# EDA

In [None]:
import os
from pathlib import Path

import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import matplotlib.pyplot as plt

%load_ext autoreload
%autoreload 2

## 1. Data

In [None]:
# Step 0: Unzip data file and read data
## Unzip file
DATA_PATH =  Path('/home/jovyan/hfactory_magic_folders/polycarbonate_pricing/resources')
# Read parquets
DATAFRAMES = {
    folder.name: {
        data_file.name: pd.read_parquet(data_file)
        for data_file in (DATA_PATH / folder).iterdir()
        if data_file.suffix == ".pq"
    }
    for folder in DATA_PATH.iterdir() if folder.is_dir()
}
# Now you can access each DataFrame using its folder & file name as the key in the 'dataframes' dictionary
# For example:
# df_pc_price_eu = DATAFRAMES["pc_price"]["pc_price_eu.pq"]

In [None]:
df_pc_price_eu = DATAFRAMES["pc_price"]["pc_price_eu.pq"]
df_pc_price_eu.head()

In [None]:
df_best_pc_price_eu = (
    df_pc_price_eu.set_index("date")[df_pc_price_eu.columns[df_pc_price_eu.columns.str.startswith("eu_supplier")]]
    .min(axis=1)
    .reset_index(drop=False)
    .rename(columns={0: "price_pc_eur_per_kg"})
)

# Create figure for the best price
fig = go.Figure()
fig.add_trace(
    go.Scatter(
        x=df_best_pc_price_eu["date"],
        y=df_best_pc_price_eu["price_pc_eur_per_kg"],
        mode="lines+markers",
        name="Best Price",
        marker={"size": 5, "color": "red"},
        line={"color": "red"},
    )
)

# Add traces for each EU supplier
for col in df_pc_price_eu.columns:
    if col.startswith("eu_supplier"):
        fig.add_trace(
            go.Scatter(
                x=df_pc_price_eu["date"],
                y=df_pc_price_eu[col],
                mode="lines",
                name=col,
                line={"dash": "dash", "width": 2},
                opacity=0.5,
            )
        )

fig.update_layout(
    title={
        "text": "PolyCarbonate prices in EUR/Kg accross SE suppliers, against time",
        "x": 0.5,
        "xanchor": "center",
        "yanchor": "top",
        "font": {"size": 20},
    },
    xaxis_title="Date",
    yaxis_title="Polycarbonate price (EUR/Kg)",
    width=1150,
    height=600,
    template="plotly",
)

fig.show("svg")

In [None]:
df_best_pc_price_eu["date"] = pd.to_datetime(df_best_pc_price_eu["date"], format="%b-%Y")

In [None]:
df_best_pc_price_eu

### 1.1 Electricity price

In [None]:
df_elec_price = DATAFRAMES["electricity_price"]["electricity_price_history_per_country.pq"]
df_elec_price.head()

In [None]:
df_elec_price['date'] = pd.to_datetime(df_elec_price["date"], format="%Y/%m").dt.strftime("%b-%Y")

In [None]:
df_elec_price['price - EUR/MWh (avg)'] = pd.to_numeric(df_elec_price['price - EUR/MWh (avg)'], errors='coerce')
df_elec_price

In [None]:
df_elec_price.country.unique()

In [None]:
df_elec_price.info()

In [None]:
df_elec_price.describe()

In [None]:
18*30

There are 30 data points of electricity price for each country, each month from 04/2023 to 09/2024. There are no missing values.

In [None]:
# Plot the electricity price for each country
# Grouping the data by country and plotting
plt.figure(figsize=(12, 8))  # Set the figure size

for country in df_elec_price['country'].unique():
    country_data = df_elec_price[df_elec_price['country'] == country]
    plt.plot(country_data['date'], country_data['price - EUR/MWh (avg)'], label=country)

plt.plot()

# Adding labels and title
plt.xlabel('Date')
plt.ylabel('Electricity Price - EUR/MWh (avg)')
plt.title('Electricity Prices Over Time by Country')
plt.legend(loc='upper left', bbox_to_anchor=(1, 1), ncol=1)  # Adjust legend position
plt.xticks(rotation=45)  # Rotate x-axis labels for better readability
plt.tight_layout()  # Adjusts plot to ensure everything fits without overlapping
plt.show()

#### 1.2 Automobile industry

<u>Description</u>: <br>
History of the number of registered vehicles.<br>
<u>Data source</u>: <br>
Eurostat <br>
<u>Columns</u>:
- _freq_: Time frequency (Annual: [A])
- _unit_: Number [NR]
- _mot_nrg_: Motor energy
- _geo_: Geopolitical entity
- _2012 -> 2022_: history of the number of registered vehicles

In [None]:
df_auto = DATAFRAMES["automobile_industry"]["estat_road_eqr_busmot.pq"]
df_auto

In [None]:
df_auto.info()

In [None]:
df_auto.drop(columns=['freq', 'unit'], inplace=True)
df_auto

In [None]:
for col in df_auto.select_dtypes(include='object').columns:
    unique_values = df_auto[col].unique()
    print(f"{len(unique_values)} unique values in column '{col}': {set(unique_values)}")

The values of motor energy type are:
- TOTAL: Likely represents the total count or sum of vehicles across different energy sources.
- ELC_DIE_PI: This could stand for Electric Diesel Plug-In Hybrid.
- HYD_FCELL: Likely stands for Hydrogen Fuel Cell.
- ALT: Could refer to Alternative Fuels, a broad category that may include any non-conventional fuel sources.
- CNG: Compressed Natural Gas.
- DIE_X_HYB: Could mean Diesel Extended Hybrid or possibly Diesel Hybrid (the exact meaning might depend on the context).
- ELC_DIE_HYB: Electric Diesel Hybrid.
- PET: Petrol (also known as gasoline in some regions).
- LPG: Liquefied Petroleum Gas.
- LNG: Liquefied Natural Gas.
- GAS: Typically refers to Gasoline (petrol).
- DIE: Diesel.
- OTH: Other types of motor energy sources not listed explicitly.
- ELC: Electric.

The values of geopolitical entity are:

- AT - Austria
- BE - Belgium
- BG - Bulgaria
- CY - Cyprus
- CZ - Czech Republic
- DE - Germany
- DK - Denmark
- EE - Estonia
- ES - Spain
- FI - Finland
- FR - France
- HR - Croatia
- HU - Hungary
- IE - Ireland
- IT - Italy
- LT - Lithuania
- LU - Luxembourg
- LV - Latvia
- MT - Malta
- NL - Netherlands
- PL - Poland
- PT - Portugal
- RO - Romania
- SE - Sweden
- SI - Slovenia
- SK - Slovakia
- EL - Greece (EL is used as the ISO code for Greece)

All of these countries have a significant geopolitical and economic connection to Europe.

In [None]:
df_auto[df_auto["geo"] == "EU27_2020"]

In [None]:
df_auto[df_auto["geo"] == "AL"]

In [None]:
df_auto[(df_auto["mot_nrg"] == 'TOTAL') & (df_auto["geo"] == 'EU27_2020')]

In [None]:
# Assuming df is your DataFrame
# Step 1: Melt the DataFrame to bring year columns into a single 'date' column
df_melted = pd.melt(
    df_auto,
    id_vars=['mot_nrg', 'geo'],  # Keep 'mot_nrg' and 'geo' as identifiers
    value_vars=[str(year) for year in range(2012, 2023)],  # Adjust the range of years if needed
    var_name='date',
    value_name='nr_autos'
)

# Step 2: Combine 'geo' and 'mot_nrg' to create unique column names
df_melted['geo_mot_nrg'] = df_melted['geo'] + '_' + df_melted['mot_nrg']

# Step 3: Pivot the DataFrame
df_pivoted = df_melted.pivot(index='date', columns='geo_mot_nrg', values='nr_autos').reset_index()

# Display the resulting DataFrame
df_pivoted['date'] = pd.to_datetime(df_pivoted['date'], format='%Y')
df_pivoted[["date", "AL_GAS", "EU27_2020_TOTAL", "EU27_2020_ELC"]]

Since there are no pc price values for 2012, we drop the first row.

In [None]:
df_pivoted.drop(df_pivoted.index[0], inplace=True)
df_pivoted

In [None]:
# Get the count of missing values for each column
missing_values = df_pivoted.isna().sum()

# Plotting a histogram of the number of missing values
plt.figure(figsize=(10, 6))
plt.hist(missing_values, bins=range(0, max(missing_values) + 2), color='skyblue', edgecolor='black')
plt.title('Histogram of Number of Missing Values per Column')
plt.xlabel('Number of Missing Values')
plt.ylabel('Frequency (Number of Columns)')
plt.grid(axis='y')
plt.show()

In [None]:
missing_values[missing_values < 4]

In [None]:
missing_values['FR_TOTAL']

In [None]:
missing_values['EU27_2020_ELC']

In [None]:
missing_values['EU27_2020_TOTAL']

Linearly interpolate missing values.

In [None]:
df_cleaned = df_pivoted.loc[:, df_pivoted.isna().sum() <= 3]
df_cleaned

In [None]:
# df_cleaned = df_cleaned.interpolate(method='linear', limit_direction='both')
df_cleaned = df_cleaned.interpolate(method='linear')
df_cleaned

Merge best price data frame with automobiles data frame.

In [None]:
df_merged = pd.merge(df_best_pc_price_eu, df_cleaned, how='left')
df_merged

Linear interpolation on missing values.

In [None]:
# df = df_merged.interpolate(method='linear', limit_direction='both')
df = df_merged.interpolate(method='linear')
df

In [None]:
df.loc[:, df.isna().sum() > 0].columns

In [None]:
df.drop(columns= df.loc[:, df.isna().sum() > 0].columns, inplace=True)
df

In [None]:
# Calculate correlation of 'price_pc_eur_per_kg' with each column, handling missing values independently
correlations = {}

for col in df.columns:
    if col != 'price_pc_eur_per_kg' and col != 'date':  # Exclude 'price_pc_eur_per_kg' and 'date'
        # Drop rows with missing values for the current column and 'price_pc_eur_per_kg'
        df_temp = df[['price_pc_eur_per_kg', col]].dropna()
        
        # Calculate correlation and store it
        correlation_value = df_temp['price_pc_eur_per_kg'].corr(df_temp[col])
        correlations[col] = correlation_value

# Convert the results to a Pandas Series for easy viewing
correlations_series = pd.Series(correlations)

# Rank correlations by magnitude, retaining the sign
ranked_correlations = correlations_series.reindex(correlations_series.abs().sort_values(ascending=False).index)

# Display the ranked correlations
print(ranked_correlations[abs(ranked_correlations) > 0.5])

In [None]:
ranked_correlations["EU27_2020_ELC"]

In [None]:
ranked_correlations["EU27_2020_TOTAL"]

In [None]:
# Group correlations by vehicle types
grouped_correlations = correlations_series.groupby(vehicle_types)

# Plot a histogram for each vehicle type
for vehicle_type, correlations in grouped_correlations:
    plt.figure(figsize=(8, 5))
    plt.hist(correlations, bins=10, color='skyblue', edgecolor='black', alpha=0.7, density=False)
    plt.title(f'Correlation Distribution for {vehicle_type}')
    plt.xlabel('Correlation Value')
    plt.ylabel('Frequency')
    plt.grid(axis='y', alpha=0.75)
    plt.show()

In [None]:
import re

# List of predefined vehicle types
vehicle_types_list = df_auto["mot_nrg"].unique()

# Extract the vehicle types from the index using a regular expression that matches the list
def extract_vehicle_type(index_value):
    for vehicle_type in vehicle_types_list:
        # Match the vehicle type at the end of the string
        if re.search(f'_{vehicle_type}$', index_value):
            return vehicle_type
    return None  # Return None if no match is found

# Apply the extraction function to each index value
vehicle_types = correlations_series.index.map(extract_vehicle_type)

# Group by extracted vehicle types and calculate the average correlation
average_correlation_by_vehicle_type = correlations_series.groupby(vehicle_types).mean()

# Display the result, sorted by magnitude of correlation while keeping the sign
average_correlation_sorted = average_correlation_by_vehicle_type.reindex(
    average_correlation_by_vehicle_type.abs().sort_values(ascending=False).index
)

print(average_correlation_sorted)


It seems that the correlation of the number of electric and alternative automobiles with the price of pc is higher than other types of automobiles.

Electric vehicles (EVs) vehicles often have different design and production needs compared to conventional vehicles. They can include lightweight materials (like plastic) for battery casing, interior components, and weight reduction for efficiency. Growth in EV production may lead to increased demand for lightweight materials like polycarbonate. Similar forces may be at play for alternative vehicles.

The correlation of the total number of automobiles with the price of pc is negative.

In [None]:
df[["date", "price_pc_eur_per_kg", "EU27_2020_TOTAL", "EU27_2020_ELC"]]

In [None]:
# Plotting
fig, ax1 = plt.subplots(figsize=(12, 6))

# Plotting price_pc_eur_per_kg on the primary y-axis
ax1.set_xlabel('Date')
ax1.set_ylabel('Price PC (EUR/kg)', color='b')
ax1.plot(df['date'], df['price_pc_eur_per_kg'], color='b', marker='o', linestyle='-', label='Price PC (EUR/kg)')
ax1.tick_params(axis='y', labelcolor='b')

# Creating a second y-axis
ax2 = ax1.twinx()
ax2.set_ylabel('Automobiles', color='g')
ax2.plot(df['date'], df['EU27_2020_TOTAL'], color='g', marker='x', linestyle='--', label='EU27_2020_TOTAL')
ax2.tick_params(axis='y', labelcolor='g')

# Adding legends and titles
fig.suptitle('PC Price and Automobile Data Over Time')
fig.legend(loc='upper left', bbox_to_anchor=(0.1, 0.85))
plt.xticks(rotation=45)
plt.grid(axis='x')  # Optional: Add grid for the x-axis
plt.tight_layout()
plt.show()

In [None]:
# Plotting
fig, ax1 = plt.subplots(figsize=(12, 6))

# Plotting price_pc_eur_per_kg on the primary y-axis
ax1.set_xlabel('Date')
ax1.set_ylabel('Price PC (EUR/kg)', color='b')
ax1.plot(df['date'], df['price_pc_eur_per_kg'], color='b', marker='o', linestyle='-', label='Price PC (EUR/kg)')
ax1.tick_params(axis='y', labelcolor='b')

# Creating a second y-axis
ax2 = ax1.twinx()
ax2.set_ylabel('Automobiles', color='g')
ax2.plot(df['date'], df['FR_ALT'], color='g', marker='x', linestyle='--', label='FR_ALT')
ax2.plot(df['date'], df['FR_ELC'], color='c', marker='x', linestyle='--', label='FR_ELC')
ax2.plot(df['date'], df['EU27_2020_ELC'], color='r', marker='^', linestyle='-.', label='EU27_2020_ELC')
ax2.tick_params(axis='y', labelcolor='g')

# Adding legends and titles
fig.suptitle('PC Price and Automobile Data Over Time')
fig.legend(loc='upper left', bbox_to_anchor=(0.1, 0.85))
plt.xticks(rotation=45)
plt.grid(axis='x')  # Optional: Add grid for the x-axis
plt.tight_layout()
plt.show()

In [None]:
# Plotting
fig, ax1 = plt.subplots(figsize=(12, 6))

# Plotting price_pc_eur_per_kg on the primary y-axis
ax1.set_xlabel('Date')
ax1.set_ylabel('Price PC (EUR/kg)', color='b')
ax1.plot(df['date'], df['price_pc_eur_per_kg'], color='b', marker='o', linestyle='-', label='Price PC (EUR/kg)')
ax1.tick_params(axis='y', labelcolor='b')

# Creating a second y-axis
ax2 = ax1.twinx()
ax2.set_ylabel('Automobiles', color='g')
ax2.plot(df['date'], df['FR_TOTAL'], color='g', marker='x', linestyle='--', label='FR_TOTAL')
ax2.tick_params(axis='y', labelcolor='g')

# Adding legends and titles
fig.suptitle('PC Price and Automobile Data Over Time')
fig.legend(loc='upper left', bbox_to_anchor=(0.1, 0.85))
plt.xticks(rotation=45)
plt.grid(axis='x')  # Optional: Add grid for the x-axis
plt.tight_layout()
plt.show()

The number of total automobiles seems to be inversely correlated with the price of pc. This intuitively does not make sense: as the number of automobiles increases, there is more demand for plastic for their production which drives the price of pc up.

The number of electric automobiles seems to be positively correlated with the price of pc.

Feature engineering:
- ratio electrique/total

Questions to ask to Schneider:
- Are there any countries that you supply more to? (so the automobile industry in that country has more of an effect on the price of pc)
- 