# EDA

In [None]:
import os
from pathlib import Path

import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import matplotlib.pyplot as plt

%load_ext autoreload
%autoreload 2

## 1. Data

In [None]:
# Step 0: Unzip data file and read data
## Unzip file
DATA_PATH =  Path('/home/jovyan/hfactory_magic_folders/polycarbonate_pricing/resources')
# Read parquets
DATAFRAMES = {
    folder.name: {
        data_file.name: pd.read_parquet(data_file)
        for data_file in (DATA_PATH / folder).iterdir()
        if data_file.suffix == ".pq"
    }
    for folder in DATA_PATH.iterdir() if folder.is_dir()
}
# Now you can access each DataFrame using its folder & file name as the key in the 'dataframes' dictionary
# For example:
# df_pc_price_eu = DATAFRAMES["pc_price"]["pc_price_eu.pq"]

In [None]:
df_pc_price_eu = DATAFRAMES["pc_price"]["pc_price_eu.pq"]
df_pc_price_eu.head()

In [None]:
df_best_pc_price_eu = (
    df_pc_price_eu.set_index("date")[df_pc_price_eu.columns[df_pc_price_eu.columns.str.startswith("eu_supplier")]]
    .min(axis=1)
    .reset_index(drop=False)
    .rename(columns={0: "price_pc_eur_per_kg"})
)

# Create figure for the best price
fig = go.Figure()
fig.add_trace(
    go.Scatter(
        x=df_best_pc_price_eu["date"],
        y=df_best_pc_price_eu["price_pc_eur_per_kg"],
        mode="lines+markers",
        name="Best Price",
        marker={"size": 5, "color": "red"},
        line={"color": "red"},
    )
)

# Add traces for each EU supplier
for col in df_pc_price_eu.columns:
    if col.startswith("eu_supplier"):
        fig.add_trace(
            go.Scatter(
                x=df_pc_price_eu["date"],
                y=df_pc_price_eu[col],
                mode="lines",
                name=col,
                line={"dash": "dash", "width": 2},
                opacity=0.5,
            )
        )

fig.update_layout(
    title={
        "text": "PolyCarbonate prices in EUR/Kg accross SE suppliers, against time",
        "x": 0.5,
        "xanchor": "center",
        "yanchor": "top",
        "font": {"size": 20},
    },
    xaxis_title="Date",
    yaxis_title="Polycarbonate price (EUR/Kg)",
    width=1150,
    height=600,
    template="plotly",
)

fig.show("svg")

In [None]:
df_best_pc_price_eu["date"] = pd.to_datetime(df_best_pc_price_eu["date"], format="%b-%Y")

In [None]:
df_best_pc_price_eu

In [None]:
df_best_pc_price_eu["2023-04-01" <= df_best_pc_price_eu["date"] <= "2024-09-01"]

In [None]:
df_best_pc_price_ = (
    df_electricity_price.assign(date=lambda df: pd.to_datetime(df["date"], format="%Y/%m").dt.strftime("%b-%Y"))
    .merge(df_best_pc_price_eu, how="right", on="date")
    .dropna()
    .rename(columns={"price - EUR/MWh (avg)": "electricity_price_eur_per_mwh"})
)

### 1.1 Electricity price

In [None]:
df_elec_price = DATAFRAMES["electricity_price"]["electricity_price_history_per_country.pq"]
df_elec_price.head()

In [None]:
df_elec_price['date'] = pd.to_datetime(df_elec_price["date"], format="%Y/%m").dt.strftime("%b-%Y")

In [None]:
df_elec_price['price - EUR/MWh (avg)'] = pd.to_numeric(df_elec_price['price - EUR/MWh (avg)'], errors='coerce')
df_elec_price

In [None]:
df_elec_price.country.unique()

In [None]:
df_elec_price.info()

In [None]:
df_elec_price.describe()

In [None]:
18*30

There are 30 data points of electricity price for each country, each month from 04/2023 to 09/2024. There are no missing values.

In [None]:
# Plot the electricity price for each country
# Grouping the data by country and plotting
plt.figure(figsize=(12, 8))  # Set the figure size

for country in df_elec_price['country'].unique():
    country_data = df_elec_price[df_elec_price['country'] == country]
    plt.plot(country_data['date'], country_data['price - EUR/MWh (avg)'], label=country)

plt.plot()

# Adding labels and title
plt.xlabel('Date')
plt.ylabel('Electricity Price - EUR/MWh (avg)')
plt.title('Electricity Prices Over Time by Country')
plt.legend(loc='upper left', bbox_to_anchor=(1, 1), ncol=1)  # Adjust legend position
plt.xticks(rotation=45)  # Rotate x-axis labels for better readability
plt.tight_layout()  # Adjusts plot to ensure everything fits without overlapping
plt.show()