In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
import plotly.figure_factory as ff
import plotly.io as pio

pio.templates.default = "seaborn"
sns.set_theme()

In [None]:
df = pd.read_csv("../data/raw/train.csv", parse_dates=["datetime"])
df.head()

In [None]:
df.info()

In [None]:
df.dtypes

In [None]:
for col in df.dtypes[df.dtypes == "int"].index:
    df[col] = pd.to_numeric(df[col], downcast="unsigned")

In [None]:
df["target"] = pd.to_numeric(df["target"], downcast="float")

In [None]:
df.info()

In [None]:
df.to_parquet("./data/data.parquet", index=False)

In [None]:
df = pd.read_parquet("./data/data.parquet")
df.info()

In [None]:
df.nunique()

In [None]:
production = df[df["is_consumption"] == 0].drop(columns=["is_consumption"])
consumption = df[df["is_consumption"] == 1].drop(columns=["is_consumption"])

# county

In [None]:
ax = sns.histplot(consumption, x="county")

In [None]:
fig = px.histogram(consumption, x="county")
fig.show()

In [None]:
consumption["county"] = consumption["county"].astype("category")
consumption["county"].cat.categories

In [None]:
ax = sns.histplot(consumption, x="county")

In [None]:
fig = px.histogram(consumption, x="county")
fig.show()

# is_business

In [None]:
consumption["is_business"].dtype

In [None]:
consumption["is_business"].unique()

In [None]:
consumption["is_business"] = consumption["is_business"].astype("bool")

In [None]:
consumption["is_business"]  # 0: False, 1: True

In [None]:
ax = sns.histplot(consumption, x="is_business")

In [None]:
fig = px.histogram(consumption, x="is_business")
fig.show()

# product_type

In [None]:
consumption["product_type"].dtype

In [None]:
consumption["product_type"].unique()

In [None]:
consumption["product_type"] = consumption["product_type"].astype("category")

In [None]:
consumption["product_type"]

In [None]:
ax = sns.histplot(consumption, x="product_type")

In [None]:
fig = px.histogram(df, x="product_type")
fig.show()

# target

In [None]:
consumption["target"].dtype

In [None]:
consumption["target"].describe()

## Histograms

### Linear scale

In [None]:
ax = sns.histplot(consumption, x="target")

In [None]:
fig = px.histogram(consumption, x="target")
fig.show()

### Log scale on y-axis

In [None]:
fig = px.histogram(consumption, x="target", log_y=True)
fig.show()

<!-- ### Log scale on x-axis -->

In [None]:
ax = sns.histplot(consumption, x="target", log_scale=True)

In [None]:
x = consumption["target"] + 1e-4
fig = px.histogram(x=x)
fig.show()

<!-- ## Overlay histograms -->

In [None]:
consumption0 = consumption[consumption["prediction_unit_id"] == 0]
consumption1 = consumption[consumption["prediction_unit_id"] == 1]
consumption2 = consumption[consumption["prediction_unit_id"] == 2]
x0 = consumption0["target"] + 1e-4
x1 = consumption1["target"] + 1e-4
x2 = consumption2["target"] + 1e-4

In [None]:
fig = go.Figure()
fig.add_trace(go.Histogram(x=x0))
# fig.add_trace(go.Histogram(x=x1))
# fig.add_trace(go.Histogram(x=x2))
fig.update_xaxes(type="log")
# Overlay histograms
fig.update_layout(barmode="overlay")
# Reduce opacity to see histograms
fig.update_traces(opacity=0.5)
fig.show()

In [None]:
consumption_sample = consumption[consumption["prediction_unit_id"].isin([0, 1, 2])]
ax = sns.histplot(
    consumption_sample, x="target", hue="prediction_unit_id", log_scale=True
)

## KDE plot

In [None]:
x = consumption["target"] + 0.00001
sns.kdeplot(data=x, log_scale=True)

In [None]:
consumption_sample = consumption[consumption["prediction_unit_id"].isin([0, 1, 2])]
ax = sns.kdeplot(
    consumption_sample,
    x="target",
    hue="prediction_unit_id",
    log_scale=True,
    fill=True,
    palette="tab10",
    linewidth=0,
    alpha=0.4,
)

In [None]:
# fig = ff.create_distplot(
#     [consumption["target"].ffill()],
#     ["target"],
#     show_hist=False,
#     show_rug=False,
# )
# fig.update_xaxes(type="log")
# fig.show()

In [None]:
# fig = ff.create_distplot(
#     [consumption["target"].ffill()],
#     ["target"],
#     show_hist=False,
#     show_rug=False,
# )
# fig.update_yaxes(type="log")
# fig.show()

## Day hours distribution

In [None]:
production["target"].describe()

In [None]:
mask = (production["datetime"].dt.hour >= 8) & (production["datetime"].dt.hour < 20)
production.loc[mask, "target"].describe()

In [None]:
x = production.loc[mask, "target"].to_frame()

In [None]:
fig = px.histogram(x, x="target", log_y=True)
fig.show()

## Daily data

In [None]:
consumption_list = [
    consumption[consumption["prediction_unit_id"] == i].set_index("datetime")
    for i in consumption["prediction_unit_id"].unique()
]
len(consumption_list)

In [None]:
consumption0 = consumption_list[0]
consumption0["prediction_unit_id"].unique()

In [None]:
consumption0_resample = consumption0["target"].resample("D").sum()
consumption0_resample.describe()

In [None]:
sns.displot(consumption0_resample)

## Target vs datetime

In [None]:
x = consumption[["target", "datetime"]].copy()
x["hour"] = x["datetime"].dt.hour
x["day"] = x["datetime"].dt.day
x["month"] = x["datetime"].dt.month
x["year"] = x["datetime"].dt.year
x["dayofweek"] = x["datetime"].dt.dayofweek
x["dayofyear"] = x["datetime"].dt.dayofyear
x["weekofyear"] = x["datetime"].dt.isocalendar().week

In [None]:
x.shape

In [None]:
y = df[["target", "datetime"]].copy().set_index("datetime")
y.head()

In [None]:
y = y.resample("h").sum()
y.shape

In [None]:
y.head()

In [None]:
sns.displot(y, x="target")

In [None]:
fig = px.histogram(y, x="target")
fig.show()

## Aggregated data

In [None]:
y = consumption[["target", "datetime"]].copy().set_index("datetime")
y.head()

In [None]:
y_sum = y.resample("D").sum()
y_sum.shape

In [None]:
y_sum.plot()

In [None]:
sns.lineplot(data=y_sum)

# is_consumption

# datetime

# data_block_id

# row_id

# prediction_unit_id