In [1]:
%cd ../..

C:\Users\fahmi\Documents\Github ML Project\time-series-projects


# Libraries & Dataset
## Load Libraries

In [2]:
import pandas as pd
import numpy as np
import os
import joblib
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio

pio.templates.default = "plotly_white"

from src.forecasting.ml_forecasting import (
    FeatureConfig,
    MissingValueConfig,
    MLForecast,
    ModelConfig,
    calculate_metrics,
)
from src.utils.general import LogTime
import humanize

from sklearn.linear_model import LinearRegression, LassoCV, RidgeCV
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

  from tqdm.autonotebook import tqdm


## Load Train and Val sets

In [None]:
train_data = pd.read_csv("energy-consumption/datasets/preprocessed/train_data_preprocessed.csv")
auto_stat_target = pd.read_csv("energy-consumption/datasets/autotransform/train_val_data_autotransformer.csv")
transformer_pipeline = joblib.load("energy-consumption/datasets/preprocessed/train_val_data_autotransformer_pipeline.pkl")
val_data = pd.read_csv("energy-consumption/datasets/preprocessed/val_data_preprocessed.csv")

In [None]:
pd.concat([train_df, val_df]).reset_index().set_index("timestamp")

In [None]:
train_data = train_data.set_index("Month").join(auto_stat_target.set_index("Month")).reset_index()
train_data = train_data.drop(["type"], axis=1)
train_data["Month"] = pd.to_datetime(train_data["Month"])

val_data = val_data.drop(["type"], axis=1)
val_data["Month"] = pd.to_datetime(val_data["Month"])

In [None]:
train_data

In [None]:
train_data.columns

## Feature Definition

In [None]:
def column_dtypes(df):
    continuous_cols = []
    categorical_cols = []
    boolean_cols = []

    for column in df.columns:
        if df[column].dtype in [int, float]:
            continuous_cols.append(column)
        elif (df[column].dtype == bool) or ("is" in column.lower()) :
            boolean_cols.append(column)
        else:
            categorical_cols.append(column)
            
    return continuous_cols, categorical_cols, boolean_cols

In [None]:
continuous_cols, categorical_cols, boolean_cols = column_dtypes(train_data.drop(["y", "Month", "y_auto_stat"], axis=1))

In [None]:
feat_config = FeatureConfig(
    date="Month",
    target="y_auto_stat",
    original_target="y",
    continuous_features=continuous_cols,
    categorical_features=categorical_cols,
    boolean_features=boolean_cols,
    index_cols=["Month"]
)

In [None]:
train_features, train_target, train_original_target = feat_config.get_X_y(
    train_data, categorical=True, exogenous=False
)
# Loading the Validation as test
test_features, test_target, test_original_target = feat_config.get_X_y(
    val_data, categorical=True, exogenous=False
)

## Handling Missing Values

In [None]:
tr_m = train_features.isnull().sum()
tr_m[tr_m > 0]

In [None]:
ts_m = test_features.isnull().sum()
ts_m[ts_m > 0]

In [None]:
missing_value_config = MissingValueConfig(
    bfill_columns = tr_m[tr_m > 0].index.tolist()
)

# Helper Functions

In [None]:
def format_plot(fig, legends = None, xlabel="Time", ylabel="Value", title="", font_size=15):
    if legends:
        names = cycle(legends)
        fig.for_each_trace(lambda t:  t.update(name = next(names)))
    fig.update_layout(
            autosize=False,
            width=900,
            height=500,
            title_text=title,
            title={
            'x':0.5,
            'xanchor': 'center',
            'yanchor': 'top'},
            titlefont={
                "size": 20
            },
            legend_title = None,
            legend=dict(
                font=dict(size=font_size),
                orientation="h",
                yanchor="bottom",
                y=0.98,
                xanchor="right",
                x=1,
            ),
            yaxis=dict(
                title_text=ylabel,
                titlefont=dict(size=font_size),
                tickfont=dict(size=font_size),
            ),
            xaxis=dict(
                title_text=xlabel,
                titlefont=dict(size=font_size),
                tickfont=dict(size=font_size),
            )
        )
    return fig

In [None]:
def evaluate_model(
    model_config, 
    feature_config, 
    missing_config, 
    target_transformer, 
    train_features, 
    train_target, 
    test_features, 
    test_target, 
    train_target_original=None
):
    ml_model = MLForecast(
        model_config=model_config, 
        feature_config=feat_config, 
        missing_config=missing_value_config, 
        target_transformer=target_transformer
    )
    ml_model.fit(train_features, train_target, is_transformed=True)
    y_pred = ml_model.predict(test_features)
    feat_df = ml_model.feature_importance()
    metrics = calculate_metrics(test_target, y_pred, model_config.name, train_target_original)
    return y_pred, metrics, feat_df

In [None]:
from itertools import cycle

def plot_forecast(pred_df, forecast_columns, forecast_display_names=None):
    if forecast_display_names is None:
        forecast_display_names = forecast_columns
    else:
        assert len(forecast_columns)==len(forecast_display_names)
    mask = ~pred_df[forecast_columns[0]].isnull()
    # colors = ["rgba("+",".join([str(c) for c in plotting_utils.hex_to_rgb(c)])+",<alpha>)" for c in px.colors.qualitative.Plotly]
    colors = [c.replace("rgb", "rgba").replace(")", ", <alpha>)") for c in px.colors.qualitative.Dark2]
    # colors = [c.replace("rgb", "rgba").replace(")", ", <alpha>)") for c in px.colors.qualitative.Safe]
    act_color = colors[0]
    colors = cycle(colors[1:])
    dash_types = cycle(["dash","dot","dashdot"])
    fig = go.Figure()
#     change y to your time series column
    fig.add_trace(go.Scatter(x=pred_df[mask].index, y=pred_df[mask].y,
                        mode='lines', line = dict(color=act_color.replace("<alpha>", "0.3")),
                        name='Actual Consumption'))
    for col, display_col in zip(forecast_columns,forecast_display_names):
        fig.add_trace(go.Scatter(x=pred_df[mask].index, y=pred_df.loc[mask, col],
                            mode='lines', line = dict(dash=next(dash_types), color=next(colors).replace("<alpha>", "1")),
                            name=display_col))
    return fig

In [None]:
def highlight_abs_min(s, props=''):
    return np.where(s == np.nanmin(np.abs(s.values)), props, '')

# Ensemble Forecasting

In [None]:
pred_df = pd.concat([train_target, test_original_target])
metric_record = []

## Linear Regression

# Evaluation Metrics

In [None]:
summary = pd.DataFrame(metric_record)
summary.sort_values("MAE", ascending=True)