In [1]:
import pandas as pd

In [2]:
df = pd.read_csv(
    "../data/consumption.csv", usecols=["prediction_unit_id", "datetime", "target"]
)[["prediction_unit_id", "datetime", "target"]].rename(
    columns={"prediction_unit_id": "unique_id", "datetime": "ds", "target": "y"}
)
df["ds"] = pd.to_datetime(df["ds"])
df.head()

Unnamed: 0,unique_id,ds,y
0,0,2021-09-01,96.59
1,1,2021-09-01,17.314
2,2,2021-09-01,656.859
3,3,2021-09-01,59.0
4,4,2021-09-01,501.76


In [3]:
df["y"] = df["y"].interpolate(method="linear")
df.isna().sum()

unique_id    0
ds           0
y            0
dtype: int64

# Train/Test split

In [206]:
# taking the last 48 hours for test
for i in df["unique_id"].unique():
    if i == df["unique_id"].unique()[0]:
        df_test = df[df["unique_id"]==i][-48:]
        continue
    df_test = pd.concat([df_test, df[df["unique_id"]==i][-48:]])
    
df_test.sort_index(inplace=True)
print(df_test.shape)
df_test.head()

(3312, 3)


Unnamed: 0,unique_id,ds,y
906150,26,2023-03-28 00:00:00,5.137
906217,26,2023-03-28 01:00:00,4.212
906284,26,2023-03-28 02:00:00,4.917
906351,26,2023-03-28 03:00:00,4.195
906418,26,2023-03-28 04:00:00,4.518


In [5]:
train_idx = [idx for idx in df.index if idx not in df_test.index]
df_train = df.loc[train_idx]
df_train.shape
df_train.head()

Unnamed: 0,unique_id,ds,y
0,0,2021-09-01,96.59
1,1,2021-09-01,17.314
2,2,2021-09-01,656.859
3,3,2021-09-01,59.0
4,4,2021-09-01,501.76


In [6]:
df.shape[0] == df_train.shape[0] + df_test.shape[0]

True

In [7]:
df.shape[1] == df_train.shape[1] == df_test.shape[1]

True

# Set features and target

In [192]:
def get_features(df):
    y = pd.concat([df["y"].shift(-1), df["y"].shift(-2)], axis=1).dropna()
    y.columns = ["lead_1", "lead_2"]
    X = pd.concat([df, df["y"].shift(1)], axis=1).dropna()
    X.columns = ["unique_id", "ds", "lag_0", "lag_1"]
    y, X = y.align(X, axis=0, join="inner")
    return X, y

In [195]:
X_train, y_train = get_features(df_train)
display("X_train", X_train.head())
display("y_train", y_train.head())

'X_train'

Unnamed: 0,unique_id,ds,lag_0,lag_1
1,1,2021-09-01,17.314,96.59
2,2,2021-09-01,656.859,17.314
3,3,2021-09-01,59.0,656.859
4,4,2021-09-01,501.76,59.0
5,5,2021-09-01,5155.056,501.76


'y_train'

Unnamed: 0,lead_1,lead_2
1,656.859,59.0
2,59.0,501.76
3,501.76,5155.056
4,5155.056,2.756
5,2.756,22.841


In [196]:
X_test, y_test = get_features(df_test)
display("X_test", X_test.head())
display("y_test", y_test.head())

'X_test'

Unnamed: 0,unique_id,ds,lag_0,lag_1
906217,26,2023-03-28 01:00:00,4.212,5.137
906284,26,2023-03-28 02:00:00,4.917,4.212
906351,26,2023-03-28 03:00:00,4.195,4.917
906418,26,2023-03-28 04:00:00,4.518,4.195
906485,26,2023-03-28 05:00:00,5.8,4.518


'y_test'

Unnamed: 0,lead_1,lead_2
906217,4.917,4.195
906284,4.195,4.518
906351,4.518,5.8
906418,5.8,5.966
906485,5.966,5.427


# Training

In [143]:
!pip install mlforecast lightgbm



In [230]:
from lightgbm import LGBMRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import MaxAbsScaler

from sklearn import set_config
set_config(transform_output="pandas")

In [243]:
# to drop 'unique_id' and 'ds' columns
class columnDropperTransformer():
    def __init__(self, columns):
        self.columns = columns

    def transform(self, X, y=None):
        return X.drop(self.columns, axis=1)

    def fit(self, X, y=None):
        return self

pipeline = Pipeline(
    [
        ("columnDropper", columnDropperTransformer(['unique_id','ds'])),
        ("scaler", MaxAbsScaler()),
        ("lgb", MultiOutputRegressor(LGBMRegressor())),
    ]
)

In [244]:
pipeline.fit(X_train, y_train)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.041567 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 510
[LightGBM] [Info] Number of data points in the train set: 1005861, number of used features: 2
[LightGBM] [Info] Start training from score 460.951713
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.020935 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 510
[LightGBM] [Info] Number of data points in the train set: 1005861, number of used features: 2
[LightGBM] [Info] Start training from score 460.951236


# Inference

In [245]:
X_test.iloc[:10]

Unnamed: 0,unique_id,ds,lag_0,lag_1
906217,26,2023-03-28 01:00:00,4.212,5.137
906284,26,2023-03-28 02:00:00,4.917,4.212
906351,26,2023-03-28 03:00:00,4.195,4.917
906418,26,2023-03-28 04:00:00,4.518,4.195
906485,26,2023-03-28 05:00:00,5.8,4.518
906552,26,2023-03-28 06:00:00,5.966,5.8
906619,26,2023-03-28 07:00:00,5.427,5.966
906686,26,2023-03-28 08:00:00,4.221,5.427
906753,26,2023-03-28 09:00:00,3.638,4.221
906820,26,2023-03-28 10:00:00,3.157,3.638


In [248]:
pd.DataFrame(pipeline.predict(X_test.head(10)), columns=y_test.columns)

Unnamed: 0,lead_1,lead_2
0,145.662004,234.531611
1,145.662004,234.531611
2,145.662004,234.531611
3,145.662004,234.531611
4,145.662004,234.531611
5,151.279857,234.531611
6,151.279857,234.531611
7,151.279857,234.531611
8,145.662004,234.531611
9,145.662004,230.267837


In [252]:
X_test[:1]

Unnamed: 0,unique_id,ds,lag_0,lag_1
906217,26,2023-03-28 01:00:00,4.212,5.137


In [286]:
data_json = X_train[:2].to_json(orient="records", indent=4)
print(data_json)

[
    {
        "unique_id":1,
        "ds":1630454400000,
        "lag_0":17.314,
        "lag_1":96.59
    },
    {
        "unique_id":2,
        "ds":1630454400000,
        "lag_0":656.859,
        "lag_1":17.314
    }
]


In [287]:
from io import StringIO

pd.read_json(StringIO(data_json))

Unnamed: 0,unique_id,ds,lag_0,lag_1
0,1,1630454400000,17.314,96.59
1,2,1630454400000,656.859,17.314


In [289]:
print(data_json)

[
    {
        "unique_id":1,
        "ds":1630454400000,
        "lag_0":17.314,
        "lag_1":96.59
    },
    {
        "unique_id":2,
        "ds":1630454400000,
        "lag_0":656.859,
        "lag_1":17.314
    }
]


In [295]:
data_json = """
[
    {
        "unique_id":1,
        "ds":1630454400000,
        "values":[17.314, 96.59],
    },
    {
        "unique_id":2,
        "ds":1630454400000,
        "values":[656.859, 17.314],
    }
]
"""

In [298]:
pd.read_json(StringIO(data_json)).explode("values")#.reset_index()

Unnamed: 0,index,unique_id,ds,values
0,0,1,1630454400000,17.314
1,0,1,1630454400000,96.59
2,1,2,1630454400000,656.859
3,1,2,1630454400000,17.314


# Visualization

# Save model

In [213]:
import joblib

joblib.dump(model, 'model_test.joblib')

['model_test.joblib']

Load the model for test

In [214]:
with open('model_test.joblib', 'rb') as file:
    loaded_model = joblib.load(file)
    
loaded_model.predict(X_test.iloc[:10, 2:]) == model.predict(X_test.iloc[:10, 2:])

array([[ True,  True],
       [ True,  True],
       [ True,  True],
       [ True,  True],
       [ True,  True],
       [ True,  True],
       [ True,  True],
       [ True,  True],
       [ True,  True],
       [ True,  True]])

# App deployment

In [63]:
from flask import jsonify

In [None]:
import joblib
import json
from flask import Flask, request
import numpy as np
import pandas as pd

model = None

def load_model():
    global model
    # model variable refers to the global variable
    with open('model_test.joblib', 'rb') as file:
        model = joblib.load(file)
        
        
app = Flask(__name__)

