In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv(
    "../data/consumption.csv", usecols=["prediction_unit_id", "datetime", "target"]
)[["prediction_unit_id", "datetime", "target"]].rename(
    columns={"prediction_unit_id": "unique_id", "datetime": "ds", "target": "y"}
)
df["ds"] = pd.to_datetime(df["ds"])
df.head()

Unnamed: 0,unique_id,ds,y
0,0,2021-09-01,96.59
1,1,2021-09-01,17.314
2,2,2021-09-01,656.859
3,3,2021-09-01,59.0
4,4,2021-09-01,501.76


In [3]:
df["y"] = df["y"].interpolate(method="linear")
df.isna().sum()

unique_id    0
ds           0
y            0
dtype: int64

# Train/Test split

In [4]:
# taking the last 48 hours for test
for i in df["unique_id"].unique():
    if i == df["unique_id"].unique()[0]:
        df_test = df[df["unique_id"]==i][-48:]
        continue
    df_test = pd.concat([df_test, df[df["unique_id"]==i][-48:]])
    
df_test.sort_index(inplace=True)
print(df_test.shape)
df_test.head()

(3312, 3)


Unnamed: 0,unique_id,ds,y
906150,26,2023-03-28 00:00:00,5.137
906217,26,2023-03-28 01:00:00,4.212
906284,26,2023-03-28 02:00:00,4.917
906351,26,2023-03-28 03:00:00,4.195
906418,26,2023-03-28 04:00:00,4.518


In [5]:
train_idx = [idx for idx in df.index if idx not in df_test.index]
df_train = df.loc[train_idx]
df_train.shape
df_train.head()

Unnamed: 0,unique_id,ds,y
0,0,2021-09-01,96.59
1,1,2021-09-01,17.314
2,2,2021-09-01,656.859
3,3,2021-09-01,59.0
4,4,2021-09-01,501.76


In [6]:
df.shape[0] == df_train.shape[0] + df_test.shape[0]

True

In [7]:
df.shape[1] == df_train.shape[1] == df_test.shape[1]

True

# Set features and target

In [43]:
pd.concat(
    [
        df.groupby('unique_id')["y"].shift(-1).rename("lead1"),
        df,
        df.groupby('unique_id')["y"].shift(1).rename("lag1")
    ], axis=1
)[["unique_id", "ds", "lead1", "y", "lag1"]]#[df["unique_id"]==0]
;

''

In [53]:
pd.concat(
        [
            df,
            df.groupby('unique_id')["y"].shift(1).rename("lag1"),
            df.groupby('unique_id')["y"].shift(2).rename("lag2"),
        ], axis=1
    ).dropna()[df["unique_id"]==67]

  pd.concat(


Unnamed: 0,unique_id,ds,y,lag1,lag2
231564,67,2022-02-01 02:00:00,262.130,267.313,377.355
231630,67,2022-02-01 03:00:00,260.800,262.130,267.313
231696,67,2022-02-01 04:00:00,252.112,260.800,262.130
231762,67,2022-02-01 05:00:00,263.184,252.112,260.800
231828,67,2022-02-01 06:00:00,312.198,263.184,252.112
...,...,...,...,...,...
1008898,67,2023-05-31 19:00:00,313.989,275.831,188.055
1008963,67,2023-05-31 20:00:00,324.015,313.989,275.831
1009028,67,2023-05-31 21:00:00,316.206,324.015,313.989
1009093,67,2023-05-31 22:00:00,333.201,316.206,324.015


In [79]:
def get_features(df):
    y = pd.concat(
        [
            df[["unique_id", "ds"]],
            df.groupby('unique_id')["y"].shift(-1).rename("lead1"),
            df.groupby('unique_id')["y"].shift(-2).rename("lead2"),
        ], axis=1
    ).dropna()
    X = pd.concat(
        [
            df,
            df.groupby('unique_id')["y"].shift(1).rename("lag1"),
            df.groupby('unique_id')["y"].shift(2).rename("lag2"),
        ], axis=1
    ).dropna()
    y, X = y.align(X, axis=0, join="inner")
    return X, y

In [80]:
X_train, y_train = get_features(df_train)
display("X_train", X_train.head())
display("y_train", y_train.head())

'X_train'

Unnamed: 0,unique_id,ds,y,lag1,lag2
122,0,2021-09-01 02:00:00,91.594,77.691,96.59
123,1,2021-09-01 02:00:00,16.51,15.872,17.314
124,2,2021-09-01 02:00:00,598.45,595.498,656.859
125,3,2021-09-01 02:00:00,63.1,61.6,59.0
126,4,2021-09-01 02:00:00,458.562,486.297,501.76


'y_train'

Unnamed: 0,unique_id,ds,lead1,lead2
122,0,2021-09-01 02:00:00,87.955,88.184
123,1,2021-09-01 02:00:00,14.271,18.225
124,2,2021-09-01 02:00:00,622.824,607.308
125,3,2021-09-01 02:00:00,64.0,60.5
126,4,2021-09-01 02:00:00,511.794,520.318


In [85]:
pd.merge(X_train, y_train, on=["unique_id", "ds"])[df["unique_id"]==0
][["unique_id", "ds", "lead2", "lead1", "y", "lag1", "lag2"]]

  pd.merge(X_train, y_train, on=["unique_id", "ds"])[df["unique_id"]==0


Unnamed: 0,unique_id,ds,lead2,lead1,y,lag1,lag2
0,0,2021-09-01 02:00:00,88.184,87.955,91.594,77.691,96.590
61,0,2021-09-01 03:00:00,89.781,88.184,87.955,91.594,77.691
122,0,2021-09-01 04:00:00,96.481,89.781,88.184,87.955,91.594
183,0,2021-09-01 05:00:00,94.592,96.481,89.781,88.184,87.955
244,0,2021-09-01 06:00:00,77.308,94.592,96.481,89.781,88.184
...,...,...,...,...,...,...,...
1005276,10,2023-05-29 17:00:00,376.690,338.154,254.930,234.407,235.288
1005341,10,2023-05-29 18:00:00,343.615,376.690,338.154,254.930,234.407
1005406,10,2023-05-29 19:00:00,380.320,343.615,376.690,338.154,254.930
1005471,10,2023-05-29 20:00:00,368.056,380.320,343.615,376.690,338.154


``ds`` is the date and time of the last measured value ``y``

In [88]:
X_test, y_test = get_features(df_test)
display("X_test", X_test.head())
display("y_test", y_test.head())

'X_test'

Unnamed: 0,unique_id,ds,y,lag1,lag2
906284,26,2023-03-28 02:00:00,4.917,4.212,5.137
906351,26,2023-03-28 03:00:00,4.195,4.917,4.212
906418,26,2023-03-28 04:00:00,4.518,4.195,4.917
906485,26,2023-03-28 05:00:00,5.8,4.518,4.195
906552,26,2023-03-28 06:00:00,5.966,5.8,4.518


'y_test'

Unnamed: 0,unique_id,ds,lead1,lead2
906284,26,2023-03-28 02:00:00,4.195,4.518
906351,26,2023-03-28 03:00:00,4.518,5.8
906418,26,2023-03-28 04:00:00,5.8,5.966
906485,26,2023-03-28 05:00:00,5.966,5.427
906552,26,2023-03-28 06:00:00,5.427,4.221


# Training

In [13]:
!pip install mlforecast lightgbm

Collecting mlforecast
  Downloading mlforecast-0.12.1-py3-none-any.whl.metadata (11 kB)
Collecting lightgbm
  Downloading lightgbm-4.3.0.tar.gz (1.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m27.2 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25h  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Installing backend dependencies ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
Collecting coreforecast>=0.0.7 (from mlforecast)
  Downloading coreforecast-0.0.8-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.4 kB)
Collecting optuna (from mlforecast)
  Downloading optuna-3.6.1-py3-none-any.whl.metadata (17 kB)
Collecting utilsforecast>=0.0.27 (from mlforecast)
  Downloading utilsforecast-0.1.3-py3-none-any.whl.metadata (7.4 kB)
Collecting window-ops (from mlforecast)
  Downloading window_ops-0.0.15-py3-none-any.whl.metadata (6.8 kB)
Collecting al

In [89]:
from lightgbm import LGBMRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import MaxAbsScaler

from sklearn import set_config
set_config(transform_output="pandas")

In [90]:
# to drop 'unique_id' and 'ds' columns
class columnDropperTransformer():
    def __init__(self, columns):
        self.columns = columns

    def transform(self, X, y=None):
        return X.drop(self.columns, axis=1)

    def fit(self, X, y=None):
        return self

pipeline = Pipeline(
    [
        ("columnDropper", columnDropperTransformer(['unique_id','ds'])),
        ("scaler", MaxAbsScaler()),
        ("lgb", MultiOutputRegressor(LGBMRegressor())),
    ]
)

In [92]:
y_train.drop(columns=["unique_id", "ds"])

Unnamed: 0,lead1,lead2
122,87.955,88.184
123,14.271,18.225
124,622.824,607.308
125,64.000,60.500
126,511.794,520.318
...,...,...
1005921,39.949,37.058
1005922,120.978,110.621
1005923,212.626,188.167
1005924,33.624,31.484


In [14]:
pipeline.fit(X_train, y_train)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.022869 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 510
[LightGBM] [Info] Number of data points in the train set: 1005861, number of used features: 2
[LightGBM] [Info] Start training from score 460.951713
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.040524 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 510
[LightGBM] [Info] Number of data points in the train set: 1005861, number of used features: 2
[LightGBM] [Info] Start training from score 460.951236


# Inference

In [15]:
X_test.iloc[:10]

Unnamed: 0,unique_id,ds,lag_0,lag_1
906217,26,2023-03-28 01:00:00,4.212,5.137
906284,26,2023-03-28 02:00:00,4.917,4.212
906351,26,2023-03-28 03:00:00,4.195,4.917
906418,26,2023-03-28 04:00:00,4.518,4.195
906485,26,2023-03-28 05:00:00,5.8,4.518
906552,26,2023-03-28 06:00:00,5.966,5.8
906619,26,2023-03-28 07:00:00,5.427,5.966
906686,26,2023-03-28 08:00:00,4.221,5.427
906753,26,2023-03-28 09:00:00,3.638,4.221
906820,26,2023-03-28 10:00:00,3.157,3.638


In [16]:
pd.DataFrame(pipeline.predict(X_test.head(10)), columns=y_test.columns)

Unnamed: 0,lead_1,lead_2
0,145.662004,234.531611
1,145.662004,234.531611
2,145.662004,234.531611
3,145.662004,234.531611
4,145.662004,234.531611
5,151.279857,234.531611
6,151.279857,234.531611
7,151.279857,234.531611
8,145.662004,234.531611
9,145.662004,230.267837


In [41]:
X_train[:2]

Unnamed: 0,unique_id,ds,lag_0,lag_1
1,1,2021-09-01,17.314,96.59
2,2,2021-09-01,656.859,17.314


In [25]:
data_json = X_train[:2].to_json(orient="records", indent=4)
print(data_json)

[
    {
        "unique_id":1,
        "ds":1630454400000,
        "lag_0":17.314,
        "lag_1":96.59
    },
    {
        "unique_id":2,
        "ds":1630454400000,
        "lag_0":656.859,
        "lag_1":17.314
    }
]


In [26]:
from io import StringIO

pd.read_json(StringIO(data_json))

Unnamed: 0,unique_id,ds,lag_0,lag_1
0,1,1630454400000,17.314,96.59
1,2,1630454400000,656.859,17.314


In [82]:
data_json = """
[
    {
        "unique_id":1,
        "ds":1630454400000,
        "values":[17.314, 96.59],
    },
    {
        "unique_id":2,
        "ds":1630454400000,
        "values":[656.859, null],
    }
]
"""

In [85]:
df_input = pd.read_json(StringIO(data_json)).explode("values")#.fillna(value=0)#.reset_index()
df_input["ds"] = pd.to_datetime(df_input["ds"], unit="ms")
df_input

Unnamed: 0,unique_id,ds,values
0,1,2021-09-01,17.314
0,1,2021-09-01,96.59
1,2,2021-09-01,656.859
1,2,2021-09-01,


In [88]:
df_input.fillna(value=np.nan)

  df_input.fillna(value=np.nan)


Unnamed: 0,unique_id,ds,values
0,1,2021-09-01,17.314
0,1,2021-09-01,96.59
1,2,2021-09-01,656.859
1,2,2021-09-01,


In [89]:
#pd.DataFrame(pipeline.predict(X_test.head(10)), columns=y_test.columns)
pipeline.predict(df_input)

ValueError: The feature names should match those that were passed during fit.
Feature names unseen at fit time:
- values
Feature names seen at fit time, yet now missing:
- lag_0
- lag_1


# Visualization

# Save model

In [213]:
import joblib

joblib.dump(model, 'model_test.joblib')

['model_test.joblib']

Load the model for test

In [214]:
with open('model_test.joblib', 'rb') as file:
    loaded_model = joblib.load(file)
    
loaded_model.predict(X_test.iloc[:10, 2:]) == model.predict(X_test.iloc[:10, 2:])

array([[ True,  True],
       [ True,  True],
       [ True,  True],
       [ True,  True],
       [ True,  True],
       [ True,  True],
       [ True,  True],
       [ True,  True],
       [ True,  True],
       [ True,  True]])

# App deployment

In [63]:
from flask import jsonify

In [None]:
import joblib
import json
from flask import Flask, request
import numpy as np
import pandas as pd

model = None

def load_model():
    global model
    # model variable refers to the global variable
    with open('model_test.joblib', 'rb') as file:
        model = joblib.load(file)
        
        
app = Flask(__name__)

