In [2]:
import numpy as np
import pandas as pd

In [3]:
df = pd.read_csv(
    "../data/consumption.csv", usecols=["prediction_unit_id", "datetime", "target"]
)[["prediction_unit_id", "datetime", "target"]].rename(
    columns={"prediction_unit_id": "building_id", "target": "consumption"}
)
df["datetime"] = pd.to_datetime(df["datetime"])
df.head()

Unnamed: 0,building_id,datetime,consumption
0,0,2021-09-01,96.59
1,1,2021-09-01,17.314
2,2,2021-09-01,656.859
3,3,2021-09-01,59.0
4,4,2021-09-01,501.76


In [4]:
df["consumption"] = df["consumption"].interpolate(method="linear")
df.isna().sum()

building_id    0
datetime       0
consumption    0
dtype: int64

# Train/Test split

In [5]:
# taking the last 60 days for test
test_duration = 24 * 60
for i in df["building_id"].unique():
    if i == df["building_id"].unique()[0]:
        df_test = df[df["building_id"]==i][-test_duration:]
        continue
    df_test = pd.concat([df_test, df[df["building_id"]==i][-test_duration:]])
    
df_test.sort_index(inplace=True)
print(df_test.shape)
df_test.head()

(99360, 3)


Unnamed: 0,building_id,datetime,consumption
684254,68,2022-11-10 00:00:00,28.124
684323,68,2022-11-10 01:00:00,28.02
684392,68,2022-11-10 02:00:00,28.741
684461,68,2022-11-10 03:00:00,31.947
684530,68,2022-11-10 04:00:00,36.197


In [6]:
train_idx = [idx for idx in df.index if idx not in df_test.index]
df_train = df.loc[train_idx]
df_train.shape
df_train.head()

Unnamed: 0,building_id,datetime,consumption
0,0,2021-09-01,96.59
1,1,2021-09-01,17.314
2,2,2021-09-01,656.859
3,3,2021-09-01,59.0
4,4,2021-09-01,501.76


In [7]:
df.shape[0] == df_train.shape[0] + df_test.shape[0]

True

In [8]:
df.shape[1] == df_train.shape[1] == df_test.shape[1]

True

test size : 9.80%


# Set features and target

In [10]:
from mlforecast import MLForecast
from mlforecast.lag_transforms import ExpandingMean, RollingMean
from mlforecast.target_transforms import Differences

col_params = dict(id_col="building_id", time_col="datetime", target_col="consumption")

fcst = MLForecast(
    models=[],
    freq="h",
    # target_transforms=[Differences([24])],
    lags=[i + 1 for i in range(47)],
    lag_transforms={
        1: [ExpandingMean()],
        1: [RollingMean(window_size=24)],
        24: [RollingMean(window_size=24)],
        # 24: [RollingMean(window_size=48)],
    },
    date_features=["month", "dayofweek", "hour"],
)

def get_features(df):
    return fcst.preprocess(df, **col_params).rename(columns={"consumption": "lag0"})


def get_target(X, horizon=24):
    X, y = X.align(
        df.groupby("building_id")["consumption"].shift(-horizon).rename("lead24").dropna(),
        axis=0,
        join="inner",
    )
    y = pd.concat(
        [
            X[["building_id", "datetime"]],
            y
        ], axis=1
    )
    return X, y

In [11]:
X_train = get_features(df_train)
X_train, y_train = get_target(X_train)
display("X_train", X_train.head())
display("y_train", y_train.head())

'X_train'

Unnamed: 0,building_id,datetime,lag0,lag1,lag2,lag3,lag4,lag5,lag6,lag7,...,lag43,lag44,lag45,lag46,lag47,rolling_mean_lag1_window_size24,rolling_mean_lag24_window_size24,month,dayofweek,hour
2867,0,2021-09-02 23:00:00,120.54,134.986,150.412,152.763,136.13,121.033,80.621,43.428,...,88.184,87.955,91.594,77.691,96.59,87.588333,79.96975,9,3,23
2868,1,2021-09-02 23:00:00,19.43,21.577,24.309,27.201,25.419,18.06,16.228,10.614,...,18.225,14.271,16.51,15.872,17.314,15.725333,15.106667,9,3,23
2869,2,2021-09-02 23:00:00,748.504,920.535,1000.499,988.047,877.168,656.989,489.864,278.374,...,607.308,622.824,598.45,595.498,656.859,556.726667,497.7205,9,3,23
2870,3,2021-09-02 23:00:00,76.4,72.1,80.1,84.1,77.1,74.6,64.0,79.7,...,60.5,64.0,63.1,61.6,59.0,73.770833,62.166667,9,3,23
2871,4,2021-09-02 23:00:00,572.467,579.63,678.026,673.509,612.565,556.25,582.44,638.742,...,520.318,511.794,458.562,486.297,501.76,584.572583,563.044375,9,3,23


'y_train'

Unnamed: 0,building_id,datetime,lead24
2867,0,2021-09-02 23:00:00,139.929
2868,1,2021-09-02 23:00:00,27.217
2869,2,2021-09-02 23:00:00,898.365
2870,3,2021-09-02 23:00:00,74.8
2871,4,2021-09-02 23:00:00,601.735


In [12]:
X_train.shape, y_train.shape

((906573, 55), (906573, 3))

We verify that features have been correctly created :

In [13]:
df_merge = pd.merge(X_train, y_train, on=["building_id", "datetime"])[["building_id", "datetime", "lead24", "lag0", "lag1", "lag2"]]
df_merge[df_merge["building_id"]==0].head(25)

Unnamed: 0,building_id,datetime,lead24,lag0,lag1,lag2
0,0,2021-09-02 23:00:00,139.929,120.54,134.986,150.412
61,0,2021-09-03 00:00:00,121.942,107.129,120.54,134.986
122,0,2021-09-03 01:00:00,114.038,81.92,107.129,120.54
183,0,2021-09-03 02:00:00,105.102,96.193,81.92,107.129
244,0,2021-09-03 03:00:00,120.12,94.536,96.193,81.92
305,0,2021-09-03 04:00:00,117.384,99.585,94.536,96.193
366,0,2021-09-03 05:00:00,105.497,102.671,99.585,94.536
427,0,2021-09-03 06:00:00,112.545,109.17,102.671,99.585
488,0,2021-09-03 07:00:00,95.339,108.439,109.17,102.671
549,0,2021-09-03 08:00:00,80.138,95.669,108.439,109.17


Note : ``datetime`` is the date and time of the last measured value for ``consumption``

In [14]:
X_test = get_features(df_test)
X_test, y_test = get_target(X_test)
display("X_test", X_test.head())
display("y_test", y_test.head())

'X_test'

Unnamed: 0,building_id,datetime,lag0,lag1,lag2,lag3,lag4,lag5,lag6,lag7,...,lag43,lag44,lag45,lag46,lag47,rolling_mean_lag1_window_size24,rolling_mean_lag24_window_size24,month,dayofweek,hour
687497,68,2022-11-11 23:00:00,28.991,32.392,48.965,37.38,40.38,41.558,37.281,81.182,...,36.197,31.947,28.741,28.02,28.124,86.123375,82.340083,11,4,23
687566,68,2022-11-12 00:00:00,27.482,28.991,32.392,48.965,37.38,40.38,41.558,37.281,...,37.045,36.197,31.947,28.741,28.02,86.060333,82.334417,11,5,0
687635,68,2022-11-12 01:00:00,26.29,27.482,28.991,32.392,48.965,37.38,40.38,41.558,...,47.85,37.045,36.197,31.947,28.741,86.03925,82.412667,11,5,1
687704,68,2022-11-12 02:00:00,27.581,26.29,27.482,28.991,32.392,48.965,37.38,40.38,...,93.215,47.85,37.045,36.197,31.947,85.888917,82.781875,11,5,2
687773,68,2022-11-12 03:00:00,28.395,27.581,26.29,27.482,28.991,32.392,48.965,37.38,...,140.989,93.215,47.85,37.045,36.197,85.471375,82.916333,11,5,3


'y_test'

Unnamed: 0,building_id,datetime,lead24
687497,68,2022-11-11 23:00:00,27.33
687566,68,2022-11-12 00:00:00,28.096
687635,68,2022-11-12 01:00:00,29.718
687704,68,2022-11-12 02:00:00,26.952
687773,68,2022-11-12 03:00:00,28.515


In [15]:
X_test.shape, y_test.shape

((94461, 55), (94461, 3))

# Training

In [16]:
from lightgbm import LGBMRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MaxAbsScaler
from modules import columnDropperTransformer

In [17]:
pipeline = Pipeline(
    [
        # ("columnDropper", columnDropperTransformer(columns=['building_id','datetime'])),
        ("scaler", MaxAbsScaler()),
        # ("lgb", MultiOutputRegressor(LGBMRegressor())),
        ("lgb", LGBMRegressor()),
    ]
)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.146667 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 12539
[LightGBM] [Info] Number of data points in the train set: 906573, number of used features: 52
[LightGBM] [Info] Start training from score 468.430628


MLForecast(models=[maxabs_lgbm], freq=h, lag_features=['lag1', 'lag2', 'lag3', 'lag4', 'lag5', 'lag6', 'lag7', 'lag8', 'lag9', 'lag10', 'lag11', 'lag12', 'lag13', 'lag14', 'lag15', 'lag16', 'lag17', 'lag18', 'lag19', 'lag20', 'lag21', 'lag22', 'lag23', 'lag24', 'lag25', 'lag26', 'lag27', 'lag28', 'lag29', 'lag30', 'lag31', 'lag32', 'lag33', 'lag34', 'lag35', 'lag36', 'lag37', 'lag38', 'lag39', 'lag40', 'lag41', 'lag42', 'lag43', 'lag44', 'lag45', 'lag46', 'lag47', 'rolling_mean_lag1_window_size24', 'rolling_mean_lag24_window_size24'], date_features=['month', 'dayofweek', 'hour'], num_threads=1)

Unnamed: 0,building_id,datetime,maxabs_lgbm
133,66,2023-04-02 01:00:00,70.382392
134,67,2023-04-02 00:00:00,378.36302
135,67,2023-04-02 01:00:00,378.36302
136,68,2022-11-10 00:00:00,36.033647
137,68,2022-11-10 01:00:00,35.546412


Unnamed: 0,building_id,datetime,consumption
684254,68,2022-11-10 00:00:00,28.124
684323,68,2022-11-10 01:00:00,28.02
684392,68,2022-11-10 02:00:00,28.741
684461,68,2022-11-10 03:00:00,31.947
684530,68,2022-11-10 04:00:00,36.197


# Inference

## On DataFrame data

In [129]:
input_data = get_features(df_test.groupby("building_id").head(48))
predictions = pipeline.predict(input_data)
# predictions = pd.DataFrame(predictions, index=input_data.index, columns=["lead1", "lead2"])
predictions = pd.Series(predictions, index=input_data.index, name="lead24")
predictions

687497     32.491260
695767     17.217934
775202     20.205441
796091     10.255186
887069     10.255186
             ...    
917179     82.673105
917180    174.269732
917181    264.659947
917182     75.099487
917183    275.611528
Name: lead24, Length: 69, dtype: float64

In [130]:
pd.concat(
    [input_data.iloc[:, :2], predictions],
    axis=1
)

Unnamed: 0,building_id,datetime,lead24
687497,68,2022-11-11 23:00:00,32.491260
695767,47,2022-11-16 23:00:00,17.217934
775202,41,2023-01-04 23:00:00,20.205441
796091,26,2023-01-17 23:00:00,10.255186
887069,44,2023-03-15 23:00:00,10.255186
...,...,...,...
917179,57,2023-04-03 23:00:00,82.673105
917180,58,2023-04-03 23:00:00,174.269732
917181,64,2023-04-03 23:00:00,264.659947
917182,59,2023-04-03 23:00:00,75.099487


## On json data

Let's reformat the input data in json format from the DataFrame

In [131]:
input_data = X_test.groupby("building_id").head(1)
def combine_columns(row):
    return [row["lag%i"%i] for i in range(48)]
    # return row[["lag0", "lag1", "lag2"]].values.reshape(-1,).tolist()

# Apply the custom function to create a new column "Combined"
input_data["consumption_values"] = input_data.apply(combine_columns, axis=1)
input_data = input_data.drop(columns=input_data.columns.drop(["building_id", "datetime", "consumption_values"]))
input_data.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  input_data["consumption_values"] = input_data.apply(combine_columns, axis=1)


Unnamed: 0,building_id,datetime,consumption_values
687497,68,2022-11-11 23:00:00,"[28.991, 32.392, 48.965, 37.38, 40.38, 41.558,..."
695767,47,2022-11-16 23:00:00,"[18.641, 28.545, 30.363, 30.462, 31.306, 20.04..."
775202,41,2023-01-04 23:00:00,"[18.939, 18.04, 18.64, 18.151, 18.568, 18.885,..."
796091,26,2023-01-17 23:00:00,"[4.738, 4.63, 6.656, 6.77, 6.742, 6.886, 5.184..."
887069,44,2023-03-15 23:00:00,"[3.317, 4.683, 5.702, 4.876, 4.096, 5.325, 6.1..."


In [132]:
input_data["datetime"] = input_data["datetime"].astype("string")
data_json = input_data.to_json(orient="records", indent=4)
print(data_json[:480])

[
    {
        "building_id":68,
        "datetime":"2022-11-11 23:00:00",
        "consumption_values":[
            28.991,
            32.392,
            48.965,
            37.38,
            40.38,
            41.558,
            37.281,
            81.182,
            127.549,
            148.603,
            164.361,
            161.418,
            177.639,
            158.718,
            203.076,
            177.424,
            150.11,
            48.37,
        


Create a ``data`` field that holds the data

In [133]:
import json

data_dict = {"data": json.loads(data_json)}
data_json = json.dumps(data_dict, indent=4)
print(data_json[:500])

{
    "data": [
        {
            "building_id": 68,
            "datetime": "2022-11-11 23:00:00",
            "consumption_values": [
                28.991,
                32.392,
                48.965,
                37.38,
                40.38,
                41.558,
                37.281,
                81.182,
                127.549,
                148.603,
                164.361,
                161.418,
                177.639,
                158.718,
                203.


In [134]:
input_data = pd.DataFrame(json.loads(data_json)["data"])
input_data

Unnamed: 0,building_id,datetime,consumption_values
0,68,2022-11-11 23:00:00,"[28.991, 32.392, 48.965, 37.38, 40.38, 41.558,..."
1,47,2022-11-16 23:00:00,"[18.641, 28.545, 30.363, 30.462, 31.306, 20.04..."
2,41,2023-01-04 23:00:00,"[18.939, 18.04, 18.64, 18.151, 18.568, 18.885,..."
3,26,2023-01-17 23:00:00,"[4.738, 4.63, 6.656, 6.77, 6.742, 6.886, 5.184..."
4,44,2023-03-15 23:00:00,"[3.317, 4.683, 5.702, 4.876, 4.096, 5.325, 6.1..."
...,...,...,...
64,57,2023-04-03 23:00:00,"[78.923, 84.31, 89.139, 96.076, 85.737, 79.575..."
65,58,2023-04-03 23:00:00,"[167.749, 172.708, 182.784, 200.79, 175.345, 1..."
66,64,2023-04-03 23:00:00,"[237.289, 242.667, 255.649, 291.508, 428.853, ..."
67,59,2023-04-03 23:00:00,"[73.374, 74.72, 80.581, 86.075, 72.534, 60.053..."


Save json data

In [135]:
with open("../deployment/data_json_test.json", "w") as file:
    json.dump(data_dict, file, indent=4)

In [136]:
with open("../deployment/data_json_test.json", "r") as file:
    loaded_data = json.load(file)

In [137]:
input_data = pd.DataFrame(loaded_data["data"])
input_data

Unnamed: 0,building_id,datetime,consumption_values
0,68,2022-11-11 23:00:00,"[28.991, 32.392, 48.965, 37.38, 40.38, 41.558,..."
1,47,2022-11-16 23:00:00,"[18.641, 28.545, 30.363, 30.462, 31.306, 20.04..."
2,41,2023-01-04 23:00:00,"[18.939, 18.04, 18.64, 18.151, 18.568, 18.885,..."
3,26,2023-01-17 23:00:00,"[4.738, 4.63, 6.656, 6.77, 6.742, 6.886, 5.184..."
4,44,2023-03-15 23:00:00,"[3.317, 4.683, 5.702, 4.876, 4.096, 5.325, 6.1..."
...,...,...,...
64,57,2023-04-03 23:00:00,"[78.923, 84.31, 89.139, 96.076, 85.737, 79.575..."
65,58,2023-04-03 23:00:00,"[167.749, 172.708, 182.784, 200.79, 175.345, 1..."
66,64,2023-04-03 23:00:00,"[237.289, 242.667, 255.649, 291.508, 428.853, ..."
67,59,2023-04-03 23:00:00,"[73.374, 74.72, 80.581, 86.075, 72.534, 60.053..."


set one column per value

In [101]:
input_data[["lag%i"%i for i in range(48)]] = pd.DataFrame(input_data["consumption_values"].tolist(), index= input_data.index)
input_data = input_data.drop(columns="consumption_values")
input_data

Unnamed: 0,building_id,datetime,lag0,lag1,lag2,lag3,lag4,lag5,lag6,lag7,...,lag38,lag39,lag40,lag41,lag42,lag43,lag44,lag45,lag46,lag47
0,68,2022-11-11 23:00:00,28.991,32.392,48.965,37.380,40.380,41.558,37.281,81.182,...,159.203,140.989,93.215,47.850,37.045,36.197,31.947,28.741,28.020,28.124
1,47,2022-11-16 23:00:00,18.641,28.545,30.363,30.462,31.306,20.040,19.225,24.856,...,24.540,24.070,17.992,15.463,10.327,10.044,9.585,10.244,10.453,10.509
2,41,2023-01-04 23:00:00,18.939,18.040,18.640,18.151,18.568,18.885,17.812,20.291,...,30.891,28.346,21.035,20.437,21.337,21.438,20.746,21.480,20.887,21.220
3,26,2023-01-17 23:00:00,4.738,4.630,6.656,6.770,6.742,6.886,5.184,5.347,...,3.039,2.884,3.549,2.501,2.944,2.671,3.151,2.108,3.123,2.436
4,44,2023-03-15 23:00:00,3.317,4.683,5.702,4.876,4.096,5.325,6.190,7.934,...,6.611,2.837,4.599,3.025,2.725,2.826,2.786,2.870,3.180,2.995
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
64,57,2023-04-03 23:00:00,78.923,84.310,89.139,96.076,85.737,79.575,81.680,58.903,...,48.229,58.007,72.988,72.113,76.535,70.652,67.667,72.407,72.128,71.023
65,58,2023-04-03 23:00:00,167.749,172.708,182.784,200.790,175.345,154.706,130.652,103.447,...,56.686,101.708,141.269,166.805,157.064,172.144,176.501,166.167,154.896,175.131
66,64,2023-04-03 23:00:00,237.289,242.667,255.649,291.508,428.853,573.684,574.405,614.831,...,174.594,244.015,270.238,287.844,213.901,205.060,211.198,230.625,208.882,220.237
67,59,2023-04-03 23:00:00,73.374,74.720,80.581,86.075,72.534,60.053,75.455,76.091,...,21.092,31.158,49.032,61.115,74.041,70.558,60.427,61.077,73.703,68.070


In [25]:
# input_data["datetime"] = pd.to_datetime(input_data["datetime"], unit="ms")  # if in 'epoch' format
# input_data["datetime"] = pd.to_datetime(input_data["datetime"])  # if in 'iso'
# input_data

In [26]:
# # introduce nan values for testing
# input_data_sample = input_data_df.sample(frac=0.01, random_state=100)

# input_data["lag1"].loc[input_data_sample.index] = (
#     input_data["lag1"].loc[input_data_sample.index].apply(lambda row: np.nan)
# )

In [27]:
input_data.isna().sum()

building_id    0
datetime       0
consumption    0
lag1           0
lag2           0
dtype: int64

In [28]:
predictions = pd.DataFrame(pipeline.predict(input_data), index=input_data.index, columns=["lead1", "lead2"])
predictions = pd.concat(
    [input_data.iloc[:, :2], predictions],
    axis=1
)
predictions["forecasts"] = pd.Series(predictions[["lead1", "lead2"]].values.tolist())
predictions = predictions.drop(columns=["lead1", "lead2"])
predictions

Unnamed: 0,building_id,datetime,forecasts
0,26,2023-03-28 02:00:00,"[6.336468979987606, 7.797781515255318]"
1,41,2023-05-14 02:00:00,"[198.38977135090636, 200.17952187038023]"
2,47,2023-05-23 02:00:00,"[5.960210791260598, 7.312225515889457]"
3,44,2023-05-26 02:00:00,"[3.799105925739712, 5.1752025835049]"
4,0,2023-05-30 02:00:00,"[435.0949469854639, 429.74490503136434]"
...,...,...,...
64,57,2023-05-30 02:00:00,"[33.5668496665648, 35.2151870548145]"
65,58,2023-05-30 02:00:00,"[76.76305219379158, 79.55824767783817]"
66,64,2023-05-30 02:00:00,"[172.3848713280855, 174.2567693786622]"
67,59,2023-05-30 02:00:00,"[33.98936813299375, 35.38365440142835]"


In [29]:
predictions_json = predictions.to_json(orient="records", indent=4)
print(predictions_json[:480])

[
    {
        "building_id":26,
        "datetime":"2023-03-28 02:00:00",
        "forecasts":[
            6.33646898,
            7.7977815153
        ]
    },
    {
        "building_id":41,
        "datetime":"2023-05-14 02:00:00",
        "forecasts":[
            198.3897713509,
            200.1795218704
        ]
    },
    {
        "building_id":47,
        "datetime":"2023-05-23 02:00:00",
        "forecasts":[
            5.9602107913,
            7.3122255159
 


Create a ``prediction`` field that holdatetime the predictions

In [30]:
predictions_dict = {"predictions": json.loads(predictions_json)}
output = json.dumps(predictions_dict, indent=4)
print(output[:500])

{
    "predictions": [
        {
            "building_id": 26,
            "datetime": "2023-03-28 02:00:00",
            "forecasts": [
                6.33646898,
                7.7977815153
            ]
        },
        {
            "building_id": 41,
            "datetime": "2023-05-14 02:00:00",
            "forecasts": [
                198.3897713509,
                200.1795218704
            ]
        },
        {
            "building_id": 47,
            "datetime": "2023-05-23 


# Visualization

# Save model

In [31]:
import joblib
joblib.dump(pipeline, '../deployment/model_test.joblib')

['../deployment/model_test.joblib']

Load the model for test

In [32]:
import joblib

In [33]:
with open('../deployment/model_test.joblib', 'rb') as file:
    loaded_pipeline = joblib.load(file)
    
loaded_pipeline.predict(X_test.iloc[:10]) == pipeline.predict(X_test.iloc[:10])

array([[ True,  True],
       [ True,  True],
       [ True,  True],
       [ True,  True],
       [ True,  True],
       [ True,  True],
       [ True,  True],
       [ True,  True],
       [ True,  True],
       [ True,  True]])