In [34]:
import numpy as np
import pandas as pd

from mlforecast import MLForecast
from mlforecast.lag_transforms import ExpandingMean, RollingMean
from mlforecast.target_transforms import Differences
from lightgbm import LGBMRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MaxAbsScaler

import json
import joblib


In [35]:
df = pd.read_csv(
    "../data/consumption.csv", usecols=["prediction_unit_id", "datetime", "target"]
)[["prediction_unit_id", "datetime", "target"]].rename(
    columns={"prediction_unit_id": "building_id", "target": "consumption"}
)
df["datetime"] = pd.to_datetime(df["datetime"])
df.head()

Unnamed: 0,building_id,datetime,consumption
0,0,2021-09-01,96.59
1,1,2021-09-01,17.314
2,2,2021-09-01,656.859
3,3,2021-09-01,59.0
4,4,2021-09-01,501.76


In [36]:
df["consumption"] = df["consumption"].interpolate(method="linear")
df.isna().sum()

building_id    0
datetime       0
consumption    0
dtype: int64

# Train/Test split

In [37]:
# taking the last 60 days for test
test_duration = 24 * 60
for i in df["building_id"].unique():
    if i == df["building_id"].unique()[0]:
        df_test = df[df["building_id"]==i][-test_duration:]
        continue
    df_test = pd.concat([df_test, df[df["building_id"]==i][-test_duration:]])
    
df_test.sort_index(inplace=True)
print(df_test.shape)
df_test.head()

(99360, 3)


Unnamed: 0,building_id,datetime,consumption
684254,68,2022-11-10 00:00:00,28.124
684323,68,2022-11-10 01:00:00,28.02
684392,68,2022-11-10 02:00:00,28.741
684461,68,2022-11-10 03:00:00,31.947
684530,68,2022-11-10 04:00:00,36.197


In [38]:
train_idx = [idx for idx in df.index if idx not in df_test.index]
df_train = df.loc[train_idx]
df_train.shape
df_train.head()

Unnamed: 0,building_id,datetime,consumption
0,0,2021-09-01,96.59
1,1,2021-09-01,17.314
2,2,2021-09-01,656.859
3,3,2021-09-01,59.0
4,4,2021-09-01,501.76


In [39]:
df.shape[0] == df_train.shape[0] + df_test.shape[0]

True

In [40]:
df.shape[1] == df_train.shape[1] == df_test.shape[1]

True

In [41]:
test_size = df_test.shape[0] / (df_train.shape[0] + df_test.shape[0])
print(f"test size : {round(test_size, 3)*100}0%")

test size : 9.80%


# Training

In [42]:
lgbm_params = {
    'feature_fraction': 0.6857592714582879,
    'learning_rate': 0.08105638762049074,
    'max_depth': 15,
    'num_leaves': 138,
    'subsample': 0.11864728937294303
}

pipeline = Pipeline(
    [
        ("scaler", MaxAbsScaler()),
        ("lgb", LGBMRegressor(**lgbm_params)),
    ]
)

fcst = MLForecast(
    models={"model": pipeline},
    freq="h",
    # target_transforms=[Differences([24])],
    lags=[i + 1 for i in range(47)],
    lag_transforms={
        1: [ExpandingMean()],
        1: [RollingMean(window_size=24)],
        24: [RollingMean(window_size=24)],
        # 24: [RollingMean(window_size=48)],
    },
    date_features=["month", "dayofweek", "hour"],
)


In [43]:
col_params = dict(id_col="building_id", time_col="datetime", target_col="consumption")
fcst.fit(df_train, **col_params)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.135541 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 12539
[LightGBM] [Info] Number of data points in the train set: 906573, number of used features: 52
[LightGBM] [Info] Start training from score 468.430628


MLForecast(models=[model], freq=h, lag_features=['lag1', 'lag2', 'lag3', 'lag4', 'lag5', 'lag6', 'lag7', 'lag8', 'lag9', 'lag10', 'lag11', 'lag12', 'lag13', 'lag14', 'lag15', 'lag16', 'lag17', 'lag18', 'lag19', 'lag20', 'lag21', 'lag22', 'lag23', 'lag24', 'lag25', 'lag26', 'lag27', 'lag28', 'lag29', 'lag30', 'lag31', 'lag32', 'lag33', 'lag34', 'lag35', 'lag36', 'lag37', 'lag38', 'lag39', 'lag40', 'lag41', 'lag42', 'lag43', 'lag44', 'lag45', 'lag46', 'lag47', 'rolling_mean_lag1_window_size24', 'rolling_mean_lag24_window_size24'], date_features=['month', 'dayofweek', 'hour'], num_threads=1)

# Save model

In [44]:
joblib.dump(fcst, '../deployment/mlforecast_model.joblib')

['../deployment/mlforecast_model.joblib']

Load the model for test

In [45]:
with open('../deployment/mlforecast_model.joblib', 'rb') as file:
    loaded_model = joblib.load(file)

loaded_model.predict(1) == fcst.predict(1)



Unnamed: 0,building_id,datetime,model
0,True,True,True
1,True,True,True
2,True,True,True
3,True,True,True
4,True,True,True
...,...,...,...
64,True,True,True
65,True,True,True
66,True,True,True
67,True,True,True


# Inference

## On DataFrame data

In [46]:
fcst.predict(24)



Unnamed: 0,building_id,datetime,model
0,0,2023-04-02 00:00:00,1018.648010
1,0,2023-04-02 01:00:00,982.513590
2,0,2023-04-02 02:00:00,974.014104
3,0,2023-04-02 03:00:00,965.600797
4,0,2023-04-02 04:00:00,960.029375
...,...,...,...
1651,68,2022-11-10 19:00:00,86.633929
1652,68,2022-11-10 20:00:00,78.661577
1653,68,2022-11-10 21:00:00,71.187151
1654,68,2022-11-10 22:00:00,59.474685


In [47]:
input_data = df_test.groupby("building_id").head(1)
fcst.update(input_data)
predictions = fcst.predict(24)
predictions



Unnamed: 0,building_id,datetime,model
0,0,2023-04-02 01:00:00,949.890258
1,0,2023-04-02 02:00:00,944.953531
2,0,2023-04-02 03:00:00,937.299366
3,0,2023-04-02 04:00:00,932.727657
4,0,2023-04-02 05:00:00,933.921204
...,...,...,...
1651,68,2022-11-10 20:00:00,78.661577
1652,68,2022-11-10 21:00:00,71.187151
1653,68,2022-11-10 22:00:00,59.474685
1654,68,2022-11-10 23:00:00,48.408205


In [48]:
input_data["datetime"] = input_data["datetime"].astype("string")
input_data.loc[input_data["building_id"] == 0, "datetime"].values[0]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  input_data["datetime"] = input_data["datetime"].astype("string")


'2023-04-02'

In [49]:
output = []
for building_id in predictions["building_id"].unique():
    output.append(
        {
            "building_id": int(building_id),
            "datetime": input_data.loc[input_data["building_id"] == building_id, "datetime"].values[0],
            "forecasts": predictions.loc[predictions["building_id"] == 0, "model"].to_list(),
        }
    )

In [50]:
output_json = json.dumps(output, indent=4)
print(output_json[:500])

[
    {
        "building_id": 0,
        "datetime": "2023-04-02",
        "forecasts": [
            949.8902581149754,
            944.9535307816661,
            937.2993656442748,
            932.7276572865806,
            933.9212041167327,
            924.7042518995202,
            886.8520554726454,
            771.2090808192432,
            568.2658631644154,
            398.60904031591525,
            307.72055197078515,
            240.9340544559327,
            229.14627351564278,
   


In [51]:
# input_data.join(predictions.set_index("building_id"), on=["building_id"], rsuffix="_pred")

## On json data

Let's reformat the input data in json format from the DataFrame

In [52]:
input_data = df_test.groupby("building_id").head(2).groupby("building_id").tail(1)
input_data["datetime"] = input_data["datetime"].astype("string")
data_json = input_data.to_json(orient="records", indent=4)
print(data_json[:400])

[
    {
        "building_id":68,
        "datetime":"2022-11-10 01:00:00",
        "consumption":28.02
    },
    {
        "building_id":47,
        "datetime":"2022-11-15 01:00:00",
        "consumption":10.453
    },
    {
        "building_id":41,
        "datetime":"2023-01-03 01:00:00",
        "consumption":20.887
    },
    {
        "building_id":26,
        "datetime":"2023-01-16 01:00:


Create a ``data`` field that holds the data

In [53]:
data_dict = {"data": json.loads(data_json)}
data_json = json.dumps(data_dict, indent=4)
print(data_json[:500])

{
    "data": [
        {
            "building_id": 68,
            "datetime": "2022-11-10 01:00:00",
            "consumption": 28.02
        },
        {
            "building_id": 47,
            "datetime": "2022-11-15 01:00:00",
            "consumption": 10.453
        },
        {
            "building_id": 41,
            "datetime": "2023-01-03 01:00:00",
            "consumption": 20.887
        },
        {
            "building_id": 26,
            "datetime": "2023-01-16 01:00:00"


In [54]:
input_data = pd.DataFrame(json.loads(data_json)["data"])
input_data

Unnamed: 0,building_id,datetime,consumption
0,68,2022-11-10 01:00:00,28.020
1,47,2022-11-15 01:00:00,10.453
2,41,2023-01-03 01:00:00,20.887
3,26,2023-01-16 01:00:00,3.123
4,44,2023-03-14 01:00:00,3.180
...,...,...,...
64,57,2023-04-02 01:00:00,72.128
65,58,2023-04-02 01:00:00,154.896
66,64,2023-04-02 01:00:00,208.882
67,59,2023-04-02 01:00:00,73.703


Save json data

In [55]:
with open("../deployment/data_json_mlforecast.json", "w") as file:
    json.dump(data_dict, file, indent=4)

In [56]:
with open("../deployment/data_json_mlforecast.json", "r") as file:
    loaded_data = json.load(file)

In [57]:
input_data = pd.DataFrame(loaded_data["data"])
input_data

Unnamed: 0,building_id,datetime,consumption
0,68,2022-11-10 01:00:00,28.020
1,47,2022-11-15 01:00:00,10.453
2,41,2023-01-03 01:00:00,20.887
3,26,2023-01-16 01:00:00,3.123
4,44,2023-03-14 01:00:00,3.180
...,...,...,...
64,57,2023-04-02 01:00:00,72.128
65,58,2023-04-02 01:00:00,154.896
66,64,2023-04-02 01:00:00,208.882
67,59,2023-04-02 01:00:00,73.703


In [58]:
input_data.dtypes

building_id      int64
datetime        object
consumption    float64
dtype: object

In [59]:
# input_data["datetime"] = pd.to_datetime(input_data["datetime"], unit="ms")  # if in 'epoch' format
datetimes = input_data.drop(columns=["consumption"])
input_data["datetime"] = pd.to_datetime(input_data["datetime"])  # if in 'iso'
input_data
input_data.dtypes

building_id             int64
datetime       datetime64[ns]
consumption           float64
dtype: object

In [60]:
fcst.update(input_data)
predictions = fcst.predict(24)
predictions





Unnamed: 0,building_id,datetime,model
0,0,2023-04-02 02:00:00,934.808960
1,0,2023-04-02 03:00:00,927.154795
2,0,2023-04-02 04:00:00,918.283115
3,0,2023-04-02 05:00:00,919.476662
4,0,2023-04-02 06:00:00,910.259709
...,...,...,...
1651,68,2022-11-10 21:00:00,71.187151
1652,68,2022-11-10 22:00:00,59.474685
1653,68,2022-11-10 23:00:00,48.408205
1654,68,2022-11-11 00:00:00,41.683696


In [61]:
output = []
for building_id in predictions["building_id"].unique():
    output.append(
        {
            "building_id": int(building_id),
            "datetime": datetimes.loc[datetimes["building_id"] == building_id, "datetime"].values[0],
            "forecasts": predictions.loc[predictions["building_id"] == building_id, "model"].to_list(),
        }
    )

In [62]:
output_json = json.dumps(output, indent=4)
print(output_json[:1000])

[
    {
        "building_id": 0,
        "datetime": "2023-04-02 01:00:00",
        "forecasts": [
            934.8089598131903,
            927.1547946757989,
            918.2831147202858,
            919.4766615504379,
            910.2597093332254,
            871.7239108249272,
            744.8209197276753,
            553.5063776598045,
            392.29275367037565,
            307.2543318964081,
            242.79173472015745,
            231.78524241636885,
            240.92326520985213,
            278.5958106682852,
            351.7306320502672,
            500.3361883023582,
            761.3913397094235,
            958.5858283338631,
            1088.7908245165563,
            1130.0601012391849,
            1096.6522369491977,
            1028.7695985501118,
            987.8557784881514,
            976.7136764618655
        ]
    },
    {
        "building_id": 1,
        "datetime": "2023-04-02 01:00:00",
        "forecasts": [
            14.474451842668007,
  

Create a ``prediction`` field that holdatetime the predictions

In [63]:
output = {"predictions": output}
output = json.dumps(output, indent=4)
print(output[:500])

{
    "predictions": [
        {
            "building_id": 0,
            "datetime": "2023-04-02 01:00:00",
            "forecasts": [
                934.8089598131903,
                927.1547946757989,
                918.2831147202858,
                919.4766615504379,
                910.2597093332254,
                871.7239108249272,
                744.8209197276753,
                553.5063776598045,
                392.29275367037565,
                307.2543318964081,
            


# Visualization