In [1]:
import os
import gc
import pickle

import numpy as np
import pandas as pd
import polars as pl
import plotly.express as px

from sklearn.ensemble import VotingRegressor

import lightgbm as lgb
from catboost import CatBoostRegressor

from pycaret import regression

In [2]:
root = "../data/"

data_cols = ['target', 'county', 'is_business', 'product_type', 'is_consumption', 'datetime', 'row_id']
client_cols = ['product_type', 'county', 'eic_count', 'installed_capacity', 'is_business', 'date']
gas_prices_cols = ['forecast_date', 'lowest_price_per_mwh', 'highest_price_per_mwh']
electricity_prices_cols = ['forecast_date', 'euros_per_mwh']
forecast_weather_cols = ['latitude', 'longitude', 'hours_ahead', 'temperature', 'dewpoint', 'cloudcover_high', 'cloudcover_low', 'cloudcover_mid', 'cloudcover_total', '10_metre_u_wind_component', '10_metre_v_wind_component', 'forecast_datetime', 'direct_solar_radiation', 'surface_solar_radiation_downwards', 'snowfall', 'total_precipitation']
historical_weather_cols = ['datetime', 'temperature', 'dewpoint', 'rain', 'snowfall', 'surface_pressure','cloudcover_total','cloudcover_low','cloudcover_mid','cloudcover_high','windspeed_10m','winddirection_10m','shortwave_radiation','direct_solar_radiation','diffuse_radiation','latitude','longitude']
location_cols = ['longitude', 'latitude', 'county']
target_cols = ['target', 'county', 'is_business', 'product_type', 'is_consumption', 'datetime']

In [3]:
df_data = pl.read_csv(os.path.join(root, "train.csv"), columns=data_cols, try_parse_dates=True)
df_client = pl.read_csv(os.path.join(root, "client.csv"), columns=client_cols, try_parse_dates=True)
df_gas_prices = pl.read_csv(os.path.join(root, "gas_prices.csv"), columns=gas_prices_cols, try_parse_dates=True)
df_electricity_prices = pl.read_csv(os.path.join(root, "electricity_prices.csv"), columns=electricity_prices_cols, try_parse_dates=True)
df_forecast_weather = pl.read_csv(os.path.join(root, "forecast_weather.csv"), columns=forecast_weather_cols, try_parse_dates=True)
df_historical_weather = pl.read_csv(os.path.join(root, "historical_weather.csv"), columns=historical_weather_cols, try_parse_dates=True)
df_weather_station_to_county_mapping = pl.read_csv(os.path.join(root, "weather_station_to_county_mapping.csv"), columns=location_cols, try_parse_dates=True)
df_target = df_data.select(target_cols)

schema_data = df_data.schema
schema_client = df_client.schema
schema_gas  = df_gas_prices.schema
schema_electricity = df_electricity_prices.schema
schema_forecast = df_forecast_weather.schema
schema_historical = df_historical_weather.schema
schema_target = df_target.schema

In [4]:
def generate_features(
        df_data, 
        df_client, 
        df_gas_prices, 
        df_electricity_prices, 
        df_forecast_weather, 
        df_historical_weather, 
        df_weather_station_to_county_mapping, 
        df_target
):
    df_data = (
        df_data
        .with_columns(
            pl.col("datetime").cast(pl.Date).alias("date"),
        )
    )

    df_gas_prices = (
        df_gas_prices
        .rename({"forecast_date": "date"})
    )

    df_electricity_prices = (
        df_electricity_prices
        .rename({"forecast_date": "datetime"})
    )

    df_weather_station_to_county_mapping = (
        df_weather_station_to_county_mapping
        .with_columns(
            pl.col("latitude").cast(pl.datatypes.Float32),
            pl.col("longitude").cast(pl.datatypes.Float32)
        )
    )

    # sum of all product_type targets related to ["datetime", "county", "is_business", "is_consumption"]
    df_target_all_type_sum = (
        df_target
        .group_by(["datetime", "county", "is_business", "is_consumption"]).sum()
        .drop("product_type")
    )

    df_forecast_weather = (
        df_forecast_weather
        .rename({"forecast_datetime": "datetime"})
        .filter(pl.col("hours_ahead") >= 24) # we don't need forecast for today
        .with_columns(
            pl.col("latitude").cast(pl.datatypes.Float32),
            pl.col("longitude").cast(pl.datatypes.Float32),
            # datetime for forecast in a different timezone
#             pl.col('datetime').dt.convert_time_zone("Europe/Bucharest").dt.replace_time_zone(None).cast(pl.Datetime("us")),
        )
        .join(df_weather_station_to_county_mapping, how="left", on=["longitude", "latitude"])
        .drop("longitude", "latitude")
    )

    df_historical_weather = (
        df_historical_weather
        .with_columns(
            pl.col("latitude").cast(pl.datatypes.Float32),
            pl.col("longitude").cast(pl.datatypes.Float32),
#             pl.col("datetime") + pl.duration(hours=37)
        )
        .join(df_weather_station_to_county_mapping, how="left", on=["longitude", "latitude"])
        .drop("longitude", "latitude")
    )

    # creating average forecast characteristics for all weather stations
    df_forecast_weather_date = (
        df_forecast_weather
        .group_by("datetime").mean()
        .drop("county")
    )

    # creating average forecast characteristics for weather stations related to county
    df_forecast_weather_local = (
        df_forecast_weather
        .filter(pl.col("county").is_not_null())
        .group_by("county", "datetime").mean()
    )

    # creating average historical characteristics for all weather stations
    df_historical_weather_date = (
        df_historical_weather
        .group_by("datetime").mean()
        .drop("county")
    )

    # creating average historical characteristics for weather stations related to county
    df_historical_weather_local = (
        df_historical_weather
        .filter(pl.col("county").is_not_null())
        .group_by("county", "datetime").mean()
    )

    df_data = (
        df_data
        # pl.duration(days=1) shifts datetime to join lag features (usually we join last available values)
        .join(df_gas_prices.with_columns((pl.col("date") + pl.duration(days=1)).cast(pl.Date)), on="date", how="left")
        .join(df_client.with_columns((pl.col("date") + pl.duration(days=2)).cast(pl.Date)), on=["county", "is_business", "product_type", "date"], how="left")
        .join(df_electricity_prices.with_columns(pl.col("datetime") + pl.duration(days=1)), on="datetime", how="left")

        # lag forecast_weather features (24 hours * days)
        .join(df_forecast_weather_date, on="datetime", how="left", suffix="_fd")
        .join(df_forecast_weather_local, on=["county", "datetime"], how="left", suffix="_fl")
        .join(df_forecast_weather_date.with_columns(pl.col("datetime") + pl.duration(days=7)), on="datetime", how="left", suffix="_fd_7d")
        .join(df_forecast_weather_local.with_columns(pl.col("datetime") + pl.duration(days=7)), on=["county", "datetime"], how="left", suffix="_fl_7d")

        # lag historical_weather features (24 hours * days)
        .join(df_historical_weather_date.with_columns(pl.col("datetime") + pl.duration(days=2)), on="datetime", how="left", suffix="_hd_2d")
        .join(df_historical_weather_local.with_columns(pl.col("datetime") + pl.duration(days=2)), on=["county", "datetime"], how="left", suffix="_hl_2d")
        .join(df_historical_weather_date.with_columns(pl.col("datetime") + pl.duration(days=7)), on="datetime", how="left", suffix="_hd_7d")
        .join(df_historical_weather_local.with_columns(pl.col("datetime") + pl.duration(days=7)), on=["county", "datetime"], how="left", suffix="_hl_7d")

        # lag target features (24 hours * days)
        .join(df_target.with_columns(pl.col("datetime") + pl.duration(days=2)).rename({"target": "target_1"}), on=["county", "is_business", "product_type", "is_consumption", "datetime"], how="left")
        .join(df_target.with_columns(pl.col("datetime") + pl.duration(days=3)).rename({"target": "target_2"}), on=["county", "is_business", "product_type", "is_consumption", "datetime"], how="left")
        .join(df_target.with_columns(pl.col("datetime") + pl.duration(days=4)).rename({"target": "target_3"}), on=["county", "is_business", "product_type", "is_consumption", "datetime"], how="left")
        .join(df_target.with_columns(pl.col("datetime") + pl.duration(days=5)).rename({"target": "target_4"}), on=["county", "is_business", "product_type", "is_consumption", "datetime"], how="left")
        .join(df_target.with_columns(pl.col("datetime") + pl.duration(days=6)).rename({"target": "target_5"}), on=["county", "is_business", "product_type", "is_consumption", "datetime"], how="left")
        .join(df_target.with_columns(pl.col("datetime") + pl.duration(days=7)).rename({"target": "target_6"}), on=["county", "is_business", "product_type", "is_consumption", "datetime"], how="left")
        .join(df_target.with_columns(pl.col("datetime") + pl.duration(days=14)).rename({"target": "target_7"}), on=["county", "is_business", "product_type", "is_consumption", "datetime"], how="left")

        .join(df_target_all_type_sum.with_columns(pl.col("datetime") + pl.duration(days=2)).rename({"target": "target_1"}), on=["county", "is_business", "is_consumption", "datetime"], suffix="_all_type_sum", how="left")
        .join(df_target_all_type_sum.with_columns(pl.col("datetime") + pl.duration(days=3)).rename({"target": "target_2"}), on=["county", "is_business", "is_consumption", "datetime"], suffix="_all_type_sum", how="left")
        .join(df_target_all_type_sum.with_columns(pl.col("datetime") + pl.duration(days=7)).rename({"target": "target_6"}), on=["county", "is_business", "is_consumption", "datetime"], suffix="_all_type_sum", how="left")
        .join(df_target_all_type_sum.with_columns(pl.col("datetime") + pl.duration(days=14)).rename({"target": "target_7"}), on=["county", "is_business", "is_consumption", "datetime"], suffix="_all_type_sum", how="left")


        .with_columns(
            pl.col("datetime").dt.ordinal_day().alias("dayofyear"),
            pl.col("datetime").dt.hour().alias("hour"),
            pl.col("datetime").dt.day().alias("day"),
            pl.col("datetime").dt.weekday().alias("weekday"),
            pl.col("datetime").dt.month().alias("month"),
            pl.col("datetime").dt.year().alias("year"),
        )

        .with_columns(
            pl.concat_str("county", "is_business", "product_type", "is_consumption", separator="_").alias("segment"),
        )

        # cyclical features encoding https://towardsdatascience.com/cyclical-features-encoding-its-about-time-ce23581845ca
        .with_columns(
            (np.pi * pl.col("dayofyear") / 183).sin().alias("sin(dayofyear)"),
            (np.pi * pl.col("dayofyear") / 183).cos().alias("cos(dayofyear)"),
            (np.pi * pl.col("hour") / 12).sin().alias("sin(hour)"),
            (np.pi * pl.col("hour") / 12).cos().alias("cos(hour)"),
        )

        .with_columns(
            pl.col(pl.Float64).cast(pl.Float32),
        )

        .drop("date", "datetime", "hour", "dayofyear")
    )

    return df_data

In [5]:
def to_pandas(X, y=None):
    cat_cols = ["county", "is_business", "product_type", "is_consumption", "segment"]

    if y is not None:
        df = pd.concat([X.to_pandas(), y.to_pandas()], axis=1)
    else:
        df = X.to_pandas()

    df = df.set_index("row_id")
    df[cat_cols] = df[cat_cols].astype("category")

    df["target_mean"] = df[[f"target_{i}" for i in range(1, 7)]].mean(1)
    df["target_std"] = df[[f"target_{i}" for i in range(1, 7)]].std(1)
    df["target_ratio"] = df["target_6"] / (df["target_7"] + 1e-3)

    return df

In [6]:
df_data, y = df_data.drop("target"), df_data.select("target")

df_train_features = generate_features(
    df_data,
    df_client,
    df_gas_prices,
    df_electricity_prices,
    df_forecast_weather,
    df_historical_weather,
    df_weather_station_to_county_mapping,
    df_target
)

df_train_features = to_pandas(df_train_features, y)
# a little proportion of target values are null
df_train_features = df_train_features[df_train_features['target'].notnull()]

In [7]:
n = len(df_train_features)

df_train_features, df_test_features = df_train_features[:int(n * 0.009)], df_train_features[int(n * 0.009):int(n * 0.01)]

In [8]:
regression.setup(data=df_train_features, test_data=df_test_features, target='target')

Unnamed: 0,Description,Value
0,Session id,4795
1,Target,target
2,Target type,Regression
3,Original data shape,"(20178, 141)"
4,Transformed data shape,"(20178, 100)"
5,Transformed train set shape,"(18160, 100)"
6,Transformed test set shape,"(2018, 100)"
7,Ordinal features,2
8,Numeric features,135
9,Categorical features,5


<pycaret.regression.oop.RegressionExperiment at 0x2a0695ba0>

In [9]:
regression.models()

Unnamed: 0_level_0,Name,Reference,Turbo
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
lr,Linear Regression,sklearn.linear_model._base.LinearRegression,True
lasso,Lasso Regression,sklearn.linear_model._coordinate_descent.Lasso,True
ridge,Ridge Regression,sklearn.linear_model._ridge.Ridge,True
en,Elastic Net,sklearn.linear_model._coordinate_descent.Elast...,True
lar,Least Angle Regression,sklearn.linear_model._least_angle.Lars,True
llar,Lasso Least Angle Regression,sklearn.linear_model._least_angle.LassoLars,True
omp,Orthogonal Matching Pursuit,sklearn.linear_model._omp.OrthogonalMatchingPu...,True
br,Bayesian Ridge,sklearn.linear_model._bayes.BayesianRidge,True
ard,Automatic Relevance Determination,sklearn.linear_model._bayes.ARDRegression,False
par,Passive Aggressive Regressor,sklearn.linear_model._passive_aggressive.Passi...,True


In [10]:
model = regression.compare_models(include=[
    'lightgbm',
    'catboost',
    'dt',
    'huber',
    'br',
    'omp',
    'ridge',
    'xgboost'
], sort='MAE', fold=3, n_select=5, turbo=True)

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
catboost,CatBoost Regressor,116.5573,136997.8902,326.5576,0.6726,1.7197,333.8036,2.3933
dt,Decision Tree Regressor,120.303,165493.4431,363.8852,0.6027,1.2403,165.223,0.6867
xgboost,Extreme Gradient Boosting,128.6751,143597.8893,335.158,0.6566,1.9264,397.4984,0.6667
lightgbm,Light Gradient Boosting Machine,129.2774,152730.5589,341.1179,0.6353,1.6871,382.3649,1.1467
huber,Huber Regressor,155.6766,176071.8776,376.306,0.5769,2.3178,699.434,0.2367
br,Bayesian Ridge,162.0681,155469.268,363.4369,0.6245,2.4488,766.4515,0.5067
omp,Orthogonal Matching Pursuit,167.3812,166523.2072,380.313,0.5969,2.5024,775.0752,0.1233
ridge,Ridge Regression,182.8392,166979.2875,376.9361,0.5968,2.5914,836.3159,0.1267


In [11]:
model

[<catboost.core.CatBoostRegressor at 0x2d0eb4880>,
 DecisionTreeRegressor(random_state=4795),
 XGBRegressor(base_score=None, booster='gbtree', callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, device='cpu', early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=None, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              multi_strategy=None, n_estimators=None, n_jobs=-1,
              num_parallel_tree=None, random_state=4795, ...),
 LGBMRegressor(n_jobs=-1, random_state=4795),
 HuberRegressor()]

In [12]:
tuned_model = [
    regression.tune_model(
        estimator=m,
        n_iter=5,
        optimize='MAE',
        search_library='scikit-optimize',
        search_algorithm='bayesian',
    ) for m in model
]

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,165.9774,106603.99,326.5027,0.7634,2.685,812.7513
1,139.0953,45215.476,212.6393,0.8719,3.0483,1210.6492
2,175.2669,122516.2052,350.0231,0.7484,2.0692,235.114
3,134.8934,65078.2237,255.1043,0.8472,3.0517,1457.4413
4,138.7144,89246.1937,298.741,0.8068,2.499,548.1149
5,148.4575,105189.054,324.3286,0.6525,2.2514,293.1731
6,106.366,30687.3902,175.1782,0.8797,3.0407,1538.0108
7,134.7459,71525.0375,267.4417,0.7364,2.1043,522.4538
8,133.8079,95495.9707,309.0242,0.8051,2.7187,1140.4116
9,131.8635,105654.7038,325.0457,0.7918,2.858,968.0904


Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).


Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,86.6472,43253.7154,207.9753,0.904,1.5693,47.6383
1,52.1402,24923.7494,157.8726,0.9294,1.2823,29.5973
2,100.0661,40415.2752,201.0355,0.917,2.3227,1.4465
3,46.5215,19573.5436,139.9055,0.9541,0.7874,20.0362
4,58.2521,20792.7647,144.197,0.955,0.7946,0.7086
5,108.32,156505.9265,395.6083,0.483,0.8239,1.7189
6,33.433,30013.6325,173.2444,0.8823,0.6651,1.3109
7,69.4267,28593.6439,169.0966,0.8946,0.97,2.3629
8,51.7105,23915.2388,154.6455,0.9512,0.5867,1.6824
9,45.8198,19081.0737,138.1343,0.9624,0.5808,1.2507


Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).


Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,185.165,134007.5469,366.0704,0.7026,2.7606,811.1369
1,166.2867,71000.75,266.4597,0.7989,3.1478,1177.423
2,189.757,153464.4375,391.7454,0.6848,2.1162,254.2113
3,107.7612,49826.8867,223.2194,0.883,2.8128,1116.1724
4,94.6503,42787.6836,206.8518,0.9074,2.1825,332.7524
5,129.8639,103362.8984,321.501,0.6585,1.93,167.6554
6,81.1099,30954.1836,175.938,0.8786,2.681,936.1866
7,122.687,78001.9531,279.2883,0.7125,1.8183,328.7289
8,86.8082,42800.3945,206.8826,0.9126,2.3561,671.6638
9,87.0272,45287.1445,212.8078,0.9107,2.5062,595.6194


Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).


Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,102.7575,65252.4147,255.4455,0.8552,2.1316,409.9679
1,78.463,25245.9991,158.8899,0.9285,2.4318,698.2025
2,110.006,74815.28,273.5238,0.8463,1.6025,88.4876
3,76.6315,29896.1607,172.9051,0.9298,2.3768,510.2787
4,76.5935,46715.161,216.1369,0.8989,1.8688,188.1326
5,101.8636,73584.302,271.2643,0.7569,1.673,85.6116
6,61.3684,24777.6003,157.409,0.9028,2.3147,475.8649
7,86.0623,37725.9235,194.2316,0.861,1.5753,193.5891
8,68.6627,41395.4052,203.4586,0.9155,2.0387,374.9995
9,73.2114,42286.2576,205.6362,0.9167,2.1512,317.9492


Fitting 10 folds for each of 1 candidates, totalling 10 fits
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.012101 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 10872
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.009337 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 10873
[LightGBM] [Info] Number of data points in the train set: 16344, number of used features: 93
[LightGBM] [Info] Number of data points in the train set: 16344, number of used features: 93
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.008145 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 11211
[LightGBM] [Info] Number of data p

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,111.5968,71085.4953,266.6186,0.8422,2.1434,735.0069
1,77.6846,25735.076,160.4216,0.9271,2.3278,914.3112
2,114.4018,73318.5509,270.774,0.8494,1.3978,92.2566
3,69.4468,25425.0842,159.4525,0.9403,2.0635,622.899
4,74.001,31114.7785,176.3938,0.9327,1.626,214.8776
5,143.0879,141351.0495,375.9668,0.533,1.6066,161.6591
6,67.7572,28084.2497,167.5836,0.8899,2.0301,808.3636
7,131.1622,95558.4737,309.1253,0.6478,1.6885,283.3919
8,55.8372,15586.2397,124.8449,0.9682,1.7535,403.339
9,53.7608,18486.8442,135.9663,0.9636,1.8438,351.3732


Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits


In [15]:
blended = regression.blend_models(
    estimator_list=model,
    fold=5,
    optimize='MAE'
)

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,83.7499,39743.9411,199.3588,0.9012,2.0707,491.5381
1,55.595,20194.043,142.1057,0.9559,1.3355,108.9718
2,71.3417,45619.1338,213.5864,0.8807,0.9332,35.7885
3,58.7735,28258.7552,168.1034,0.8929,1.2702,188.9099
4,61.5042,41008.5578,202.5057,0.9178,1.2527,116.3473
Mean,66.1929,34964.8862,185.132,0.9097,1.3725,188.3111
Std,10.2382,9338.3974,26.2874,0.026,0.3759,159.1751


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.016801 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 9936
[LightGBM] [Info] Number of data points in the train set: 14528, number of used features: 91
[LightGBM] [Info] Start training from score 211.423080
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.033018 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 10416
[LightGBM] [Info] Number of data points in the train set: 14528, number of used features: 93
[LightGBM] [Info] Start training from score 219.685266
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.022032 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not 

In [16]:
final_model = regression.finalize_model(blended)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003470 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 11605
[LightGBM] [Info] Number of data points in the train set: 20178, number of used features: 93
[LightGBM] [Info] Start training from score 215.240460


In [17]:
predicted = regression.predict_model(final_model, df_test_features)

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,Voting Regressor,16.4441,1199.3381,34.6315,0.9982,0.6613,40.5498
