In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold, cross_val_score, train_test_split, GridSearchCV
from sklearn.metrics import make_scorer, mean_absolute_error, r2_score
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor, plot_tree
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
import lightgbm as lgb
import shap
import lime
import os

# import dask.dataframe as dd


In [None]:
exps_dir = "../../exps"
if os.path.exists(exps_dir) == False: # tạo thư mục (nếu chưa có)
  os.makedirs(exps_dir, exist_ok=True)

save_dir = f"{exps_dir}/exp"
os.makedirs(save_dir, exist_ok=True)

In [5]:
ndays=1
X=pd.read_csv(f'{save_dir}/X_train_lag{ndays}_pre_processing.csv', index_col=None)
y=pd.read_csv(f'{save_dir}/y_train_lag{ndays}_pre_processing.csv', index_col=None)
X_train=pd.read_csv(f'{save_dir}/X_train_train_lag{ndays}_pre_processing.csv', index_col=None)
X_valid=pd.read_csv(f'{save_dir}/X_test_train_lag{ndays}_pre_processing.csv', index_col=None)
y_train=pd.read_csv(f'{save_dir}/y_train_train_lag{ndays}_pre_processing.csv', index_col=None)
y_valid=pd.read_csv(f'{save_dir}/y_test_train_lag{ndays}_pre_processing.csv', index_col=None)


In [6]:
X.drop(columns=['Unnamed: 0'], inplace=True)
y.drop(columns=['Unnamed: 0'], inplace=True)
X_train.drop(columns=['Unnamed: 0'], inplace=True)
X_valid.drop(columns=['Unnamed: 0'], inplace=True)
y_train.drop(columns=['Unnamed: 0'], inplace=True)
y_valid.drop(columns=['Unnamed: 0'], inplace=True)
print(X.shape, y.shape, X_train.shape, y_train.shape, X_valid.shape, y_valid.shape)

(1622156, 102) (1622156, 1) (1086844, 102) (1086844, 1) (535312, 102) (535312, 1)


### Build Decision Tree and Improve in valid set

In [7]:
# Train Decision Tree model
dt_model = DecisionTreeRegressor(random_state=42)
dt_model.fit(X_train, y_train)



In [8]:
dt_valid_predictions = dt_model.predict(X_valid)
dt_valid_mae = mean_absolute_error(y_valid, dt_valid_predictions)
dt_valid_r2 = r2_score(y_valid, dt_valid_predictions)
print(f"Decision Tree - Validation MAE: {dt_valid_mae:.3f}, Validation R2: {dt_valid_r2:.3f}")


Decision Tree - Validation MAE: 0.108, Validation R2: 0.888


#### Hyparameters tuning

In [9]:
param_grid_dt = {
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}
dt_grid_search = GridSearchCV(estimator=DecisionTreeRegressor(random_state=42),
                              param_grid=param_grid_dt,
                              scoring='neg_mean_absolute_error',
                              cv=5, n_jobs=-1, verbose=2)

dt_grid_search.fit(X_train, y_train)

print(f"Best parameters for Decision Tree: {dt_grid_search.best_params_}")

dt_best_model = dt_grid_search.best_estimator_
dt_valid_predictions = dt_best_model.predict(X_valid)
dt_valid_mae = mean_absolute_error(y_valid, dt_valid_predictions)
dt_valid_r2 = r2_score(y_valid, dt_valid_predictions)
print(f"Decision Tree - Validation MAE: {dt_valid_mae:.3f}, Validation R2: {dt_valid_r2:.3f}")

Fitting 20 folds for each of 36 candidates, totalling 720 fits


KeyboardInterrupt: 

### Build LGBM and Improve in valid set

In [None]:
# Train LightGBM model
lgbm_model = lgb.LGBMRegressor(objective='regression', num_leaves=31, learning_rate=0.1, n_estimators=100)
lgbm_model.fit(X_train, y_train)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004266 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 21293
[LightGBM] [Info] Number of data points in the train set: 45368, number of used features: 93
[LightGBM] [Info] Start training from score -0.025777


In [None]:
lgbm_model_predictions = lgbm_model.predict(X_valid)
lgbm_model_mae = mean_absolute_error(y_valid, lgbm_model_predictions)
lgbm_model_r2 = r2_score(y_valid, lgbm_model_predictions)
print(f"LGBM model - Validation MAE: {lgbm_model_mae:.3f}, Validation R2: {lgbm_model_r2:.3f}")

LGBM model - Validation MAE: 0.115, Validation R2: 0.913


#### Hyparameters tuning

In [None]:
param_grid_lgbm = {
    'num_leaves': [31, 50],
    'learning_rate': [0.01, 0.1],
    'n_estimators': [100, 200]
}

lgb_grid_search = GridSearchCV(estimator=lgb.LGBMRegressor(objective='regression', learning_rate=0.1, n_estimators=100),
                              param_grid=param_grid_lgbm,
                              scoring='neg_mean_absolute_error',
                              cv=5, n_jobs=-1, verbose=2)

lgb_grid_search.fit(X_train, y_train)
lgb_best_model = lgb_grid_search.best_estimator_
lgb_valid_predictions = lgb_best_model.predict(X_valid)
lgb_valid_mae = mean_absolute_error(y_valid, lgb_valid_predictions)
lgb_valid_r2 = r2_score(y_valid, lgb_valid_predictions)
print(f"LGBM - Validation MAE: {lgb_valid_mae:.3f}, Validation R2: {lgb_valid_r2:.3f}")


Fitting 5 folds for each of 8 candidates, totalling 40 fits
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.013685 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 21293
[LightGBM] [Info] Number of data points in the train set: 45368, number of used features: 93
[LightGBM] [Info] Start training from score -0.025777
Random Forest - Validation MAE: 0.118, Validation R2: 0.904


In [None]:
print(f"Best parameters for lgbm: {lgb_grid_search.best_params_}")
np.savez(f'{save_dir}/best_params_.npz', dt_grid_search=dt_grid_search.best_params_, lgb_grid_search = lgb_grid_search.best_params_)


NameError: name 'lgb_grid_search' is not defined

In [None]:
dict(np.load(f'{save_dir}/best_params_.npz',allow_pickle=True))
