In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold, cross_val_score, train_test_split, GridSearchCV
from sklearn.metrics import make_scorer, mean_absolute_error, r2_score
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor, plot_tree
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
import lightgbm as lgb
import shap
import lime

# import dask.dataframe as dd


In [10]:
save_dir="../../exps"
X=pd.read_csv(f'{save_dir}/X_train_lag1_days_pre_processing.csv', index_col=None)
y=pd.read_csv(f'{save_dir}/y_train_lag1_days_pre_processing.csv', index_col=None)
X_train=pd.read_csv(f'{save_dir}/X_train_train_lag1_days_pre_processing.csv', index_col=None)
X_valid=pd.read_csv(f'{save_dir}/X_test_train_lag1_days_pre_processing.csv', index_col=None)
y_train=pd.read_csv(f'{save_dir}/y_train_train_lag1_days_pre_processing.csv', index_col=None)
y_valid=pd.read_csv(f'{save_dir}/y_test_train_lag1_days_pre_processing.csv', index_col=None)


In [12]:
X.drop(columns=['Unnamed: 0'], inplace=True)
y.drop(columns=['Unnamed: 0'], inplace=True)
X_train.drop(columns=['Unnamed: 0'], inplace=True)
X_valid.drop(columns=['Unnamed: 0'], inplace=True)
y_train.drop(columns=['Unnamed: 0'], inplace=True)
y_valid.drop(columns=['Unnamed: 0'], inplace=True)
print(X.shape, y.shape, X_train.shape, y_train.shape, X_valid.shape, y_valid.shape)

(67714, 93) (67714, 1) (45368, 93) (45368, 1) (22346, 93) (22346, 1)


### Build Decision Tree and Improve in valid set

In [11]:
# Train Decision Tree model
dt_model = DecisionTreeRegressor(random_state=42)
dt_model.fit(X_train, y_train)



In [None]:
dt_valid_predictions = dt_model.predict(X_valid)
dt_valid_mae = mean_absolute_error(y_valid, dt_valid_predictions)
dt_valid_r2 = r2_score(y_valid, dt_valid_predictions)
print(f"Decision Tree - Validation MAE: {dt_valid_mae:.3f}, Validation R2: {dt_valid_r2:.3f}")


#### Hyparameters tuning

In [None]:
param_grid_dt = {
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}
dt_grid_search = GridSearchCV(estimator=DecisionTreeRegressor(random_state=42),
                              param_grid=param_grid_dt,
                              scoring='neg_mean_absolute_error',
                              cv=5, n_jobs=-1, verbose=2)

dt_grid_search.fit(X_train, y_train)

print(f"Best parameters for Decision Tree: {dt_grid_search.best_params_}")

dt_best_model = dt_grid_search.best_estimator_
dt_valid_predictions = dt_best_model.predict(X_valid)
dt_valid_mae = mean_absolute_error(y_valid, dt_valid_predictions)
dt_valid_r2 = r2_score(y_valid, dt_valid_predictions)
print(f"Decision Tree - Validation MAE: {dt_valid_mae:.3f}, Validation R2: {dt_valid_r2:.3f}")

### Build LGBM and Improve in valid set

In [None]:
# Train LightGBM model
lgbm_model = lgb.LGBMRegressor(objective='regression', num_leaves=31, learning_rate=0.1, n_estimators=100)
lgbm_model.fit(X_train, y_train)

In [None]:
lgbm_model_predictions = lgbm_model.predict(X_valid)
lgbm_model_mae = mean_absolute_error(y_valid, lgbm_model_predictions)
lgbm_model_r2 = r2_score(y_valid, lgbm_model_predictions)
print(f"LGBM model - Validation MAE: {lgbm_model_mae:.3f}, Validation R2: {lgbm_model_r2:.3f}")

#### Hyparameters tuning

In [None]:
param_grid_lgbm = {
    'num_leaves': [31, 50],
    'learning_rate': [0.01, 0.1],
    'n_estimators': [100, 200]
}

lgb_grid_search = GridSearchCV(estimator=lgb.LGBMRegressor(objective='regression', learning_rate=0.1, n_estimators=100),
                              param_grid=param_grid_lgbm,
                              scoring='neg_mean_absolute_error',
                              cv=5, n_jobs=-1, verbose=2)

lgb_grid_search.fit(X_train, y_train)
lgb_best_model = lgb_grid_search.best_estimator_
lgb_valid_predictions = lgb_best_model.predict(X_valid)
lgb_valid_mae = mean_absolute_error(y_valid, lgb_valid_predictions)
lgb_valid_r2 = r2_score(y_valid, lgb_valid_predictions)
print(f"Random Forest - Validation MAE: {lgb_valid_mae:.3f}, Validation R2: {lgb_valid_r2:.3f}")


### Build Random Forest and Improve in valid set

In [None]:
rf= RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

In [None]:
rf_predictions = rf.predict(X_valid)
rf_mae = mean_absolute_error(y_valid, rf_predictions)
rf_r2 = r2_score(y_valid, rf_predictions)
print(f"Random Forest- Validation MAE: {rf_mae:.3f}, Validation R2: {rf_r2:.3f}")

#### Hyparameters tuning

In [None]:
param_grid_rf = {
    'n_estimators': [50,100],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [2, 4]
}
rf_grid_search = GridSearchCV(estimator=RandomForestRegressor(random_state=42),
                              param_grid=param_grid_rf,
                              scoring='neg_mean_absolute_error',
                              cv=5, n_jobs=-1, verbose=2)

rf_grid_search.fit(X_train, y_train)
rf_best_model = rf_grid_search.best_estimator_
rf_valid_predictions = rf_best_model.predict(X_valid)
rf_valid_mae = mean_absolute_error(y_valid, rf_valid_predictions)
rf_valid_r2 = r2_score(y_valid, rf_valid_predictions)
print(f"Random Forest - Validation MAE: {rf_valid_mae:.3f}, Validation R2: {rf_valid_r2:.3f}")
