In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.metrics import make_scorer, mean_absolute_error, r2_score
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
import lightgbm as lgb
import shap
import lime
import dask.dataframe as dd


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# from dask.distributed import Client
# client = Client(memory_limit='10GB')  # Tăng giới hạn bộ nhớ
# # client.close()


In [3]:
save_dir="../../exps"

In [4]:
df = pd.read_csv(f'{save_dir}/train_lag1_pre_processing.csv', index_col=None)

# df = dd.read_csv(f'{save_dir}/train_lag1_pre_processing.csv')
# df = df.compute() 
has_null = df.isnull().values.any()
print("DataFrame có chứa giá trị NaN không?", has_null)

len(df)

ParserError: Error tokenizing data. C error: Calling read(nbytes) on source failed. Try engine='python'.

In [None]:
df=df.drop(columns=['Unnamed: 0','row_id','date','origin_date','origin_date_right'])
# df.isnull().sum()
df.dropna(inplace=True)
len(df)

1622156

In [None]:
for i in df.columns:
    print(i,end=',')

county,is_business,product_type,is_consumption,prediction_unit_id,lowest_price_per_mwh,highest_price_per_mwh,data_block_id,eic_count,installed_capacity,euros_per_mwh,data_block_id_right,hours_ahead,temperature,dewpoint,cloudcover_high,cloudcover_low,cloudcover_mid,cloudcover_total,10_metre_u_wind_component,10_metre_v_wind_component,direct_solar_radiation,surface_solar_radiation_downwards,snowfall,total_precipitation,hours_ahead_fl,temperature_fl,dewpoint_fl,cloudcover_high_fl,cloudcover_low_fl,cloudcover_mid_fl,cloudcover_total_fl,10_metre_u_wind_component_fl,10_metre_v_wind_component_fl,direct_solar_radiation_fl,surface_solar_radiation_downwards_fl,snowfall_fl,total_precipitation_fl,hours_ahead_fd_7d,temperature_fd_7d,dewpoint_fd_7d,cloudcover_high_fd_7d,cloudcover_low_fd_7d,cloudcover_mid_fd_7d,cloudcover_total_fd_7d,10_metre_u_wind_component_fd_7d,10_metre_v_wind_component_fd_7d,direct_solar_radiation_fd_7d,surface_solar_radiation_downwards_fd_7d,snowfall_fd_7d,total_precipitation_f

In [None]:
X = df.drop('target', axis=1)
y = df['target']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42, shuffle=False)

In [None]:


num_folds = 40
seed = 7
scorers = {'mae': make_scorer(mean_absolute_error), 'r2': make_scorer(r2_score)}
models = []

# Defining models
models.append(('LR', LinearRegression()))
models.append(('Ridge', Ridge(alpha=1.0)))
models.append(('Lasso', Lasso(alpha=0.1)))
models.append(('SVR', SVR(kernel='rbf', C=100, gamma=0.1, epsilon=0.1)))
models.append(('DT', DecisionTreeRegressor(random_state=42)))
models.append(('RF', RandomForestRegressor(n_estimators=100, random_state=42)))
models.append(('GBR', GradientBoostingRegressor(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=42)))
models.append(('LGBM', lgb.LGBMRegressor(objective='regression', num_leaves=31, learning_rate=0.1, n_estimators=100)))

# Function to evaluate models
def check_model(name, model, X, y, scoring):
    kfold = KFold(n_splits=num_folds, shuffle=True, random_state=seed)
    cv_results = cross_val_score(model, X, y, cv=kfold, scoring=scoring)
    msg = f"{scoring} {name}: {cv_results.mean():.3f} ({cv_results.std():.3f})"
    print(msg)
    return cv_results

results_mae = []
names_mae = []
results_r2 = []
names_r2 = []

# Evaluating models
for name, model in models:
    print(f"Evaluating {name}...")
    mae_result = check_model(name, model, X, y, scoring='neg_mean_absolute_error')
    results_mae.append(mae_result)
    names_mae.append(name)
    
    r2_result = check_model(name, model, X, y, scoring='r2')
    results_r2.append(r2_result)
    names_r2.append(name)

# Plotting MAE
fig_mae = plt.figure()
fig_mae.suptitle('Comparison of MAE among models')
ax_mae = fig_mae.add_subplot(111)
plt.boxplot(results_mae)
ax_mae.set_xticklabels(names_mae)
plt.ylabel('MAE')
plt.show()

# Plotting R2
fig_r2 = plt.figure()
fig_r2.suptitle('Comparison of R2 among models')
ax_r2 = fig_r2.add_subplot(111)
plt.boxplot(results_r2)
ax_r2.set_xticklabels(names_r2)
plt.ylabel('R2 Score')
plt.show()


Evaluating LR...
neg_mean_absolute_error LR: -0.117 (0.000)
r2 LR: 0.880 (0.002)
Evaluating Ridge...
neg_mean_absolute_error Ridge: -0.117 (0.000)
r2 Ridge: 0.880 (0.002)
Evaluating Lasso...
neg_mean_absolute_error Lasso: -0.102 (0.001)


KeyboardInterrupt: 

In [None]:
# Train Decision Tree model
dt_model = DecisionTreeRegressor(random_state=42)
dt_model.fit(X_train, y_train)



In [None]:
importances = dt_model.feature_importances_
indices = np.argsort(importances)[::-1]

In [None]:
print("Feature ranking:")
for f in range(X_train.shape[1]):
    print(f"{f + 1}. feature {X_train.columns[indices[f]]} ({importances[indices[f]]})")

# Visualize Decision Tree
plt.figure(figsize=(20,10))
plot_tree(dt_model, feature_names=X_train.columns, filled=True, rounded=True)
plt.title("Decision Tree Structure")
plt.show()

In [None]:
# Train LightGBM model
lgbm_model = lgb.LGBMRegressor(objective='regression', num_leaves=31, learning_rate=0.1, n_estimators=100)
lgbm_model.fit(X_train, y_train)

In [None]:
# Evaluating SHAP for Decision Tree
explainer_dt = shap.Explainer(dt_model, X_train)
shap_values_dt = explainer_dt(X_test)

# Evaluating SHAP for LightGBM
explainer_lgbm = shap.Explainer(lgbm_model, X_train)
shap_values_lgbm = explainer_lgbm(X_test)

# Evaluating LIME for Decision Tree
lime_explainer = lime.lime_tabular.LimeTabularExplainer(X_train.values, feature_names=X_train.columns, verbose=True, mode='regression')
lime_exp_dt = lime_explainer.explain_instance(X_test.iloc[0].values, dt_model.predict)

# Evaluating LIME for LightGBM
lime_exp_lgbm = lime_explainer.explain_instance(X_test.iloc[0].values, lgbm_model.predict)

In [None]:


# SHAP summary plot for Decision Tree
shap.summary_plot(shap_values_dt, X_test, plot_type="bar", title="SHAP Decision Tree Feature Importance")

# SHAP summary plot for LightGBM
shap.summary_plot(shap_values_lgbm, X_test, plot_type="bar", title="SHAP LightGBM Feature Importance")

# LIME explanation for Decision Tree
lime_exp_dt.show_in_notebook()

# LIME explanation for LightGBM
lime_exp_lgbm.show_in_notebook()


# Detailed force plot for one sample in the test set
shap.force_plot(explainer_dt.expected_value, shap_values_dt[0,:], X_test.iloc[0,:], matplotlib=True)
shap.force_plot(explainer_lgbm.expected_value, shap_values_lgbm[0,:], X_test.iloc[0,:], matplotlib=True)