In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.metrics import make_scorer, mean_absolute_error, r2_score
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor, plot_tree
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
import lightgbm as lgb
import shap
import lime
import os
# from joblib import dump
import joblib
from IPython.display import IFrame
import tempfile
import warnings
import statsmodels.api as sm

warnings.filterwarnings('ignore')
# import dask.dataframe as dd


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
exps_dir = "../../exps"
if os.path.exists(exps_dir) == False: # tạo thư mục (nếu chưa có)
  os.makedirs(exps_dir, exist_ok=True)

save_dir = f"{exps_dir}/exp"
os.makedirs(save_dir, exist_ok=True)
ndays=1


In [3]:
X=pd.read_csv(f'{save_dir}/X_train_lag{ndays}_pre_processing.csv', index_col=None)
y=pd.read_csv(f'{save_dir}/y_train_lag{ndays}_pre_processing.csv', index_col=None)
X.drop(columns=['Unnamed: 0'], inplace=True)
y.drop(columns=['Unnamed: 0'], inplace=True)

In [4]:
X_train=pd.read_csv(f'{save_dir}/X_train_train_lag{ndays}_pre_processing.csv', index_col=None)
X_valid=pd.read_csv(f'{save_dir}/X_valid_train_lag{ndays}_pre_processing.csv', index_col=None)
y_train=pd.read_csv(f'{save_dir}/y_train_train_lag{ndays}_pre_processing.csv', index_col=None)
y_valid=pd.read_csv(f'{save_dir}/y_valid_train_lag{ndays}_pre_processing.csv', index_col=None)
X_train.drop(columns=['Unnamed: 0'], inplace=True)
X_valid.drop(columns=['Unnamed: 0'], inplace=True)
y_train.drop(columns=['Unnamed: 0'], inplace=True)
y_valid.drop(columns=['Unnamed: 0'], inplace=True)
print(X_train.shape, y_train.shape, X_valid.shape, y_valid.shape)

(1086844, 102) (1086844, 1) (535312, 102) (535312, 1)


### Analyze the regression relation

#### Use OLS regression

In [5]:
def analyze_OLS(x,y):
    x = sm.add_constant(x)  
    model = sm.OLS(y, x,hasconst=True)
    results = model.fit()
    summary = results.summary()
    # print(summary)
    return summary
print(analyze_OLS(X,y))

                            OLS Regression Results                            
Dep. Variable:                 target   R-squared:                       0.881
Model:                            OLS   Adj. R-squared:                  0.881
Method:                 Least Squares   F-statistic:                 1.208e+05
Date:                Tue, 09 Jul 2024   Prob (F-statistic):               0.00
Time:                        04:07:40   Log-Likelihood:            -1.1674e+07
No. Observations:             1622156   AIC:                         2.335e+07
Df Residuals:                 1622056   BIC:                         2.335e+07
Df Model:                          99                                         
Covariance Type:            nonrobust                                         
                                              coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------------------------

#### Use Linear machine learning model

In [6]:
def print_sorted_coefficients(model, model_name, feature_names):
    coef = model.coef_
    if len(coef.shape) > 1:
        coef = coef[0]
    sorted_indices = np.argsort(-np.abs(coef))  
    sorted_coef = coef[sorted_indices]
    sorted_feature_names = [feature_names[i] for i in sorted_indices]
    for i, (name, value) in enumerate(zip(sorted_feature_names, sorted_coef)):
        print(f"{model_name} Coefficient {i + 1}: {name}, Coefficient: {value}")

In [7]:
linear_model = LinearRegression()
linear_model.fit(X_train, y_train)
y_pred_linear = linear_model.predict(X_valid)

print_sorted_coefficients(linear_model, "Linear Regression", X_train.columns)
print("Linear Regression intercept:", linear_model.intercept_)





Linear Regression Coefficient 1: shortwave_radiation, Coefficient: -19693.53618392226
Linear Regression Coefficient 2: direct_solar_radiation_hd_7d, Coefficient: 14079.636507876614
Linear Regression Coefficient 3: diffuse_radiation, Coefficient: 6611.894871276224
Linear Regression Coefficient 4: shortwave_radiation_hl_7d, Coefficient: -5651.0285514499565
Linear Regression Coefficient 5: direct_solar_radiation_hl_7d, Coefficient: 4126.837632583039
Linear Regression Coefficient 6: diffuse_radiation_hl_7d, Coefficient: 1975.0776332127566
Linear Regression Coefficient 7: installed_capacity, Coefficient: 0.11719447729762444
Linear Regression Coefficient 8: eic_count, Coefficient: -0.05212604255272758
Linear Regression Coefficient 9: is_consumption, Coefficient: 0.04988049807293459
Linear Regression Coefficient 10: dewpoint_hd_7d, Coefficient: 0.03411084986360514
Linear Regression Coefficient 11: temperature_hd_7d, Coefficient: -0.03383909793792379
Linear Regression Coefficient 12: surface_s

In [8]:
print("Linear Regression R²:", r2_score(y_valid, y_pred_linear))
print("Linear Regression MAE:", mean_absolute_error(y_valid, y_pred_linear))

Linear Regression R²: -251257.6226528385
Linear Regression MAE: 304.61527718956324


In [9]:
ridge_model = Ridge(alpha=1.0)
ridge_model.fit(X_train, y_train)
y_pred_ridge = ridge_model.predict(X_valid)









In [10]:

print_sorted_coefficients(ridge_model, "Ridge Regression", X_train.columns)
print("Ridge Regression intercept:", ridge_model.intercept_)

Ridge Regression Coefficient 1: installed_capacity, Coefficient: 0.1171958256863419
Ridge Regression Coefficient 2: eic_count, Coefficient: -0.05212670141502252
Ridge Regression Coefficient 3: is_consumption, Coefficient: 0.04988112051838729
Ridge Regression Coefficient 4: temperature_hd_7d, Coefficient: -0.03402736901721808
Ridge Regression Coefficient 5: dewpoint_hd_7d, Coefficient: 0.03398798096716237
Ridge Regression Coefficient 6: surface_solar_radiation_downwards_fl, Coefficient: 0.03160523525814361
Ridge Regression Coefficient 7: surface_solar_radiation_downwards_fd_7d, Coefficient: -0.027105388447861624
Ridge Regression Coefficient 8: surface_solar_radiation_downwards, Coefficient: 0.025008360631364957
Ridge Regression Coefficient 9: dewpoint_fd_7d, Coefficient: -0.02156107510732211
Ridge Regression Coefficient 10: diffuse_radiation, Coefficient: -0.02013582165101051
Ridge Regression Coefficient 11: weekday, Coefficient: -0.01971406933229101
Ridge Regression Coefficient 12: tem