In [10]:
import shap
import statsmodels.api as sm
import numpy as np
import pandas as pd
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.preprocessing import StandardScaler

np.random.seed(42)
features = pd.read_csv('../results/features/all_datasets_features_diff_xg.csv')
c = (features.model == 'GRU') & (features.data == 'aus')
features = features[~c]
c = (features.model == 'GRU') & (features.data == 'solar')
features = features[~c]
features = features[features.eb < 0.8]
features = features.dropna(axis=1)
arima_features = features[features.model == 'ARIMA']
trans_features = features[features.model == 'TRANSFORMER']
informer_features = features[features.model == 'INFORMER']
nbeats_features = features[features.model == 'NBEATS']
gboost_features = features[features.model == 'GBOOST']
linear_features = features[features.model == 'DLINEAR']
gru_features = features[features.model == 'GRU']

print(arima_features.shape)
print(trans_features.shape)

(194, 48)
(229, 48)


In [13]:
X_train = arima_features[['max_kl_shift', 'max_level_shift', 'seas_acf1', 'max_var_shift', 'unitroot_pp']]
y_train = arima_features[['TFE']]

x_scaler = StandardScaler()
y_scaler = StandardScaler()
X_train[X_train.columns] = x_scaler.fit_transform(X_train)
y_train[y_train.columns] = y_scaler.fit_transform(y_train)

gbm = GradientBoostingRegressor()
gbm.fit(X_train, y_train)
explainer = shap.TreeExplainer(gbm)
feature_importance = gbm.feature_importances_

features_df = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': feature_importance
})

# Sort the features based on importance
features_df = features_df.sort_values(by='Importance', ascending=False)

train_predictions = gbm.predict(X_train)
mse_train = np.sqrt(mean_squared_error(y_train, train_predictions))
r2_train = r2_score(y_train, train_predictions)

print(f"Training MSE: {mse_train}")
print(f"Training R^2: {r2_train}")
display(features_df)

Training MSE: 0.10697657930790697
Training R^2: 0.988556011479579



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().


Unnamed: 0,Feature,Importance
1,max_level_shift,0.392464
2,seas_acf1,0.280029
4,unitroot_pp,0.184898
0,max_kl_shift,0.121703
3,max_var_shift,0.020905


In [15]:
X_train = trans_features[['max_kl_shift', 'max_level_shift', 'seas_acf1', 'max_var_shift', 'unitroot_pp']]
y_train = trans_features[['TFE']]

gbm = GradientBoostingRegressor()
gbm.fit(X_train, y_train)
feature_importance = gbm.feature_importances_

features_df = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': feature_importance
})

features_df = features_df.sort_values(by='Importance', ascending=False)
# Sort the features based on importance
train_predictions = gbm.predict(X_train)
mse_train = np.sqrt(mean_squared_error(y_train, train_predictions))
r2_train = r2_score(y_train, train_predictions)

print(f"Training MSE: {mse_train}")
print(f"Training R^2: {r2_train}")
display(features_df)

Training MSE: 0.025126666631229696
Training R^2: 0.9986974148763046


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().


Unnamed: 0,Feature,Importance
2,seas_acf1,0.42779
4,unitroot_pp,0.24574
1,max_level_shift,0.224084
3,max_var_shift,0.081567
0,max_kl_shift,0.020819


In [19]:
from sklearn.preprocessing import StandardScaler

X_train = informer_features[['te', 'max_kl_shift']]
y_train = informer_features[['TFE']]

x_scaler = StandardScaler()
y_scaler = StandardScaler()
X_train = x_scaler.fit_transform(X_train)
y_train = y_scaler.fit_transform(y_train)

# Adding a constant to the model (intercept)
X_train = sm.add_constant(X_train)
# Create a model and fit it
trans_model = sm.OLS(y_train, X_train).fit()


print(trans_model.summary())

# For a more direct view of the coefficients:
print("Coefficients:\n", trans_model.params)

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.506
Model:                            OLS   Adj. R-squared:                  0.502
Method:                 Least Squares   F-statistic:                     115.7
Date:                Fri, 05 Jan 2024   Prob (F-statistic):           2.52e-35
Time:                        18:05:51   Log-Likelihood:                -244.21
No. Observations:                 229   AIC:                             494.4
Df Residuals:                     226   BIC:                             504.7
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const      -2.082e-17      0.047  -4.45e-16      1.0

In [20]:
X_train = arima_features[['te', 'max_kl_shift']]
y_train = arima_features[['TFE']]

x_scaler = StandardScaler()
y_scaler = StandardScaler()
X_train = x_scaler.fit_transform(X_train)
y_train = y_scaler.fit_transform(y_train)

# Adding a constant to the model (intercept)
X_train = sm.add_constant(X_train)
# Create a model and fit it
arima_model = sm.OLS(y_train, X_train).fit()


print(arima_model.summary())

# For a more direct view of the coefficients:
print("Coefficients:\n", arima_model.params)

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.697
Model:                            OLS   Adj. R-squared:                  0.694
Method:                 Least Squares   F-statistic:                     220.0
Date:                Fri, 05 Jan 2024   Prob (F-statistic):           2.69e-50
Time:                        18:06:13   Log-Likelihood:                -159.34
No. Observations:                 194   AIC:                             324.7
Df Residuals:                     191   BIC:                             334.5
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const       7.633e-17      0.040   1.92e-15      1.0

In [17]:
X_train = informer_features[['max_kl_shift', 'max_level_shift', 'seas_acf1', 'max_var_shift', 'unitroot_pp']]
y_train = informer_features[['TFE']]

x_scaler = StandardScaler()
y_scaler = StandardScaler()
X_train = x_scaler.fit_transform(X_train)
y_train = y_scaler.fit_transform(y_train)

# Adding a constant to the model (intercept)
X_train = sm.add_constant(X_train)
# Create a model and fit it
arima_model = sm.OLS(y_train, X_train).fit()


print(arima_model.summary())

# For a more direct view of the coefficients:
print("Coefficients:\n", arima_model.params)

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.478
Model:                            OLS   Adj. R-squared:                  0.466
Method:                 Least Squares   F-statistic:                     40.84
Date:                Fri, 05 Jan 2024   Prob (F-statistic):           1.00e-29
Time:                        18:04:02   Log-Likelihood:                -250.51
No. Observations:                 229   AIC:                             513.0
Df Residuals:                     223   BIC:                             533.6
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const      -2.082e-17      0.048   -4.3e-16      1.0

In [53]:
X_train = nbeats_features[['max_kl_shift', 'max_level_shift', 'seas_acf1', 'max_var_shift', 'unitroot_pp']]
y_train = nbeats_features[['TFE']]

x_scaler = StandardScaler()
y_scaler = StandardScaler()
X_train = x_scaler.fit_transform(X_train)
y_train = y_scaler.fit_transform(y_train)

# Adding a constant to the model (intercept)
X_train = sm.add_constant(X_train)
# Create a model and fit it
arima_model = sm.OLS(y_train, X_train).fit()


print(arima_model.summary())

# For a more direct view of the coefficients:
print("Coefficients:\n", arima_model.params)

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.425
Model:                            OLS   Adj. R-squared:                  0.412
Method:                 Least Squares   F-statistic:                     32.99
Date:                Sat, 30 Dec 2023   Prob (F-statistic):           3.92e-25
Time:                        18:31:41   Log-Likelihood:                -261.54
No. Observations:                 229   AIC:                             535.1
Df Residuals:                     223   BIC:                             555.7
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const      -2.776e-17      0.051  -5.47e-16      1.0

In [18]:
X_train = gru_features[['max_kl_shift', 'max_level_shift', 'seas_acf1', 'max_var_shift', 'unitroot_pp']]
y_train = gru_features[['TFE']]

x_scaler = StandardScaler()
y_scaler = StandardScaler()
X_train = x_scaler.fit_transform(X_train)
y_train = y_scaler.fit_transform(y_train)

# Adding a constant to the model (intercept)
X_train = sm.add_constant(X_train)
# Create a model and fit it
arima_model = sm.OLS(y_train, X_train).fit()


print(arima_model.summary())

# For a more direct view of the coefficients:
print("Coefficients:\n", arima_model.params)

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.369
Model:                            OLS   Adj. R-squared:                  0.347
Method:                 Least Squares   F-statistic:                     17.05
Date:                Fri, 05 Jan 2024   Prob (F-statistic):           2.93e-13
Time:                        18:05:02   Log-Likelihood:                -180.73
No. Observations:                 152   AIC:                             373.5
Df Residuals:                     146   BIC:                             391.6
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const      -5.638e-17      0.066  -8.57e-16      1.0

In [55]:
X_train = gboost_features[['max_kl_shift', 'max_level_shift', 'seas_acf1', 'max_var_shift', 'unitroot_pp']]
y_train = gboost_features[['TFE']]

x_scaler = StandardScaler()
y_scaler = StandardScaler()
X_train = x_scaler.fit_transform(X_train)
y_train = y_scaler.fit_transform(y_train)

# Adding a constant to the model (intercept)
X_train = sm.add_constant(X_train)
# Create a model and fit it
arima_model = sm.OLS(y_train, X_train).fit()


print(arima_model.summary())

# For a more direct view of the coefficients:
print("Coefficients:\n", arima_model.params)

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.421
Model:                            OLS   Adj. R-squared:                  0.407
Method:                 Least Squares   F-statistic:                     30.41
Date:                Sat, 30 Dec 2023   Prob (F-statistic):           3.55e-23
Time:                        18:32:52   Log-Likelihood:                -246.31
No. Observations:                 215   AIC:                             504.6
Df Residuals:                     209   BIC:                             524.8
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const       1.388e-17      0.053   2.64e-16      1.0