In [61]:
import pandas as pd
import numpy as np
import seaborn as sns
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LassoCV, Lasso
from sklearn.model_selection import TimeSeriesSplit, cross_val_score
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler

In [62]:
carbon_credits = pd.read_csv("../../data/mapped/mapped_cdp_carbon_credits_full_processed.csv", index_col=['id', 'year'])

In [63]:
ghg_change = pd.read_csv("../../data/mapped/mapped_cdp_ghg_change_processed.csv", index_col=['id', 'year'])

In [17]:
ghg_change = ghg_change[ghg_change['absent_cdp_ghg_change_processed.csv'] == 0].drop(columns=['absent_cdp_ghg_change_processed.csv'])

In [18]:
# left join ghg_change and carbon_credits
baseline_carbon_credits = ghg_change.join(carbon_credits, how='left', lsuffix='_ghg_change', rsuffix='_carbon_credits')

In [19]:
baseline_carbon_credits.ghg_change_real_next.describe()

count    18455.000000
mean        -4.244380
std          7.871529
min        -49.900000
25%         -5.400000
50%         -1.500000
75%          0.000000
max         45.000000
Name: ghg_change_real_next, dtype: float64

In [20]:
baseline_carbon_credits.drop(columns=['Unnamed: 0_ghg_change', 'isin_ghg_change', 'Unnamed: 0_carbon_credits', 'isin_carbon_credits'], inplace=True)

In [21]:
baseline_carbon_credits = baseline_carbon_credits.reset_index().drop(columns=['id'])

In [22]:
baseline_carbon_credits.columns

Index(['year', 'ghg_change_total', 'ghg_change_real', 'ghg_change_structure',
       'ghg_change_measure', 'ghg_change_othergroup',
       'ghg_change_nonzero_count', 'ghg_change_real_cat',
       'ghg_change_real_cat_next', 'ghg_change_real_next',
       'cdp_num_credits_clean_sum', 'cdp_num_credits_clean_count',
       'cdp_num_credits_riskadj_clean_sum',
       'cdp_num_credits_clean_missing_sum',
       'cdp_num_credits_riskadj_clean_missing_sum',
       'cdp_orig_or_purchase_clean_credit origination',
       'cdp_orig_or_purchase_clean_credit purchase',
       'cdp_credits_cancelled_clean_no', 'cdp_credits_cancelled_clean_yes',
       'cdp_credits_cancelled_clean_missing', 'cdp_purpose_clean_compliance',
       'cdp_purpose_clean_other', 'cdp_purpose_clean_voluntary offsetting',
       'cdp_purpose_clean_missing',
       'absent_cdp_carbon_credits_full_processed.csv'],
      dtype='object')

In [23]:
# Create train and test sets
train_data = baseline_carbon_credits[baseline_carbon_credits['year'] < 2021]
test_data = baseline_carbon_credits[baseline_carbon_credits['year'] == 2021]

# Define features and target variable
target = 'ghg_change_real_next'

X_train = train_data.drop(columns=['ghg_change_real_next', 'ghg_change_real_cat_next'])
y_train = train_data[target]
X_test = test_data.drop(columns=['ghg_change_real_next', 'ghg_change_real_cat_next'])
y_test = test_data[target]

# Standardize the features in both training and test sets
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [24]:

# Initialize the LassoCV model with cross-validation
lasso_cv_model = LassoCV(alphas=np.logspace(-4, 4, 100), cv=TimeSeriesSplit(n_splits=5))

# Fit the LassoCV model to the training data
lasso_cv_model.fit(X_train_scaled, y_train)

# Get the optimal alpha (regularization strength) selected by cross-validation
optimal_alpha = lasso_cv_model.alpha_

# Fit the Lasso model with the optimal alpha to the entire training dataset
lasso_model = Lasso(alpha=optimal_alpha)
lasso_model.fit(X_train_scaled, y_train)

# Make predictions on the test set
y_pred = lasso_model.predict(X_test_scaled)

# Evaluate the Lasso model on the test set
test_rmse = mean_squared_error(y_test, y_pred, squared=False)
test_mae = mean_absolute_error(y_test, y_pred)
test_r2 = r2_score(y_test, y_pred)

# Print the results
print("Optimal Alpha:", optimal_alpha)
print("Test RMSE:", test_rmse)
print("Test MAE:", test_mae)
print("Test R-squared:", test_r2)


Optimal Alpha: 0.09770099572992257
Test RMSE: 9.68225606978034
Test MAE: 6.487382190660172
Test R-squared: 0.06577029344177898


In [25]:
# Assuming you've already trained the Lasso model and named it 'lasso_model'

# Get the coefficients and feature names
coefficients = lasso_model.coef_
feature_names = X_train.columns

# Create a DataFrame to display the coefficients and feature names
coefficients_df = pd.DataFrame({'Feature': feature_names, 'Coefficient': coefficients})

# Print the coefficients
print("Lasso Coefficients:")
print(coefficients_df)

Lasso Coefficients:
                                          Feature  Coefficient
0                                            year    -0.491641
1                                ghg_change_total    -0.014200
2                                 ghg_change_real     1.709350
3                            ghg_change_structure    -0.001681
4                              ghg_change_measure    -0.000000
5                           ghg_change_othergroup    -0.000000
6                        ghg_change_nonzero_count    -0.107936
7                             ghg_change_real_cat    -0.188014
8                       cdp_num_credits_clean_sum    -0.000000
9                     cdp_num_credits_clean_count     0.000000
10              cdp_num_credits_riskadj_clean_sum    -0.024712
11              cdp_num_credits_clean_missing_sum     0.000000
12      cdp_num_credits_riskadj_clean_missing_sum     0.000000
13  cdp_orig_or_purchase_clean_credit origination     0.013157
14     cdp_orig_or_purchase_clean_c

In [26]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import TimeSeriesSplit, cross_val_score, GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Initialize the Random Forest regressor
rf_model = RandomForestRegressor(random_state=42)

# Define hyperparameter grid for tuning
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30]
}

# Perform time series cross-validation with hyperparameter tuning
tscv = TimeSeriesSplit(n_splits=5)  # You can adjust the number of splits as needed

grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=tscv,
                           scoring='neg_mean_squared_error', verbose=1, n_jobs=-1)

# Fit the grid search to the training data
grid_search.fit(X_train, y_train)

# Get the best hyperparameters from the grid search
best_n_estimators = grid_search.best_params_['n_estimators']
best_max_depth = grid_search.best_params_['max_depth']

print("Best n_estimators:", best_n_estimators)
print("Best max_depth:", best_max_depth)

# Fit the Random Forest model with the best hyperparameters to the entire training dataset
rf_model = RandomForestRegressor(n_estimators=best_n_estimators, max_depth=best_max_depth, random_state=42)
rf_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = rf_model.predict(X_test)

# Evaluate the model on the test set
test_rmse = mean_squared_error(y_test, y_pred, squared=False)
test_mae = mean_absolute_error(y_test, y_pred)
test_r2 = r2_score(y_test, y_pred)

# Print the test set performance metrics
print("Test RMSE:", test_rmse)
print("Test MAE:", test_mae)
print("Test R-squared:", test_r2)

# Get feature importances from the trained Random Forest model
feature_importances = rf_model.feature_importances_

# Create a DataFrame to display feature importances
importance_df = pd.DataFrame({'Feature': X_train.columns, 'Importance': feature_importances})

# Sort feature importances in descending order
importance_df = importance_df.sort_values(by='Importance', ascending=False)

# Print the feature importances
print("\nFeature Importances:")
print(importance_df)


Fitting 5 folds for each of 12 candidates, totalling 60 fits




Best n_estimators: 300
Best max_depth: 10
Test RMSE: 9.921121061989389
Test MAE: 6.541215061120822
Test R-squared: 0.019106080659304192

Feature Importances:
                                          Feature  Importance
2                                 ghg_change_real    0.378588
1                                ghg_change_total    0.108170
0                                            year    0.086306
5                           ghg_change_othergroup    0.085704
4                              ghg_change_measure    0.069188
3                            ghg_change_structure    0.056001
6                        ghg_change_nonzero_count    0.041541
8                       cdp_num_credits_clean_sum    0.040580
10              cdp_num_credits_riskadj_clean_sum    0.039928
20         cdp_purpose_clean_voluntary offsetting    0.015012
16                cdp_credits_cancelled_clean_yes    0.014087
14     cdp_orig_or_purchase_clean_credit purchase    0.013478
17            cdp_credits_cancelled_

In [27]:
baseline_carbon_credits.ghg_change_real_next.describe()

count    18455.000000
mean        -4.244380
std          7.871529
min        -49.900000
25%         -5.400000
50%         -1.500000
75%          0.000000
max         45.000000
Name: ghg_change_real_next, dtype: float64

**Windsorizing the train and re-running**

In [29]:
X_train = train_data.drop(columns=['ghg_change_real_next', 'ghg_change_real_cat_next'])
y_train = train_data[target]
X_test = test_data.drop(columns=['ghg_change_real_next', 'ghg_change_real_cat_next'])
y_test = test_data[target]

In [30]:

# Initialize the LassoCV model with cross-validation
lasso_cv_model = LassoCV(alphas=np.logspace(-4, 4, 100), cv=TimeSeriesSplit(n_splits=5))

# Fit the LassoCV model to the training data
lasso_cv_model.fit(X_train, y_train)

# Get the optimal alpha (regularization strength) selected by cross-validation
optimal_alpha = lasso_cv_model.alpha_

# Fit the Lasso model with the optimal alpha to the entire training dataset
lasso_model = Lasso(alpha=optimal_alpha)
lasso_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = lasso_model.predict(X_test)

# Evaluate the Lasso model on the test set
test_rmse = mean_squared_error(y_test, y_pred, squared=False)
test_mae = mean_absolute_error(y_test, y_pred)
test_r2 = r2_score(y_test, y_pred)

# Print the results
print("Optimal Alpha:", optimal_alpha)
print("Test RMSE:", test_rmse)
print("Test MAE:", test_mae)
print("Test R-squared:", test_r2)


Optimal Alpha: 0.06734150657750829
Test RMSE: 9.698290596747603
Test MAE: 6.418002846036193
Test R-squared: 0.06267342526383735


In [34]:
# Assuming you've already trained the Lasso model and named it 'lasso_model'
# Get the coefficients and feature names
coefficients = lasso_model.coef_
feature_names = X_train.columns

# Create a DataFrame to display the coefficients and feature names
coefficients_df = pd.DataFrame({'Feature': feature_names, 'Coefficient': coefficients})

# Print the coefficients
print("Lasso Coefficients:")
print(coefficients_df.sort_values(by='Coefficient', ascending=False))

Lasso Coefficients:
                                          Feature  Coefficient
2                                 ghg_change_real     0.308306
13  cdp_orig_or_purchase_clean_credit origination     0.184556
5                           ghg_change_othergroup     0.001100
11              cdp_num_credits_clean_missing_sum     0.000000
12      cdp_num_credits_riskadj_clean_missing_sum    -0.000000
21                      cdp_purpose_clean_missing     0.000000
19                        cdp_purpose_clean_other    -0.000000
18                   cdp_purpose_clean_compliance     0.000000
17            cdp_credits_cancelled_clean_missing    -0.000000
15                 cdp_credits_cancelled_clean_no     0.000000
22   absent_cdp_carbon_credits_full_processed.csv     0.000000
9                     cdp_num_credits_clean_count     0.000000
7                             ghg_change_real_cat    -0.000000
4                              ghg_change_measure     0.000000
3                            ghg_ch

In [32]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import TimeSeriesSplit, cross_val_score, GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Initialize the Random Forest regressor
rf_model = RandomForestRegressor(random_state=42)

# Define hyperparameter grid for tuning
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30]
}

# Perform time series cross-validation with hyperparameter tuning
tscv = TimeSeriesSplit(n_splits=5)  # You can adjust the number of splits as needed

grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=tscv,
                           scoring='neg_mean_squared_error', verbose=1, n_jobs=-1)

# Fit the grid search to the training data
grid_search.fit(X_train, y_train)

# Get the best hyperparameters from the grid search
best_n_estimators = grid_search.best_params_['n_estimators']
best_max_depth = grid_search.best_params_['max_depth']

print("Best n_estimators:", best_n_estimators)
print("Best max_depth:", best_max_depth)

# Fit the Random Forest model with the best hyperparameters to the entire training dataset
rf_model = RandomForestRegressor(n_estimators=best_n_estimators, max_depth=best_max_depth, random_state=42)
rf_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = rf_model.predict(X_test)

# Evaluate the model on the test set
test_rmse = mean_squared_error(y_test, y_pred, squared=False)
test_mae = mean_absolute_error(y_test, y_pred)
test_r2 = r2_score(y_test, y_pred)

# Print the test set performance metrics
print("Test RMSE:", test_rmse)
print("Test MAE:", test_mae)
print("Test R-squared:", test_r2)

# Get feature importances from the trained Random Forest model
feature_importances = rf_model.feature_importances_

# Create a DataFrame to display feature importances
importance_df = pd.DataFrame({'Feature': X_train.columns, 'Importance': feature_importances})

# Sort feature importances in descending order
importance_df = importance_df.sort_values(by='Importance', ascending=False)

# Print the feature importances
print("\nFeature Importances:")
print(importance_df)

Fitting 5 folds for each of 12 candidates, totalling 60 fits




Best n_estimators: 300
Best max_depth: 10
Test RMSE: 9.690482331421025
Test MAE: 6.308054524633013
Test R-squared: 0.06418213408926476

Feature Importances:
                                          Feature  Importance
2                                 ghg_change_real    0.398576
1                                ghg_change_total    0.114629
0                                            year    0.081833
5                           ghg_change_othergroup    0.070407
4                              ghg_change_measure    0.063918
3                            ghg_change_structure    0.049026
6                        ghg_change_nonzero_count    0.043125
10              cdp_num_credits_riskadj_clean_sum    0.042041
8                       cdp_num_credits_clean_sum    0.040809
14     cdp_orig_or_purchase_clean_credit purchase    0.014771
20         cdp_purpose_clean_voluntary offsetting    0.013833
16                cdp_credits_cancelled_clean_yes    0.012334
17            cdp_credits_cancelled_c

Key takeaways: 
- Features that matter are: year, ghg_change_real, ghg_change_total, ghg_change_othergroup, ghg_change_measure, ghg_change_structure, ghg_change_nonzero_count, cdp_num_credits_riskadj_clean_sum, cdp_num_credits_clean_sum, cdp_orig_or_purchase_clean_credit purchase, cdp_purpose_clean_voluntary offsetting, cdp_credits_cancelled_clean_yes

In [44]:
# reading initiative
initiative = pd.read_csv("../../data/mapped/mapped_cdp_initiative_processed.csv", index_col=['id', 'year'])

In [45]:
# left join initiative and baseline_carbon_credits
baseline_initiative = ghg_change.join(initiative, how='left', lsuffix='_ghg_change', rsuffix='_initiative')

baseline_initiative.drop(columns=['Unnamed: 0_ghg_change', 'isin_ghg_change', 'Unnamed: 0_initiative', 'isin_initiative'], inplace=True)

baseline_initiative = baseline_initiative.reset_index().drop(columns=['id'])

baseline_initiative.columns

Index(['year', 'ghg_change_total', 'ghg_change_real', 'ghg_change_structure',
       'ghg_change_measure', 'ghg_change_othergroup',
       'ghg_change_nonzero_count', 'ghg_change_real_cat',
       'ghg_change_real_cat_next', 'ghg_change_real_next', 'scope1', 'scope2',
       'scope3', 'absent_cdp_initiative_processed.csv'],
      dtype='object')

In [46]:
baseline_initiative

Unnamed: 0,year,ghg_change_total,ghg_change_real,ghg_change_structure,ghg_change_measure,ghg_change_othergroup,ghg_change_nonzero_count,ghg_change_real_cat,ghg_change_real_cat_next,ghg_change_real_next,scope1,scope2,scope3,absent_cdp_initiative_processed.csv
0,2011.0,-11.00,-11.00,0.00,0.0,0.00,1.0,1.0,0.0,0.00,0.000000,0.000000,0.000000,0
1,2012.0,-25.00,0.00,0.00,0.0,-25.00,1.0,0.0,1.0,-1.00,0.000000,0.000000,0.000000,1
2,2013.0,-19.00,-1.00,0.00,-1.0,-17.00,3.0,1.0,0.0,0.00,0.000000,0.000000,0.000000,0
3,2014.0,-29.00,0.00,0.00,0.0,-29.00,1.0,0.0,0.0,0.00,0.000000,0.000000,0.000000,0
4,2016.0,-4.00,0.00,0.00,0.0,-4.00,1.0,0.0,1.0,-39.70,0.000000,1.098612,0.000000,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18450,2021.0,-3.54,-3.11,0.00,0.0,-0.24,3.0,1.0,1.0,-2.18,0.000000,0.693147,0.000000,0
18451,2021.0,0.00,0.00,0.00,0.0,0.00,0.0,0.0,1.0,-29.60,0.000000,0.000000,0.000000,0
18452,2021.0,0.00,0.00,0.00,0.0,0.00,0.0,0.0,0.0,0.00,1.098612,1.098612,0.000000,0
18453,2021.0,7.19,0.00,7.19,0.0,0.00,1.0,0.0,1.0,-23.50,0.000000,1.098612,0.000000,0


In [47]:
# Create train and test sets
train_data = baseline_initiative[baseline_initiative['year'] < 2021]
test_data = baseline_initiative[baseline_initiative['year'] == 2021]

# Define features and target variable
target = 'ghg_change_real_next'

X_train = train_data.drop(columns=['ghg_change_real_next', 'ghg_change_real_cat_next'])
y_train = train_data[target]
X_test = test_data.drop(columns=['ghg_change_real_next', 'ghg_change_real_cat_next'])
y_test = test_data[target]

# Standardize the features in both training and test sets
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [54]:

# Initialize the LassoCV model with cross-validation
lasso_cv_model = LassoCV(alphas=np.logspace(-4, 4, 100), cv=TimeSeriesSplit(n_splits=5))

# Fit the LassoCV model to the training data
lasso_cv_model.fit(X_train_scaled, y_train)

# Get the optimal alpha (regularization strength) selected by cross-validation
optimal_alpha = lasso_cv_model.alpha_

# Fit the Lasso model with the optimal alpha to the entire training dataset
lasso_model = Lasso(alpha=optimal_alpha)
lasso_model.fit(X_train_scaled, y_train)

# Make predictions on the test set
y_pred = lasso_model.predict(X_test_scaled)

# Evaluate the Lasso model on the test set
test_rmse = mean_squared_error(y_test, y_pred, squared=False)
test_mae = mean_absolute_error(y_test, y_pred)
test_r2 = r2_score(y_test, y_pred)

# Print the results
print("Optimal Alpha:", optimal_alpha)
print("Test RMSE:", test_rmse)
print("Test MAE:", test_mae)
print("Test R-squared:", test_r2)


Optimal Alpha: 0.08111308307896872
Test RMSE: 9.64973874570356
Test MAE: 6.410885020875105
Test R-squared: 0.07203487421707744


In [50]:
# Assuming you've already trained the Lasso model and named it 'lasso_model'
# Get the coefficients and feature names
coefficients = lasso_model.coef_
feature_names = X_train.columns

# Create a DataFrame to display the coefficients and feature names
coefficients_df = pd.DataFrame({'Feature': feature_names, 'Coefficient': coefficients})

# Print the coefficients
print("Lasso Coefficients:")
print(coefficients_df.sort_values(by='Coefficient', ascending=False))

Lasso Coefficients:
                                Feature  Coefficient
11  absent_cdp_initiative_processed.csv     0.547889
2                       ghg_change_real     0.270840
8                                scope1     0.114835
4                    ghg_change_measure     0.002382
5                 ghg_change_othergroup     0.001498
3                  ghg_change_structure    -0.003083
1                      ghg_change_total    -0.003989
6              ghg_change_nonzero_count    -0.097614
0                                  year    -0.147266
7                   ghg_change_real_cat    -0.216114
9                                scope2    -0.409644
10                               scope3    -0.415617


In [51]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import TimeSeriesSplit, cross_val_score, GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Initialize the Random Forest regressor
rf_model = RandomForestRegressor(random_state=42)

# Define hyperparameter grid for tuning
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30]
}

# Perform time series cross-validation with hyperparameter tuning
tscv = TimeSeriesSplit(n_splits=5)  # You can adjust the number of splits as needed

grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=tscv,
                           scoring='neg_mean_squared_error', verbose=1, n_jobs=-1)

# Fit the grid search to the training data
grid_search.fit(X_train, y_train)

# Get the best hyperparameters from the grid search
best_n_estimators = grid_search.best_params_['n_estimators']
best_max_depth = grid_search.best_params_['max_depth']

print("Best n_estimators:", best_n_estimators)
print("Best max_depth:", best_max_depth)

# Fit the Random Forest model with the best hyperparameters to the entire training dataset
rf_model = RandomForestRegressor(n_estimators=best_n_estimators, max_depth=best_max_depth, random_state=42)
rf_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = rf_model.predict(X_test)

# Evaluate the model on the test set
test_rmse = mean_squared_error(y_test, y_pred, squared=False)
test_mae = mean_absolute_error(y_test, y_pred)
test_r2 = r2_score(y_test, y_pred)

# Print the test set performance metrics
print("Test RMSE:", test_rmse)
print("Test MAE:", test_mae)
print("Test R-squared:", test_r2)

# Get feature importances from the trained Random Forest model
feature_importances = rf_model.feature_importances_

# Create a DataFrame to display feature importances
importance_df = pd.DataFrame({'Feature': X_train.columns, 'Importance': feature_importances})

# Sort feature importances in descending order
importance_df = importance_df.sort_values(by='Importance', ascending=False)

# Print the feature importances
print("\nFeature Importances:")
print(importance_df)

Fitting 5 folds for each of 12 candidates, totalling 60 fits




Best n_estimators: 300
Best max_depth: 10
Test RMSE: 9.766071624697268
Test MAE: 6.497448288351847
Test R-squared: 0.049525752901597775

Feature Importances:
                                Feature  Importance
2                       ghg_change_real    0.372485
1                      ghg_change_total    0.126654
5                 ghg_change_othergroup    0.095041
0                                  year    0.080269
4                    ghg_change_measure    0.077921
9                                scope2    0.066010
3                  ghg_change_structure    0.056625
6              ghg_change_nonzero_count    0.045615
8                                scope1    0.038123
10                               scope3    0.035413
11  absent_cdp_initiative_processed.csv    0.004999
7                   ghg_change_real_cat    0.000844


In [52]:
# doing the same with incentives
incentives = pd.read_csv("../../data/mapped/mapped_cdp_incentives_processed.csv", index_col=['id', 'year'])

# left join initiative and baseline_carbon_credits

baseline_incentives = ghg_change.join(incentives, how='left', lsuffix='_ghg_change', rsuffix='_incentives')

baseline_incentives.drop(columns=['Unnamed: 0_ghg_change', 'isin_ghg_change', 'Unnamed: 0_incentives', 'isin_incentives'], inplace=True)

baseline_incentives = baseline_incentives.reset_index().drop(columns=['id'])

baseline_incentives.columns

Index(['year', 'ghg_change_total', 'ghg_change_real', 'ghg_change_structure',
       'ghg_change_measure', 'ghg_change_othergroup',
       'ghg_change_nonzero_count', 'ghg_change_real_cat',
       'ghg_change_real_cat_next', 'ghg_change_real_next',
       'cdp_boardoversight_i', 'cdp_incentivebinary_i',
       'cdp_boardoversight_i_na', 'cdp_incentivebinary_i_na',
       'absent_cdp_incentives_processed.csv'],
      dtype='object')

In [58]:
# Create train and test sets
train_data = baseline_incentives[baseline_incentives['year'] < 2021]
test_data = baseline_incentives[baseline_incentives['year'] == 2021]

# Define features and target variable
target = 'ghg_change_real_next'

X_train = train_data.drop(columns=['ghg_change_real_next', 'ghg_change_real_cat_next'])
y_train = train_data[target]
X_test = test_data.drop(columns=['ghg_change_real_next', 'ghg_change_real_cat_next'])
y_test = test_data[target]

# Standardize the features in both training and test sets
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [59]:

# Initialize the LassoCV model with cross-validation
lasso_cv_model = LassoCV(alphas=np.logspace(-4, 4, 100), cv=TimeSeriesSplit(n_splits=5))

# Fit the LassoCV model to the training data
lasso_cv_model.fit(X_train_scaled, y_train)

# Get the optimal alpha (regularization strength) selected by cross-validation
optimal_alpha = lasso_cv_model.alpha_

# Fit the Lasso model with the optimal alpha to the entire training dataset
lasso_model = Lasso(alpha=optimal_alpha)
lasso_model.fit(X_train_scaled, y_train)

# Make predictions on the test set
y_pred = lasso_model.predict(X_test_scaled)

# Evaluate the Lasso model on the test set
test_rmse = mean_squared_error(y_test, y_pred, squared=False)
test_mae = mean_absolute_error(y_test, y_pred)
test_r2 = r2_score(y_test, y_pred)

# Print the results
print("Optimal Alpha:", optimal_alpha)
print("Test RMSE:", test_rmse)
print("Test MAE:", test_mae)
print("Test R-squared:", test_r2)


Optimal Alpha: 0.09770099572992257
Test RMSE: 9.661211056267751
Test MAE: 6.456219720751737
Test R-squared: 0.06982709788061847


In [60]:
# Assuming you've already trained the Lasso model and named it 'lasso_model'
# Get the coefficients and feature names
coefficients = lasso_model.coef_
feature_names = X_train.columns

# Create a DataFrame to display the coefficients and feature names
coefficients_df = pd.DataFrame({'Feature': feature_names, 'Coefficient': coefficients})

# Print the coefficients
print("Lasso Coefficients:")
print(coefficients_df.sort_values(by='Coefficient', ascending=False))

Lasso Coefficients:
                                Feature  Coefficient
2                       ghg_change_real     1.730259
11             cdp_incentivebinary_i_na     0.003274
4                    ghg_change_measure    -0.000000
5                 ghg_change_othergroup    -0.000000
8                  cdp_boardoversight_i    -0.000000
10              cdp_boardoversight_i_na     0.000000
12  absent_cdp_incentives_processed.csv     0.000000
3                  ghg_change_structure    -0.002208
1                      ghg_change_total    -0.008261
6              ghg_change_nonzero_count    -0.092673
7                   ghg_change_real_cat    -0.119475
9                 cdp_incentivebinary_i    -0.314354
0                                  year    -0.468564
