In [75]:
import pandas as pd
import numpy as np
import seaborn as sns
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LassoCV
from sklearn.model_selection import TimeSeriesSplit, cross_val_score
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [59]:
carbon_credits = pd.read_csv("../../data/mapped/mapped_cdp_carbon_credits_full_processed.csv", index_col=['id', 'year'])

In [60]:
ghg_change = pd.read_csv("../../data/mapped/mapped_cdp_ghg_change_processed.csv", index_col=['id', 'year'])

In [61]:
ghg_change = ghg_change[ghg_change['absent_cdp_ghg_change_processed.csv'] == 0].drop(columns=['absent_cdp_ghg_change_processed.csv'])

In [62]:
# left join ghg_change and carbon_credits
baseline_carbon_credits = ghg_change.join(carbon_credits, how='left', lsuffix='_ghg_change', rsuffix='_carbon_credits')

In [63]:
baseline_carbon_credits.drop(columns=['Unnamed: 0_ghg_change', 'isin_ghg_change', 'Unnamed: 0_carbon_credits', 'isin_carbon_credits'], inplace=True)

In [69]:
baseline_carbon_credits = baseline_carbon_credits.reset_index().drop(columns=['id'])

In [71]:
baseline_carbon_credits.columns

Index(['year', 'ghg_change_total', 'ghg_change_real', 'ghg_change_structure',
       'ghg_change_measure', 'ghg_change_othergroup',
       'ghg_change_nonzero_count', 'ghg_change_real_cat',
       'ghg_change_real_cat_next', 'ghg_change_real_next',
       'cdp_num_credits_clean_sum', 'cdp_num_credits_clean_count',
       'cdp_num_credits_riskadj_clean_sum',
       'cdp_num_credits_clean_missing_sum',
       'cdp_num_credits_riskadj_clean_missing_sum',
       'cdp_orig_or_purchase_clean_credit origination',
       'cdp_orig_or_purchase_clean_credit purchase',
       'cdp_credits_cancelled_clean_no', 'cdp_credits_cancelled_clean_yes',
       'cdp_credits_cancelled_clean_missing', 'cdp_purpose_clean_compliance',
       'cdp_purpose_clean_other', 'cdp_purpose_clean_voluntary offsetting',
       'cdp_purpose_clean_missing',
       'absent_cdp_carbon_credits_full_processed.csv'],
      dtype='object')

In [72]:
# Create train and test sets
train_data = baseline_carbon_credits[baseline_carbon_credits['year'] < 2021]
test_data = baseline_carbon_credits[baseline_carbon_credits['year'] == 2021]

# Define features and target variable
target = 'ghg_change_real_next'

X_train = train_data.drop(columns=['ghg_change_real_next', 'ghg_change_real_cat_next'])
y_train = train_data[target]
X_test = test_data.drop(columns=['ghg_change_real_next', 'ghg_change_real_cat_next'])
y_test = test_data[target]

In [None]:

# Initialize the LassoCV model with cross-validation
lasso_cv_model = LassoCV(alphas=np.logspace(-4, 4, 100), cv=TimeSeriesSplit(n_splits=5))

# Fit the LassoCV model to the training data
lasso_cv_model.fit(X_train, y_train)

# Get the optimal alpha (regularization strength) selected by cross-validation
optimal_alpha = lasso_cv_model.alpha_

# Fit the Lasso model with the optimal alpha to the entire training dataset
lasso_model = LassoCV(alpha=optimal_alpha)
lasso_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = lasso_model.predict(X_test)

# Evaluate the Lasso model on the test set
test_rmse = mean_squared_error(y_test, y_pred, squared=False)
test_mae = mean_absolute_error(y_test, y_pred)
test_r2 = r2_score(y_test, y_pred)

# Print the results
print("Optimal Alpha:", optimal_alpha)
print("Test RMSE:", test_rmse)
print("Test MAE:", test_mae)
print("Test R-squared:", test_r2)
