In [54]:
import pandas as pd
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

In [55]:
df = pd.read_csv("../R/final_model_selected.csv")

In [56]:
df.drop(columns=["Unnamed: 0"], inplace=True)

In [57]:
train_df = df.loc[df['Year'] < 10]
test_df = df.loc[df['Year'] == 10]

In [58]:
train_stats = {
    'Dataset': 'Train',
    'Number of Observations': train_df.shape[0],
    'Number of Features': train_df.shape[1] - 1,
    'Number of Unique Firms': train_df['Id'].nunique(),
    'Mean Next Year Decarbonization Rate': train_df['Ghg.Change.Real.Next'].mean(),
    'Standard Deviation Next Year Decarbonization Rate': train_df['Ghg.Change.Real.Next'].std(),
    '\% of Total Observations': f'{train_df.shape[0] / df.shape[0] * 100:.2f}\%'
}

test_stats = {
    'Dataset': 'Test',
    'Number of Observations': test_df.shape[0],
    'Number of Features': test_df.shape[1] - 1,
    'Number of Unique Firms': test_df['Id'].nunique(),
    'Mean Next Year Decarbonization Rate': test_df['Ghg.Change.Real.Next'].mean(),
    'Standard Deviation Next Year Decarbonization Rate': test_df['Ghg.Change.Real.Next'].std(),
    '\% of Total Observations': f'{test_df.shape[0] / df.shape[0] * 100:.2f}\%'
}

summary_df = pd.DataFrame([train_stats, test_stats])
# transpose
summary_df = summary_df.T

# make the first row the column names
summary_df.columns = summary_df.iloc[0]
summary_df = summary_df[1:]

  '\% of Total Observations': f'{train_df.shape[0] / df.shape[0] * 100:.2f}\%'
  '\% of Total Observations': f'{train_df.shape[0] / df.shape[0] * 100:.2f}\%'
  '\% of Total Observations': f'{test_df.shape[0] / df.shape[0] * 100:.2f}\%'
  '\% of Total Observations': f'{test_df.shape[0] / df.shape[0] * 100:.2f}\%'


In [59]:
summary_df

Dataset,Train,Test
Number of Observations,12411,1330
Number of Features,130,130
Number of Unique Firms,1870,1330
Mean Next Year Decarbonization Rate,-4.191514,-5.982805
Standard Deviation Next Year Decarbonization Rate,7.473937,10.128837
\% of Total Observations,90.32\%,9.68\%


In [60]:
summary_df

Dataset,Train,Test
Number of Observations,12411,1330
Number of Features,130,130
Number of Unique Firms,1870,1330
Mean Next Year Decarbonization Rate,-4.191514,-5.982805
Standard Deviation Next Year Decarbonization Rate,7.473937,10.128837
\% of Total Observations,90.32\%,9.68\%


In [61]:
summary_df.to_latex("../../thesis_tex/tables/summary_train_test.tex", caption="Summary Statistics for Training and Testing Data", label="tab:summary_stats", longtable=True, float_format="%.2f")

In [92]:
'''
# now let's create a baseline metrics dataset according to this 
The baseline metrics of the test set are calculated using the following methods:
\begin{itemize}
    \item Using previous year decarbonization rate to predict next year's decarbonization rate
    \item Using the mean decarbonization rate for each firm across all reported years
    \item Guessing zero for all firms as the next year's decarbonization rate
    \item Using the mean for all firms for each year as the prediction for the next year's decarbonization rate
\end{itemize}
'''

result_df = pd.DataFrame(columns=['Method', 'MSE', 'RMSE', 'MAE', 'R2'])

# previous year decarbonization rate
y_pred = test_df['Ghg.Change.Real']
y_true = test_df['Ghg.Change.Real.Next']
mse = mean_squared_error(y_true, y_pred).round(2)
rmse = mean_squared_error(y_true, y_pred, squared=False).round(2)
mae = mean_absolute_error(y_true, y_pred).round(2)
r2 = r2_score(y_true, y_pred).round(2)

new_row = pd.DataFrame({'Method': 'Current Year Rate', 'MSE': mse, 'RMSE': rmse, 'MAE': mae, 'R2': r2}, index=[0])
result_df = pd.concat([result_df, new_row], ignore_index=True)

test_df['Mean.From.Train'] = test_df['Id'].map(train_df.groupby('Id')['Ghg.Change.Real'].mean())

y_pred = test_df['Mean.From.Train']
y_true = test_df['Ghg.Change.Real.Next']


mse = mean_squared_error(y_true, y_pred).round(2)
rmse = mean_squared_error(y_true, y_pred, squared=False).round(2)
mae = mean_absolute_error(y_true, y_pred).round(2)
r2 = r2_score(y_true, y_pred).round(2)

new_row = pd.DataFrame({'Method': 'Previous Mean For Each Firm', 'MSE': mse, 'RMSE': rmse, 'MAE': mae, 'R2': r2}, index=[0])
result_df = pd.concat([result_df, new_row], ignore_index=True)

# guessing zero for all firms
y_pred = [0] * test_df.shape[0]
y_true = test_df['Ghg.Change.Real.Next']
mse = mean_squared_error(y_true, y_pred).round(2)
rmse = mean_squared_error(y_true, y_pred, squared=False).round(2)
mae = mean_absolute_error(y_true, y_pred).round(2)
r2 = r2_score(y_true, y_pred).round(2)

new_row = pd.DataFrame({'Method': 'Guessing Zero for All Firms', 'MSE': mse, 'RMSE': rmse, 'MAE': mae, 'R2': r2}, index=[0])
result_df = pd.concat([result_df, new_row], ignore_index=True)

# using the mean for all firms for the previous year
y_pred = test_df['Ghg.Change.Real'].mean()
y_true = test_df['Ghg.Change.Real.Next']
mse = mean_squared_error(y_true, [y_pred] * test_df.shape[0]).round(2)
rmse = mean_squared_error(y_true, [y_pred] * test_df.shape[0], squared=False).round(2)
mae = mean_absolute_error(y_true, [y_pred] * test_df.shape[0]).round(2)
r2 = r2_score(y_true, [y_pred] * test_df.shape[0]).round(2)

new_row = pd.DataFrame({'Method': 'Previous Year Mean for All Firms', 'MSE': mse, 'RMSE': rmse, 'MAE': mae, 'R2': r2}, index=[0])
result_df = pd.concat([result_df, new_row], ignore_index=True)

result_df


  '''
  result_df = pd.concat([result_df, new_row], ignore_index=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df['Mean.From.Train'] = test_df['Id'].map(train_df.groupby('Id')['Ghg.Change.Real'].mean())


Unnamed: 0,Method,MSE,RMSE,MAE,R2
0,Current Year Rate,148.06,12.17,7.2,-0.44
1,Previous Mean For Each Firm,109.16,10.45,6.41,-0.06
2,Guessing Zero for All Firms,138.31,11.76,6.99,-0.35
3,Previous Year Mean for All Firms,102.52,10.13,7.03,-0.0


In [93]:
result_df.index = result_df.index + 1

In [94]:
result_df

Unnamed: 0,Method,MSE,RMSE,MAE,R2
1,Current Year Rate,148.06,12.17,7.2,-0.44
2,Previous Mean For Each Firm,109.16,10.45,6.41,-0.06
3,Guessing Zero for All Firms,138.31,11.76,6.99,-0.35
4,Previous Year Mean for All Firms,102.52,10.13,7.03,-0.0


In [95]:
# show only 2 decimal points
result_df.to_latex("../../thesis_tex/tables/baseline_metrics.tex", caption="Baseline Metrics for Test Set", label="tab:baseline_metrics", longtable=True, float_format="%.2f", index = True)

In [65]:
data = df.loc[df['Year'] == 10]

In [66]:
data['Ghg.Change.Real.Next'].describe().to_frame()

Unnamed: 0,Ghg.Change.Real.Next
count,1330.0
mean,-5.982805
std,10.128837
min,-49.8
25%,-8.6925
50%,-2.5
75%,-0.1
max,43.72


In [67]:
(data['Ghg.Change.Real.Next'] == 0).mean() * 100

15.939849624060152

# Predicting Zero

In [68]:
(r2_score(data['Ghg.Change.Real.Next'], data['zeroes']), 
mean_absolute_error(data['Ghg.Change.Real.Next'], data['zeroes']),
mean_squared_error(data['Ghg.Change.Real.Next'], data['zeroes']))

KeyError: 'zeroes'

# Predicting the Previous Year

In [None]:
(r2_score(data['Ghg.Change.Real.Next'], data['Ghg.Change.Real']), 
mean_absolute_error(data['Ghg.Change.Real.Next'], data['Ghg.Change.Real']),
mean_squared_error(data['Ghg.Change.Real.Next'], data['Ghg.Change.Real']))

(-0.44423200661062046, 7.199877072165203, 148.0571875764483)

# Predicting the average across all years

In [None]:
prevs = df.copy()
prevs = prevs.groupby('Id')['Ghg.Change.Real'].mean()
data = data.merge(prevs, how = 'left', on = 'Id')
(r2_score(data['Ghg.Change.Real.Next'], data['Ghg.Change.Real_y']), 
mean_absolute_error(data['Ghg.Change.Real.Next'], data['Ghg.Change.Real_y']),
mean_squared_error(data['Ghg.Change.Real.Next'], data['Ghg.Change.Real_y']))

(0.00901803210939589, 6.1922011044068235, 101.59171271185849)

# Predicting the average from the previous year

In [None]:
data['prev_avg'] = df.loc[df['Year'] == 10]['Ghg.Change.Real'].mean()
(r2_score(data['Ghg.Change.Real.Next'], data['prev_avg']), 
mean_absolute_error(data['Ghg.Change.Real.Next'], data['prev_avg']),
mean_squared_error(data['Ghg.Change.Real.Next'], data['prev_avg']))

(-6.788326320017468e-05, 7.031409947378317, 102.52316629443158)