In [69]:
import pandas as pd
import numpy as np
import catboost as cb
from catboost import CatBoostClassifier, CatBoostRegressor, Pool
from sklearn.metrics import mean_squared_error, accuracy_score

In [70]:
data = pd.read_csv('../R/final_model_predictions.csv')

In [71]:
data.drop(['Unnamed: 0'], axis=1, inplace=True)

In [72]:
data.drop(columns=['Companyname', 'Gind', 'Isin'], inplace=True)

In [73]:
# make a list of columns that have text data with a loop (note object columns can also be numbers)
text_columns = []
for col in data.columns:
    if data[col].dtype == 'object':
        text_columns.append(col)

In [74]:
train_set = data.loc[data['Year'] < 9]
validation_set = data.loc[data['Year'] == 9]
test_set = data.loc[data['Year'] == 10]

In [75]:
continuous_features = [col for col in data.columns if col not in ['Id', 'Year', 'Ghg.Change.Real.Next', 'Industry', 'Country', 'Continent', 'Method.Ind', 'Type.Scope1'] and data[col].nunique() > 2]
binary_features = [col for col in data.columns if data[col].nunique() == 2]
train_stats = train_set[continuous_features].agg(['mean', 'std'])

In [76]:

# Function to standardize data (excluding binary features)
def standardize_data(df, stats, continuous_features, binary_features):
    standardized_df = df.copy()
    for feature in continuous_features:
        if feature not in binary_features:  # Only standardize continuous features
            mean = stats.loc['mean', feature]
            std = stats.loc['std', feature]
            standardized_df[feature] = (df[feature] - mean) / std
    # Replace NaN values resulted from standardization with 0 (especially for features with no variance)
    standardized_df = standardized_df.fillna(0)
    return standardized_df


# Standardize the entire dataset based on statistics from the first 10 years
df_standardized = standardize_data(data, train_stats, continuous_features, binary_features)


  # Last year for testing

# Now, you can proceed to create sequences from train_data_standardized and test_data_standardized
# Remember, for LSTM, sequences should be shaped properly according to your model's requirements.


In [77]:
categorical_features = ['Id', 'Industry', 'Country', 'Continent', 'Method.Ind', 'Type.Scope1']

In [78]:
X_train_class = train_set.drop(columns=['Ghg.Change.Real.Next', 'Ghg.Change.Real.Cat.Next'])
y_train_class = train_set['Ghg.Change.Real.Cat.Next']

X_val_class = validation_set.drop(columns=['Ghg.Change.Real.Next', 'Ghg.Change.Real.Cat.Next'])
y_val_class = validation_set['Ghg.Change.Real.Cat.Next']

X_test_class = test_set.drop(columns=['Ghg.Change.Real.Next', 'Ghg.Change.Real.Cat.Next'])
y_test_class = test_set['Ghg.Change.Real.Cat.Next']

In [79]:
X_train_reg = train_set.loc[train_set['Ghg.Change.Real.Cat.Next'] == 1].drop(columns=['Ghg.Change.Real.Next', 'Ghg.Change.Real.Cat.Next'])
y_train_reg = train_set.loc[train_set['Ghg.Change.Real.Cat.Next'] == 1]['Ghg.Change.Real.Next']

X_val_reg = validation_set.loc[validation_set['Ghg.Change.Real.Cat.Next'] == 1].drop(columns=['Ghg.Change.Real.Next', 'Ghg.Change.Real.Cat.Next'])
y_val_reg = validation_set.loc[validation_set['Ghg.Change.Real.Cat.Next'] == 1]['Ghg.Change.Real.Next']

X_test_reg = test_set.loc[test_set['Ghg.Change.Real.Cat.Next'] == 1].drop(columns=['Ghg.Change.Real.Next', 'Ghg.Change.Real.Cat.Next'])
y_test_reg = test_set.loc[test_set['Ghg.Change.Real.Cat.Next'] == 1]['Ghg.Change.Real.Next']

In [80]:
# Initialize the CatBoostClassifier
clf = CatBoostClassifier(verbose=0, random_seed=42)

# Train the classifier
clf.fit(X_train_class, y_train_class, cat_features=categorical_features)

# Predict on the test set
y_pred_classify = clf.predict(X_test_class)

# Evaluate the classifier
accuracy = accuracy_score(y_test_class, y_pred_classify)
print(f"Classification Accuracy: {accuracy}")

Classification Accuracy: 0.7954887218045112


In [152]:
# calculate baseline accuracy
baseline_accuracy = max(y_test_class.mean(), 1 - y_test_class.mean())
print(f"Baseline Accuracy: {baseline_accuracy}")

Baseline Accuracy: 0.7744360902255639


In [81]:
# Initialize the CatBoostRegressor
reg = CatBoostRegressor(verbose=0, random_seed=42)

# Train the regressor
reg.fit(X_train_reg, y_train_reg, cat_features=categorical_features)

# Predict on the test set
y_pred_reg = reg.predict(X_test_reg)

# Evaluate the regressor
rmse = np.sqrt(mean_squared_error(y_test_reg, y_pred_reg))
print(f"Regression RMSE: {rmse}")

# print test r2
r2 = reg.score(X_test_reg, y_test_reg)
print(f"Regression R2: {r2}")

Regression RMSE: 9.034888973002467
Regression R2: 0.14355876579085802


In [92]:
# now use the moddels first to classify and then to predict the values on the test set
# first classify
classifications = clf.predict(X_test_class)

# now predict the values using the seocond model for the ones that were classified as 1
predictions = reg.predict(X_test_class.loc[classifications == 1])

In [121]:
classifications = pd.DataFrame(classifications, columns=['Ghg.Change.Real.Cat.Next'])
predictions = pd.DataFrame(predictions, columns=['Ghg.Change.Real.Next'])

In [122]:
# where classifications is 0, the prediction should be 0 and where it is 1, the prediction should be the value from the regression model
pred_idx = 0
for i in range(len(classifications)):
    if classifications.iloc[i, 0] != 0:
        classifications.iloc[i, 0] = predictions.iloc[pred_idx, 0]
        pred_idx += 1

In [123]:
predictions

Unnamed: 0,Ghg.Change.Real.Next
0,-6.181075
1,-2.360412
2,-5.563912
3,-10.583072
4,-2.739668
...,...
1203,-5.037881
1204,-2.691106
1205,-10.155822
1206,-3.199662


In [124]:
# calculate the rmse
rmse = np.sqrt(mean_squared_error(test_set['Ghg.Change.Real.Next'], classifications))
print(f"Final RMSE: {rmse}")

Final RMSE: 9.837906266993384


In [140]:
test_set['Ghg.Change.Real.Next']

9       -15.50
28        3.30
49       -2.30
52        0.00
55       -7.90
         ...  
13728    -0.20
13731     0.00
13734     0.00
13737    -1.00
13740    -5.66
Name: Ghg.Change.Real.Next, Length: 1330, dtype: float64

In [141]:
classifications

Unnamed: 0,Ghg.Change.Real.Cat.Next
0,0.000000
1,-6.181075
2,-2.360412
3,0.000000
4,0.000000
...,...
1325,-2.691106
1326,-10.155822
1327,0.000000
1328,-3.199662


In [142]:
test_set['Ghg.Change.Real.Next'].reset_index().drop(columns=['index'])

Unnamed: 0,Ghg.Change.Real.Next
0,-15.50
1,3.30
2,-2.30
3,0.00
4,-7.90
...,...
1325,-0.20
1326,0.00
1327,0.00
1328,-1.00


In [143]:
test_set = test_set.reset_index().drop(columns=['index'])
test_set['preds'] = classifications

In [148]:
# calculate r2 between preds and real values
test_set[['preds', 'Ghg.Change.Real.Next']].corr()

Unnamed: 0,preds,Ghg.Change.Real.Next
preds,1.0,0.302501
Ghg.Change.Real.Next,0.302501,1.0


In [153]:
# for positive (test_set['Ghg.Change.Real.Next'] set them as 0
test_set.loc[test_set['Ghg.Change.Real.Next'] > 0, 'Ghg.Change.Real.Next'] = 0

In [154]:
# import r2 from sklearn
from sklearn.metrics import r2_score

r2_score(test_set['Ghg.Change.Real.Next'], test_set['preds'])

0.09399659906249358

In [155]:
# calculate the rmse
rmse = np.sqrt(mean_squared_error(test_set['Ghg.Change.Real.Next'], classifications))
print(f"Final RMSE: {rmse}")

Final RMSE: 8.830452029855467


In [156]:
# calculate mae
from sklearn.metrics import mean_absolute_error

mae = mean_absolute_error(test_set['Ghg.Change.Real.Next'], classifications)
print(f"Final MAE: {mae}")


Final MAE: 6.2184259007290725


In [157]:
# print the two columns side by side
pd.concat([test_set['Ghg.Change.Real.Next'], classifications], axis=1)

Unnamed: 0,Ghg.Change.Real.Next,Ghg.Change.Real.Cat.Next
0,-15.50,0.000000
1,0.00,-6.181075
2,-2.30,-2.360412
3,0.00,0.000000
4,-7.90,0.000000
...,...,...
1325,-0.20,-2.691106
1326,0.00,-10.155822
1327,0.00,0.000000
1328,-1.00,-3.199662


In [None]:
# try