In [43]:
import pandas as pd
import numpy as np
import catboost as cb
from catboost import Pool, CatBoostRegressor, cv

In [44]:
data = pd.read_csv('../R/final_model_predictions.csv')

In [45]:
data.drop(['Unnamed: 0'], axis=1, inplace=True)

In [46]:
data.drop(columns=['Companyname', 'Gind', 'Isin', 'Ghg.Change.Real.Cat.Next'], inplace=True)

In [47]:
# make a list of columns that have text data with a loop (note object columns can also be numbers)
text_columns = []
for col in data.columns:
    if data[col].dtype == 'object':
        text_columns.append(col)

    


In [48]:
text_columns

['Country', 'Continent', 'Industry', 'Method.Ind', 'Type.Scope1']

In [49]:
train_set = data.loc[data['Year'] < 9]
validation_set = data.loc[data['Year'] == 9]
test_set = data.loc[data['Year'] == 10]

In [50]:
continuous_features = [col for col in data.columns if col not in ['Id', 'Year', 'Ghg.Change.Real.Next', 'Industry', 'Country', 'Continent', 'Method.Ind', 'Type.Scope1'] and data[col].nunique() > 2]
binary_features = [col for col in data.columns if data[col].nunique() == 2]
train_stats = train_set[continuous_features].agg(['mean', 'std'])

In [51]:

# Function to standardize data (excluding binary features)
def standardize_data(df, stats, continuous_features, binary_features):
    standardized_df = df.copy()
    for feature in continuous_features:
        if feature not in binary_features:  # Only standardize continuous features
            mean = stats.loc['mean', feature]
            std = stats.loc['std', feature]
            standardized_df[feature] = (df[feature] - mean) / std
    # Replace NaN values resulted from standardization with 0 (especially for features with no variance)
    standardized_df = standardized_df.fillna(0)
    return standardized_df


# Standardize the entire dataset based on statistics from the first 10 years
df_standardized = standardize_data(data, train_stats, continuous_features, binary_features)


  # Last year for testing

# Now, you can proceed to create sequences from train_data_standardized and test_data_standardized
# Remember, for LSTM, sequences should be shaped properly according to your model's requirements.


In [52]:
categorical_features = ['Id', 'Industry', 'Country', 'Continent', 'Method.Ind', 'Type.Scope1']

In [53]:
# re split df_standardized into train, validation, and test sets
train_set = df_standardized.loc[df_standardized['Year'] < 9]
validation_set = df_standardized.loc[df_standardized['Year'] == 9]
test_set = df_standardized.loc[df_standardized['Year'] == 10]

In [54]:
X_train = train_set.drop(columns=['Ghg.Change.Real.Next'])
y_train = train_set['Ghg.Change.Real.Next']
X_val = validation_set.drop(columns=['Ghg.Change.Real.Next'])
y_val = validation_set['Ghg.Change.Real.Next']
X_test = test_set.drop(columns=['Ghg.Change.Real.Next'])
y_test = test_set['Ghg.Change.Real.Next']

In [55]:
# find the index of elements in categorical_features in X_train.columns
categorical_features_indices = [X_train.columns.get_loc(col) for col in categorical_features if col in X_train]

In [56]:
# Assuming you have your training, validation, and test datasets as follows:
# X_train, y_train for training data
# X_validation, y_validation for validation data
# X_test, y_test for test data

# Initialize data pools
train_pool = Pool(X_train, y_train, cat_features=categorical_features_indices)
validation_pool = Pool(X_val, y_val, cat_features=categorical_features_indices)

# Specify model parameters
model_params = {
    'iterations': 1000,
    'learning_rate': 0.03,
    'depth': 6,
    'loss_function': 'RMSE',  # You can change this according to your problem
    'eval_metric': 'RMSE',    # Evaluation metric, change if necessary
    'random_seed': 42,
    'logging_level': 'Verbose',  # You can change this to 'Silent' for less output
}

# Initialize CatBoostRegressor model
model = CatBoostRegressor(**model_params)

# Train model
model.fit(train_pool, eval_set=validation_pool, early_stopping_rounds=50)

# Make predictions
predictions = model.predict(X_test)

# Evaluate the model
from sklearn.metrics import mean_squared_error

mse = mean_squared_error(y_test, predictions)
print(f"Test MSE: {mse}")


0:	learn: 6.9839955	test: 10.4262051	best: 10.4262051 (0)	total: 22.8ms	remaining: 22.8s
1:	learn: 6.9653003	test: 10.3979004	best: 10.3979004 (1)	total: 31.6ms	remaining: 15.8s
2:	learn: 6.9462056	test: 10.3698569	best: 10.3698569 (2)	total: 41.7ms	remaining: 13.9s
3:	learn: 6.9290715	test: 10.3444704	best: 10.3444704 (3)	total: 50.8ms	remaining: 12.6s
4:	learn: 6.9116229	test: 10.3116043	best: 10.3116043 (4)	total: 58ms	remaining: 11.5s
5:	learn: 6.8950141	test: 10.2804955	best: 10.2804955 (5)	total: 64.4ms	remaining: 10.7s
6:	learn: 6.8794180	test: 10.2589154	best: 10.2589154 (6)	total: 78.3ms	remaining: 11.1s
7:	learn: 6.8645219	test: 10.2372027	best: 10.2372027 (7)	total: 84.9ms	remaining: 10.5s
8:	learn: 6.8468316	test: 10.2080995	best: 10.2080995 (8)	total: 91.1ms	remaining: 10s
9:	learn: 6.8324623	test: 10.1869758	best: 10.1869758 (9)	total: 97.3ms	remaining: 9.63s
10:	learn: 6.8205248	test: 10.1688278	best: 10.1688278 (10)	total: 105ms	remaining: 9.44s
11:	learn: 6.8074725	tes

In [57]:
# test mae
from sklearn.metrics import mean_absolute_error
mae = mean_absolute_error(y_test, predictions)
print(f"Test MAE: {mae}")

Test MAE: 6.216770869852613


In [58]:
# test R2
from sklearn.metrics import r2_score
r2 = r2_score(y_test, predictions)
print(f"Test R2: {r2}")

Test R2: 0.09907985817544429


In [59]:
import numpy as np
from catboost import Pool, CatBoostRegressor
from sklearn.metrics import mean_squared_error

# Assuming you have your training, validation, and test datasets as follows:
# X_train, y_train for training data
# X_validation, y_validation for validation data
# And 'categorical_features_indices' is a list of categorical feature indices

train_pool = Pool(X_train, y_train, cat_features=categorical_features_indices)
validation_pool = Pool(X_val, y_val, cat_features=categorical_features_indices)

# Define parameter grid
param_grid = {
    'depth': [6, 8, 10],
    'iterations': [1000, 1500],
    'learning_rate': [0.005, 0.01],
    'l2_leaf_reg': [1, 3]
}


In [60]:
best_params = None
best_rmse = float('inf')

# Iterate over all combinations of parameters
for depth in param_grid['depth']:
    for iterations in param_grid['iterations']:
        for learning_rate in param_grid['learning_rate']:
            for l2_leaf_reg in param_grid['l2_leaf_reg']:
                # Define model with current set of parameters
                model = CatBoostRegressor(
                    depth=depth,
                    iterations=iterations,
                    learning_rate=learning_rate,
                    l2_leaf_reg=l2_leaf_reg,
                    loss_function='RMSE',
                    eval_metric='RMSE',
                    random_seed=42,
                    logging_level='Silent',
                    thread_count = 8  # Set to 'Verbose' to see training logs
                )
                
                # Train model on training dataset
                model.fit(train_pool)
                
                # Evaluate model on validation dataset
                predictions = model.predict(validation_pool)
                rmse = np.sqrt(mean_squared_error(y_val, predictions))
                
                # Update best parameters if current model is better
                if rmse < best_rmse:
                    best_rmse = rmse
                    best_params = {
                        'depth': depth,
                        'iterations': iterations,
                        'learning_rate': learning_rate,
                        'l2_leaf_reg': l2_leaf_reg
                    }

print(f"Best parameters: {best_params}")
print(f"Best RMSE on validation set: {best_rmse}")

Best parameters: {'depth': 8, 'iterations': 1000, 'learning_rate': 0.01, 'l2_leaf_reg': 1}
Best RMSE on validation set: 9.496817592756415


In [61]:
# evaluate best model on test set
model = CatBoostRegressor(
    depth=best_params['depth'],
    iterations=best_params['iterations'],
    learning_rate=best_params['learning_rate'],
    l2_leaf_reg=best_params['l2_leaf_reg'],
    loss_function='RMSE',
    eval_metric='RMSE',
    random_seed=42,
    logging_level='Silent'
)

model.fit(train_pool)
predictions = model.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, predictions))
print(f"RMSE on test set with best model: {rmse}")

# print r2
r2 = r2_score(y_test, predictions)
print(f"Test R2: {r2}")

# print mae
mae = mean_absolute_error(y_test, predictions)
print(f"Test MAE: {mae}")

RMSE on test set with best model: 9.584479554637436
Test R2: 0.10392462930171409
Test MAE: 6.188308086768396


In [62]:
# feature importance
feature_importance = model.get_feature_importance(train_pool)
feature_names = X_train.columns
for score, name in sorted(zip(feature_importance, feature_names), reverse=True):
    print('{}: {}'.format(name, score))

Ghg.Change.Real: 20.711285897218744
Cdp.Targetamount.Mean: 5.942385736267044
Id: 5.569923703139661
Continent: 4.83317939208637
Ghg2Market: 4.1007382525607206
Industry: 3.710477388006951
Ghg1: 2.896831359694945
Type.Scope1: 2.52588862253826
Country: 2.3357611819568542
Cdp.Baseyear.Mean: 1.8534773394801574
Ghg.Change.Total: 1.6921727588354905
Net.Income.Over.Assets: 1.657066676248041
Ghg.Int.Change: 1.623848793385911
Ghg.Change.Output: 1.3941354534478079
Cdp.Targetduration.Mean: 1.315955307172771
Co2.Total.Log1P: 1.2909147660045894
Roe: 1.2711661212396137
Market.Cap: 1.2705103766821044
Ghg2Location: 1.2363042504986095
Investment.Total.Log1P: 1.2338784927047246
Method.Ind: 1.1969292412804249
Ghg.Int.Figure: 1.1636024864302972
Tot.Assets: 1.095874762228697
Ghg3.Count: 1.0344555182539434
Employees: 1.024429039225645
Year: 0.9872650819322244
Cdp.Timeprogress.Mean: 0.9328851447346047
Net.Income: 0.9231833327369626
Ghg.Change.Measure: 0.9079269562943177
Cdp.Targetscope.Percent.Mean: 0.89639800