In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

Load data

In [3]:
data = pd.read_csv('../../02_dataset/merged_df.csv')
data.drop(columns='Unnamed: 0', inplace=True)

In [4]:
feats = ['Year',
         'yearly_avg_rainfall',
         'rainfall_var_t',
         'rainfall_var_t_1',
         'mean_temp',
         'mean_temp_t',
         'mean_temp_t_1',
         'pt_gdp_agriculture',
         'oil_rent']

target = 'gdp_g'

subsaharan = ['AGO', 'BEN', 'BWA', 'CIV', 'CAF', 'CMR', 'COD', 'COG', 'CPV', 'DJI', 'ERI', 'ETH', 'GAB', 'GHA', 'GIN', 'GMB', 'KEN', 'LSO', 'LBR', 'MDG', 'MLI', 'MRT', 'MUS', 'MWI', 'NAM', 'NER', 'NGA', 'RWA', 'SEN', 'SDN', 'SLE', 'SOM', 'SSD', 'SWZ', 'SYC', 'TCD', 'TGO', 'TZA', 'UGA', 'ZMB', 'ZWE']
sub_data = data[data['ISO3_code'].isin(subsaharan)]

In [5]:
from sklearn.model_selection import train_test_split


df_train, df_test = train_test_split(sub_data, 
                                                    shuffle=True,
                                                    random_state=216,
                                                    test_size=.2)
df_tt, df_val = train_test_split(df_train, 
                                                    shuffle=True,
                                                    random_state=216,
                                                    test_size=.2)

In [6]:
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, AdaBoostRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor
from sklearn.svm import SVR

from sklearn.metrics import root_mean_squared_error

In [7]:


models = {
    'ols': LinearRegression(),
    'rf': RandomForestRegressor(),
    'extra_trees': ExtraTreesRegressor(),
    'ada': AdaBoostRegressor(),
    'xgbr': XGBRegressor(),
    'svr': SVR(),
    'grad': GradientBoostingRegressor(),
    'knr': KNeighborsRegressor(5)
}

for name, model in models.items():
    model.fit(df_tt[feats], df_tt[target])
    preds = model.predict(df_val[feats])
    rmse = root_mean_squared_error(df_val[target], preds)

    print(f"{name}, rmse: {rmse}")



ols, rmse: 0.06816955067761325
rf, rmse: 0.06442163036407415
extra_trees, rmse: 0.06419502662897779
ada, rmse: 0.07226835963294254
xgbr, rmse: 0.06492964204590143
svr, rmse: 0.07068494935702643
grad, rmse: 0.06665096480297061
knr, rmse: 0.07246734348724386


In [10]:
from sklearn.model_selection import GridSearchCV

grid_cv = GridSearchCV(ExtraTreesRegressor(),
                       param_grid={
                           'max_depth': [5, 10, 15, 20],
                           'n_estimators': [100, 500]
                       },
                       scoring='neg_root_mean_squared_error',
                       cv=5
                       )

grid_cv.fit(df_train[feats], df_train[target])

print(grid_cv.best_score_, grid_cv.best_params_)

-0.05908560535511746 {'max_depth': 20, 'n_estimators': 500}


In [11]:
feature_importance_df = pd.DataFrame({
    'Feature': feats,  # List of feature names
    'Importance': grid_cv.best_estimator_.feature_importances_
})

# Sort the features by importance in descending order
feature_importance_df = feature_importance_df.sort_values(by="Importance", ascending=False)

# Reset index for better readability
feature_importance_df.reset_index(drop=True, inplace=True)

# Display the DataFrame
print(feature_importance_df)

               Feature  Importance
0                 Year    0.157880
1   pt_gdp_agriculture    0.144752
2            mean_temp    0.125086
3  yearly_avg_rainfall    0.116224
4        mean_temp_t_1    0.101142
5     rainfall_var_t_1    0.096893
6       rainfall_var_t    0.095629
7          mean_temp_t    0.092055
8             oil_rent    0.070339


### Feature importances with `f_regression`

In [15]:
from sklearn.feature_selection import f_regression

pp = f_regression(df_tt[feats], df_tt[target])[1]

for i, val in enumerate(pp):
    print(feats[i], val)

Year 0.02241580939016425
yearly_avg_rainfall 0.625058414793981
rainfall_var_t 0.6434453339655446
rainfall_var_t_1 0.8459305037759743
mean_temp 0.27945099669783546
mean_temp_t 0.1657856573940783
mean_temp_t_1 0.1952734273254414
pt_gdp_agriculture 0.5551019242079394
oil_rent 0.23665724164830593


In [16]:
import statsmodels.api as sm

ols = sm.OLS(df_tt[target], df_tt[feats])
ols_result = ols.fit()
ols_result.summary()

0,1,2,3
Dep. Variable:,gdp_g,R-squared (uncentered):,0.239
Model:,OLS,Adj. R-squared (uncentered):,0.234
Method:,Least Squares,F-statistic:,49.85
Date:,"Sun, 13 Apr 2025",Prob (F-statistic):,1.12e-78
Time:,17:38:29,Log-Likelihood:,1999.9
No. Observations:,1436,AIC:,-3982.0
Df Residuals:,1427,BIC:,-3934.0
Df Model:,9,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Year,2.516e-05,6.02e-06,4.182,0.000,1.34e-05,3.7e-05
yearly_avg_rainfall,-3.312e-06,3.34e-06,-0.990,0.322,-9.87e-06,3.25e-06
rainfall_var_t,0.0002,0.009,0.027,0.978,-0.017,0.017
rainfall_var_t_1,0.0004,0.008,0.043,0.966,-0.016,0.017
mean_temp,-0.0006,0.000,-1.322,0.186,-0.002,0.000
mean_temp_t,-0.1058,0.117,-0.902,0.367,-0.336,0.124
mean_temp_t_1,0.1064,0.117,0.909,0.363,-0.123,0.336
pt_gdp_agriculture,1.061e-05,9.7e-05,0.109,0.913,-0.000,0.000
oil_rent,0.0003,0.000,1.272,0.204,-0.000,0.001

0,1,2,3
Omnibus:,373.743,Durbin-Watson:,1.973
Prob(Omnibus):,0.0,Jarque-Bera (JB):,7566.326
Skew:,-0.691,Prob(JB):,0.0
Kurtosis:,14.16,Cond. No.,182000.0


## Try all again with indicator variables

In [17]:
cols = []
for code in sub_data['ISO3_code'].unique():
    sub_data[code] = pd.get_dummies(sub_data['ISO3_code'])[code] * 1
    sub_data[f'{code}_year'] = sub_data[code] * sub_data['Year']
    cols.append(code)
    cols.append(f'{code}_year')
sub_data.columns

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sub_data[code] = pd.get_dummies(sub_data['ISO3_code'])[code] * 1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sub_data[f'{code}_year'] = sub_data[code] * sub_data['Year']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sub_data[code] = pd.get_dummies(sub_data['ISO3_code'])[code] * 1
A value is try

Index(['ISO3_code', 'Year', 'pt_gdp_agriculture', 'oil_rent',
       'democracy_polity', 'gdp', 'country', 'ethnic_fractionation_index',
       'mean_temp', 'yearly_avg_rainfall',
       ...
       'TZA', 'TZA_year', 'TGO', 'TGO_year', 'UGA', 'UGA_year', 'ZMB',
       'ZMB_year', 'ZWE', 'ZWE_year'],
      dtype='object', length=103)

In [18]:
df_train, df_test = train_test_split(sub_data, 
                                                    shuffle=True,
                                                    random_state=216,
                                                    test_size=.2)
df_tt, df_val = train_test_split(df_train, 
                                                    shuffle=True,
                                                    random_state=216,
                                                    test_size=.2)

In [19]:

models = {
    'ols': LinearRegression(),
    'rf': RandomForestRegressor(),
    'extra_trees': ExtraTreesRegressor(),
    'ada': AdaBoostRegressor(),
    'xgbr': XGBRegressor(),
    'svr': SVR(),
    'grad': GradientBoostingRegressor(),
    'knr': KNeighborsRegressor(5)
}

for name, model in models.items():
    model.fit(df_tt[feats], df_tt[target])
    preds = model.predict(df_val[feats])
    rmse = root_mean_squared_error(df_val[target], preds)

    print(f"{name}, rmse: {rmse}")


ols, rmse: 0.06816955067761325
rf, rmse: 0.06462492970375812
extra_trees, rmse: 0.06465962127290463
ada, rmse: 0.0704119635300178
xgbr, rmse: 0.06492964204590143
svr, rmse: 0.07068494935702643
grad, rmse: 0.06679594827861181
knr, rmse: 0.07246734348724386


In [20]:
grid_cv = GridSearchCV(ExtraTreesRegressor(),
                       param_grid={
                           'max_depth': [5, 10, 15, 20],
                           'n_estimators': [100, 500]
                       },
                       scoring='neg_root_mean_squared_error',
                       cv=5
                       )

grid_cv.fit(df_train[feats], df_train[target])

print(grid_cv.best_score_, grid_cv.best_params_)

-0.05888961694279289 {'max_depth': 15, 'n_estimators': 500}


In [21]:
feature_importance_df = pd.DataFrame({
    'Feature': feats,  # List of feature names
    'Importance': grid_cv.best_estimator_.feature_importances_
})

# Sort the features by importance in descending order
feature_importance_df = feature_importance_df.sort_values(by="Importance", ascending=False)

# Reset index for better readability
feature_importance_df.reset_index(drop=True, inplace=True)

# Display the DataFrame
print(feature_importance_df)

               Feature  Importance
0                 Year    0.163003
1   pt_gdp_agriculture    0.145501
2            mean_temp    0.125985
3  yearly_avg_rainfall    0.113712
4        mean_temp_t_1    0.099307
5     rainfall_var_t_1    0.096155
6       rainfall_var_t    0.093170
7          mean_temp_t    0.088921
8             oil_rent    0.074246


In [23]:
pp = f_regression(df_tt[feats], df_tt[target])[1]

for i, val in enumerate(pp):
    print(feats[i], val)

Year 0.02241580939016425
yearly_avg_rainfall 0.625058414793981
rainfall_var_t 0.6434453339655446
rainfall_var_t_1 0.8459305037759743
mean_temp 0.27945099669783546
mean_temp_t 0.1657856573940783
mean_temp_t_1 0.1952734273254414
pt_gdp_agriculture 0.5551019242079394
oil_rent 0.23665724164830593


In [24]:

ols = sm.OLS(df_tt[target], df_tt[feats])
ols_result = ols.fit()
ols_result.summary()

0,1,2,3
Dep. Variable:,gdp_g,R-squared (uncentered):,0.239
Model:,OLS,Adj. R-squared (uncentered):,0.234
Method:,Least Squares,F-statistic:,49.85
Date:,"Sun, 13 Apr 2025",Prob (F-statistic):,1.12e-78
Time:,17:43:17,Log-Likelihood:,1999.9
No. Observations:,1436,AIC:,-3982.0
Df Residuals:,1427,BIC:,-3934.0
Df Model:,9,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Year,2.516e-05,6.02e-06,4.182,0.000,1.34e-05,3.7e-05
yearly_avg_rainfall,-3.312e-06,3.34e-06,-0.990,0.322,-9.87e-06,3.25e-06
rainfall_var_t,0.0002,0.009,0.027,0.978,-0.017,0.017
rainfall_var_t_1,0.0004,0.008,0.043,0.966,-0.016,0.017
mean_temp,-0.0006,0.000,-1.322,0.186,-0.002,0.000
mean_temp_t,-0.1058,0.117,-0.902,0.367,-0.336,0.124
mean_temp_t_1,0.1064,0.117,0.909,0.363,-0.123,0.336
pt_gdp_agriculture,1.061e-05,9.7e-05,0.109,0.913,-0.000,0.000
oil_rent,0.0003,0.000,1.272,0.204,-0.000,0.001

0,1,2,3
Omnibus:,373.743,Durbin-Watson:,1.973
Prob(Omnibus):,0.0,Jarque-Bera (JB):,7566.326
Skew:,-0.691,Prob(JB):,0.0
Kurtosis:,14.16,Cond. No.,182000.0
