In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
data = pd.read_csv('../../02_dataset/merged_df.csv')
data.drop(columns='Unnamed: 0', inplace=True)
data.sample(15)

Unnamed: 0,ISO3_code,Year,pt_gdp_agriculture,oil_rent,democracy_polity,gdp,country,ethnic_fractionation_index,mean_temp,yearly_avg_rainfall,...,gdp_pchange,unemp_rate,unemp_rate_pchange,gini,gini_pchange,population,participants,number_events,yprotest,protest_fraction
7939,LKA,1966,28.682653,0.0,7.0,7777282000.0,Sri Lanka,0.436,27.1,1317.719298,...,5.023789,3.88,0.0,0.368815,0.0,,,,,
7011,ROU,1966,21.809069,0.183645,-7.0,92524250000.0,Romania,0.224,9.9,638.608333,...,0.0,3.912,0.0,0.339969,0.0,,,,,
6404,OMN,1969,16.1,14.994278,-10.0,6377742000.0,Oman,0.209,27.4,29.577044,...,25.666159,1.46,0.0,,,,,,,
9185,UZB,2015,29.220718,0.526676,-9.0,86196260000.0,Uzbekistan,0.462,14.0,218.639024,...,7.218766,5.15,1.178782,0.359525,0.0,30749346.0,0.0,0.0,0.0,0.0
3148,GMB,1967,32.26371,0.0,8.0,253504700.0,Gambia,0.754,27.2,922.916667,...,0.0,4.127,0.0,0.427606,0.0,,,,,
2791,EST,1961,5.051828,0.233222,-7.0,9855880000.0,Estonia,0.52,6.4,532.723214,...,0.0,1.468,0.0,0.328386,0.0,,,,,
8758,TKM,1975,33.333332,3.655335,-7.0,7852866000.0,Turkmenistan,0.464,15.7,92.289167,...,0.0,1.4,0.0,0.408069,0.0,,,,,
6832,POL,1979,5.73588,0.004001,-7.0,181131200000.0,Poland,0.026,7.3,494.030996,...,0.0,2.886,0.0,0.313952,0.0,,,,,
477,AUT,1978,4.468656,0.141544,10.0,178763600000.0,Austria,0.1,5.8,758.030488,...,-0.210661,3.42,0.0,0.301773,0.0,,,,,
6557,PAN,1994,6.46079,0.0,9.0,16816900000.0,Panama,0.569,25.9,1857.234848,...,2.850147,6.439,1.433522,0.533385,0.0,2669546.0,0.0,0.0,0.0,0.0


## Try modeling the sub-Saharan African countries

Choose features and target

In [3]:
feats = ['Year',
         'yearly_avg_rainfall',
         'rainfall_var_t',
         'rainfall_var_t_1',
         'mean_temp',
         'pt_gdp_agriculture',
         'oil_rent']

target = 'gdp_g'

In [4]:
subsaharan = ['AGO', 'BEN', 'BWA', 'CIV', 'CAF', 'CMR', 'COD', 'COG', 'CPV', 'DJI', 'ERI', 'ETH', 'GAB', 'GHA', 'GIN', 'GMB', 'KEN', 'LSO', 'LBR', 'MDG', 'MLI', 'MRT', 'MUS', 'MWI', 'NAM', 'NER', 'NGA', 'RWA', 'SEN', 'SDN', 'SLE', 'SOM', 'SSD', 'SWZ', 'SYC', 'TCD', 'TGO', 'TZA', 'UGA', 'ZMB', 'ZWE']
sub_data = data[data['ISO3_code'].isin(subsaharan)]

In [5]:
sub_data[feats]

Unnamed: 0,Year,yearly_avg_rainfall,rainfall_var_t,rainfall_var_t_1,mean_temp,pt_gdp_agriculture,oil_rent
192,1960,1018.500204,0.051549,0.051549,21.6,14.902152,4.094633
193,1961,1071.002852,0.051549,0.051549,21.3,14.902152,4.094633
194,1962,979.839650,-0.085119,0.051549,21.3,14.902152,4.094633
195,1963,967.382233,-0.012714,-0.085119,21.3,14.902152,4.094633
196,1964,833.767522,-0.138120,-0.012714,21.3,14.902152,4.094633
...,...,...,...,...,...,...,...
9317,2019,647.373718,0.176221,-0.316445,22.4,9.819262,0.050883
9318,2020,586.835256,-0.093514,0.176221,22.0,8.772859,0.029129
9319,2021,543.462821,-0.073909,-0.093514,22.0,8.849899,0.047769
9320,2022,635.176282,0.168758,-0.073909,21.9,7.170550,0.000000


In [6]:
from sklearn.model_selection import train_test_split


df_train, df_test = train_test_split(sub_data, 
                                                    shuffle=True,
                                                    random_state=216,
                                                    test_size=.2)
df_tt, df_val = train_test_split(df_train, 
                                                    shuffle=True,
                                                    random_state=216,
                                                    test_size=.2)


In [7]:
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, AdaBoostRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor
from sklearn.svm import SVR

from sklearn.metrics import root_mean_squared_error

Initial model selection: just run a set of models, mostly ensemble methods, with default hyperparameters and on training data. We will choose the best one using RMSE as a metric and then further pick the best hyperparameters using cross validation.

In [8]:
models = {
    'ols': LinearRegression(),
    'rf': RandomForestRegressor(),
    'extra_trees': ExtraTreesRegressor(),
    'ada': AdaBoostRegressor(),
    'xgbr': XGBRegressor(),
    'svr': SVR(),
    'grad': GradientBoostingRegressor(),
    'knr': KNeighborsRegressor(5)
}

for name, model in models.items():
    model.fit(df_tt[feats], df_tt[target])
    preds = model.predict(df_val[feats])
    rmse = root_mean_squared_error(df_val[target], preds)

    print(f"{name}, rmse: {rmse}")

ols, rmse: 0.06827070330302251
rf, rmse: 0.06352216720052865
extra_trees, rmse: 0.06390534014317806
ada, rmse: 0.07135603502478294
xgbr, rmse: 0.06425829507127696
svr, rmse: 0.07068276556227954
grad, rmse: 0.06511321756741419
knr, rmse: 0.07246681820896303


In [None]:
from sklearn.model_selection import GridSearchCV

grid_cv = GridSearchCV(ExtraTreesRegressor(),
                       param_grid={
                           'max_depth': [5, 10],
                           'n_estimators': [100]
                       },
                       scoring='neg_root_mean_squared_error',
                       cv=5
                       )

grid_cv.fit(df_train[feats], df_train[target])

print(grid_cv.best_score_, grid_cv.best_params_)

-0.05909959034575255 {'max_depth': 10, 'n_estimators': 100}


In [10]:
grid_cv = GridSearchCV(XGBRegressor(),
                       param_grid={
                           'max_depth': [1, 5,10],
                           'learning_rate': [0.01, 0.1, 1],
                           'n_estimators': [100, 500]
                       },
                       scoring='neg_root_mean_squared_error',
                       cv=5
                       )

grid_cv.fit(df_train[feats], df_train[target])

print(grid_cv.best_score_, grid_cv.best_params_)

-0.059560477384699315 {'learning_rate': 0.01, 'max_depth': 5, 'n_estimators': 500}


In [11]:
!pip install interpret

Collecting interpret
  Using cached interpret-0.6.10-py3-none-any.whl.metadata (1.2 kB)
Collecting interpret-core==0.6.10 (from interpret-core[aplr,dash,debug,linear,notebook,plotly,sensitivity,shap]==0.6.10->interpret)
  Using cached interpret_core-0.6.10-py3-none-any.whl.metadata (2.9 kB)
Collecting plotly>=3.8.1 (from interpret-core[aplr,dash,debug,linear,notebook,plotly,sensitivity,shap]==0.6.10->interpret)
  Using cached plotly-6.0.1-py3-none-any.whl.metadata (6.7 kB)
Collecting SALib>=1.3.3 (from interpret-core[aplr,dash,debug,linear,notebook,plotly,sensitivity,shap]==0.6.10->interpret)
  Using cached salib-1.5.1-py3-none-any.whl.metadata (11 kB)
Collecting shap>=0.28.5 (from interpret-core[aplr,dash,debug,linear,notebook,plotly,sensitivity,shap]==0.6.10->interpret)
  Using cached shap-0.47.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (25 kB)
Collecting dill>=0.2.5 (from interpret-core[aplr,dash,debug,linear,notebook,plotly,sensitivity,shap]==0.6.10->inte

In [12]:
from interpret.glassbox import ExplainableBoostingRegressor
from interpret import show

In [13]:
ebm = ExplainableBoostingRegressor(interactions=0)
ebm.fit(df_train[feats], df_train[target])

In [14]:
show(ebm.explain_global())

In [15]:
ebm = ExplainableBoostingRegressor(interactions=10)
ebm.fit(df_train[feats], df_train[target])
show(ebm.explain_global())

## Try the same with with the dummy variables

In [16]:
cols = []
for code in sub_data['ISO3_code'].unique():
    sub_data[code] = pd.get_dummies(sub_data['ISO3_code'])[code]*1
    sub_data[f'{code}_year'] = sub_data[code] * sub_data['Year']
    cols.append(code)
    cols.append(f'{code}_year')
sub_data.columns



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/

Index(['ISO3_code', 'Year', 'pt_gdp_agriculture', 'oil_rent',
       'democracy_polity', 'gdp', 'country', 'ethnic_fractionation_index',
       'mean_temp', 'yearly_avg_rainfall',
       ...
       'TZA', 'TZA_year', 'TGO', 'TGO_year', 'UGA', 'UGA_year', 'ZMB',
       'ZMB_year', 'ZWE', 'ZWE_year'],
      dtype='object', length=103)

In [17]:
df_train, df_test = train_test_split(sub_data, 
                                                    shuffle=True,
                                                    random_state=216,
                                                    test_size=.2)
df_tt, df_val = train_test_split(df_train, 
                                                    shuffle=True,
                                                    random_state=216,
                                                    test_size=.2)

In [18]:
models = {
    'ols': LinearRegression(),
    'rf': RandomForestRegressor(),
    'extra_trees': ExtraTreesRegressor(),
    'ada': AdaBoostRegressor(),
    'xgbr': XGBRegressor(),
    'svr': SVR(),
    'grad': GradientBoostingRegressor(),
    'knr': KNeighborsRegressor(5)
}

for name, model in models.items():
    model.fit(df_tt[feats], df_tt[target])
    preds = model.predict(df_val[feats])
    rmse = root_mean_squared_error(df_val[target], preds)

    print(f"{name}, rmse: {rmse}")

ols, rmse: 0.06827070330302251
rf, rmse: 0.06354061944832952
extra_trees, rmse: 0.06337302482743114
ada, rmse: 0.07183026918258664
xgbr, rmse: 0.06425829507127696
svr, rmse: 0.07068276556227954
grad, rmse: 0.06510227641006888
knr, rmse: 0.07246681820896303


In [19]:
from sklearn.model_selection import GridSearchCV

grid_cv = GridSearchCV(ExtraTreesRegressor(),
                       param_grid={
                           'max_depth': [5, 10],
                           'n_estimators': [100]
                       },
                       scoring='neg_root_mean_squared_error',
                       cv=5
                       )

grid_cv.fit(df_train[feats], df_train[target])

print(grid_cv.best_score_, grid_cv.best_params_)

-0.059029599482301 {'max_depth': 10, 'n_estimators': 100}


In [20]:
grid_cv = GridSearchCV(XGBRegressor(),
                       param_grid={
                           'max_depth': [1, 5,10],
                           'learning_rate': [0.01, 0.1, 1],
                           'n_estimators': [100, 500]
                       },
                       scoring='neg_root_mean_squared_error',
                       cv=5
                       )

grid_cv.fit(df_train[feats], df_train[target])

print(grid_cv.best_score_, grid_cv.best_params_)

-0.059560477384699315 {'learning_rate': 0.01, 'max_depth': 5, 'n_estimators': 500}


In [21]:
ebm = ExplainableBoostingRegressor(interactions=0)
ebm.fit(df_train[feats], df_train[target])
show(ebm.explain_global())

In [22]:
ebm = ExplainableBoostingRegressor(interactions=10)
ebm.fit(df_train[feats], df_train[target])
show(ebm.explain_global())