In [4]:
import sys, os; sys.path.insert(0, os.path.dirname(os.getcwd()))
import pandas as pd
import utils.io
import utils.preprocessing
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import sklearn.linear_model
import xgboost
from sklearn.ensemble import GradientBoostingRegressor
from mlxtend.feature_selection import SequentialFeatureSelector

# Environment settings:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
pd.set_option('display.max_column', None)




In [5]:
data = utils.io.retrieve_processed_dataset()

##### Keep only data for training

In [6]:
data = data.select_dtypes(include = 'number').dropna()
y = data['target']
X = data.drop(columns = 'target')


First we make a model with all features to see how it performs

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)

In [6]:
model = xgboost.XGBRegressor()

"\ngbm_hyperparams = {\n    'n_estimators': 100,\n    'max_depth': 10,\n    'learning_rate': 0.05,\n    'loss': 'ls'\n}\nmodel = GradientBoostingRegressor(**gbm_hyperparams)\n"

In [7]:
weights_train = 1# + (X_train['Year'] - 1960)/50
weights_test = 1 #+ (X_test['Year'] - 1960)/50

In [8]:
model.fit(X_train, y_train)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.300000012, max_delta_step=0, max_depth=6,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=100, n_jobs=4, num_parallel_tree=1, random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=None)

In [9]:
from sklearn.metrics import r2_score, mean_absolute_error
def mape(y, y_):
    return abs((y-y_)/y).mean()

In [10]:
y_ = model.predict(X_test)

In [11]:
print(f"{mape(y_test, y_) = }")
print(f"{mean_absolute_error(y_test, y_) = }")
print(f"{r2_score(y_test, y_) = }")

mape(y_test, y_) = inf
mean_absolute_error(y_test, y_) = 2.551737284145325
r2_score(y_test, y_) = 0.2303740563622051


##### Feature selection

In [10]:
X_non_corr = utils.preprocessing.drop_high_corr(X,0.8);
X_non_corr.shape


(12350, 124)

In [42]:
X_train, X_test, y_train, y_test = train_test_split(X_non_corr, y, test_size = 0.2, random_state = 42)
model = xgboost.XGBRegressor()
model.fit(X_train, y_train)
y_ = model.predict(X_test)
print(f"{r2_score(y_test, y_) = }")

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.300000012, max_delta_step=0, max_depth=6,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=100, n_jobs=4, num_parallel_tree=1, random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=None)

r2_score(y_test, y_) = 0.19293012554835343


In [43]:
# from mlxtend
sfs = SequentialFeatureSelector(model, k_features=10, forward=True, floating=False, verbose=2, scoring='r2', cv=5, n_jobs=1)
sfs.fit(X_train, y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.3s remaining:    0.0s


[Parallel(n_jobs=1)]: Done 124 out of 124 | elapsed:  1.5min finished

[2020-12-13 14:08:27] Features: 1/10 -- score: 0.06653838023783734[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.9s remaining:    0.0s


[Parallel(n_jobs=1)]: Done 123 out of 123 | elapsed:  1.6min finished

[2020-12-13 14:10:04] Features: 2/10 -- score: 0.1375788877075221[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.9s remaining:    0.0s


[Parallel(n_jobs=1)]: Done 122 out of 122 | elapsed:  1.7min finished

[2020-12-13 14:11:47] Features: 3/10 -- score: 0.17373891980417844[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.0s remaining:    0.0s


[Parallel(n_jobs=1)]: Done 121 out of 121 | elapsed:  1.8min finished

[2020-12-13 14:13:32] Features: 4/10 -- score: 0.20031341440868414[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.0s remaining:    0.0s


[Parallel(n_jobs=1)]: Done 120 out of 120 | elapsed:  1.8min finished

[2020-12-13 14:15:22] Features: 5/10 -- score: 0.21313595284163403[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.0s remaining:    0.0s


[Parallel(n_jobs=1)]: Done 119 out of 119 | elapsed:  2.0min finished

[2020-12-13 14:17:24] Features: 6/10 -- score: 0.2232537790586527[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.1s remaining:    0.0s


[Parallel(n_jobs=1)]: Done 118 out of 118 | elapsed:  2.1min finished

[2020-12-13 14:19:29] Features: 7/10 -- score: 0.23200025804718086[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.2s remaining:    0.0s


[Parallel(n_jobs=1)]: Done 117 out of 117 | elapsed:  2.2min finished

[2020-12-13 14:21:40] Features: 8/10 -- score: 0.24108098577339057[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.2s remaining:    0.0s


[Parallel(n_jobs=1)]: Done 116 out of 116 | elapsed:  2.2min finished

[2020-12-13 14:23:55] Features: 9/10 -- score: 0.24392095634869387[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.2s remaining:    0.0s


[Parallel(n_jobs=1)]: Done 115 out of 115 | elapsed:  2.3min finished

[2020-12-13 14:26:13] Features: 10/10 -- score: 0.2393512127830409

SequentialFeatureSelector(estimator=XGBRegressor(base_score=0.5,
                                                 booster='gbtree',
                                                 colsample_bylevel=1,
                                                 colsample_bynode=1,
                                                 colsample_bytree=1, gamma=0,
                                                 gpu_id=-1,
                                                 importance_type='gain',
                                                 interaction_constraints='',
                                                 learning_rate=0.300000012,
                                                 max_delta_step=0, max_depth=6,
                                                 min_child_weight=1,
                                                 missing=nan,
                                                 monotone_constraints='()',
                                                 n_estimators=100, n_jobs=4,
 

In [47]:
features = list(sfs.k_feature_names_)
features

['Year',
 'AG.LND.FRST.ZS',
 'ER.LND.PTLD.ZS',
 'ER.PTD.TOTL.ZS',
 'NY.GDP.PCAP.PP.CD',
 'SE.PRM.DURS',
 'SH.MMR.RISK',
 'Region_East Asia & Pacific',
 'Region_Latin America & Caribbean',
 'Region_South Asia']

In [48]:
model = xgboost.XGBRegressor()
model.fit(X_train[features], y_train)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.300000012, max_delta_step=0, max_depth=6,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=100, n_jobs=4, num_parallel_tree=1, random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=None)

In [49]:
y_ = model.predict(X_test[features])
print(f"{r2_score(y_test, y_) = }")

r2_score(y_test, y_) = 0.13844702520385976


There is too much overfitting