## The purpose of this notebook is to classify Nation State Failure events in the next 5 years, better than a baseline guess. Baseline guesses classify whether a state failure event will occur based on whether there were state failure events for a nation in the last 5 years.

In [37]:
import pandas as pd

In [38]:
train = pd.read_csv('https://raw.githubusercontent.com/ddodds42/DS_Unit_2_Build_DAVID_DODDS/master/train_github.csv')
val = pd.read_csv('https://raw.githubusercontent.com/ddodds42/DS_Unit_2_Build_DAVID_DODDS/master/val_github.csv')
test = pd.read_csv('https://raw.githubusercontent.com/ddodds42/DS_Unit_2_Build_DAVID_DODDS/master/test_github.csv')

## Columns named with a T_ are targets. C_ indicates a feature measuring state centralization, E_ indicates a feature measuring economic inclusion, and P_ indicates a feature measuring political inclusion.

In [39]:
train.columns

Index(['NS_code_from_gini', 'Nation_State', 'T_HDI_Mean_plus_5',
       'T_HDI_Trend_plus_5', 'T_State_Fail_yr_1', 'T_State_Fail_plus_5',
       'T_State_Fail_yr_5', 'year0', 'CWGI_Poli_Stabl_Mean_minus_4',
       'CWGI_Poli_Stabl_Trend_minus_4', 'CWGI_RoL_Mean_minus_4',
       'CWGI_RoL_Trend_minus_4', 'E_GINI_Mean_minus_4', 'E_GINI_Trend_minus_4',
       'PCPI_std_Mean_minus_4', 'PCPI_std_Trend_minus_4',
       'PWGI_Corrupt_Ctrl_Mean_minus_4', 'PWGI_Corrupt_Ctrl_Trend_minus_4',
       'PWGI_Voice_Acct_Mean_minus_4', 'PWGI_Voice_Acct_Trend_minus_4'],
      dtype='object')

In [40]:
features = ['NS_code_from_gini', 'year0', 'CWGI_Poli_Stabl_Mean_minus_4',
       'CWGI_Poli_Stabl_Trend_minus_4', 'CWGI_RoL_Mean_minus_4',
       'CWGI_RoL_Trend_minus_4', 'E_GINI_Mean_minus_4', 'E_GINI_Trend_minus_4',
       'PCPI_std_Mean_minus_4', 'PCPI_std_Trend_minus_4',
       'PWGI_Corrupt_Ctrl_Mean_minus_4', 'PWGI_Corrupt_Ctrl_Trend_minus_4',
       'PWGI_Voice_Acct_Mean_minus_4', 'PWGI_Voice_Acct_Trend_minus_4']
target = 'T_State_Fail_yr_5'

In [41]:
X_train = train[features]
X_val = val[features]
X_test = test[features]

y_train = train[target]
y_val = val[target]
y_test = test[target]

In [42]:
import category_encoders as ce
from sklearn.pipeline import make_pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score

In [43]:
pipeline = make_pipeline(
    ce.OrdinalEncoder(), 
    DecisionTreeClassifier(max_depth=13, random_state=42)
)

pipeline.fit(X_train, y_train)
y_predt = pipeline.predict(X_train)
y_predv = pipeline.predict(X_val)
print('Yr 5 Train F_Score', f1_score(y_train, y_predt))
print('Yr 5 Validation F_Score', f1_score(y_val, y_predv))
print('Yr 5 Validation roc_auc_score', roc_auc_score(y_val, y_predv))

Yr 5 Train F_Score 0.9469598965071151
Yr 5 Validation F_Score 0.6617647058823529
Yr 5 Validation roc_auc_score 0.816712615267611


## ^^ T_State_Fail_yr_5: MVP model beat Initial baseline by 17 points. It is 17 points better at predicting a state failure event 5 years from the present. ^^

In [44]:
target0 = 'T_State_Fail_plus_5'

y_train0 = train[target0]
y_val0 = val[target0]
y_test0 = test[target0]

In [45]:
pipeline0 = make_pipeline(
    ce.OrdinalEncoder(), 
    DecisionTreeClassifier(max_depth=13, random_state=42)
)

pipeline0.fit(X_train, y_train0)
y_predt = pipeline0.predict(X_train)
y_predv = pipeline0.predict(X_val)
print('Nx 5 Train F_Score', f1_score(y_train0, y_predt))
print('Nx 5 Validation F_Score', f1_score(y_val0, y_predv))

Nx 5 Train F_Score 0.9623493975903615
Nx 5 Validation F_Score 0.7481481481481481


## T_State_Fail_plus_5: MVP model beat Initial baseline by almost 8 points. It is almost 8 points better at predicting a state failure event in any of the next 5 years. ^^

In [46]:
target1 = 'T_State_Fail_yr_1'

y_train1 = train[target1]
y_val1 = val[target1]
y_test1 = test[target1]

In [47]:
pipeline1 = make_pipeline(
    ce.OrdinalEncoder(), 
    DecisionTreeClassifier(max_depth=17, random_state=42)
)

pipeline1.fit(X_train, y_train1)
y_predt = pipeline1.predict(X_train)
y_predv = pipeline1.predict(X_val)
print('Yr 1 Train F_Score', f1_score(y_train1, y_predt))
print('Yr 1 Validation F_Score', f1_score(y_val1, y_predv))

Yr 1 Train F_Score 0.9988109393579072
Yr 1 Validation F_Score 0.6413043478260869


## ^^ Decision Tree Classifier failed to beat baseline for T_State_Fail_yr_1. To beat baseline for predicting a state failure event next year, a model must have an F1 Score of 0.80457 or greater. ^^

In [48]:
from sklearn.linear_model import LogisticRegression

In [49]:
features1 = ['year0', 'CWGI_Poli_Stabl_Mean_minus_4',
       'CWGI_Poli_Stabl_Trend_minus_4', 'CWGI_RoL_Mean_minus_4',
       'CWGI_RoL_Trend_minus_4', 'E_GINI_Mean_minus_4', 'E_GINI_Trend_minus_4',
       'PCPI_std_Mean_minus_4', 'PCPI_std_Trend_minus_4',
       'PWGI_Corrupt_Ctrl_Mean_minus_4', 'PWGI_Corrupt_Ctrl_Trend_minus_4',
       'PWGI_Voice_Acct_Mean_minus_4', 'PWGI_Voice_Acct_Trend_minus_4']

In [50]:
X_train1 = train[features1]
X_val1 = val[features1]
X_test1 = test[features1]

In [51]:
model = LogisticRegression(max_iter=2000)
model.fit(X_train1, y_train1)
y_predt = model.predict(X_train1)
y_predv = model.predict(X_val1)
print('Yr 1 Train F_Score', f1_score(y_train1, y_predt))
print('Yr 1 Validation F_Score', f1_score(y_val1, y_predv))

Yr 1 Train F_Score 0.48233486943164366
Yr 1 Validation F_Score 0.4084507042253521


## ^^ For predicting state failure events next year, Logistic Regression has half the F1_score of baseline. ^^

In [52]:
from sklearn.linear_model import LogisticRegressionCV

In [53]:
explore = pd.read_csv('https://raw.githubusercontent.com/ddodds42/DS_Unit_2_Build_DAVID_DODDS/master/explore_github.csv')

In [54]:
X_explore = explore[features1]
y_explore = explore[target1]

In [55]:
model0 = LogisticRegressionCV(scoring='f1', max_iter=2000)
model0.fit(X_explore, y_explore)
print('Yr 1 Validation F_Score', model0.score(X_explore, y_explore))

Yr 1 Validation F_Score 0.46134020618556704


## ^^ Cross Validated Logistic Regression is not much better. ^^

In [56]:
from sklearn.ensemble import RandomForestClassifier

In [57]:
pipeline2 = make_pipeline(
    ce.OrdinalEncoder(), 
    RandomForestClassifier(n_estimators=100, n_jobs=-1, random_state=42)
)

pipeline2.fit(X_train, y_train1)
y_predt = pipeline2.predict(X_train)
y_predv = pipeline2.predict(X_val)
print('Yr 1 Train F_Score', f1_score(y_train1, y_predt))
print('Yr 1 Validation F_Score', f1_score(y_val1, y_predv))

Yr 1 Train F_Score 1.0
Yr 1 Validation F_Score 0.675


## ^^ Random Forest Classifier has the best F1_score yet, ~0.67, but is still ~13 points worse than baseline. ^^

In [58]:
from xgboost import XGBClassifier

In [59]:
# from sklearn.metrics import make_scorer
import numpy as np
def f1eval(y_pred, dtrain):
    y_true = dtrain.get_label()
    err = 1-(f1_score(y_true, np.round(y_pred)))
    return 'f1_err', err

encoder = ce.OrdinalEncoder()
X_train_enc = encoder.fit_transform(X_train)
X_val_enc = encoder.transform(X_val)

model1 = XGBClassifier(
    n_estimators = 1000,
    max_depth = 8,
    learning_rate = 0.25,
    n_jobs = -1,
    disable_default_eval_metric = 1
)

evaluators = [(X_train_enc, y_train),
             (X_val_enc, y_val)]

model1.fit(X_train_enc, y_train,
         eval_set = evaluators,
         eval_metric = f1eval,
         early_stopping_rounds = 50)

[0]	validation_0-f1_err:0.435169	validation_1-f1_err:0.592233
Multiple eval metrics have been passed: 'validation_1-f1_err' will be used for early stopping.

Will train until validation_1-f1_err hasn't improved in 50 rounds.
[1]	validation_0-f1_err:0.275272	validation_1-f1_err:0.495935
[2]	validation_0-f1_err:0.261006	validation_1-f1_err:0.5
[3]	validation_0-f1_err:0.238095	validation_1-f1_err:0.5
[4]	validation_0-f1_err:0.1994	validation_1-f1_err:0.438017
[5]	validation_0-f1_err:0.181149	validation_1-f1_err:0.376
[6]	validation_0-f1_err:0.148629	validation_1-f1_err:0.398374
[7]	validation_0-f1_err:0.127119	validation_1-f1_err:0.333333
[8]	validation_0-f1_err:0.11831	validation_1-f1_err:0.306452
[9]	validation_0-f1_err:0.106145	validation_1-f1_err:0.307087
[10]	validation_0-f1_err:0.095436	validation_1-f1_err:0.286822
[11]	validation_0-f1_err:0.084699	validation_1-f1_err:0.282443
[12]	validation_0-f1_err:0.079235	validation_1-f1_err:0.28125
[13]	validation_0-f1_err:0.079019	validation_

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1,
              disable_default_eval_metric=1, gamma=0, learning_rate=0.25,
              max_delta_step=0, max_depth=8, min_child_weight=1, missing=None,
              n_estimators=1000, n_jobs=-1, nthread=None,
              objective='binary:logistic', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=1, seed=None, silent=None,
              subsample=1, verbosity=1)

## XG Boost Classsifier is still closer, but is still worse than baseline by about 6 points for predicting state failure events next year (0.80457 is baseline F1). ^^

In [60]:
import eli5
from eli5.sklearn import PermutationImportance

In [61]:
transformer = ce.OrdinalEncoder()

X_train_trans = transformer.fit_transform(X_train)
X_val_trans = transformer.transform(X_val)

model2 = DecisionTreeClassifier(max_depth=13, random_state=42)
model2.fit(X_train_trans, y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=13, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=42, splitter='best')

In [62]:
permuter = PermutationImportance(
    model2,
    scoring='f1',
    n_iter=5,
    random_state=42
)

permuter.fit(X_val_trans, y_val)

PermutationImportance(cv='prefit',
                      estimator=DecisionTreeClassifier(ccp_alpha=0.0,
                                                       class_weight=None,
                                                       criterion='gini',
                                                       max_depth=13,
                                                       max_features=None,
                                                       max_leaf_nodes=None,
                                                       min_impurity_decrease=0.0,
                                                       min_impurity_split=None,
                                                       min_samples_leaf=1,
                                                       min_samples_split=2,
                                                       min_weight_fraction_leaf=0.0,
                                                       presort='deprecated',
                                                      

In [63]:
feature_list = X_val_trans.columns.tolist()

In [64]:
eli5.show_weights(
    permuter,
    top=None,
    feature_names=feature_list
)

Weight,Feature
0.3078  ± 0.0483,CWGI_Poli_Stabl_Mean_minus_4
0.2931  ± 0.0167,PWGI_Corrupt_Ctrl_Mean_minus_4
0.1878  ± 0.0507,CWGI_RoL_Mean_minus_4
0.1650  ± 0.0514,PWGI_Voice_Acct_Mean_minus_4
0.1319  ± 0.0164,E_GINI_Mean_minus_4
0.0993  ± 0.0437,year0
0.0934  ± 0.0175,PCPI_std_Mean_minus_4
0.0712  ± 0.0432,PWGI_Corrupt_Ctrl_Trend_minus_4
0.0571  ± 0.0219,NS_code_from_gini
0.0441  ± 0.0218,PCPI_std_Trend_minus_4


## ^^ Feature importances for predicting state failure events 5 years from the present. 5-year means are much more important than 5-year trend features. ^^

In [65]:
model3 = DecisionTreeClassifier(max_depth=13, random_state=42)
model3.fit(X_train_trans, y_train0)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=13, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=42, splitter='best')

In [66]:
permuter0 = PermutationImportance(
    model3,
    scoring='f1',
    n_iter=5,
    random_state=42
)

permuter0.fit(X_val_trans, y_val0)

PermutationImportance(cv='prefit',
                      estimator=DecisionTreeClassifier(ccp_alpha=0.0,
                                                       class_weight=None,
                                                       criterion='gini',
                                                       max_depth=13,
                                                       max_features=None,
                                                       max_leaf_nodes=None,
                                                       min_impurity_decrease=0.0,
                                                       min_impurity_split=None,
                                                       min_samples_leaf=1,
                                                       min_samples_split=2,
                                                       min_weight_fraction_leaf=0.0,
                                                       presort='deprecated',
                                                      

In [67]:
eli5.show_weights(
    permuter0,
    top=None,
    feature_names=feature_list
)

Weight,Feature
0.3754  ± 0.0515,CWGI_Poli_Stabl_Mean_minus_4
0.2319  ± 0.0280,PWGI_Corrupt_Ctrl_Mean_minus_4
0.1857  ± 0.0449,CWGI_RoL_Mean_minus_4
0.0973  ± 0.0106,PCPI_std_Mean_minus_4
0.0752  ± 0.0205,PWGI_Voice_Acct_Mean_minus_4
0.0668  ± 0.0168,NS_code_from_gini
0.0603  ± 0.0155,CWGI_Poli_Stabl_Trend_minus_4
0.0495  ± 0.0275,E_GINI_Mean_minus_4
0.0196  ± 0.0080,year0
0.0138  ± 0.0242,PWGI_Corrupt_Ctrl_Trend_minus_4


## ^^ Feature importances for predicting state failure events in any of the next 5 years. Again, 5-year means are more important than 5-year trend features. ^^

In [68]:
permuter1 = PermutationImportance(
    model1,
    scoring='f1',
    n_iter=5,
    random_state=42
)

permuter1.fit(X_val_trans, y_val1)

PermutationImportance(cv='prefit',
                      estimator=XGBClassifier(base_score=0.5, booster='gbtree',
                                              colsample_bylevel=1,
                                              colsample_bynode=1,
                                              colsample_bytree=1,
                                              disable_default_eval_metric=1,
                                              gamma=0, learning_rate=0.25,
                                              max_delta_step=0, max_depth=8,
                                              min_child_weight=1, missing=None,
                                              n_estimators=1000, n_jobs=-1,
                                              nthread=None,
                                              objective='binary:logistic',
                                              random_state=0, reg_alpha=0,
                                              reg_lambda=1, scale_pos_weight=1,
           

In [69]:
eli5.show_weights(
    permuter1,
    top=None,
    feature_names=feature_list
)

Weight,Feature
0.2972  ± 0.0628,CWGI_Poli_Stabl_Mean_minus_4
0.2190  ± 0.0427,PWGI_Corrupt_Ctrl_Mean_minus_4
0.1118  ± 0.0323,PWGI_Voice_Acct_Mean_minus_4
0.1076  ± 0.0410,NS_code_from_gini
0.0961  ± 0.0423,CWGI_RoL_Mean_minus_4
0.0305  ± 0.0294,PCPI_std_Mean_minus_4
0.0304  ± 0.0207,PCPI_std_Trend_minus_4
0.0292  ± 0.0406,E_GINI_Mean_minus_4
0.0239  ± 0.0121,PWGI_Corrupt_Ctrl_Trend_minus_4
0.0232  ± 0.0063,CWGI_RoL_Trend_minus_4


## ^^ Feature importances for predicting state failure events next year. ^^