In [None]:
import warnings
from copy import deepcopy
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
import imblearn.over_sampling

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
# from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import MinMaxScaler
# from sklearn.decomposition import PCA
from sklearn.feature_selection import chi2

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from xgboost import XGBClassifier

from sklearn import metrics
from sklearn.metrics import average_precision_score
# from sklearn.metrics import roc_curve
# from sklearn.metrics import auc
from sklearn.metrics import roc_auc_score
# from sklearn.metrics import precision_recall_curve

In [5]:
# function cm_val creates an interactive confusion matrix on un-scaled data.
# function cm_val_scaled creates an interactive confusion matrix on scaled data.
from my_functions import cm_val
from my_functions import cm_val_scaled
# function y_pred_inverse extracts the predictive probability from predict_proba.
from my_functions import y_pred_inverse
# function plot_validation_curve_log plots a validation curve on a log scale.
# function plot_validation_curve_reg plots a validation curve on a default scale.
from my_functions import plot_validation_curve_log
from my_functions import plot_validation_curve_reg
# function plot_learning_curve_reg plots a learning curve on a default scale.
from my_functions import plot_learning_curve
# function plot_decision_tree uses graphviz to visualize the splits of a devision tree.
from my_functions import plot_decision_tree
# function train_and_calibrate_cv performs stratified shuffle split on a specified model,
# returning validation scores and roc/auc.
from my_functions import train_and_calibrate_cv

In [6]:
warnings.filterwarnings('ignore')

In [7]:
pd.set_option('display.max_colwidth', 1000)
pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', lambda x: '%.3f' % x)
# pd.set_option('display.max_rows', None)

In [47]:
np.random.seed(42)

In [48]:
df = pd.read_pickle('Pickles/df_modeling.pkl')

In [49]:
len(df)

8643

In [50]:
df[df['state_name'] == 'United States of America']['t_num_conflicts'].mean()

0.6932030139800149

In [51]:
df.groupby('conflict_present').mean()

Unnamed: 0_level_0,cow_code,year,num_trade_states,export_dollars,import_dollars,t_num_conflicts,military_expenditure,military_personnel,prim_energy_consumption,total_pop,cinc_score,num_alliances,pre_1816_alliances,num_in_effect_1231_2012,defense_treaties,neutrality_treaties,nonaggression_treaties,entente_treaties,num_conflicts,avg_cum_duration,num_wars,ongoing_2010,revision_pct,avg_hostility_level,dominant_majority,num_leadership_changes,leader_tenure,age_govt,num_transitions_ever,mtco2,export_import_ratio,export_dollars_change,export_dollars_change_pct,import_dollars_change,import_dollars_change_pct,military_expenditure_change,military_expenditure_change_pct,military_personnel_change,military_personnel_change_pct,export_import_ratio_change,export_import_ratio_change_pct,prim_energy_consumption_change,prim_energy_consumption_change_pct,total_pop_change,total_pop_change_pct,mtco2_change,mtco2_change_pct,cinc_score_change,Americas,Asia,Europe,Oceania,Indirect election,Nonelective,No legislature exists,Non‐elective legislature,Appointed,Closed,Elected,All parties legally banned,Legally single party state,Multiple parties legally allowed,Multiple parties,No parties,One party,Multiple parties outside regime,No parties outside regime,One party outside regime,Legislature with multiple parties,No legislature or all nonpartisan,Only members from regime party,Rules rewritten unconstitutionally,collective_leadership,military_leader,royal_leader,nominal_vs_eff_diff,communist_leader,leader_died,democratic_regime,cabinet_assembly,popular_election,Civilian dictatorship,Military dictatorship,Mixed (semi‐presidential) democracy,Parliamentary democracy,Presidential democracy,Royal dictatorship,transition_to_democracy,transition_to_dictatorship,war_present,40s,50s,60s,70s,80s,90s
conflict_present,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1
0,421.456,1982.984,154.602,11737234659.996,12125059075.517,0.675,1167097758.26,67361.028,60062262410.143,14348604.007,0.003,13.332,0.009,11.637,11.121,0.175,10.002,11.385,0.0,0.0,0.0,0.0,0.0,1.0,0.978,0.217,7.8,30.566,0.333,51.728,3.578,948079280.29,1.644,997843164.025,3.927,33979424.948,1.444,-498.326,0.969,-0.369,1.151,1603754004.641,2.098,219412.634,1.019,1.179,1.141,-0.0,0.241,0.176,0.23,0.052,0.42,0.244,0.101,0.041,0.032,0.119,0.847,0.095,0.129,0.774,0.753,0.093,0.152,0.719,0.093,0.186,0.644,0.171,0.182,0.037,0.017,0.196,0.082,0.073,0.068,0.015,0.452,0.325,0.213,0.287,0.179,0.089,0.23,0.133,0.082,0.012,0.007,0.0,0.026,0.082,0.129,0.171,0.186,0.199
1,505.496,1981.391,151.663,32146323340.8,31326062890.463,0.729,9143170066.928,364667.723,294455186153.474,64625102.397,0.016,14.886,0.025,12.159,10.641,0.377,11.402,12.77,2.176,0.81,0.154,0.007,0.45,3.232,0.98,0.219,7.242,34.736,0.421,248.871,2.998,2806220610.438,1.288,2731601848.984,1.353,412176603.438,1.214,729.717,1.048,0.474,1.143,7875818687.969,1.299,980637.255,1.02,6.072,1.074,-0.0,0.142,0.438,0.186,0.016,0.379,0.338,0.133,0.06,0.052,0.156,0.786,0.119,0.1,0.777,0.741,0.1,0.156,0.674,0.1,0.223,0.592,0.223,0.18,0.027,0.002,0.284,0.074,0.12,0.101,0.017,0.383,0.284,0.159,0.316,0.226,0.052,0.225,0.107,0.074,0.012,0.008,0.087,0.018,0.116,0.146,0.152,0.182,0.216


In [52]:
# df['military_personnel'] = df['military_personnel']/df['total_pop']
# df['military_expenditure'] = df['military_expenditure']/df['total_pop']
# df['military_expenditure_change'] = df['military_expenditure_change']/df['total_pop']
# df['military_personnel_change'] = df['military_personnel_change']/df['total_pop']
# df['military_personnel_change_pct'] = df['military_personnel_change_pct']/df['total_pop']
# df['military_expenditure_change_pct'] = df['military_expenditure_change_pct']/df['total_pop']
# df['prim_energy_consumption_per_pop'] = df['prim_energy_consumption']/df['total_pop']
# df['prim_energy_consumption_change_per_pop'] = df['prim_energy_consumption_change']/df['total_pop']
# df['mtco2_per_pop'] = df['mtco2']/df['total_pop']
# df['mtco2_change_per_pop'] = df['mtco2_change']/df['total_pop']
# df['import_dollars_per_pop'] = df['import_dollars']/df['total_pop']
# df['export_dollars_per_pop'] = df['export_dollars']/df['total_pop']
# df['import_dollars_change_per_pop'] = df['import_dollars_change']/df['total_pop']
# df['export_dollars_change_per_pop'] = df['export_dollars_change']/df['total_pop']
# df.drop(['prim_energy_consumption', 'prim_energy_consumption_change',
#          'import_dollars_change', 'export_dollars_change', 'mtco2',
#          'import_dollars', 'export_dollars'], axis = 1, inplace = True)

In [53]:
drop_list = ['cow_code', 'year', 'state_name', 'avg_hostility_level', 'num_wars',
             'num_conflicts', 'num_wars', 'num_conflicts', 'war_present',
             'conflict_present', 'revision_pct', 'avg_cum_duration',
             'military_expenditure', 'military_expenditure_change',
             'military_expenditure_change_pct', 'military_personnel',
             'military_personnel_change', 'military_personnel_change_pct',
             'cinc_score', 'cinc_score_change', 'ongoing_2010', 't_num_conflicts']
X = df.drop(drop_list, axis = 1)
y = df['conflict_present']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 10)

In [54]:
df['conflict_present'].value_counts()

0    5889
1    2754
Name: conflict_present, dtype: int64

In [55]:
continuous_columns = ['t_num_conflicts']

In [56]:
continuous_columns = ['num_trade_states',
                      'num_alliances', 'pre_1816_alliances', 'num_in_effect_1231_2012',
                      'defense_treaties', 'neutrality_treaties', 'nonaggression_treaties',
                      'entente_treaties', 'leader_tenure', 'age_govt', 'num_transitions_ever',
                      'num_leadership_changes', 'export_import_ratio', 'export_import_ratio_change',
                      'export_import_ratio_change_pct', 'export_dollars_change_pct',
                      'import_dollars_change_pct', 'prim_energy_consumption_change_pct',
                      'total_pop_change', 'total_pop_change_pct', 'mtco2_change_pct',
                      'prim_energy_consumption', 'prim_energy_consumption_change',
                      'import_dollars_change', 'export_dollars_change', 'mtco2',
                      'import_dollars', 'export_dollars', 'total_pop', 'mtco2_change']

In [57]:
X_cont = X[continuous_columns]
X_cont = X_cont.reset_index()
X_cont.drop(['index'], axis = 1, inplace = True)

X_dummy = X.drop(continuous_columns, axis = 1)
X_dummy = X_dummy.reset_index()
X_dummy.drop(['index'], axis = 1, inplace = True)

X_train_cont = X_train[continuous_columns]
X_train_cont = X_train_cont.reset_index()
X_train_cont.drop(['index'], axis = 1, inplace = True)

X_train_dummy = X_train.drop(continuous_columns, axis = 1)
X_train_dummy = X_train_dummy.reset_index()
X_train_dummy.drop(['index'], axis = 1, inplace = True)

X_test_cont = X_test[continuous_columns]
X_test_cont = X_test_cont.reset_index()
X_test_cont.drop(['index'], axis = 1, inplace = True)

X_test_dummy = X_test.drop(continuous_columns, axis = 1)
X_test_dummy = X_test_dummy.reset_index()
X_test_dummy.drop(['index'], axis = 1, inplace = True)

In [58]:
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(np.array(X_train).reshape(-1, 1))
X_test_scaled = scaler.transform(np.array(X_test).reshape(-1, 1))
X_scaled = scaler.transform(np.array(X).reshape(-1, 1))

In [59]:
scaler = MinMaxScaler()

X_train_cont_scaled = pd.DataFrame(scaler.fit_transform(X_train_cont))
X_train_cont_scaled = X_train_cont_scaled.reset_index()
X_train_cont_scaled.drop(['index'], axis = 1, inplace = True)
X_train_cont_scaled.columns = continuous_columns

X_cont_scaled = pd.DataFrame(scaler.transform(X_cont))
X_cont_scaled = X_cont_scaled.reset_index()
X_cont_scaled.drop(['index'], axis = 1, inplace = True)
X_cont_scaled.columns = continuous_columns

X_test_cont_scaled = pd.DataFrame(scaler.transform(X_test_cont))
X_test_cont_scaled = X_test_cont_scaled.reset_index()
X_test_cont_scaled.drop(['index'], axis = 1, inplace = True)
X_test_cont_scaled.columns = continuous_columns

X_scaled = pd.concat([X_cont_scaled, X_dummy], axis = 1)
X_train_scaled = pd.concat([X_train_cont_scaled, X_train_dummy], axis = 1)
X_test_scaled = pd.concat([X_test_cont_scaled, X_test_dummy], axis = 1)

In [60]:
pd.DataFrame(y_train)['conflict_present'].value_counts()

0    4735
1    2179
Name: conflict_present, dtype: int64

In [61]:
smote = imblearn.over_sampling.SMOTE(ratio = {0: 4845, 1: 4845}, random_state = 101)
X_train_scaled, y_train = smote.fit_sample(X_train_scaled, y_train)

X_train_scaled = pd.DataFrame(X_train_scaled).reset_index()
X_train_scaled.drop(['index'], axis = 1, inplace = True)
X_train_scaled.columns = list(continuous_columns) + list(X_train_dummy.columns)

In [62]:
print(X_train_scaled.shape)
print(X_test_scaled.shape)
print(y_train.shape)
print(y_test.shape)

(9690, 78)
(1729, 78)
(9690,)
(1729,)


In [63]:
pd.set_option('display.max_rows', None)
chi2_, pval = chi2(X_train_scaled, y_train)
p_val_df = pd.DataFrame(pval.tolist())
p_val_df.index = list(continuous_columns) + list(X_train_dummy.columns)
p_val_df.rename({0: 'p_val'}, axis = 1, inplace = True)
p_val_df = p_val_df.sort_values(by = 'p_val', ascending = False)
p_val_df

Unnamed: 0,p_val
export_import_ratio_change,0.989
total_pop_change_pct,0.976
export_import_ratio_change_pct,0.947
import_dollars_change,0.944
mtco2_change_pct,0.93
export_dollars_change,0.925
num_leadership_changes,0.832
total_pop_change,0.816
dominant_majority,0.786
import_dollars_change_pct,0.78


In [64]:
pd.reset_option("display.max_rows")

In [65]:
high_p_val_drop = list(p_val_df[p_val_df['p_val'] > 0.05].index)
print(high_p_val_drop)

['export_import_ratio_change', 'total_pop_change_pct', 'export_import_ratio_change_pct', 'import_dollars_change', 'mtco2_change_pct', 'export_dollars_change', 'num_leadership_changes', 'total_pop_change', 'dominant_majority', 'import_dollars_change_pct', 'mtco2_change', 'Multiple parties legally allowed', 'export_import_ratio', 'export_dollars_change_pct', 'prim_energy_consumption_change', 'Only members from regime party', '80s', 'prim_energy_consumption_change_pct', 'num_in_effect_1231_2012', 'Parliamentary democracy', 'defense_treaties', 'No parties', 'No parties outside regime', 'leader_died', 'transition_to_democracy', 'leader_tenure', 'Multiple parties', '90s', 'num_trade_states', 'transition_to_dictatorship']


In [66]:
X_scaled.drop(high_p_val_drop, axis = 1, inplace = True)
X_train_scaled.drop(high_p_val_drop, axis = 1, inplace = True)
X_test_scaled.drop(high_p_val_drop, axis = 1, inplace = True)

In [67]:
cv = StratifiedKFold(n_splits = 5, shuffle = True, random_state = 101)
param_grid = {'C': [1, 10, 20, 50, 75, 100],
             'penalty': ['l1', 'l2']}
grid = GridSearchCV(LogisticRegression(random_state = 101), param_grid = param_grid, cv = cv, scoring = 'roc_auc', refit = True)
grid.fit(X_train_scaled, y_train)

train_predictions = grid.predict(X_train_scaled)
test_predictions = grid.predict(X_test_scaled)
print(grid.cv_results_['mean_train_score'].mean())
print(grid.cv_results_['mean_test_score'].mean())
print('')
print(grid.best_params_)

0.7634105332472098
0.7574160162136659

{'C': 75, 'penalty': 'l1'}


In [68]:
print(roc_auc_score(y_train, train_predictions).round(3), '\n')
print(metrics.classification_report(y_train, train_predictions))
print(metrics.confusion_matrix(y_train, train_predictions))
print('\n')
print(roc_auc_score(y_test, test_predictions).round(3), '\n')
print(metrics.classification_report(y_test, test_predictions))
print(metrics.confusion_matrix(y_test, test_predictions))

0.701 

              precision    recall  f1-score   support

           0       0.68      0.75      0.71      4845
           1       0.72      0.66      0.69      4845

   micro avg       0.70      0.70      0.70      9690
   macro avg       0.70      0.70      0.70      9690
weighted avg       0.70      0.70      0.70      9690

[[3615 1230]
 [1670 3175]]


0.68 

              precision    recall  f1-score   support

           0       0.80      0.72      0.76      1154
           1       0.53      0.64      0.58       575

   micro avg       0.69      0.69      0.69      1729
   macro avg       0.67      0.68      0.67      1729
weighted avg       0.71      0.69      0.70      1729

[[832 322]
 [207 368]]


In [69]:
logreg = LogisticRegression(penalty = 'l1', C = 75)
logreg.fit(X_train_scaled, y_train)

train_predictions = logreg.predict_proba(X_scaled)
test_predictions = logreg.predict(X_test_scaled)

In [70]:
y_predictions = y_pred_inverse(train_predictions)
prediction_df = pd.concat([pd.DataFrame(y_predictions), df[['state_name', 'year']]], axis = 1, join = 'outer')

In [71]:
aggregations = {
    0: 'mean'
    }

prediction_df_grouped = prediction_df.groupby(['state_name']).agg(aggregations).reset_index()
prediction_df_grouped.rename({0: 'conflict_probability'}, axis = 1, inplace = True)
prediction_df_grouped = prediction_df_grouped.sort_values(by = 'conflict_probability', ascending = False)

In [72]:
prediction_df_grouped

Unnamed: 0,state_name,conflict_probability
181,United States of America,0.990
34,China,0.984
77,India,0.948
140,Russia,0.896
169,Thailand,0.837
86,Japan,0.832
129,Pakistan,0.826
78,Indonesia,0.816
117,Myanmar,0.814
174,Turkey,0.802


In [74]:
logreg = LogisticRegression(penalty = 'l1', C = 75)
logreg.fit(X_train_scaled, y_train)

train_predictions = logreg.predict(X_train_scaled)
test_predictions = logreg.predict(X_test_scaled)

In [75]:
print(roc_auc_score(y_train, train_predictions).round(3), '\n')
print(metrics.classification_report(y_train, train_predictions))
print(metrics.confusion_matrix(y_train, train_predictions))
print('\n')
print(roc_auc_score(y_test, test_predictions).round(3), '\n')
print(metrics.classification_report(y_test, test_predictions))
print(metrics.confusion_matrix(y_test, test_predictions))

0.701 

              precision    recall  f1-score   support

           0       0.68      0.75      0.71      4845
           1       0.72      0.66      0.69      4845

   micro avg       0.70      0.70      0.70      9690
   macro avg       0.70      0.70      0.70      9690
weighted avg       0.70      0.70      0.70      9690

[[3615 1230]
 [1670 3175]]


0.68 

              precision    recall  f1-score   support

           0       0.80      0.72      0.76      1154
           1       0.53      0.64      0.58       575

   micro avg       0.69      0.69      0.69      1729
   macro avg       0.67      0.68      0.67      1729
weighted avg       0.71      0.69      0.70      1729

[[832 322]
 [207 368]]


In [76]:
pd.set_option('display.max_rows', None)

lr_coef_df = pd.DataFrame(np.e**pd.DataFrame(logreg.coef_.tolist()[0]))
lr_coef_df.index = list(X_train_scaled.columns)
lr_coef_df.rename({0: 'coef_'}, axis = 1, inplace = True)
lr_coef_df = lr_coef_df.sort_values(by = 'coef_', ascending = False)
lr_coef_df

Unnamed: 0,coef_
prim_energy_consumption,2459.949
export_dollars,893.851
total_pop,68.176
num_transitions_ever,9.947
neutrality_treaties,6.624
entente_treaties,3.54
Asia,3.122
age_govt,2.512
military_leader,2.494
pre_1816_alliances,2.384


In [78]:
pd.reset_option("display.max_rows")

In [79]:
cv = StratifiedKFold(n_splits = 5, shuffle = True, random_state = 101)
param_grid = {'C': [1, 5, 10, 15, 20]}
grid = GridSearchCV(LinearSVC(random_state = 101), param_grid = param_grid, cv = cv, scoring = 'roc_auc', refit = True)
grid.fit(X_train_scaled, y_train)

train_predictions = grid.predict(X_train_scaled)
test_predictions = grid.predict(X_test_scaled)
print(grid.cv_results_['mean_train_score'].mean())
print(grid.cv_results_['mean_test_score'].mean())
print('')
print(grid.best_params_)

0.7617212882111184
0.7555696562477037

{'C': 10}


In [80]:
print(roc_auc_score(y_train, train_predictions).round(3), '\n')
print(metrics.classification_report(y_train, train_predictions))
print(metrics.confusion_matrix(y_train, train_predictions))
print('\n')
print(roc_auc_score(y_test, test_predictions).round(3), '\n')
print(metrics.classification_report(y_test, test_predictions))
print(metrics.confusion_matrix(y_test, test_predictions))

0.696 

              precision    recall  f1-score   support

           0       0.68      0.75      0.71      4845
           1       0.72      0.65      0.68      4845

   micro avg       0.70      0.70      0.70      9690
   macro avg       0.70      0.70      0.70      9690
weighted avg       0.70      0.70      0.70      9690

[[3617 1228]
 [1714 3131]]


0.677 

              precision    recall  f1-score   support

           0       0.80      0.72      0.76      1154
           1       0.53      0.63      0.58       575

   micro avg       0.69      0.69      0.69      1729
   macro avg       0.66      0.68      0.67      1729
weighted avg       0.71      0.69      0.70      1729

[[835 319]
 [212 363]]


In [None]:
# cv = StratifiedKFold(n_splits = 5, shuffle = True, random_state = 101)
# param_grid = {'n_estimators': [400, 500, 600, 700, 800, 900], 'max_depth': np.arange(3, 12), 'max_features': np.arange(3, 9), 'min_samples_split': np.arange(3, 7), 'min_samples_leaf': np.arange(2, 7)}
# rand = RandomizedSearchCV(RandomForestClassifier(random_state = 101), param_distributions = param_grid, cv = cv, scoring = 'roc_auc', refit = True, random_state = 101)
# rand.fit(X_train_scaled, y_train)

# train_predictions = rand.predict(X_train_scaled)
# test_predictions = rand.predict(X_test_scaled)
# print(rand.cv_results_['mean_train_score'].mean())
# print(rand.cv_results_['mean_test_score'].mean())
# print('')
# print(rand.best_params_)

In [None]:
# print(roc_auc_score(y_train, train_predictions).round(3), '\n')
# print(metrics.classification_report(y_train, train_predictions))
# print(metrics.confusion_matrix(y_train, train_predictions))
# print('\n')
# print(roc_auc_score(y_test, test_predictions).round(3), '\n')
# print(metrics.classification_report(y_test, test_predictions))
# print(metrics.confusion_matrix(y_test, test_predictions))

In [None]:
# rfc = RandomForestClassifier(n_estimators = 700, min_samples_split = 6, min_samples_leaf = 2, max_features = 6, max_depth = 11)
# rfc.fit(X_train_scaled, y_train)

# train_predictions = rfc.predict(X_train_scaled)
# test_predictions = rfc.predict(X_test_scaled)

In [None]:
# print(roc_auc_score(y_train, train_predictions).round(3), '\n')
# print(metrics.classification_report(y_train, train_predictions))
# print(metrics.confusion_matrix(y_train, train_predictions))
# print('\n')
# print(roc_auc_score(y_test, test_predictions).round(3), '\n')
# print(metrics.classification_report(y_test, test_predictions))
# print(metrics.confusion_matrix(y_test, test_predictions))

In [None]:
# importances = rfc.feature_importances_
# std = np.std([tree.feature_importances_ for tree in rfc.estimators_],
#              axis=0)
# indices = np.argsort(importances)[::-1]

# # Print the feature ranking
# print("Feature ranking:")

# # for f in range(X_train_scaled.shape[1]):
# for i, feature in enumerate(X_train_scaled.columns):    
#     print(str(i + 1) + '. ' + feature + ' ' + str(importances[indices[i]]))

In [None]:
# cv = StratifiedKFold(n_splits = 5, shuffle = True, random_state = 101)

# params_grid = {
#     'n_estimators': [700],
#     'learning_rate': [0.5],
#     'max_depth': [3],
#     'max_features': np.arange(3, 7),
#     'min_samples_split': np.arange(3, 4),
#     'min_samples_leaf': np.arange(2, 4)
# }

# params_fixed = {
#     'objective':'binary:logistic',
#     'silent': 1
# }

# grid = GridSearchCV(
#     estimator = XGBClassifier(**params_fixed, seed = 42),
#     param_grid = params_grid,
#     cv = cv,
#     scoring = 'roc_auc'
# )

In [None]:
# grid.fit(X_train_scaled, y_train)

In [None]:
# print("Best accuracy obtained {0}".format(grid.best_score_))
# print("Parameters:")
# for key, value in grid.best_params_.items():
#     print("\t{}: {}".format(key, value))

In [None]:
# train_predictions = grid.predict(X_train_scaled)
# test_predictions = grid.predict(X_test_scaled)
# print(grid.cv_results_['mean_train_score'].mean())
# print(grid.cv_results_['mean_test_score'].mean())

In [None]:
# print(roc_auc_score(y_train, train_predictions).round(3), '\n')
# print(metrics.classification_report(y_train, train_predictions))
# print(metrics.confusion_matrix(y_train, train_predictions))
# print('\n')
# print(roc_auc_score(y_test, test_predictions).round(3), '\n')
# print(metrics.classification_report(y_test, test_predictions))
# print(metrics.confusion_matrix(y_test, test_predictions))

In [None]:
# xgb = XGBClassifier(learning_rate = 0.5, max_depth = 3, max_features = 3, min_samples_leaf = 2, min_samples_split = 3, n_estimators = 700)
# xgb.fit(X_train_scaled, y_train)

In [None]:
# df[df['conflict_present'] == 1]['t_num_conflicts'].value_counts()

In [None]:
# y_pred_proba = pd.DataFrame(y_pred_inverse(xgb.predict_proba(X_scaled)))
# y_pred_proba.rename({0: 'prob'}, axis = 1, inplace = True)

In [None]:
# train_predictions = xgb.predict(X_train_scaled)
# test_predictions = xgb.predict(X_test_scaled)
# all_predictions = xgb.predict_proba(X_scaled)

In [None]:
# print(roc_auc_score(y_train, train_predictions).round(3), '\n')
# print(metrics.classification_report(y_train, train_predictions))
# print(metrics.confusion_matrix(y_train, train_predictions))
# print('\n')
# print(roc_auc_score(y_test, test_predictions).round(3), '\n')
# print(metrics.classification_report(y_test, test_predictions))
# print(metrics.confusion_matrix(y_test, test_predictions))
# print('\n')
# print(roc_auc_score(y, all_predictions).round(3), '\n')
# print(metrics.classification_report(y, all_predictions))
# print(metrics.confusion_matrix(y, all_predictions))

In [None]:
# df.columns

In [None]:
# plt.scatter((df['import_dollars'] + df['export_dollars'])/2, df['num_conflicts'])

In [None]:
# plt.scatter((df['import_dollars_per_pop'] + df['export_dollars_per_pop'])/2, df['num_conflicts'])

In [None]:
# plt.figure(figsize = (12, 6))

# ax = sns.barplot(x = X_train_scaled.columns,  y = xgb.feature_importances_)
# #                  data = _______, palette = 'Set2')
# ax.set_title('_______')
# ax.set_xlabel('Features')
# ax.set_ylabel('Feature Importance')
# ax.set_xticklabels(X_train_scaled.columns, 
#                    rotation = 45, ha = "right")

# plt.tight_layout()

In [None]:
# plt.figure(figsize = (12, 6))

# ax = sns.barplot(x = X_train_scaled.columns,  y = gbc.feature_importances_)
# #                  data = _______, palette = 'Set2')
# ax.set_title('_______')
# ax.set_xlabel('Features')
# ax.set_ylabel('Feature Importance')
# ax.set_xticklabels(X_train_scaled.columns, 
#                    rotation = 45, ha = "right")

# plt.tight_layout()

In [None]:
# print(xgb.feature_importances_)
# # plot
# plt.bar(X_train_scaled.columns, )
# # pyplot.show()

In [None]:
# comparing roc/auc from before and after hyperparameter tuning.
# fpr1, tpr1, _ = metrics.roc_curve(y_validation_2, y_pred_proba_1)
# roc1_df = pd.DataFrame(fpr1, tpr1).reset_index()
# roc1_df.rename({'index': 'fpr1', 0: 'tpr1'}, axis = 1, inplace = True)

# auc1 = metrics.roc_auc_score(y_validation_2, y_pred_proba_1)

# fpr2, tpr2, _ = metrics.roc_curve(y_validation_2, y_pred_proba_2)
# roc2_df = pd.DataFrame(fpr2, tpr2).reset_index()
# roc2_df.rename({'index': 'fpr2', 0: 'tpr2'}, axis = 1, inplace = True)

# auc2 = metrics.roc_auc_score(y_validation_2, y_pred_proba_2)

# sns.set_context(font_scale = 1.2)
# plt.plot('tpr1', 'fpr1', data = roc1_df, color = 'skyblue', linewidth = 10)
# plt.plot('tpr2', 'fpr2', data = roc2_df, color = 'olive', linewidth = 2, linestyle = 'dashed')
# plt.ylim(0, )
# plt.title("Found Guilty, New AUC = " + str(auc2.round(4)));