In [1]:
import warnings
from copy import deepcopy
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
import imblearn.over_sampling

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
# from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import MinMaxScaler

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
# from sklearn.tree import DecisionTreeClassifier
# from sklearn.naive_bayes import GaussianNB

from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import average_precision_score
# from sklearn.metrics import roc_curve
# from sklearn.metrics import auc
# from sklearn.metrics import precision_recall_curve

In [4]:
from xgboost import XGBClassifier

In [5]:
# from notebook_pbar import * # import my notebook_pbar.py file
# timelist = timelist # import the default variables timelist and then_time
# then_time = then_time

In [6]:
# function cm_val creates an interactive confusion matrix on un-scaled data.
# function cm_val_scaled creates an interactive confusion matrix on scaled data.
from my_functions import cm_val
from my_functions import cm_val_scaled
# function y_pred_inverse extracts the predictive probability from predict_proba.
from my_functions import y_pred_inverse
# function plot_validation_curve_log plots a validation curve on a log scale.
# function plot_validation_curve_reg plots a validation curve on a default scale.
from my_functions import plot_validation_curve_log
from my_functions import plot_validation_curve_reg
# function plot_learning_curve_reg plots a learning curve on a default scale.
from my_functions import plot_learning_curve
# function plot_decision_tree uses graphviz to visualize the splits of a devision tree.
from my_functions import plot_decision_tree
# function train_and_calibrate_cv performs stratified shuffle split on a specified model,
# returning validation scores and roc/auc.
from my_functions import train_and_calibrate_cv

In [7]:
warnings.filterwarnings('ignore')

In [8]:
pd.set_option('display.max_colwidth', 1000)
pd.set_option('display.max_columns', None)
# pd.set_option('display.max_rows', None)

In [9]:
pd.set_option('display.float_format', lambda x: '%.3f' % x)

In [10]:
np.random.seed(42)

In [11]:
df = pd.read_pickle('pickle/df_modeling.pkl')

In [12]:
len(df)

9115

In [13]:
df.drop(['cow_code', 'year', 'state_name', 'export_import_ratio', 'avg_hostility_level', 'military_expenditure', 'military_personnel', 'num_wars', 'num_conflicts'], axis = 1).columns

Index(['num_trade_states', 'export_dollars', 'import_dollars',
       'prim_energy_consumption', 'total_pop', 'cinc_score', 'num_alliances',
       'pre_1816_alliances', 'num_in_effect_1231_2012', 'defense_treaties',
       'neutrality_treaties', 'nonaggression_treaties', 'entente_treaties',
       'avg_cum_duration', 'ongoing_2010', 'revision_pct',
       'num_leadership_changes', 'leader_tenure', 'age_govt',
       'num_transitions_ever', 'Americas', 'Asia', 'Europe', 'Oceania',
       'Indirect election', 'Nonelective', 'No legislature exists',
       'Non‐elective legislature', 'Appointed', 'Closed', 'Elected',
       'All parties legally banned', 'Legally single party state',
       'Multiple parties legally allowed', 'Multiple parties', 'No parties',
       'One party', 'Multiple parties outside regime',
       'No parties outside regime', 'One party outside regime',
       'Legislature with multiple parties',
       'No legislature or all nonpartisan', 'Only members from regime 

In [61]:
df.head()

Unnamed: 0,cow_code,state_name,year,num_trade_states,export_dollars,import_dollars,military_expenditure,military_personnel,prim_energy_consumption,total_pop,cinc_score,num_alliances,pre_1816_alliances,num_in_effect_1231_2012,defense_treaties,neutrality_treaties,nonaggression_treaties,entente_treaties,num_conflicts,avg_cum_duration,num_wars,ongoing_2010,revision_pct,avg_hostility_level,num_leadership_changes,leader_tenure,age_govt,num_transitions_ever,export_import_ratio,Americas,Asia,Europe,Oceania,Indirect election,Nonelective,No legislature exists,Non‐elective legislature,Appointed,Closed,Elected,All parties legally banned,Legally single party state,Multiple parties legally allowed,Multiple parties,No parties,One party,Multiple parties outside regime,No parties outside regime,One party outside regime,Legislature with multiple parties,No legislature or all nonpartisan,Only members from regime party,Rules rewritten unconstitutionally,collective_leadership,military_leader,royal_leader,nominal_vs_eff_diff,communist_leader,leader_died,democratic_regime,cabinet_assembly,popular_election,Civilian dictatorship,Military dictatorship,Mixed (semi‐presidential) democracy,Parliamentary democracy,Presidential democracy,Royal dictatorship,transition_to_democracy,transition_to_dictatorship,war_present,conflict_present,40s,50s,60s,70s,80s,90s
0,2,United States of America,1946,65.0,160000000.0,11343750000.0,45133984000.0,3030000.0,2376288000000.0,141389000.0,0.364,19.0,0.0,0.0,19.0,0.0,0.0,19.0,2.0,0.268,0.0,0.0,0.0,3.5,0,2,77.0,0,0.014,1,0,0,0,0,0,0,0,0,0,1,0,0,1,1,0,0,1,0,0,1,0,0,0,0,1,0,0,0,0,1,0,1,0,0,0,0,1,0,0,0,0,1,1,0,0,0,0,0
1,651,Egypt,1946,65.0,603500000.0,580390000.0,31998000.0,50000.0,4046000000.0,18792000.0,0.006,7.0,0.0,0.0,0.0,0.0,6.0,7.0,0.0,0.0,0.0,0.0,0.0,1.0,0,11,25.0,0,1.04,0,0,0,0,0,1,0,0,0,0,1,0,0,1,1,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0
2,135,Peru,1946,65.0,163700000.0,175990000.0,26446000.0,38000.0,2166000000.0,7420000.0,0.002,19.0,0.0,0.0,19.0,0.0,0.0,19.0,0.0,0.0,0.0,0.0,0.0,1.0,0,2,8.0,0,0.93,1,0,0,0,0,0,0,0,0,0,1,0,0,1,1,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0
3,652,Syria,1946,65.0,150000000.0,43680000.0,14540000.0,10000.0,10000000.0,2978000.0,0.001,6.0,0.0,4.0,0.0,0.0,6.0,6.0,0.0,0.0,0.0,0.0,0.0,1.0,0,4,1.0,0,3.434,0,1,0,0,0,0,0,0,0,0,1,0,0,1,1,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
4,660,Lebanon,1946,65.0,147799999.9,30600000.0,5111000.0,4000.0,16000000.0,1156000.0,0.0,6.0,0.0,4.0,0.0,0.0,6.0,6.0,0.0,0.0,0.0,0.0,0.0,1.0,2,1,1.0,0,4.83,0,1,0,0,1,0,0,0,0,0,1,0,0,1,1,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0,1,0,0,0,0,0


In [14]:
df.drop(['cow_code', 'year', 'state_name', 'export_import_ratio', 'avg_hostility_level',
             'military_expenditure', 'military_personnel', 'num_wars', 'num_conflicts',
             'war_present', 'revision_pct', 'avg_cum_duration'], axis = 1).sum()

num_trade_states                               1395140.000
export_dollars                         158215503813141.281
import_dollars                         158243361464238.688
prim_energy_consumption               1176329088000000.000
total_pop                                 266477614000.000
cinc_score                                          63.399
num_alliances                                   120407.000
pre_1816_alliances                                 126.000
num_in_effect_1231_2012                         102323.000
defense_treaties                                 95462.000
neutrality_treaties                               2086.000
nonaggression_treaties                           90735.000
entente_treaties                                103066.000
ongoing_2010                                        19.850
num_leadership_changes                            2040.000
leader_tenure                                    68636.000
age_govt                                        285851.0

In [15]:
X = df.drop(['cow_code', 'year', 'state_name', 'export_import_ratio', 'avg_hostility_level',
             'military_expenditure', 'military_personnel', 'num_wars', 'num_conflicts',
             'war_present', 'conflict_present', 'revision_pct', 'avg_cum_duration', 'ongoing_2010'], axis = 1)

y = df['conflict_present']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 101)

In [16]:
df['conflict_present'].value_counts()

0    6331
1    2784
Name: conflict_present, dtype: int64

In [17]:
continuous_columns = ['num_trade_states', 'export_dollars', 'import_dollars',
                      'prim_energy_consumption', 'total_pop', 'cinc_score',
                      'num_alliances', 'pre_1816_alliances', 'num_in_effect_1231_2012',
                      'defense_treaties', 'neutrality_treaties', 'nonaggression_treaties',
                      'entente_treaties', 'leader_tenure', 'age_govt',
                      'num_transitions_ever', 'num_leadership_changes']

X_train_cont = X_train[continuous_columns]
X_train_cont = X_train_cont.reset_index()
X_train_cont.drop(['index'], axis = 1, inplace = True)

X_train_dummy = X_train.drop(continuous_columns, axis = 1)
X_train_dummy = X_train_dummy.reset_index()
X_train_dummy.drop(['index'], axis = 1, inplace = True)

X_test_cont = X_test[continuous_columns]
X_test_cont = X_test_cont.reset_index()
X_test_cont.drop(['index'], axis = 1, inplace = True)

X_test_dummy = X_test.drop(continuous_columns, axis = 1)
X_test_dummy = X_test_dummy.reset_index()
X_test_dummy.drop(['index'], axis = 1, inplace = True)

In [18]:
scaler = MinMaxScaler()

X_train_cont_scaled = pd.DataFrame(scaler.fit_transform(X_train_cont))
X_train_cont_scaled = X_train_cont_scaled.reset_index()
X_train_cont_scaled.drop(['index'], axis = 1, inplace = True)
X_train_cont_scaled.columns = continuous_columns

X_test_cont_scaled = pd.DataFrame(scaler.transform(X_test_cont))
X_test_cont_scaled = X_test_cont_scaled.reset_index()
X_test_cont_scaled.drop(['index'], axis = 1, inplace = True)
X_test_cont_scaled.columns = continuous_columns

X_train_scaled = pd.concat([X_train_cont_scaled, X_train_dummy], axis = 1)
X_test_scaled = pd.concat([X_test_cont_scaled, X_test_dummy], axis = 1)

In [19]:
pd.DataFrame(y_train)['conflict_present'].value_counts()

0    5061
1    2231
Name: conflict_present, dtype: int64

In [21]:
smote = imblearn.over_sampling.SMOTE(ratio = {0: 5061, 1: (5061)}, random_state = 101)
X_train_scaled, y_train = smote.fit_sample(X_train_scaled, y_train)

X_train_scaled = pd.DataFrame(X_train_scaled)
X_train_scaled.columns = X.columns
y_train = pd.DataFrame(y_train)
y_train.columns = ['war_present']

In [22]:
from sklearn.metrics import accuracy_score

In [23]:
print(X_train_scaled.shape)
# print(X_val_scaled.shape)
print(X_test_scaled.shape)
print(y_train.shape)
# print(y_val.shape)
print(y_test.shape)

(10122, 64)
(1823, 64)
(10122, 1)
(1823,)


In [24]:
lr = LogisticRegression(penalty = 'l1')
lr.fit(X_train_scaled, y_train)
predictions = lr.predict(X_test_scaled)
print(metrics.classification_report(y_test, predictions))
print(metrics.confusion_matrix(y_test, predictions))

              precision    recall  f1-score   support

           0       0.82      0.73      0.77      1270
           1       0.51      0.63      0.56       553

   micro avg       0.70      0.70      0.70      1823
   macro avg       0.66      0.68      0.67      1823
weighted avg       0.72      0.70      0.71      1823

[[929 341]
 [204 349]]


In [25]:
lr = LogisticRegression(penalty = 'l2')
lr.fit(X_train_scaled, y_train)
predictions = lr.predict(X_test_scaled)
print(metrics.classification_report(y_test, predictions))
print(metrics.confusion_matrix(y_test, predictions))

              precision    recall  f1-score   support

           0       0.82      0.73      0.77      1270
           1       0.50      0.63      0.56       553

   micro avg       0.70      0.70      0.70      1823
   macro avg       0.66      0.68      0.67      1823
weighted avg       0.72      0.70      0.71      1823

[[921 349]
 [202 351]]


In [26]:
from sklearn.metrics import fbeta_score, make_scorer

In [27]:
fbeta = make_scorer(fbeta_score, pos_label = 1, average = 'binary', beta = 0.5)

In [28]:
cv = StratifiedKFold(n_splits = 3, shuffle = True, random_state = 101)
param_grid = {'n_estimators': [300], 'max_depth': np.arange(3, 10), 'max_features': np.arange(3, 7), 'min_samples_split': np.arange(6, 7), 'min_samples_leaf': np.arange(2, 7), 'bootstrap': [True, False]}
rand = RandomizedSearchCV(RandomForestClassifier(random_state = 101), param_distributions = param_grid, cv = cv, scoring = fbeta, refit = True, random_state = 101)
rand.fit(X_train_scaled, y_train)

RandomizedSearchCV(cv=StratifiedKFold(n_splits=3, random_state=101, shuffle=True),
          error_score='raise-deprecating',
          estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
            oob_score=False, random_state=101, verbose=0, warm_start=False),
          fit_params=None, iid='warn', n_iter=10, n_jobs=None,
          param_distributions={'n_estimators': [300], 'max_depth': array([3, 4, 5, 6, 7, 8, 9]), 'max_features': array([3, 4, 5, 6]), 'min_samples_split': array([6]), 'min_samples_leaf': array([2, 3, 4, 5, 6]), 'bootstrap': [True, False]},
          pre_dispatch='2*n_jobs', random_state=101, refit=True,
          return_train_score='warn',
          scoring=make_scorer(fbe

In [31]:
predictions = rand.predict(X_test_scaled)
print(rand.cv_results_['mean_train_score'].mean())
print(rand.cv_results_['mean_test_score'].mean())
print('')
print(rand.best_params_)
print(metrics.classification_report(y_test, predictions))
print(metrics.confusion_matrix(y_test, predictions))

0.7792948082107489
0.7546198117287491

{'n_estimators': 300, 'min_samples_split': 6, 'min_samples_leaf': 2, 'max_features': 3, 'max_depth': 9, 'bootstrap': False}
              precision    recall  f1-score   support

           0       0.86      0.80      0.83      1270
           1       0.60      0.70      0.65       553

   micro avg       0.77      0.77      0.77      1823
   macro avg       0.73      0.75      0.74      1823
weighted avg       0.78      0.77      0.77      1823

[[1016  254]
 [ 167  386]]


In [62]:
cv = StratifiedKFold(n_splits = 5, shuffle = True, random_state = 101)
fbeta = make_scorer(fbeta_score, average = 'weighted', beta = 0.5)

params_grid = {
    'n_estimators': [400],
    'learning_rate': [0.5],
    'max_depth': [6, 7, 8],
    'max_features': np.arange(3, 7),
    'min_samples_split': np.arange(6, 7),
    'min_samples_leaf': np.arange(2, 7)
}

params_fixed = {
    'objective':'binary:logistic',
    'silent': 1
}

best_grid = GridSearchCV(
    estimator = XGBClassifier(**params_fixed, seed = 42),
    param_grid = params_grid,
    cv = cv,
    scoring = fbeta

)

In [None]:
best_grid.fit(X_train_scaled, y_train)

In [None]:
print("Best accuracy obtained {0}".format(best_grid.best_score_))
print("Parameters:")
for key, value in best_grid.best_params_.items():
    print("\t{}: {}".format(key, value))

In [None]:
predictions = best_grid.predict(X_test)

In [None]:
# created new confusion matrix for tuned model.
print('\n', metrics.classification_report(y_test, predictions))

df_cm = pd.DataFrame(metrics.confusion_matrix(y_test, predictions))
df_cm.rename({0: 'Peace', 1: 'War'}, axis = 1, inplace = True)
df_cm.rename(index = {0: 'Peace', 1: 'War'}, inplace = True)

fig, ax = plt.subplots(1, 1, figsize = (5, 5))
sns.set_context(font_scale = 1.2)
sns.heatmap(df_cm, annot = True, fmt = 'g', cbar = False, cmap = 'cividis')
ax.set_ylabel('True Label')
ax.set_xlabel('Predicted Label');
ax.xaxis.set_ticks_position('top') 
plt.tight_layout()