In [1]:
# import required libraries

from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve, roc_auc_score


import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

pd.set_option('display.max_columns', 80)
sns.set_style("whitegrid")
%matplotlib inline

np.random.seed(0)
seed=42

In [2]:
base_df = pd.read_csv("input/sanctions.csv.gz", index_col=0, na_values="n.a.", decimal=",")

## Renamming columns

In [3]:
# base_df.columns

In [4]:
base_df.columns
new_names = [
    'Sender1', 'Sender2', 'Sender3', 'Target', 'Goal', 'US case',
    'US unilateral case', 'Foreign policy goal category', 'First year',
    'Last year','Policy result', 'Sanctions contribution',
    'Success', 'Companion policies', 'International cooperation',
    'International assistance', 'Cooperating international organization',
    'International organization sender',
    'International organization sender & target members',
    'Length', 'Prior relations',
    'Regime Type1', 'Regime Type2',
    'Regime Type3', 'Political stability prior',
    "Political stability during", 'Cost to target',
    'Cost to target GNP pct', 'Cost to target per capita',
    'Trade linkage', 'GNP ratio','Health and stability',
    'Sanction type', 'Cost to sender',
    'GDP growth',
    'Inflation',
    'Target IMF code',
    'Country group']
name_dict = dict(zip(base_df.columns, new_names))
base_df = base_df.rename(columns=name_dict)

## Dropping columns

In [5]:
unknowns = [
    "Last year",
    "Length",
    "Political stability during",
    "Cost to target",
    "Cost to target GNP pct",
    "Cost to target per capita",
    "Cost to sender",
    "Target IMF code",
    "Policy result",
    "Sanctions contribution"
]

df = base_df.drop(unknowns, axis=1)

print(df.shape)
df.head(3)

(204, 28)


Unnamed: 0_level_0,Sender1,Sender2,Sender3,Target,Goal,US case,US unilateral case,Foreign policy goal category,First year,Success,Companion policies,International cooperation,International assistance,Cooperating international organization,International organization sender,International organization sender & target members,Prior relations,Regime Type1,Regime Type2,Regime Type3,Political stability prior,Trade linkage,GNP ratio,Health and stability,Sanction type,GDP growth,Inflation,Country group
Case no.b,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1
14-1,UNITED KINGDOM,,,GERMANY,Military victory,0,0,4,1914,12,R,4,A,--,0,0,1,5.0,2.0,2.0,0.0,9.0,1.0,3,"F,X,M",3.6,2.9,1
17-1,UNITED STATES,,,JAPAN,Shipping for Allies,1,1,5,1917,4,--,1,--,--,0,0,2,5.0,1.0,2.0,0.0,20.5,13.0,3,X,5.4,25.6,1
18-1,UNITED KINGDOM,,,RUSSIA,Destabilize Bolsheviks,0,0,2,1918,2,"R,Q",4,--,--,0,0,1,1.0,-1.0,2.0,0.1,18.5,1.0,1,"F,X,M",,,2


## Préparation Guillaume

In [6]:
# drop all not immediately useful columns

useless = ["Sender1", "Sender2", "Sender3", "Target", "Goal", "Country group"]
df = df.drop(useless, axis=1)

In [7]:
# Create helper to get dummy columns for a given dimension

def replace_w_dummies(dataframe, col_name, prefx):
    new_dataframe = pd.concat([dataframe, pd.get_dummies(df[col_name], prefix=prefx)], axis=1)
    new_dataframe.drop([col_name], axis=1, inplace=True)
    return new_dataframe

In [8]:
df2 = replace_w_dummies(df, 'Foreign policy goal category', 'frgn_pol_4')
df2= replace_w_dummies(df2, 'International assistance', 'international_assistance')
df2.head(3)

Unnamed: 0_level_0,US case,US unilateral case,First year,Success,Companion policies,International cooperation,Cooperating international organization,International organization sender,International organization sender & target members,Prior relations,Regime Type1,Regime Type2,Regime Type3,Political stability prior,Trade linkage,GNP ratio,Health and stability,Sanction type,GDP growth,Inflation,frgn_pol_4_1,frgn_pol_4_2,frgn_pol_4_3,frgn_pol_4_4,frgn_pol_4_5,international_assistance_--,international_assistance_A
Case no.b,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1
14-1,0,0,1914,12,R,4,--,0,0,1,5.0,2.0,2.0,0.0,9.0,1.0,3,"F,X,M",3.6,2.9,0,0,0,1,0,0,1
17-1,1,1,1917,4,--,1,--,0,0,2,5.0,1.0,2.0,0.0,20.5,13.0,3,X,5.4,25.6,0,0,0,0,1,1,0
18-1,0,0,1918,2,"R,Q",4,--,0,0,1,1.0,-1.0,2.0,0.1,18.5,1.0,1,"F,X,M",,,0,1,0,0,0,1,0


In [9]:
# Transform years => era

def year_to_era(yr):
    yr = int(yr)
    if yr < 1945:
        return 1
    elif yr < 1991:
        return 2
    else:
        return 3
    
df2["first-year_era"] = df2["First year"].apply(year_to_era)
df3 = df2.drop(["First year"], axis=1)
df3.head(2)

Unnamed: 0_level_0,US case,US unilateral case,Success,Companion policies,International cooperation,Cooperating international organization,International organization sender,International organization sender & target members,Prior relations,Regime Type1,Regime Type2,Regime Type3,Political stability prior,Trade linkage,GNP ratio,Health and stability,Sanction type,GDP growth,Inflation,frgn_pol_4_1,frgn_pol_4_2,frgn_pol_4_3,frgn_pol_4_4,frgn_pol_4_5,international_assistance_--,international_assistance_A,first-year_era
Case no.b,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1
14-1,0,0,12,R,4,--,0,0,1,5.0,2.0,2.0,0.0,9.0,1.0,3,"F,X,M",3.6,2.9,0,0,0,1,0,0,1,1
17-1,1,1,4,--,1,--,0,0,2,5.0,1.0,2.0,0.0,20.5,13.0,3,X,5.4,25.6,0,0,0,0,1,1,0,1


In [10]:
# Make "Success" a boolean figure

df5 = df3
df5["bool_success"] = 1 * (df5["Success"] >= 9)
df5.drop(["Success"], axis=1, inplace=True)
df5.head(2)

Unnamed: 0_level_0,US case,US unilateral case,Companion policies,International cooperation,Cooperating international organization,International organization sender,International organization sender & target members,Prior relations,Regime Type1,Regime Type2,Regime Type3,Political stability prior,Trade linkage,GNP ratio,Health and stability,Sanction type,GDP growth,Inflation,frgn_pol_4_1,frgn_pol_4_2,frgn_pol_4_3,frgn_pol_4_4,frgn_pol_4_5,international_assistance_--,international_assistance_A,first-year_era,bool_success
Case no.b,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1
14-1,0,0,R,4,--,0,0,1,5.0,2.0,2.0,0.0,9.0,1.0,3,"F,X,M",3.6,2.9,0,0,0,1,0,0,1,1,1
17-1,1,1,--,1,--,0,0,2,5.0,1.0,2.0,0.0,20.5,13.0,3,X,5.4,25.6,0,0,0,0,1,1,0,1,0


In [11]:
# Get dummies for the "Companion policies" and "Sanction type"
df6 = df5

df6["comp_policies_J"] = df6["Companion policies"].apply(lambda x: 1 * (x.find("J") >= 0))
df6["comp_policies_Q"] = df6["Companion policies"].apply(lambda x: 1 * (x.find("Q") >= 0))
df6["comp_policies_R"] = df6["Companion policies"].apply(lambda x: 1 * (x.find("R") >= 0))

df6["Sanction_type_F"] = df6["Sanction type"].apply(lambda x: 1 * (x.find("F") >= 0))
df6["Sanction_type_X"] = df6["Sanction type"].apply(lambda x: 1 * (x.find("X") >= 0))
df6["Sanction_type_M"] = df6["Sanction type"].apply(lambda x: 1 * (x.find("M") >= 0))

df6.drop(["Companion policies", "Sanction type"], axis=1, inplace=True)


# Make "Cooperating international organization" 1 or 0

df6["bool_Cooperating international organization"] = df6["Cooperating international organization"].apply(lambda x: 0 if x == "--" else 1)
df6.drop(["Cooperating international organization"], axis=1, inplace=True)
df6.head(20)

Unnamed: 0_level_0,US case,US unilateral case,International cooperation,International organization sender,International organization sender & target members,Prior relations,Regime Type1,Regime Type2,Regime Type3,Political stability prior,Trade linkage,GNP ratio,Health and stability,GDP growth,Inflation,frgn_pol_4_1,frgn_pol_4_2,frgn_pol_4_3,frgn_pol_4_4,frgn_pol_4_5,international_assistance_--,international_assistance_A,first-year_era,bool_success,comp_policies_J,comp_policies_Q,comp_policies_R,Sanction_type_F,Sanction_type_X,Sanction_type_M,bool_Cooperating international organization
Case no.b,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1
14-1,0,0,4,0,0,1,5.0,2.0,2.0,0.0,9.0,1.0,3,3.6,2.9,0,0,0,1,0,0,1,1,1,0,0,1,1,1,1,0
17-1,1,1,1,0,0,2,5.0,1.0,2.0,0.0,20.5,13.0,3,5.4,25.6,0,0,0,0,1,1,0,1,0,0,0,0,0,1,0,0
18-1,0,0,4,0,0,1,1.0,-1.0,2.0,0.1,18.5,1.0,1,,,0,1,0,0,0,1,0,1,0,0,1,1,1,1,1,0
21-1,0,0,4,1,1,2,3.0,0.0,2.0,0.4,26.5,37.0,2,-1.5,,0,0,1,0,0,1,0,1,1,0,0,0,0,0,0,0
25-1,0,0,4,1,1,2,0.0,-6.0,1.0,0.9,36.0,56.0,2,3.3,48.8,0,0,1,0,0,1,0,1,1,0,0,0,0,0,0,0
32-1,0,0,3,1,1,2,4.0,2.0,2.0,0.0,63.0,322.0,2,,,0,0,1,0,0,1,0,1,0,0,0,0,0,1,0,0
32-1,0,0,3,1,1,2,1.0,-3.0,2.0,0.0,59.5,735.0,2,,,0,0,1,0,0,1,0,1,0,0,0,0,0,1,0,0
33-1,0,0,1,0,0,1,0.0,-9.0,1.0,0.0,13.0,1.0,2,2.4,,1,0,0,0,0,1,0,1,1,0,0,0,0,0,1,0
35-1,0,0,4,1,1,2,0.0,-9.0,1.0,0.3,16.0,6.0,3,-0.5,-4.4,0,0,1,0,0,0,1,1,0,0,0,0,1,1,1,0
38-1,1,0,2,0,0,2,0.0,-6.0,1.0,0.1,69.5,75.0,3,7.4,2.1,1,0,0,0,0,1,0,1,1,0,0,0,1,0,1,0


Question : que fait-on quand on a des "n.a." ? Je pense qu'il faut essayer de conserver le peu de données qu'on a :D

### Calculer le nombre de NAs

In [12]:
df6.isna().sum()

US case                                                0
US unilateral case                                     0
International cooperation                              0
International organization sender                      0
International organization sender & target members     0
Prior relations                                        0
Regime Type1                                           5
Regime Type2                                           5
Regime Type3                                           5
Political stability prior                              5
Trade linkage                                          1
GNP ratio                                              0
Health and stability                                   0
GDP growth                                            18
Inflation                                             40
frgn_pol_4_1                                           0
frgn_pol_4_2                                           0
frgn_pol_4_3                   

#### Regime Type is the value of the Polity IV variable called DEMOC, which measures the degree of democracy in the target in the 1st year of sanctions, ranging from 0 (no democracy) to 10 (strong democracy)

#### Regime Type2 is the value of the Polity2 variable from the Polity IV database that allows for a country to have varying degrees of both democracy and autocracy in the target, ranging from -10 (strongly autocratic) to 10 (strongly democratic)

#### Regime Type3 collapses the Polity2 data to a 3-point scale where 1 = autocracy, 2 = anocracy, and 3 = democracy.

In [13]:
# replace NAN by mean value !!! A DISCUTER !!!

df7 = df6.apply(lambda x: x.fillna(np.round(x.mean(),1)), axis=0)

In [14]:
# change all dtypes to float64

df7 = df7.astype(np.float64)

### Split on train / test

In [15]:
Y = df7['bool_success']
X = df7.drop(['bool_success'],axis=1)

x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.33, random_state=seed, stratify=Y)

In [16]:
print("Train size", x_train.shape[0])
print("Test size", x_test.shape[0])
print("Number of parameters", x_train.shape[1])

Train size 136
Test size 68
Number of parameters 30


## Entrainement

In [17]:
pipelines = {
    'rf' : make_pipeline(StandardScaler(), RandomForestClassifier(random_state = seed)),
    'gb' : make_pipeline(StandardScaler(), GradientBoostingClassifier(random_state = seed)),
    'l1' : make_pipeline(StandardScaler(), LogisticRegression(penalty = 'l1', random_state=seed, solver='saga')),
    'l2' : make_pipeline(StandardScaler(), LogisticRegression(penalty = 'l2', random_state=seed, solver='saga')),
}


rf__hyperparameters = {
    'randomforestclassifier__n_estimators' : [100, 200],
    'randomforestclassifier__max_features' : ['auto', 'sqrt', 0.33],
    'randomforestclassifier__min_samples_leaf' : [1, 3, 5, 10]
}

gb__hyperparameters = {
    'gradientboostingclassifier__n_estimators' : [100, 200],
    'gradientboostingclassifier__learning_rate' : [0.05, 0.1, 0.2],
    'gradientboostingclassifier__max_depth' : [1, 3, 5, 10] 
}


l1__hyperparameters = {
    'logisticregression__C' : [0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 5, 10, 50, 100, 500, 1000]
}

l2__hyperparameters = {
    'logisticregression__C' : [0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 5, 10, 50, 100, 500, 1000]
}


hyperparameters = { 
    'rf' : rf__hyperparameters,
    'gb' : gb__hyperparameters,
    'l1' : l1__hyperparameters,
    'l2' : l2__hyperparameters,
}

In [20]:
models = {}

for name, pipeline in pipelines.items() :
    
    model = GridSearchCV(pipeline, hyperparameters[name], cv=5, n_jobs=-1, iid=False)      
    model.fit(x_train, y_train)
    
    models[name] = model     
    print(f'{name} has been fitted')

rf has been fitted
gb has been fitted
l1 has been fitted
l2 has been fitted


In [21]:
for key in models.keys():
    train_preds = models[key].predict(x_train)
    test_preds = models[key].predict(x_test)
    train_accuracy = np.round(metrics.accuracy_score(y_train, train_preds),2)
    test_accuracy = np.round(metrics.accuracy_score(y_test, test_preds),2)
    print("Accuracy for" ,key ,"on train:", train_accuracy,"and on test:", test_accuracy)

Accuracy for rf on train: 0.93 and on test: 0.65
Accuracy for gb on train: 0.79 and on test: 0.62
Accuracy for l1 on train: 0.76 and on test: 0.66
Accuracy for l2 on train: 0.75 and on test: 0.68
