In [15]:
# import required libraries

from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier

pd.set_option('display.max_columns', 80)
sns.set_style("whitegrid")
%matplotlib inline

np.random.seed(0)

In [16]:
base_df = pd.read_csv("input/sanctions.csv.gz", index_col=0)

## Renamming columns

In [17]:
# base_df.columns

In [18]:
base_df.columns
new_names = [
    'Sender1', 'Sender2', 'Sender3', 'Target', 'Goal', 'US case',
    'US unilateral case', 'Foreign policy goal category', 'First year',
    'Last year','Policy result', 'Sanctions contribution',
    'Success', 'Companion policies', 'International cooperation',
    'International assistance', 'Cooperating international organization',
    'International organization sender',
    'International organization sender & target members',
    'Length', 'Prior relations',
    'Regime Type1', 'Regime Type2',
    'Regime Type3', 'Political stability prior',
    "Political stability during", 'Cost to target',
    'Cost to target GNP pct', 'Cost to target per capita',
    'Trade linkage', 'GNP ratio','Health and stability',
    'Sanction type', 'Cost to sender',
    'GDP growth',
    'Inflation',
    'Target IMF code',
    'Country group']
name_dict = dict(zip(base_df.columns, new_names))
base_df = base_df.rename(columns=name_dict)

## Dropping columns

In [19]:
unknowns = [
    "Last year",
    "Length",
    "Political stability during",
    "Cost to target",
    "Cost to target GNP pct",
    "Cost to target per capita",
    "Cost to sender",
    "Target IMF code",
    "Policy result",
    "Sanctions contribution"
]

df = base_df.drop(unknowns, axis=1)

print(df.shape)
df.head(3)

(204, 28)


Unnamed: 0_level_0,Sender1,Sender2,Sender3,Target,Goal,US case,US unilateral case,Foreign policy goal category,First year,Success,Companion policies,International cooperation,International assistance,Cooperating international organization,International organization sender,International organization sender & target members,Prior relations,Regime Type1,Regime Type2,Regime Type3,Political stability prior,Trade linkage,GNP ratio,Health and stability,Sanction type,GDP growth,Inflation,Country group
Case no.b,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1
14-1,UNITED KINGDOM,,,GERMANY,Military victory,0,0,4,1914,12,R,4,A,--,0,0,1,5,2,2,0,9,1,3,"F,X,M",36,29,1
17-1,UNITED STATES,,,JAPAN,Shipping for Allies,1,1,5,1917,4,--,1,--,--,0,0,2,5,1,2,0,205,13,3,X,54,256,1
18-1,UNITED KINGDOM,,,RUSSIA,Destabilize Bolsheviks,0,0,2,1918,2,"R,Q",4,--,--,0,0,1,1,-1,2,1,185,1,1,"F,X,M",n.a.,n.a.,2


## Préparation Guillaume

In [20]:
# drop all not immediately useful columns

useless = ["Sender1", "Sender2", "Sender3", "Target", "Goal", "Country group"]
df = df.drop(useless, axis=1)

In [21]:
# Create helper to get dummy columns for a given dimension

def replace_w_dummies(dataframe, col_name, prefx):
    new_dataframe = pd.concat([dataframe, pd.get_dummies(df[col_name], prefix=prefx)], axis=1)
    new_dataframe.drop([col_name], axis=1, inplace=True)
    return new_dataframe

In [22]:
df2 = replace_w_dummies(df, 'Foreign policy goal category', 'frgn_pol_4')
df2= replace_w_dummies(df2, 'International assistance', 'international_assistance')
df2.head(3)

Unnamed: 0_level_0,US case,US unilateral case,First year,Success,Companion policies,International cooperation,Cooperating international organization,International organization sender,International organization sender & target members,Prior relations,Regime Type1,Regime Type2,Regime Type3,Political stability prior,Trade linkage,GNP ratio,Health and stability,Sanction type,GDP growth,Inflation,frgn_pol_4_1,frgn_pol_4_2,frgn_pol_4_3,frgn_pol_4_4,frgn_pol_4_5,international_assistance_--,international_assistance_A
Case no.b,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1
14-1,0,0,1914,12,R,4,--,0,0,1,5,2,2,0,9,1,3,"F,X,M",36,29,0,0,0,1,0,0,1
17-1,1,1,1917,4,--,1,--,0,0,2,5,1,2,0,205,13,3,X,54,256,0,0,0,0,1,1,0
18-1,0,0,1918,2,"R,Q",4,--,0,0,1,1,-1,2,1,185,1,1,"F,X,M",n.a.,n.a.,0,1,0,0,0,1,0


In [23]:
# Transform years => era

def year_to_era(yr):
    yr = int(yr)
    if yr < 1945:
        return 1
    elif yr < 1991:
        return 2
    else:
        return 3
    
df2["first-year_era"] = df2["First year"].apply(year_to_era)
df3 = df2.drop(["First year"], axis=1)
df3.head(2)

Unnamed: 0_level_0,US case,US unilateral case,Success,Companion policies,International cooperation,Cooperating international organization,International organization sender,International organization sender & target members,Prior relations,Regime Type1,Regime Type2,Regime Type3,Political stability prior,Trade linkage,GNP ratio,Health and stability,Sanction type,GDP growth,Inflation,frgn_pol_4_1,frgn_pol_4_2,frgn_pol_4_3,frgn_pol_4_4,frgn_pol_4_5,international_assistance_--,international_assistance_A,first-year_era
Case no.b,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1
14-1,0,0,12,R,4,--,0,0,1,5,2,2,0,9,1,3,"F,X,M",36,29,0,0,0,1,0,0,1,1
17-1,1,1,4,--,1,--,0,0,2,5,1,2,0,205,13,3,X,54,256,0,0,0,0,1,1,0,1


In [24]:
# Make "Success" a boolean figure

df5 = df3
df5["bool_success"] = 1 * (df5["Success"] >= 9)
df5.drop(["Success"], axis=1, inplace=True)
df5.head(2)

Unnamed: 0_level_0,US case,US unilateral case,Companion policies,International cooperation,Cooperating international organization,International organization sender,International organization sender & target members,Prior relations,Regime Type1,Regime Type2,Regime Type3,Political stability prior,Trade linkage,GNP ratio,Health and stability,Sanction type,GDP growth,Inflation,frgn_pol_4_1,frgn_pol_4_2,frgn_pol_4_3,frgn_pol_4_4,frgn_pol_4_5,international_assistance_--,international_assistance_A,first-year_era,bool_success
Case no.b,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1
14-1,0,0,R,4,--,0,0,1,5,2,2,0,9,1,3,"F,X,M",36,29,0,0,0,1,0,0,1,1,1
17-1,1,1,--,1,--,0,0,2,5,1,2,0,205,13,3,X,54,256,0,0,0,0,1,1,0,1,0


In [25]:
# Get dummies for the "Companion policies" and "Sanction type"
df6 = df5

df6["comp_policies_J"] = df6["Companion policies"].apply(lambda x: 1 * (x.find("J") >= 0))
df6["comp_policies_Q"] = df6["Companion policies"].apply(lambda x: 1 * (x.find("Q") >= 0))
df6["comp_policies_R"] = df6["Companion policies"].apply(lambda x: 1 * (x.find("R") >= 0))

df6["Sanction_type_F"] = df6["Sanction type"].apply(lambda x: 1 * (x.find("F") >= 0))
df6["Sanction_type_X"] = df6["Sanction type"].apply(lambda x: 1 * (x.find("X") >= 0))
df6["Sanction_type_M"] = df6["Sanction type"].apply(lambda x: 1 * (x.find("M") >= 0))

df6.drop(["Companion policies", "Sanction type"], axis=1, inplace=True)


# Make "Cooperating international organization" 1 or 0

df6["bool_Cooperating international organization"] = df6["Cooperating international organization"].apply(lambda x: 0 if x == "--" else 1)
df6.drop(["Cooperating international organization"], axis=1, inplace=True)
df6.head(20)

Unnamed: 0_level_0,US case,US unilateral case,International cooperation,International organization sender,International organization sender & target members,Prior relations,Regime Type1,Regime Type2,Regime Type3,Political stability prior,Trade linkage,GNP ratio,Health and stability,GDP growth,Inflation,frgn_pol_4_1,frgn_pol_4_2,frgn_pol_4_3,frgn_pol_4_4,frgn_pol_4_5,international_assistance_--,international_assistance_A,first-year_era,bool_success,comp_policies_J,comp_policies_Q,comp_policies_R,Sanction_type_F,Sanction_type_X,Sanction_type_M,bool_Cooperating international organization
Case no.b,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1
14-1,0,0,4,0,0,1,5,2,2,00,9,1,3,36,29,0,0,0,1,0,0,1,1,1,0,0,1,1,1,1,0
17-1,1,1,1,0,0,2,5,1,2,00,205,13,3,54,256,0,0,0,0,1,1,0,1,0,0,0,0,0,1,0,0
18-1,0,0,4,0,0,1,1,-1,2,01,185,1,1,n.a.,n.a.,0,1,0,0,0,1,0,1,0,0,1,1,1,1,1,0
21-1,0,0,4,1,1,2,3,0,2,04,265,37,2,-15,n.a.,0,0,1,0,0,1,0,1,1,0,0,0,0,0,0,0
25-1,0,0,4,1,1,2,0,-6,1,09,36,56,2,33,488,0,0,1,0,0,1,0,1,1,0,0,0,0,0,0,0
32-1,0,0,3,1,1,2,4,2,2,00,63,322,2,n.a.,n.a.,0,0,1,0,0,1,0,1,0,0,0,0,0,1,0,0
32-1,0,0,3,1,1,2,1,-3,2,00,595,735,2,n.a.,n.a.,0,0,1,0,0,1,0,1,0,0,0,0,0,1,0,0
33-1,0,0,1,0,0,1,0,-9,1,00,13,1,2,24,n.a.,1,0,0,0,0,1,0,1,1,0,0,0,0,0,1,0
35-1,0,0,4,1,1,2,0,-9,1,03,16,6,3,-05,-44,0,0,1,0,0,0,1,1,0,0,0,0,1,1,1,0
38-1,1,0,2,0,0,2,0,-6,1,01,695,75,3,74,21,1,0,0,0,0,1,0,1,1,0,0,0,1,0,1,0


Question : que fait-on quand on a des "n.a." ? Je pense qu'il faut essayer de conserver le peu de données qu'on a :D