In [404]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.plotly as ply
from datetime import date
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.utils import resample
from collections import Counter
%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [1052]:
pd.set_option('display.max_columns', 101)

## Data Filtering and Manipulation

#### By Disputes

In [405]:
disputes = pd.read_csv("/Users/danielgilberg/data_science/metis/practice/mcnulty/data/MID-level/MIDA_4.01.csv")
dispute_part = pd.read_csv("/Users/danielgilberg/data_science/metis/practice/mcnulty/data/MID-level/MIDB_4.01.csv")

In [406]:
mask = (disputes.HostLev>=3 ) & (disputes.Fatality > 0) 
major_events= disputes[mask]

In [407]:
military_df = pd.read_csv("/Users/danielgilberg/data_science/metis/practice/mcnulty/data/NMC_5_0/NMC_5_0.csv")

In [408]:
military_df = military_df.rename(columns={"year": "StYear", "version":"Version"})
military_df.columns

Index(['stateabb', 'ccode', 'StYear', 'milex', 'milper', 'irst', 'pec', 'tpop',
       'upop', 'cinc', 'Version'],
      dtype='object')

In [409]:
participant_info = pd.merge(dispute_part, military_df, on=["ccode", "StYear"])
participant_info = participant_info.drop(["Version_x", "Version_y"], axis=1)

In [410]:
participant_info.columns

Index(['DispNum3', 'DispNum4', 'StAbb', 'ccode', 'StDay', 'StMon', 'StYear',
       'EndDay', 'EndMon', 'EndYear', 'SideA', 'RevState', 'RevType1',
       'RevType2', 'Fatality', 'FataPre', 'HiAct', 'HostLev', 'Orig',
       'stateabb', 'milex', 'milper', 'irst', 'pec', 'tpop', 'upop', 'cinc'],
      dtype='object')

In [411]:
dispute_side_info = participant_info.groupby(["DispNum3", "DispNum4"], as_index=False).agg({"milex": sum, 
                                                                                "milper": sum, "irst": sum,
                                                                                "tpop": sum, "upop":sum, "pec":sum})
#dispute_side_info

In [412]:
def is_major_event(fatality, hostility):
    if fatality > 0 and hostility > 2:
        return 1
    else:
        return 0
disputes["Major_Dispute"] = disputes.apply(lambda row: is_major_event(row["Fatality"], row["HostLev"]), axis=1)

In [413]:
all_sides = pd.merge(disputes, dispute_side_info, on=["DispNum3", "DispNum4"])
#all_sides

In [414]:
trade_df = pd.read_csv("/Users/danielgilberg/data_science/metis/practice/mcnulty/data/COW_Trade_4.0/National_COW_4.0.csv")

In [415]:
# trade_df = trade_df[["ccode", "statename", "stateabb", "year", "imports", "exports"]]
# trade_df = trade_df.rename(columns={"year": "StYear"})
df = pd.merge(participant_info, trade_df, on=["ccode", "StYear"])
trade_df = df.groupby(["ccode", "StYear"], as_index=False).agg({"imports": sum, "exports": sum})

In [416]:
disputes_with_trade = dispute_part.merge(trade_df, on=["ccode", "StYear"])

In [417]:
total_trade = disputes_with_trade.groupby(["DispNum3", "DispNum4"], as_index=False).agg({"imports": sum, "exports": sum})

In [418]:

all_sides = pd.merge(all_sides, total_trade, on=["DispNum3", "DispNum4"])


In [419]:
all_sides["imports"] = all_sides.imports.fillna(0)
all_sides["exports"] = all_sides.exports.fillna(0)

In [420]:
side_count = disputes.groupby(["DispNum3", "DispNum4"], as_index=False).agg({"NumA": sum, "NumB": sum})
side_count["Country_Count"] = side_count.apply(lambda row: row["NumA"] + row["NumB"], axis=1)
side_count = side_count.drop(["NumA", "NumB"], axis=1)

In [421]:
all_sides = all_sides.merge(side_count, on=["DispNum3", "DispNum4"])

In [422]:
revisionist_df = dispute_part.groupby(["DispNum3", "DispNum4"], as_index=False).agg({"RevState": sum})
all_sides = all_sides.merge(revisionist_df, on=["DispNum3", "DispNum4"])
all_sides.columns

Index(['DispNum3', 'DispNum4', 'StDay', 'StMon', 'StYear', 'EndDay', 'EndMon',
       'EndYear', 'Outcome', 'Settle', 'Fatality', 'FatalPre', 'MaxDur',
       'MinDur', 'HiAct', 'HostLev', 'Recip', 'NumA', 'NumB', 'Link1', 'Link2',
       'Link3', 'Ongo2010', 'Version', 'Major_Dispute', 'milex', 'milper',
       'irst', 'tpop', 'upop', 'pec', 'imports', 'exports', 'Country_Count',
       'RevState'],
      dtype='object')

In [518]:
#print(all_sides.columns)
world_years = all_sides.groupby("StYear", as_index=False).agg({"milex": sum, "milper": sum,
                                                              "irst": sum, "tpop":sum, "pec": sum,
                                                              "imports": sum, "exports":sum, "EndMon": len})
x = pd.read_csv("/Users/danielgilberg/data_science/metis/practice/mcnulty/data/NMC_5_0/NMC_5_0.csv")
x.drop("version",axis=1, inplace=True)
y = pd.read_csv("/Users/danielgilberg/data_science/metis/practice/mcnulty/data/COW_Trade_4.0/National_COW_4.0.csv")
y.drop("version",axis=1, inplace=True)
country_info = pd.merge(x, y, on=["ccode", "year", "stateabb"])
#print(z.columns)
country_info = country_info.iloc[:,:-4]
since_1900 = country_info[country_info.year>1899]
dispute_part["Major_Dispute"] = dispute_part.apply(lambda row: is_major_event(row["Fatality"], row["HostLev"]), axis=1)

In [519]:
country_dispute_count = dispute_part.groupby(["ccode", "StYear"], as_index=False).agg({"StAbb":len, "Major_Dispute": sum})

In [520]:
country_dispute_count = country_dispute_count.rename(columns={"StYear": "year", "StAbb": "Dispute_Count"})

In [538]:
mask = (dispute_part.Major_Dispute ==1) & (dispute_part.ccode ==2)
dispute_part[mask]
since_1900_disp = country_dispute_count[country_dispute_count.year >1899]
modern_country_info = since_1900.merge(since_1900_disp, on=["ccode", "year"], how="outer")
modern_country_info["Dispute_Count"] = modern_country_info["Dispute_Count"].fillna(0)
modern_country_info["Major_Dispute"] = modern_country_info["Major_Dispute"].fillna(0)
modern_country_info["Dispute_Count"] = modern_country_info["Dispute_Count"].astype(int)
modern_country_info["Major_Dispute"] = modern_country_info["Major_Dispute"].astype(int)

In [544]:
def is_involved(num):
    if num > 0:
        return 1
    else:
        return 0
modern_country_info["Dispute_Involved"] = modern_country_info.apply(lambda row: is_involved(row["Dispute_Count"]), axis=1)
modern_country_info["Major_Involved"] = modern_country_info.apply(lambda row: is_involved(row["Major_Dispute"]), axis=1)

In [820]:
modern_country_info.Dispute_Count.value_counts()

0     8963
1     2059
2      680
3      247
4      111
5       48
6       25
7       16
8        8
10       6
9        5
11       3
12       2
19       1
26       1
14       1
16       1
15       1
Name: Dispute_Count, dtype: int64

In [1172]:
world_counts = world_years.drop("EndMon", axis=1)
world_counts = world_counts.rename(columns={"milex": "milex_world", "irst": "irst_world",
                                           "tpop": "tpop_world", "pec": "pec_world", "imports": "imports_world",
                                           "upop": "upop_world", "exports": "exports_world",
                                           "milper": "milper_world"})
#all_sides = all_sides.merge(world_counts, on="StYear")
print(all_sides.columns)
arr = ["milex", "milper", "irst", "pec", "tpop"]
for col in arr:
    world_str = col + "_world"
    pct_str = col + "_pct"
    all_sides[pct_str] = all_sides[col]/all_sides[world_str]
all_sides[["milex", "milex_world", "milex_pct"]]

Index(['DispNum3', 'DispNum4', 'StDay', 'StMon', 'StYear', 'EndDay', 'EndMon',
       'EndYear', 'Outcome', 'Settle', 'Fatality', 'FatalPre', 'MaxDur',
       'MinDur', 'HiAct', 'HostLev', 'Recip', 'NumA', 'NumB', 'Link1', 'Link2',
       'Link3', 'Ongo2010', 'Version', 'Major_Dispute', 'milex', 'milper',
       'irst', 'tpop', 'upop', 'pec', 'imports', 'exports', 'Country_Count',
       'RevState', 'Decade', 'milex_world', 'milper_world', 'irst_world',
       'tpop_world', 'pec_world', 'imports_world', 'exports_world',
       'milex_pct', 'milper_pct', 'irst_pct', 'pec_pct', 'tpop_pct'],
      dtype='object')


Unnamed: 0,milex,milex_world,milex_pct
0,133141,380700,0.349727
1,190061,380700,0.499241
2,2669,380700,0.007011
3,2508,380700,0.006588
4,503,380700,0.001321
5,51818,380700,0.136112
6,38707,394319,0.098162
7,67859,394319,0.172092
8,386,394319,0.000979
9,386,394319,0.000979


In [1168]:
# all_sides.drop(['milex_world_x', 'milper_world_x', 'irst_world_x',
#        'tpop_world_x', 'pec_world_x', 'imports_world_x', 'exports_world_x',
#        'milex_world_y', 'milper_world_y', 'irst_world_y', 'tpop_world_y',
#        'pec_world_y', 'imports_world_y', 'exports_world_y'], axis=1, inplace=True)

#### By Year

In [608]:
trade_year = trade_df.groupby("StYear", as_index=False).agg({"imports": sum, "exports": sum})

Unnamed: 0,StYear,imports,exports
0,1870,3.116697e+03,3.020625e+03
1,1871,2.930150e+03,2.705994e+03
2,1872,7.723072e+02,8.269905e+02
3,1873,2.715954e+03,2.759691e+03
4,1874,1.249786e+02,1.206544e+02
5,1875,1.511210e+03,1.498784e+03
6,1876,6.645248e+03,4.850083e+03
7,1877,2.863674e+03,2.023614e+03
8,1878,2.636309e+03,1.673197e+03
9,1879,2.713558e+02,2.749506e+02


In [618]:
military_year = military_df.groupby("StYear", as_index=False).agg({"milex":sum, "milper":sum,
                                                                  "irst": sum, "pec": sum, "tpop":sum,
                                                                  "upop": sum})
year_info = military_year.merge(trade_year, on="StYear")
year_info= year_info.rename(columns={"milex": "milex_world", "milper": "milper_world", "pec": "pec_world",
                                    "irst": "irst_world", "tpop": "tpop_world", "upop": "upop_world", 
                                    "imports": "imports_world", "exports": "exports_world"})

In [624]:
all_sides_world = all_sides.merge(year_info, on="StYear")
modern_country_info = modern_country_info.rename(columns={"year": "StYear"})
modern_country_world = modern_country_info.merge(year_info, on="StYear")

In [626]:
arr = ["milex", "milper", "pec", "irst", "tpop", "upop", "imports", "exports"]

In [627]:
for col in arr:
    world_str = col + "_world"
    pct_str = col + "_pct"
    modern_country_world[pct_str] = round(modern_country_world[col]/modern_country_world[world_str],4)
    all_sides_world[pct_str]= round(all_sides_world[col]/all_sides_world[world_str], 4)

#### By Country

In [None]:
countries = participant_info.groupby(["DispNum3","DispNum4","ccode", "StAbb","StYear"],as_index=False).agg(
    {"milex": mean, "milper": mean, "irst": mean, "pec": mean, "tpop": mean, "upop": mean})
is_major = major_disputes[["DispNum3", "DispNum4", "Major_Dispute"]]
countries = countries.merge(is_major, on=["DispNum3", "DispNum4"], how="outer")
countries["Major_Dispute"] = countries["Major_Dispute"].fillna(0)
countries = countries.groupby(["ccode", "StYear"], as_index=False).agg({"DispNum3": len,"Major_Dispute": sum})
#countries = countries.merge(military_df, on=["ccode", "StYear"], how="outer")
#countries = countries.merge(trade_df, on=["ccode", "StYear"], how="outer")
countries.to_csv("countries.csv")
countries
countries = countries.merge(military_df, on=["ccode", "StYear"], how="outer")
countries["DispNum3"] = countries["DispNum3"].fillna(0)
countries["Major_Dispute"] = countries["Major_Dispute"].fillna(0)
countries["Major_Dispute"] = countries["Major_Dispute"].astype(int)
countries["DispNum3"] = countries["DispNum3"].astype(int)
len(countries[countries.milex == -9])
arr = [ "milex", "milper", "irst", "pec", "tpop", "upop"]
for col in arr:
    mask = countries[col] != -9
    countries= countries[mask]
countries = countries.rename(columns={"DispNum3": "Dispute_Count"})
countries

In [None]:

by_year = countries.groupby("StYear", as_index=False).agg({"milex":sum, "milper":sum, "irst": sum, 
                                                          "pec": sum, "tpop":sum, "upop":sum})
by_year = by_year.rename(columns={"milex": "milex_world", "milper": "milper_world", "irst": "irst_world",
                                 "pec": "pec_world", "tpop": "tpop_world", "upop": "upop_world"})
countries_world = countries.merge(by_year, on="StYear")
countries_world["milex_pct"] = countries_world.apply(lambda row: round(row["milex"]/row["milex_world"], 4), axis=1)
countries_world["milper_pct"] = countries_world.apply(lambda row: round(row["milper"]/row["milper_world"], 4), axis=1)
countries_world["irst_pct"] = countries_world.apply(lambda row: round(row["irst"]/row["irst_world"], 4), axis=1)
countries_world["pec_pct"] = countries_world.apply(lambda row: round(row["pec"]/row["pec_world"], 4), axis=1)
countries_world["tpop_pct"] = countries_world.apply(lambda row: round(row["tpop"]/row["tpop_world"], 4), axis=1)
countries_world["upop_pct"] = countries_world.apply(lambda row: round(row["upop"]/row["upop_world"], 4), axis=1)

In [None]:
def major_involved(num):
    if int(num) > 0:
        return 1
    else:
        return 0

countries_world["Major_Involved"] = countries_world.apply(lambda row: major_involved(row["Major_Dispute"]), axis=1)
countries_world["Dispute_Involved"] = countries_world.apply(lambda row: major_involved(row["Dispute_Count"]), axis=1)
countries_world

## Modeling

In [1240]:
major_disputes= all_sides[all_sides.Major_Dispute == 1]
len(major_disputes)

2047    1
1274    1
1290    1
1286    1
1284    1
1280    1
1278    1
3325    1
3323    1
1272    1
4299    1
1270    1
1268    1
1262    1
4458    1
3301    1
4445    1
1250    1
1292    1
3341    1
3343    1
3345    1
4506    1
1316    1
3363    1
3361    1
1312    1
4408    1
1310    1
1308    1
1306    1
1304    1
3351    1
1302    1
3349    1
1300    1
4426    1
1246    1
1244    1
1240    1
1204    1
3249    1
1200    1
3247    1
1196    1
3243    1
1190    1
4342    1
1184    1
3231    1
       ..
2752    1
2750    1
4107    1
2748    1
2746    1
2744    1
2742    1
2740    1
2738    1
2736    1
2734    1
625     1
621     1
2594    1
2628    1
2626    1
2624    1
2622    1
2620    1
4411    1
2616    1
2612    1
2610    1
2608    1
2606    1
2604    1
2602    1
2600    1
2598    1
2596    1
4260    1
2630    1
619     1
2632    1
617     1
613     1
611     1
607     1
605     1
603     1
601     1
2648    1
2646    1
2644    1
2642    1
2640    1
2638    1
4490    1
2634    1


In [890]:
non_major_events = all_sides[all_sides.Major_Dispute == 0]
len(non_major_events)

2012

In [891]:
major_disp_x = major_disputes["Major_Dispute"]
major_disp_y = major_disputes[["irst", "milper", "upop", "imports", "exports"]]

In [892]:
non_major_x = non_major_events["Major_Dispute"]
non_major_y = non_major_events[["irst", "milper", "upop"]]

In [1189]:
data_known = all_sides[(all_sides.Fatality != -9) & (all_sides.StYear > 1899)]
df = data_known[["Major_Dispute","milper", "irst", "exports","imports", "Country_Count", 
                "Recip", "MaxDur", "milex", "RevState", "pec", "MinDur", "DispNum3", "DispNum4", "HostLev",
                "Fatality", "tpop", "milex_pct", "pec_pct", "tpop_pct", "milper_pct", "irst_pct"]]
train,test = train_test_split(df, test_size=.3, random_state=42, stratify=df["Major_Dispute"])
len(test), len(train)


(616, 1435)

In [1190]:
major_train = train[train.Major_Dispute == 1]
non_major_train = train[train.Major_Dispute == 0]

In [1191]:
len(major_train), len(non_major_train)

(285, 1150)

In [1201]:
# df_major_resampled = resample(major_train, replace=True, n_samples=1408, random_state=123)
# training_df = pd.concat([non_major_train, df_major_resampled])

df_non_major_resampled = resample(non_major_train, replace=False, n_samples=285, random_state=123)
training_df = pd.concat([df_non_major_resampled, major_train])


X_train = training_df[["milper_pct", "irst_pct", "Country_Count", "milex_pct", "exports", "RevState", 
                       "imports", "pec_pct", "Recip"]]
y_train = training_df["Major_Dispute"]
X_test = test[["milper_pct", "irst_pct", "Country_Count",  "milex_pct", "exports", "RevState", "imports", "pec_pct", 
               "Recip"]]
y_test = test["Major_Dispute"]


In [1241]:
rfc = RandomForestClassifier(min_samples_split=25)
rfc.fit(X_train, y_train)
y_pred = rfc.predict(X_test)
print("Precision Score: {0:.3f}".format(metrics.precision_score(y_test, y_pred)))
print("Recall Score: {0:.3f}".format(metrics.recall_score(y_test, y_pred)))
#print(metrics.classification_report(y_test, y_pred))
print(Counter(y_pred))
len(y_pred)
print(metrics.classification_report(y_test, y_pred))
feats = {} # a dict to hold feature_name: feature_importance
for feature, importance in zip(X_test.columns, rfc.feature_importances_):
    feats[feature] = importance
feats
test["Predicted"] = y_pred
a = test[test.Predicted != test.Major_Dispute]
b = test[test.Predicted == test.Major_Dispute]
#print(a[["DispNum3", "Major_Dispute", "Predicted"]])
#dispute_part[dispute_part.DispNum3 ==2938]
a = rfc.predict_proba(X_test)
major_prob = [i[1] for i in a]
test["Major_Prob"] = major_prob


Precision Score: 0.459
Recall Score: 0.911
Counter({0: 372, 1: 244})
             precision    recall  f1-score   support

          0       0.97      0.73      0.83       493
          1       0.46      0.91      0.61       123

avg / total       0.87      0.77      0.79       616





A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



4095    1
1316    1
1324    1
3371    1
1322    1
4393    1
4391    1
4390    1
4389    1
407     1
3350    1
4054    1
349     1
2335    1
4382    1
1309    1
1308    1
2331    1
4398    1
3375    1
4400    1
1329    1
4421    1
2372    1
323     1
4418    1
4416    1
1343    1
4414    1
1339    1
1337    1
4408    1
311     1
2757    1
3381    1
4403    1
4402    1
2328    1
4087    1
1746    1
238     1
4343    1
246     1
1269    1
4340    1
242     1
4337    1
4335    1
3872    1
4370    1
       ..
2565    1
1293    1
2771    1
2770    1
2769    1
1742    1
4406    1
1425    1
2763    1
1737    1
1733    1
1344    1
2204    1
1677    1
2059    1
1724    1
608     1
602     1
4101    1
2643    1
2642    1
1776    1
2637    1
1611    1
1274    1
2632    1
3655    1
2052    1
4328    1
2628    1
1193    1
614     1
3724    1
2312    1
3721    1
4002    1
1669    1
3716    1
3714    1
2575    1
3352    1
634     1
1656    1
1654    1
1780    1
625     1
622     1
3971    1
1134    1


In [1262]:

a = test[["DispNum3", "Predicted", "Major_Prob"]]
i=dispute_part.merge(a, on="DispNum3")
j = disputes.merge(a, on="DispNum3")
missed_major = i[(i.Major_Prob < .5) & (i.Major_Dispute ==1)]
wrong_major = i[(i.Major_Prob > .5) & (i.Major_Dispute ==0)]
missed_major.StYear.value_counts()
# dispute_part[dispute_part.DispNum3 == 4529]
j["Decade"] = j.apply(lambda row: get_decade(row["StYear"]), axis=1)
missed_major = j[(j.Major_Prob < .5) & (j.Major_Dispute ==1)]
wrong_major = j[(j.Major_Prob > .5) & (j.Major_Dispute ==0)]
x = j[j.Predicted != j.Major_Dispute]
x

Unnamed: 0,DispNum3,DispNum4,StDay,StMon,StYear,EndDay,EndMon,EndYear,Outcome,Settle,Fatality,FatalPre,MaxDur,MinDur,HiAct,HostLev,Recip,NumA,NumB,Link1,Link2,Link3,Ongo2010,Version,Major_Dispute,Predicted,Major_Prob,Decade
1,12,-9,11,3,1938,30,9,1938,4,1,0,0,204,204,16,4,1,1,5,0,0,0.0,0,4.01,0,1,0.751384,1930
2,26,-9,21,3,1948,25,7,1949,5,3,0,0,492,492,16,4,1,1,3,0,0,0.0,0,4.01,0,1,0.835921,1940
16,156,-9,-9,2,1906,12,5,1906,3,3,0,0,101,74,14,4,1,1,1,0,0,0.0,0,4.01,0,1,0.678735,1900
17,159,-9,2,11,1903,25,1,1904,1,2,0,0,85,85,13,4,1,1,1,0,0,0.0,0,4.01,0,1,0.594096,1900
22,211,-9,2,4,1920,16,7,1920,2,2,0,0,106,106,14,4,1,1,2,0,0,0.0,0,4.01,0,1,0.746256,1920
24,242,-9,21,10,1904,5,11,1904,3,1,0,0,16,16,16,4,1,1,1,0,0,0.0,0,4.01,0,1,0.775381,1900
25,246,-9,22,4,1960,26,4,1961,5,3,0,0,370,370,16,4,1,2,1,0,0,0.0,0,4.01,0,1,0.760390,1960
37,400,-9,5,4,1917,11,11,1918,8,3,0,0,586,586,16,4,1,1,1,0,0,0.0,0,4.01,0,1,0.687944,1910
45,608,-9,7,11,1958,23,6,1959,5,3,0,0,229,229,15,4,1,1,3,0,0,0.0,0,4.01,0,1,0.781666,1950
51,1002,-9,26,4,1963,3,6,1963,5,3,0,0,39,39,16,4,1,1,4,0,0,0.0,0,4.01,0,1,0.838580,1960


In [711]:
y_pred = rfc.predict(X_train)
print("Precision Score: {0:.3f}".format(metrics.precision_score(y_train, y_pred)))
print("Recall Score: {0:.3f}".format(metrics.recall_score(y_train, y_pred)))
#print(metrics.classification_report(y_test, y_pred))
print(Counter(y_pred))
len(y_pred)

Precision Score: 0.799
Recall Score: 0.905
Counter({1: 334, 0: 256})


590

recall--how many of the disputes I catch
precision--guessed disputes/total disputes

In [None]:
y_pred = rfc.predict(X_test)

In [None]:
metrics.recall_score(y_test, y_pred)

In [None]:
X = all_sides[["milper", "irst", "Country_Count", "milex", "Recip"]]
y = all_sides["Major_Dispute"]

In [705]:
sv = SVC()
sv.fit(X_train, y_train)
y_pred = sv.predict(X_test)
metrics.recall_score(y_test, y_pred)

0.047619047619047616

#### Country Models

In [592]:
train, test =train_test_split(modern_country_info, test_size=.3, random_state=42, stratify=modern_country_info["Dispute_Involved"])

In [558]:
majority_class = train[train.Dispute_Involved == 0]
minority_class = train[train.Dispute_Involved == 1]
len(minority_class), len(majority_class)
#min_upsampled = resample(minority_class, replace=True, n_samples=)

(2250, 6274)

In [559]:
min_upsampled = resample(minority_class, replace=True, n_samples=6274, random_state=123)
maj_downsampled = resample(majority_class, replace=False, n_samples=2250, random_state=123)

In [570]:
training_df = pd.concat([min_upsampled, majority_class]).dropna()

In [589]:
X_train = training_df[["milex", "milper", "irst", "pec", "tpop", "upop", "imports", "exports"]]
y_train = training_df["Dispute_Involved"]
test = test.dropna()
X_test = test[["milex", "milper", "irst", "pec", "tpop", "upop", "imports", "exports"]]
y_test = test["Dispute_Involved"]
len(X_train), len(y_train)

(11585, 11585)

In [590]:
rfc.fit(X_train, y_train)
y_pred = rfc.predict(X_test)
metrics.recall_score(y_test, y_pred)

0.54769921436588098

In [991]:
train4, test4 = train_test_split(modern_country_world, test_size=.3, random_state=42, stratify=modern_country_world["Dispute_Involved"])

In [992]:
majority_class4 = train4[train4.Dispute_Involved == 0]
minority_class4 = train4[train4.Dispute_Involved == 1]
len(majority_class4), len(minority_class4)

(6021, 2250)

In [993]:
maj4_downsampled = resample(majority_class4, replace=False, n_samples=2250, random_state=123)
min4_upsampled = resample(minority_class4, replace=True, n_samples=6021, random_state=123)
training_df4 = pd.concat([min4_upsampled, majority_class4])

In [994]:
training_df4 = training_df4.dropna()
test4 = test4.dropna()
X_train4 = training_df1[["milex_pct", "milper_pct", "irst_pct", "tpop_pct", "upop_pct", "imports_pct",
                       "exports_pct"]]

y_train4 = training_df1["Dispute_Involved"]
X_test4 = test4[["milex_pct", "milper_pct", "irst_pct", "tpop_pct", "upop_pct", "imports_pct",
                       "exports_pct"]]
y_test4 = test4["Dispute_Involved"]


In [995]:
rfc4 = RandomForestClassifier()
rfc4.fit(X_train4, y_train4)
y_pred4 = rfc1.predict(X_test4)
print(metrics.classification_report(y_test4, y_pred4))
feats4 = dict()
for feature, importance in zip(X_test5.columns, rfc5.feature_importances_):
    feats4[feature] = importance
feats4

             precision    recall  f1-score   support

          0       0.82      0.81      0.82      2376
          1       0.52      0.54      0.53       891

avg / total       0.74      0.74      0.74      3267



{'exports_pct': 0.13808609150969461,
 'imports_pct': 0.13594595212521574,
 'irst_pct': 0.065630085690540713,
 'milex_pct': 0.1373090141929687,
 'milper_pct': 0.21991895234784398,
 'tpop_pct': 0.16766065355583798,
 'upop_pct': 0.13544925057789825}

In [967]:
train5, test5 = train_test_split(modern_country_world, test_size=.3, random_state=42, stratify=modern_country_world["Major_Involved"])

In [968]:
train5.Major_Involved.value_counts()

0    7758
1     513
Name: Major_Involved, dtype: int64

In [969]:
minority_class5 = train5[train5.Major_Involved == 1]
majority_class5 = train5[train5.Major_Involved == 0]
len(majority_class5), len(minority_class5)

(7758, 513)

In [975]:
maj5_downsampled = resample(majority_class5, replace=False, n_samples=513, random_state=123)
min5_upsampled = resample(minority_class5, replace=True, n_samples = 7758, random_state=123)
training_df5 = pd.concat([maj5_downsampled, minority_class5])
training_df5 = training_df5.dropna()
test5 = test5.dropna()
test5.columns

Index(['stateabb', 'ccode', 'StYear', 'milex', 'milper', 'irst', 'pec', 'tpop',
       'upop', 'cinc', 'statename', 'imports', 'exports', 'Dispute_Count',
       'Major_Dispute', 'Dispute_Involved', 'Major_Involved', 'milex_world',
       'milper_world', 'irst_world', 'pec_world', 'tpop_world', 'upop_world',
       'imports_world', 'exports_world', 'milex_pct', 'milper_pct', 'pec_pct',
       'irst_pct', 'tpop_pct', 'upop_pct', 'imports_pct', 'exports_pct'],
      dtype='object')

In [1032]:
X_train5 = training_df5[["milex_pct", "milper_pct", "irst_pct", "tpop_pct", "upop_pct", "imports_pct",
                       "exports_pct"]]

y_train5 = training_df5["Major_Involved"]
X_test5 = test5[["milex_pct", "milper_pct", "irst_pct", "tpop_pct", "upop_pct", "imports_pct",
                       "exports_pct"]]
y_test5 = test5["Major_Involved"]
dispute_part.columns

Index(['DispNum3', 'DispNum4', 'StAbb', 'ccode', 'StDay', 'StMon', 'StYear',
       'EndDay', 'EndMon', 'EndYear', 'SideA', 'RevState', 'RevType1',
       'RevType2', 'Fatality', 'FataPre', 'HiAct', 'HostLev', 'Orig',
       'Version', 'Major_Dispute'],
      dtype='object')

In [1035]:
rfc5 = RandomForestClassifier()
rfc5.fit(X_train5, y_train5)
y_pred5 = rfc5.predict(X_test5)
print(metrics.classification_report(y_test5, y_pred5))
print (metrics.accuracy_score(y_test5, y_pred5))
rfc5.feature_importances_


             precision    recall  f1-score   support

          0       0.97      0.72      0.83      3066
          1       0.14      0.69      0.24       206

avg / total       0.92      0.72      0.79      3272

0.718215158924


array([ 0.12922956,  0.22610733,  0.06748153,  0.14678308,  0.15061256,
        0.14740272,  0.13238322])

In [1015]:
test5.iloc[10:15,:]

Unnamed: 0,stateabb,ccode,StYear,milex,milper,irst,pec,tpop,upop,cinc,...,imports_world,exports_world,milex_pct,milper_pct,pec_pct,irst_pct,tpop_pct,upop_pct,imports_pct,exports_pct
5855,MEX,70,1977,533541.0,100.0,5529.0,103035.0,62538.0,22722.0,0.010801,...,1055804.0,978921.8,0.0012,0.0039,0.0103,0.0082,0.0149,0.0263,0.0052,0.0043
205,SWD,380,1904,4352.0,95.0,333.0,3944.0,7515.0,469.0,0.009037,...,8503.279,6610.669,0.0113,0.0165,0.0046,0.0093,0.0066,0.0059,0.0184,0.0172
8393,TAJ,702,1992,202800.0,3.0,0.0,4974.0,5571.0,820.0,0.000407,...,4535944.0,4085878.0,0.0003,0.0001,0.0004,0.0,0.001,0.0006,0.0,0.0
11454,CUB,40,2009,19060000.0,49.0,267.0,17071.0,11289.0,2929.0,0.00328,...,10451250.0,9992981.0,0.013,0.0024,0.0009,0.0002,0.0017,0.0015,0.0008,0.0002
6053,NOR,385,1978,1307441.0,40.0,812.0,25940.0,4059.0,805.0,0.001603,...,1073708.0,1068034.0,0.0026,0.0015,0.0025,0.0011,0.001,0.0009,0.0107,0.0102


In [1012]:
rfc5.predict_proba(X_test5.iloc[10:15,:])

array([[ 0.9,  0.1],
       [ 0.5,  0.5],
       [ 0.3,  0.7],
       [ 0.4,  0.6],
       [ 0.8,  0.2]])

In [988]:
feats5 = dict()
for feature, importance in zip(X_test5.columns, rfc5.feature_importances_):
    feats5[feature] = importance
feats5

{'exports_pct': 0.14879163663702208,
 'imports_pct': 0.17700751844139001,
 'milper_pct': 0.2537040522465312,
 'tpop_pct': 0.24209933174662437,
 'upop_pct': 0.17839746092843228}

In [215]:
countries = participant_info.groupby(["DispNum3","DispNum4","ccode", "StAbb","StYear"],as_index=False).agg(
    {"milex": mean, "milper": mean, "irst": mean, "pec": mean, "tpop": mean, "upop": mean})
is_major = major_disputes[["DispNum3", "DispNum4", "Major_Dispute"]]
countries = countries.merge(is_major, on=["DispNum3", "DispNum4"], how="outer")
countries["Major_Dispute"] = countries["Major_Dispute"].fillna(0)
countries = countries.groupby(["ccode", "StYear"], as_index=False).agg({"DispNum3": len,"Major_Dispute": sum})
#countries = countries.merge(military_df, on=["ccode", "StYear"], how="outer")
#countries = countries.merge(trade_df, on=["ccode", "StYear"], how="outer")
countries.to_csv("countries.csv")
countries
countries = countries.merge(military_df, on=["ccode", "StYear"], how="outer")
countries["DispNum3"] = countries["DispNum3"].fillna(0)
countries["Major_Dispute"] = countries["Major_Dispute"].fillna(0)


#### Visuals

In [864]:
def get_decade(year):
    year = str(year)
    return int(year[:3] + "0")


1980

In [866]:
all_sides["Decade"] = all_sides.apply(lambda row: get_decade(row["StYear"]), axis=1)
all_sides.columns

Index(['DispNum3', 'DispNum4', 'StDay', 'StMon', 'StYear', 'EndDay', 'EndMon',
       'EndYear', 'Outcome', 'Settle', 'Fatality', 'FatalPre', 'MaxDur',
       'MinDur', 'HiAct', 'HostLev', 'Recip', 'NumA', 'NumB', 'Link1', 'Link2',
       'Link3', 'Ongo2010', 'Version', 'Major_Dispute', 'milex', 'milper',
       'irst', 'tpop', 'upop', 'pec', 'imports', 'exports', 'Country_Count',
       'RevState', 'Decade'],
      dtype='object')

In [882]:
decades = all_sides.groupby("Decade", as_index=False).agg({"DispNum3": len, "Major_Dispute": sum})

In [885]:
decades.index


Int64Index([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14], dtype='int64')

In [887]:
post_1900_decades = post_1900_decades.rename(columns={"DispNum3": "Disputes", "Major_Dispute": "Major Disputes"})
post_1900_decades.to_csv("20thCenturyDecades.csv", index=False)


In [964]:
len(disputes)

2586