In [474]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.plotly as ply
from datetime import date
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
%pylab inline

Populating the interactive namespace from numpy and matplotlib


#### Datasets

Using pandas, read in the CSVs for disputes and incidents and make a master one for each category

In [475]:
dispute1 = pd.read_csv("/Users/danielgilberg/data_science/metis/practice/mcnulty/data/MID-level/MIDA_4.01.csv")

In [476]:
dispute1.head()

Unnamed: 0,DispNum3,DispNum4,StDay,StMon,StYear,EndDay,EndMon,EndYear,Outcome,Settle,...,HiAct,HostLev,Recip,NumA,NumB,Link1,Link2,Link3,Ongo2010,Version
0,2,-9,-9,7,1902,24,1,1903,6,1,...,7,3,0,1,1,0,0,0.0,0,4.01
1,3,-9,2,5,1913,25,10,1913,4,3,...,8,3,0,1,1,0,0,0.0,0,4.01
2,4,-9,15,5,1946,13,11,1946,5,3,...,16,4,1,1,1,0,0,0.0,0,4.01
3,7,-9,13,10,1951,26,1,1952,1,3,...,17,4,1,1,1,0,0,0.0,0,4.01
4,8,-9,-9,7,1856,14,3,1857,1,2,...,20,5,1,1,1,0,0,0.0,0,4.01


In [477]:
dispute2 = pd.read_csv("/Users/danielgilberg/data_science/metis/practice/mcnulty/data/MID-level/MIDB_4.01.csv")

In [478]:
dispute2.head()

Unnamed: 0,DispNum3,DispNum4,StAbb,ccode,StDay,StMon,StYear,EndDay,EndMon,EndYear,SideA,RevState,RevType1,RevType2,Fatality,FataPre,HiAct,HostLev,Orig,Version
0,2,-9,UKG,200,-9,7,1902,24,1,1903,0,1,1,-9,0,0,0,1,1,4.01
1,2,-9,USA,2,-9,7,1902,24,1,1903,1,1,1,-9,0,0,7,3,1,4.01
2,3,-9,YUG,345,2,5,1913,25,10,1913,0,0,0,-9,0,0,0,1,1,4.01
3,3,-9,AUH,300,2,5,1913,25,10,1913,1,1,2,-9,0,0,8,3,1,4.01
4,4,-9,ALB,339,15,5,1946,13,11,1946,1,0,0,-9,0,0,16,4,1,4.01


In [479]:
incident1 = pd.read_csv("/Users/danielgilberg/data_science/metis/practice/mcnulty/data/Incident-level/MIDI_4.01.csv")
incident2 = pd.read_csv("/Users/danielgilberg/data_science/metis/practice/mcnulty/data/Incident-level/MIDIP_4.01.csv")

In [480]:
incidents = pd.merge(incident1, incident2, on=["IncidNum3", "IncidNum4", "DispNum3", "DispNum4", "Version"], how="outer", suffixes=("_Inc", "_Part"))

In [481]:
disputes = pd.merge(dispute1, dispute2, on=["DispNum3", "DispNum4", "Version"], suffixes=("_Disp", "_Part"))

In [482]:
modern_incidents = incidents[incidents.StYear_Inc > 1990]

In [483]:
merged_df = pd.merge(incidents, disputes, on=["DispNum3", "DispNum4"], suffixes=("_Disp", "_Inc"))
merged_df.to_csv("merged.csv")

In [484]:
all_modern = pd.merge(modern_incidents, disputes, on=["DispNum3", "DispNum4"], suffixes=("_Disp", "_Inc"))

In [485]:
modern_disputes = disputes[disputes.StYear_Disp > 1991]

In [486]:
military_df = pd.read_csv("/Users/danielgilberg/data_science/metis/practice/mcnulty/data/NMC_5_0/NMC_5_0.csv")

#### Filters and Data Manipulation

In [487]:
def is_major_dispute(host, fatal):
    if host > 2 and fatal > 0:
        return 1
    else:
        return 0

In [488]:
def date_column(month, year):
    if not pd.isnull(month) and not pd.isnull(year):
        return date(month=int(month), year=int(year), day=1)
    else:
        return ""

In [489]:
modern_disputes["Major_Disp"] = modern_disputes.apply(lambda row: is_major_dispute(row["HostLev_Disp"], row["Fatality_Disp"]), axis=1)
major_dispute_df = modern_disputes[["DispNum3", "DispNum4", "Major_Disp"]].drop_duplicates()

major_disputes.DispNum3.value_counts()



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



4095    1
4312    1
4319    1
4318    1
4317    1
4315    1
4314    1
4313    1
4311    1
4321    1
4310    1
4309    1
4308    1
4307    1
4306    1
4305    1
4320    1
4322    1
4303    1
4332    1
4338    1
4337    1
4336    1
4335    1
4334    1
4333    1
4331    1
4323    1
4330    1
4329    1
       ..
4497    1
4496    1
4495    1
4493    1
4502    1
4492    1
4491    1
4490    1
4489    1
4488    1
4487    1
4501    1
4503    1
4519    1
4512    1
4518    1
4517    1
4516    1
4515    1
4514    1
4513    1
4511    1
4504    1
4510    1
4509    1
4508    1
4507    1
4506    1
4505    1
4096    1
Name: DispNum3, Length: 575, dtype: int64

In [490]:
incidents = pd.merge(major_dispute_df, incidents, on=["DispNum3", "DispNum4"])
incidents["MonthYear"] = incidents.apply(lambda row: date_column(row["StMon_Part"], row["StYear_Part"]), axis=1)

In [491]:
def sequence_column(incid_num):
    num = str(incid_num)
    return num[-3:]

int(sequence_column(355055))

55

In [492]:
incidents["Disp_Sequence"] = incidents.apply(lambda row: sequence_column(row["IncidNum3"]), axis=1)

In [561]:
first_incidents = incidents[incidents.Disp_Sequence == "001"]
first_incidents.to_csv("first_incidents.csv")

x = first_incidents[["DispNum3", "DispNum4", "Action_Inc"]].drop_duplicates()
a = disputes[["DispNum3", "DispNum4", "HiAct_Disp"]]
b = pd.merge(a, x, on=["DispNum3", "DispNum4"])
b = b.drop_duplicates()
b[b.HiAct_Disp > b.Action_Inc]
len(dispute1[dispute1.StYear<=1991])
incidents.columns

Index(['DispNum3', 'DispNum4', 'Major_Disp', 'IncidNum3', 'IncidNum4',
       'StDay_Inc', 'StMon_Inc', 'StYear_Inc', 'EndDay_Inc', 'EndMon_Inc',
       'EndYear_Inc', 'Duration', 'TBI', 'Fatality_Inc', 'FatalPre_Inc',
       'Action_Inc', 'HostLev_Inc', 'NumA', 'RevType1_Inc', 'RevType2_Inc',
       'Version', 'StAbb', 'ccode', 'StDay_Part', 'StMon_Part', 'StYear_Part',
       'EndDay_Part', 'EndMon_Part', 'EndYear_Part', 'InSide A', 'SideA',
       'Fatality_Part', 'FatalPre_Part', 'Action_Part', 'HostLev_Part',
       'RevType1_Part', 'RevType2_Part', 'MonthYear', 'Disp_Sequence'],
      dtype='object')

In [564]:
disputes.head(50)
dispute1.columns

Index(['DispNum3', 'DispNum4', 'StDay', 'StMon', 'StYear', 'EndDay', 'EndMon',
       'EndYear', 'Outcome', 'Settle', 'Fatality', 'FatalPre', 'MaxDur',
       'MinDur', 'HiAct', 'HostLev', 'Recip', 'NumA', 'NumB', 'Link1', 'Link2',
       'Link3', 'Ongo2010', 'Version'],
      dtype='object')

#### Visuals

#### Models

In [221]:
y = incidents["Major_Disp"]
X = incidents[["EndYear_Inc", "RevType1_Inc", "RevType2_Inc", "Duration"]]
incidents.columns

Index(['DispNum3', 'DispNum4', 'Major_Disp', 'IncidNum3', 'IncidNum4',
       'StDay_Inc', 'StMon_Inc', 'StYear_Inc', 'EndDay_Inc', 'EndMon_Inc',
       'EndYear_Inc', 'Duration', 'TBI', 'Fatality_Inc', 'FatalPre_Inc',
       'Action_Inc', 'HostLev_Inc', 'NumA', 'RevType1_Inc', 'RevType2_Inc',
       'Version', 'StAbb', 'ccode', 'StDay_Part', 'StMon_Part', 'StYear_Part',
       'EndDay_Part', 'EndMon_Part', 'EndYear_Part', 'InSide A', 'SideA',
       'Fatality_Part', 'FatalPre_Part', 'Action_Part', 'HostLev_Part',
       'RevType1_Part', 'RevType2_Part'],
      dtype='object')

In [222]:
logreg = LogisticRegression()
print(cross_val_score(logreg, X, y, cv=10, scoring='accuracy').mean())

0.586698329133


In [229]:
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X, y)
y_pred = knn.predict(X)
print(metrics.accuracy_score(y, y_pred))

0.657528527693


In [230]:
scores = cross_val_score(knn, X, y, cv=10, scoring='accuracy')
scores.mean()

0.49138852591902549

In [502]:
rfc = RandomForestClassifier()
rfc.fit(X,y)
y_pred = rfc.predict(X)
metrics.accuracy_score(y, y_pred)
cross_val_score(rfc, X, y, cv=10, scoring='accuracy').mean()

0.51945051939762266