In [2]:
import psycopg2 as pg
import pandas as pd
import pandas.io.sql as pd_sql
import numpy as np
from sqlalchemy import create_engine

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

plt.style.use('ggplot')
plt.rc('font', size=18)

connection_args = {
    'host':'3.86.206.29',
    'user':'ubuntu',
    'dbname':'mid',
    'port':5432
}
connection = pg.connect(**connection_args)

def is_prime(n):
    for i in range(2,n):
        if n%i==0:
            return False
    return True

def largestPrimeFactor(n):
    for i in range(n,1,-1):
        if n%i==0:
            if is_prime(i):
                return i

In [6]:
midb = pd.read_stata('MID4/MIDB_4_3.dta')

In [7]:
midb.rename(columns={'dispnum3':'dispute_number_v3', 
                     'dispnum4':'dispute_number_v4',
                     'stabb':'state_abbriviated',
                     'ccode':'country_code',
                     'stday':'start_day', 
                     'stmon':'start_month', 
                     'styear':'start_year', 
                     'endday':'end_day',
                     'endmon':'end_month',
                     'endyear':'end_year',
                     'sidea':'is_side_a',
                     'revstate':'is_revisionist_state',
                     'revtype1':'revision_type_1',
                     'revtype2':'revision_type_2',
                     'fatality':'fatality_bucket',
                     'fatalpre':'fatalities_number',
                     'hiact':'highest_hostile_action[hostility_level]',
                     'hostlev':'hostility_level',
                     'orig':'dispute_originator',
                     'version':'dataset_version'
                    }, inplace=True)

In [8]:
midb.replace(to_replace=-9,value=np.nan, inplace=True)

In [9]:
midb.drop('dispute_number_v4', inplace=True, axis=1)
midb.drop('revision_type_2', inplace=True, axis=1)
midb['start_day'].replace(to_replace=np.nan,value=15, inplace=True)
midb['end_day'].replace(to_replace=np.nan,value=15, inplace=True)
midb.drop(['fatalities_number'], inplace=True, axis=1)
midb.drop(['dataset_version'], inplace=True, axis=1)
midb['mid_start'] = pd.to_datetime(dict(year=midb.start_year, month=midb.start_month, day=midb.start_day))
midb['mid_end'] = pd.to_datetime(dict(year=midb.end_year, month=midb.end_month, day=midb.end_day))
midb['fatality_bucket'].replace(to_replace=np.nan, value=999, inplace=True)

In [10]:
midb.head()

Unnamed: 0,dispute_number_v3,state_abbriviated,country_code,start_day,start_month,start_year,end_day,end_month,end_year,is_side_a,is_revisionist_state,revision_type_1,fatality_bucket,highest_hostile_action[hostility_level],hostility_level,dispute_originator,mid_start,mid_end
0,2,UKG,200,15.0,7,1902,24.0,1,1903,0,1,1,0.0,0,1,1,1902-07-15,1903-01-24
1,2,USA,2,15.0,7,1902,24.0,1,1903,1,1,1,0.0,7,3,1,1902-07-15,1903-01-24
2,3,YUG,345,2.0,5,1913,25.0,10,1913,0,0,0,0.0,0,1,1,1913-05-02,1913-10-25
3,3,AUH,300,2.0,5,1913,25.0,10,1913,1,1,2,0.0,8,3,1,1913-05-02,1913-10-25
4,4,ALB,339,15.0,5,1946,13.0,11,1946,1,0,0,0.0,16,4,1,1946-05-15,1946-11-13


In [25]:
#query = "SELECT count(*) FROM mil_exp;"
#pd_sql.read_sql(query, connection)

In [26]:
#query = "SELECT count(*) FROM midb;"
#pd_sql.read_sql(query, connection)

In [24]:
#query = "SELECT b.*, e.mil_exp FROM midb as b JOIN mil_exp as e on b.state_abbriviated = e.code \
#WHERE b.start_year == e.year;"
#pd_sql.read_sql(query, connection)

In [3]:
mil_exp = pd.read_csv('mil_exp_tall.csv')

In [17]:
mil_exp.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15312 entries, 0 to 15311
Data columns (total 8 columns):
Unnamed: 0        15312 non-null int64
Unnamed: 0.1      15312 non-null int64
name              15312 non-null object
code              15312 non-null object
type              15312 non-null object
indicator_name    15312 non-null object
year              15312 non-null int64
mil_exp           9045 non-null float64
dtypes: float64(1), int64(3), object(4)
memory usage: 957.1+ KB


In [12]:
#midb.set_index(['state_abbriviated', 'start_year']).join(mil_exp.set_index(['code','year']))

In [35]:
newdf = pd.merge(midb, mil_exp, how='left', left_on=['state_abbriviated', 'start_year'], right_on=['code','year'])

In [36]:
newdf.head()

Unnamed: 0.2,dispute_number_v3,state_abbriviated,country_code,start_day,start_month,start_year,end_day,end_month,end_year,is_side_a,...,mid_start,mid_end,Unnamed: 0,Unnamed: 0.1,name,code,type,indicator_name,year,mil_exp
0,2,UKG,200,15.0,7,1902,24.0,1,1903,0,...,1902-07-15,1903-01-24,,,,,,,,
1,2,USA,2,15.0,7,1902,24.0,1,1903,1,...,1902-07-15,1903-01-24,,,,,,,,
2,3,YUG,345,2.0,5,1913,25.0,10,1913,0,...,1913-05-02,1913-10-25,,,,,,,,
3,3,AUH,300,2.0,5,1913,25.0,10,1913,1,...,1913-05-02,1913-10-25,,,,,,,,
4,4,ALB,339,15.0,5,1946,13.0,11,1946,1,...,1946-05-15,1946-11-13,,,,,,,,


In [18]:
gdp_hist = pd.read_csv('MID4/gdp_hist.csv')

In [32]:
gdp_hist.head()

Unnamed: 0.1,Unnamed: 0,name,code,indicator_name,indicator_code,year,gdp
0,0,Aruba,ABW,GDP (current US$),NY.GDP.MKTP.CD,1960,
1,1,Afghanistan,AFG,GDP (current US$),NY.GDP.MKTP.CD,1960,537777800.0
2,2,Angola,AGO,GDP (current US$),NY.GDP.MKTP.CD,1960,
3,3,Albania,ALB,GDP (current US$),NY.GDP.MKTP.CD,1960,
4,4,Andorra,AND,GDP (current US$),NY.GDP.MKTP.CD,1960,


In [37]:
newdf = pd.merge(newdf, gdp_hist, how='left', left_on=['state_abbriviated', 'start_year'], right_on=['code','year'])

In [22]:
#pd.merge(midb, gdp_hist, how='right', left_on=['state_abbriviated', 'start_year'], right_on=['code','year']).info()

In [23]:
population_hist = pd.read_csv('MID4/population_hist.csv')

In [38]:
newdf = pd.merge(newdf, population_hist, how='left', left_on=['state_abbriviated', 'start_year'], right_on=['code','year'])

In [39]:
newdf.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5558 entries, 0 to 5557
Data columns (total 40 columns):
dispute_number_v3                          5558 non-null int16
state_abbriviated                          5558 non-null object
country_code                               5558 non-null int16
start_day                                  5558 non-null float64
start_month                                5558 non-null int8
start_year                                 5558 non-null int16
end_day                                    5558 non-null float64
end_month                                  5558 non-null int8
end_year                                   5558 non-null int16
is_side_a                                  5558 non-null int8
is_revisionist_state                       5558 non-null int8
revision_type_1                            5558 non-null int8
fatality_bucket                            5558 non-null float64
highest_hostile_action[hostility_level]    5558 non-null int8
hostility_

In [42]:
#mil_exp.describe() 1960-2017

In [43]:
#midb.describe() 1980-2010