### Environment and Infrastructure

In [1]:
import psycopg2 as pg
import pandas as pd
import pandas.io.sql as pd_sql
import numpy as np
from sqlalchemy import create_engine

import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
plt.style.use('ggplot')
plt.rc('font', size=18)

In [3]:
engine = create_engine('postgresql://ubuntu:5432@3.86.206.29/mid')

In [4]:
connection_args = {
    'host':'3.86.206.29',
    'user':'ubuntu',
    'dbname':'mid',
    'port':5432
}
connection = pg.connect(**connection_args)

In [5]:
!pwd

/Users/elena/Desktop/Metis/Project3_MID


In [6]:
def is_prime(n):
    for i in range(2,n):
        if n%i==0:
            return False
    return True

def largestPrimeFactor(n):
    for i in range(n,1,-1):
        if n%i==0:
            if is_prime(i):
                return i

Checking connection with mid database on AWS on existing tables

In [8]:
# exchage changetype2, changetype1
query = "SELECT changetype2, changetype1 FROM dyadic_mid LIMIT 5;"
pd_sql.read_sql(query, connection)

Unnamed: 0,changetype2,changetype1
0,0.0,
1,0.0,
2,0.0,
3,0.0,
4,0.0,


In [9]:
# exchage changetype2, changetype1
query = "SELECT changetype2, changetype1 FROM dyadic_mid LIMIT 5;"
pd_sql.read_sql(query, connection)

Unnamed: 0,changetype2,changetype1
0,0.0,
1,0.0,
2,0.0,
3,0.0,
4,0.0,


### Exploring and Cleaning MIDA

Dispute-level: one record per dispute.

In [10]:
MIDA = pd.read_stata('MID4/MIDA_4_3.dta')

In [11]:
MIDA.shape

(2315, 24)

In [12]:
MIDA.columns
#outcome: 1:Victory A, 2:Victory B, 3:Yield A, 4:Yield B, 5:Stalemate, 6:Compromise, 7:Released, 8:Unclear, 9:Joins ongoing war, -9:Missing
#settle: 1:Negotioated, 2:Imposed, 3:None, 4: Unclear, -9:Missing 
#fatality: 0:None, 1:1-25deaths, 2:26-100 deaths, 3:101-250 deaths, 4:251-500, 5:501-999, 6:>999 deaths, -9:missing
#highest hostility action: 0:No militarized action, 1:Threat to use force, 2:Threat to bolckade, 3:Threat to occupy territory, 
#########################4:Threat to declare a war, 5:Threat to use CBR weapons, 6:Threat to join the war, 7:Show of force,
#########################8:Alert, 9:Nuclear Alert, 10:Mobilization, 11: Fortify Border, 12:Border violation, 13:Blockade,
#########################14:Occupation of territory, 15:Seizure, 16:Attack, 17:Clash, 18:Declaration of war, 19:Use of CBBR weapons,
#########################20:Begin Interstate War, 21:Join Interstate war, -9: Missing
#hostility level: 1:no militarized action, 2:Threat to use force, 3:Display force, 4:Use of Force, 5: War



Index(['dispnum3', 'dispnum4', 'stday', 'stmon', 'styear', 'endday', 'endmon',
       'endyear', 'outcome', 'settle', 'fatality', 'fatalpre', 'maxdur',
       'mindur', 'hiact', 'hostlev', 'recip', 'numa', 'numb', 'link1', 'link2',
       'link3', 'ongo2010', 'version'],
      dtype='object')

In [13]:
MIDA.rename(columns={'dispnum3':'dispute_number_v3', 
                     'dispnum4':'dispute_number_v4', 
                     'stday':'start_day', 
                     'stmon':'start_month', 
                     'styear':'start_year', 
                     'endday':'end_day',
                     'endmon':'end_month',
                     'endyear':'end_year',
                     'fatality':'fatality_bucket',
                     'fatalpre':'fatalities_number',
                     'maxdur':'maximum_diration',
                     'mindur':'minimum_duration',
                     'hiact':'highest_hostile_action[hostility_level]',
                     'hostlev':'hostility_level',
                     'resip':'resiprocated_dispute',
                     'numa':'number_states_side_a',
                     'numb':'number_states_side_b',
                     'link1':'related_dispute_number_1',
                     'link2':'related_dispute_number_2',
                     'link3':'related_dispute_number_3',
                     'ongo2010':'ongoing_2010',
                     'version':'dataset_version'
                    }, inplace=True)

In [14]:
MIDA['related_dispute_number_1'][MIDA['related_dispute_number_1']=='147W'] = np.nan

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [15]:
MIDA['related_dispute_number_2'][MIDA['related_dispute_number_2']=='166W'] = np.nan

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [16]:
MIDA.to_csv('MID4/MIDA.csv')

In [7]:
MIDA_chunks = pd.read_csv('MID4/MIDA.csv', chunksize=463)

In [11]:
for chunk in MIDA_chunks:
    chunk.to_sql(name='mida', if_exists='replace', con=engine)

### Exploring and Cleaning MIDB

Participant-Dispute level: one record per participant per dispute.

In [None]:
MIDB = pd.read_stata('MID4/MIDB_4_3.dta')

In [None]:
MIDB.shape

In [None]:
MIDB.columns
#revision_type: 0:Not applicable, 1:Territory, 2:Policy, 3:Regime/government, 4:Other, -9:Missing

In [None]:
MIDB.rename(columns={'dispnum3':'dispute_number_v3', 
                     'dispnum4':'dispute_number_v4',
                     'stabb':'state_abbriviated',
                     'ccode':'country_code',
                     'stday':'start_day', 
                     'stmon':'start_month', 
                     'styear':'start_year', 
                     'endday':'end_day',
                     'endmon':'end_month',
                     'endyear':'end_year',
                     'sidea':'is_side_a',
                     'revstate':'is_revisionist_state',
                     'revtype1':'revision_type_1',
                     'revtype2':'revision_type_2',
                     'fatality':'fatality_bucket',
                     'fatalpre':'fatalities_number',
                     'hiact':'highest_hostile_action[hostility_level]',
                     'hostlev':'hostility_level',
                     'orig':'dispute_originator',
                     'version':'dataset_version'
                    }, inplace=True)

In [None]:
MIDB.reset_index(inplace=True)

In [None]:
MIDB.set_index('index', inplace=True)

In [None]:
MIDB.to_csv('MID4/MIDB.csv')

In [12]:
MIDB_chunks = pd.read_csv('MID4/MIDB.csv', chunksize=397)

In [13]:
for chunk in MIDB_chunks:
    chunk.to_sql(name='midb', if_exists='replace', con=engine)

### Exploring and Cleaning MIDI

Incident-level: one record per incident per participant.

In [None]:
MIDI = pd.read_stata('MID4/MIDI_4_3.dta')

In [None]:
MIDI.shape

In [None]:
MIDI.columns
#revision_type: 0:Not applicable, 1:Territory, 2:Policy, 3:Regime/government, 4:Other, -9:Missing

In [None]:
MIDI.rename(columns={'dispnum3':'dispute_number_v3', 
                     'dispnum4':'dispute_number_v4',
                     'incidnum3':'incident_number_v3', 
                     'incidnum4':'incident_number_v4',
                     'stday':'start_day_incident', 
                     'stmon':'start_month_incident', 
                     'styear':'start_year_incident', 
                     'endday':'end_day_incident',
                     'endmon':'end_month_incident',
                     'endyear':'end_year_incident',
                     'duration':'duration_incident',
                     'tbi':'days_btwn_consec_incidents_within_dispute',
                     'fatality':'fatality_bucket_incident',
                     'fatalpre':'fatalities_number_incident',
                     'action':'hostile_action_incident[hostility_level]',
                     'hostlev':'hostility_level_incident',
                     'numa':'states_number_a',
                     'revtype1':'revision_type_1',
                     'revtype2':'revision_type_2',
                     'version':'dataset_version'
                    }, inplace=True)

In [None]:
MIDI.reset_index(inplace=True)
MIDI.set_index('index', inplace=True)

In [None]:
MIDI.to_csv('MID4/MIDI.csv')

In [None]:
MIDI_chunks = pd.read_csv('MID4/MIDI.csv', chunksize=114)

In [None]:
for chunk in MIDI_chunks:
    chunk.to_sql(name='midi', if_exists='append', con=engine)

### Exploring and Cleaning MIDIP

Participant-Incident-level: one record per participant per incident. 

*cookbook is confusing here as it calls it 'incident' level, but clearly it is participant level for incident*

In [None]:
MIDIP = pd.read_stata('MID4/MIDIP_4_3.dta')

In [None]:
MIDIP.shape

In [None]:
MIDIP.columns
#revision_type: 0:Not applicable, 1:Territory, 2:Policy, 3:Regime/government, 4:Other, -9:Missing

In [None]:
MIDIP.rename(columns={'dispnum3':'dispute_number_v3', 
                     'dispnum4':'dispute_number_v4',
                     'incidnum3':'incident_number_v3', 
                     'incidnum4':'incident_number_v4',
                     'stabb':'state_abbriviated',
                     'ccode':'country_code',
                     'stday':'start_day_incident', 
                     'stmon':'start_month_incident', 
                     'styear':'start_year_incident', 
                     'endday':'end_day_incident',
                     'endmon':'end_month_incident',
                     'endyear':'end_year_incident',
                     'insidea':'is_incident_side_a',
                     'sidea':'is_side_a',
                     'fatality':'fatality_bucket_incident',
                     'fatalpre':'fatalities_number_incident',
                     'action':'hostile_action_incident[hostility_level]',
                     'hostlev':'hostility_level_incident',
                     'revtype1':'revision_type_1',
                     'revtype2':'revision_type_2',
                     'version':'dataset_version'
                    }, inplace=True)

In [None]:
MIDIP.reset_index(inplace=True)
MIDIP.set_index('index', inplace=True)

In [None]:
MIDIP.to_csv('MID4/MIDIP.csv')

In [None]:
MIDIP_chunks = pd.read_csv('MID4/MIDIP.csv', chunksize=1433)

In [None]:
for chunk in MIDIP_chunks:
    chunk.to_sql(name='midip', if_exists='replace', con=engine)

## Additional Data: Military Spending

https://www.kaggle.com/nitinsss/military-expenditure-of-countries-19602019

In [12]:
mil_exp = pd.read_csv('Military Expenditure.csv')

In [None]:
mil_exp = pd.melt(mil_exp, id_vars=['Name', 'Code', 'Type', 'Indicator Name'], 
        value_vars=[str(year) for year in np.arange(1960, 2018)],
        var_name='year',
        value_name='mil_exp')

In [None]:
mil_exp.info()

In [25]:
mil_exp.to_csv('mil_exp_tall.csv')

In [8]:
mil_exp=pd.read_csv('MID4/mil_exp_tall.csv')

In [21]:
mil_exp.rename(columns={'Name':'name', 
                         'Code':'code', 
                         'Type':'type', 
                         'Indicator Name':'indicator_name'},
                inplace=True)

In [25]:
mil_exp.to_csv('mil_exp_tall.csv')

In [7]:
mil_exp_chunks = pd.read_csv('MID4/mil_exp_tall.csv', chunksize=528)

In [8]:
for chunk in mil_exp_chunks:
    chunk.to_sql(name='mil_exp_', if_exists='append', con=engine)

In [10]:
query = "SELECT year, mil_exp FROM mil_exp_ LIMIT 5;"
pd_sql.read_sql(query, connection)

Unnamed: 0,year,mil_exp
0,1960,
1,1960,
2,1960,
3,1960,
4,1960,


In [11]:
query = "SELECT count(*) FROM mil_exp_;"
pd_sql.read_sql(query, connection)

Unnamed: 0,count
0,15312


## Additional Data: Macroeconomics

### Key metircs 2017

Form Kaggle, 'countries' dataset composed from UN Data.

https://www.kaggle.com/sudalairajkumar/undata-country-profiles

In [None]:
country_profile_un = pd.read_csv('country_profile_variables.csv')

In [None]:
country_profile_un.shape

In [None]:
country_profile_un.set_index('country', inplace=True)

In [None]:
country_profile_un.rename(columns={'Surface area (km2)':'surface_area_km2',
                                   'Region':'region',
                                  'Population in thousands (2017)':'population_K',
                                  'Population density (per km2, 2017)':'population_density_per_km2',
                                  'Sex ratio (m per 100 f, 2017)':'sex_ratio_m_per_100_f',
                                  'GDP: Gross domestic product (million current US$)':'gdp_mln_usd',
                                  'GDP growth rate (annual %, const. 2005 prices)':'gdp_growth_rate_annual',
                                  'GDP per capita (current US$)':'gdp_per_capita_usd',
                                  'Economy: Agriculture (% of GVA)':'economy_agriculture_perc_GVA',
                                  'Economy: Industry (% of GVA)':'economy_industry_perc_GVA',
                                  'Economy: Services and other activity (% of GVA)':'economy_services_other_perc_GVA',
                                  'Employment: Agriculture (% of employed)':'employment_agriculture_perc',
                                  'Employment: Industry (% of employed)':'employment_industry_perc',
                                  'Employment: Services (% of employed)':'employment_services_perc',
                                  'Unemployment (% of labour force)':'unemployment_perc',
                                  'Labour force participation (female/male pop. %)':'labour_participation_gender_f_to_m_perc',
                                  'Agricultural production index (2004-2006=100)':'agricultural_production_index',
                                  'Food production index (2004-2006=100)':'food_production_index',
                                  'International trade: Exports (million US$)':'intl_trade_exports_mln_usd',
                                  'International trade: Imports (million US$)':'intl_trade_imports_mln_usd',
                                  'International trade: Balance (million US$)':'intl_trade_balance_mln_usd',
                                  'Balance of payments, current account (million US$)':'balance_payments_current_mln_usd',
                                  'Population growth rate (average annual %)':'population_growth_ann_perc',
                                  'Urban population (% of total population)':'urban_population',
                                  'Urban population growth rate (average annual %)':'urban_population_growth_rate_ann_perc',
                                  'Fertility rate, total (live births per woman)':'fertility_rate_live_births_per_f',
                                  'Life expectancy at birth (females/males, years)':'life_expectancy',
                                  'Population age distribution (0-14 / 60+ years, %)':'age_distribution_0-14_60+_perc',
                                  'International migrant stock (000/% of total pop.)':'intl_migration_population_perc',
                                  'Refugees and others of concern to UNHCR (in thousands)':'refugees_K',
                                   'Infant mortality rate (per 1000 live births':'infant_mortality_rate_per_1000_live',
                                   'Health: Total expenditure (% of GDP)':'health_expenditure_gdp_perc',
                                   'Health: Physicians (per 1000 pop.)':'health_physicians_per_1000',
                                   'Education: Government expenditure (% of GDP)':'edu_gov_expenditure_gdp_perc',
                                   'Education: Primary gross enrol. ratio (f/m per 100 pop.)':'edu_primary_gross_enroll_ratio_f/m_per_100',
                                   'Education: Secondary gross enrol. ratio (f/m per 100 pop.)':'edu_secondary_gross_enroll_ratio_f/m_per_100',
                                   'Education: Tertiary gross enrol. ratio (f/m per 100 pop.)':'edu_tertiary_gross_enroll_ratio_f/m_per_100',
                                   'Seats held by women in national parliaments %':'female_seats_national_parliament_perc',
                                   'Mobile-cellular subscriptions (per 100 inhabitants)':'mobile_subscriptions_per_100',
                                   'Mobile-cellular subscriptions (per 100 inhabitants).1':'mobile_subscriptions_per_100_1',
                                   'Individuals using the Internet (per 100 inhabitants)':'internet_users_per_100',
                                   'Threatened species (number)':'threatened_species_n',
                                   'Forested area (% of land area)':'forest_area_perc', 
                                   'CO2 emission estimates (million tons/tons per capita)':'CO2_emiss_estimate_mln_ton_ton_per_capita',
                                   'Energy production, primary (Petajoules)':'energy_production_primary_petajoules',
                                   'Energy supply per capita (Gigajoules)':'energy_supply_per_capita_gigajoules',
                                   'Pop. using improved drinking water (urban/rural, %)':'population_improved_drinking_water_access_urban/rural_perc',
                                   'Pop. using improved sanitation facilities (urban/rural, %)':'population_improved_sanitation_facil_access_urban/rural_perc',
                                   'Net Official Development Assist. received (% of GNI)':'net_official_develop_assist_recieved_GNI_perc'}, inplace=True)

In [None]:
country_profile_un.to_csv('country_profile_un_rich_2017.csv')

In [None]:
country_profile_un = pd.read_csv('country_profile_un_rich_2017.csv')

In [None]:
country_profile_un.to_sql(name='country_profile_un_rich_2017', if_exists='replace', con=engine)

In [None]:
#!pip install datapackage
#from datapackage import Package
#package=Package('https://datahub.io/core/gdp/datapackage.json')
#print(package.resource_names)
#for resource in package.resources:
#    if resource.descriptor['datahub']['type'] == 'derived/csv':
#        gdp = resource.read()
#type(gdp)
#gdp = pd.DataFrame(gdp) 

### GDP

https://data.worldbank.org/indicator/NY.GDP.MKTP.CD

In [None]:
gdp_hist = pd.read_csv('gdp.csv')

In [None]:
gdp_hist = pd.melt(gdp_hist, id_vars=['Country Name', 'Country Code', 'Indicator Name', 'Indicator Code'], 
        value_vars=[str(year) for year in np.arange(1960, 2019)],
        var_name='year',
        value_name='gdp')

In [None]:
gdp_hist.rename(columns={'Country Name':'name', 
                         'Country Code':'code', 
                         'Indicator Name':'indicator_name', 
                         'Indicator Code':'indicator_code'},
                inplace=True)                        

In [None]:
gdp_hist.to_csv('gdp_hist.csv')

In [None]:
#gdp_hist.shape
#largestPrimeFactor(15576)
#gdp_hist.indicator_name.unique()

In [None]:
gdp_hist_chunks = pd.read_csv('gdp_hist.csv', chunksize=264)

In [None]:
for chunk in gdp_hist_chunks:
    chunk.to_sql(name='gdp_hist', if_exists='replace', con=engine)

### Population

https://data.worldbank.org/indicator/SP.POP.TOTL

In [None]:
population_hist = pd.read_csv('population.csv')

In [None]:
population_hist = population_hist.iloc[:,:-1]

In [None]:
population_hist = pd.melt(population_hist, id_vars=['Country Name', 'Country Code', 'Indicator Name', 'Indicator Code'], 
        value_vars=[str(year) for year in np.arange(1960, 2019)],
        var_name='year',
        value_name='population')

In [None]:
population_hist.rename(columns={'Country Name':'name', 
                         'Country Code':'code', 
                         'Indicator Name':'indicator_name', 
                         'Indicator Code':'indicator_code'},
                inplace=True)

In [None]:
population_hist.to_csv('population_hist.csv')

In [None]:
population_hist.shape
#largestPrimeFactor(15576)

In [None]:
population_hist_chunks = pd.read_csv('population_hist.csv', chunksize=264)

In [None]:
for chunk in population_hist_chunks:
    chunk.to_sql(name='population_hist', if_exists='replace', con=engine)

### Economy

https://data.worldbank.org/topic/economy-and-growth

In [None]:
economy_hist = pd.read_csv('economy.csv')

In [None]:
economy_hist['Indicator Name'].unique()

In [None]:
economy_hist = economy_hist.iloc[:,:-1]

In [None]:
economy_hist = pd.melt(economy_hist, id_vars=['Country Name', 'Country Code', 'Indicator Name', 'Indicator Code'], 
        value_vars=[str(year) for year in np.arange(1960, 2019)],
        var_name='year',
        value_name='value')

In [None]:
economy_hist.rename(columns={'Country Name':'name', 
                         'Country Code':'code', 
                         'Indicator Name':'indicator_name', 
                         'Indicator Code':'indicator_code'},
                inplace=True)

In [None]:
economy_hist.to_csv('economy_hist.csv')

In [None]:
economy_hist.shape
largestPrimeFactor(3816120)
3816120/(118*2*2*5)

In [None]:
economy_hist_chunks = pd.read_csv('economy_hist.csv', chunksize=1617)

In [1]:
#for chunk in economy_hist_chunks:
#    chunk.to_sql(name='economy_hist', if_exists='replace', con=engine)