In [64]:
import pandas as pd
import numpy as np
from collections import defaultdict

def load_data():
    disputes = pd.read_csv('data/MIDB_4.01.csv')
    states_by_year = pd.read_csv('data/system2011.csv') # State membership in each year of dataset

    # Get rid of columns we don't care about
    disputes = disputes.loc[:, ['DispNum3', 'StAbb', 'ccode', 'StYear', 'EndYear', 'Orig', 'Fatality', 'FataPre', 
                                'HiAct', 'HostLev']]
    states_by_year = states_by_year.loc[:, ['stateabb', 'ccode', 'year']]
    
    # Only include hostile acts where force was used, not just displayed or threatened
    disputes = disputes[disputes['HostLev'] >= 4]
    
    return disputes, states_by_year, country_codes

disputes, states_by_year, country_codes = load_data()

In [65]:
def get_membership_sets(states_by_year):
    years = states_by_year['year'].unique()

    # Make a dict to track state membership by year
    membership = defaultdict(set)

    # Add states that existed each year to dict
    for year in years:
        membership[year].update(states_by_year[states_by_year['year'] == year]['ccode'])
    
    return membership
        
membership = get_membership_sets(states_by_year)

In [67]:
def get_total_possible_conflicts(disputes, membership_by_year):
    
    # Add each "opportunity" for conflict, where two states existed and might have fought in a given year
    new_data = list()
    
    for year in membership_by_year:
        print('Processing year: {}'.format(year))
        yearly_disputes = disputes[disputes['StYear'] == year]
        for country in membership_by_year[year]:
            for potential_opponent in membership_by_year[year] - set([country]):
            
                # Find any conflicts the given country was in
                conflicts = set(yearly_disputes[disputes['ccode'] == country]['DispNum3'])

                # Find parties on the opposite side of the conflict
                actual_opponents = set(disputes[(disputes['DispNum3'].isin(conflicts)) & 
                                                (disputes['ccode'] != country)]['ccode'])

                # Create an entry for each pair of countries that could have had a conflict in the dataset
                new_data.append({
                    'ccode': country,
                    'ccode_potential_opponent': potential_opponent,
                    'fought_potential_opponent': potential_opponent in actual_opponents,
                    'year': year
                })
    
    return pd.DataFrame(new_data)

total_possible_conflicts = get_total_possible_conflicts(disputes, membership)

Processing year: 1816
Processing year: 1817
Processing year: 1818
Processing year: 1819
Processing year: 1820
Processing year: 1821
Processing year: 1822
Processing year: 1823
Processing year: 1824
Processing year: 1825
Processing year: 1826
Processing year: 1827
Processing year: 1828
Processing year: 1829
Processing year: 1830
Processing year: 1831
Processing year: 1832
Processing year: 1833
Processing year: 1834
Processing year: 1835
Processing year: 1836
Processing year: 1837
Processing year: 1838
Processing year: 1839
Processing year: 1840
Processing year: 1841
Processing year: 1842
Processing year: 1843
Processing year: 1844
Processing year: 1845
Processing year: 1846
Processing year: 1847
Processing year: 1848
Processing year: 1849
Processing year: 1850
Processing year: 1851
Processing year: 1852
Processing year: 1853
Processing year: 1854
Processing year: 1855
Processing year: 1856
Processing year: 1857
Processing year: 1858
Processing year: 1859
Processing year: 1860
Processing

In [68]:
# Save dyad pairs since constructing them is time intensive
total_possible_conflicts.to_csv('data/total_possible_conflicts.csv')

In [332]:
def add_data(conflicts):
    
    # Add materiel capabilities
    nmc = pd.read_csv('data/NMC_v4_0.csv').loc[:, ['stateabb', 'ccode', 'year', 'cinc']]
    conflicts = pd.merge(left=conflicts, right=nmc, left_on=['ccode', 'year'], right_on=['ccode', 'year'])
    conflicts = pd.merge(left=conflicts, right=nmc, left_on=['ccode_potential_opponent', 'year'],                 
                         right_on=['ccode', 'year'], suffixes=('', '_potential_opponent'))

    # Merge with country code names to get the full state name
    country_codes = pd.read_csv('data/COW country codes.csv').drop_duplicates()
    country_codes = country_codes.loc[:, ['StateAbb', 'StateNme']]
    conflicts = pd.merge(left=conflicts, right=country_codes, left_on='stateabb', right_on='StateAbb')
    conflicts = pd.merge(left=conflicts, right=country_codes, left_on='stateabb_potential_opponent', 
                         right_on='StateAbb', suffixes=('', '_potential_opponent'))
    
    # Drop duplicated columns
    conflicts = conflicts.loc[:, [not x for x in conflicts.columns.duplicated()]]
    
    
    # Add alliance data
    alliances = pd.read_csv('data/alliance_v4.1_by_directed_yearly.csv').loc[:, ['ccode1', 'ccode2', 'defense',
                                                                                 'neutrality', 'nonaggression',
                                                                                 'entente', 'year']]
    conflicts = pd.merge(left=conflicts, right=alliances, left_on=['ccode', 'ccode_potential_opponent', 'year'], 
                         right_on=['ccode1', 'ccode2', 'year'])
    
    # Add contiguity data
    contiguity_data = pd.read_csv('data/contdird.csv')
    conflicts = pd.merge(left=conflicts, right=contiguity_data, left_on=['ccode', 'ccode_potential_opponent', 'year'],
                         right_on=['state1no', 'state2no', 'year'], how="left")
    # Add a new categorical value to indicate countries with no contiguity relationship in dataset
    conflicts['conttype'] = conflicts['conttype'].fillna(6)
    
    # Add trade data
    trade_data = pd.read_csv('data/Dyadic_COW_4.0.csv').loc[:, ['ccode1', 'ccode2', 'year', 'flow1', 'flow2']]
    conflicts = pd.merge(left=conflicts, right=trade_data, left_on=['ccode', 'ccode_potential_opponent', 'year'],
                         right_on=['ccode1', 'ccode2', 'year'], suffixes=('', '_extra'))
    
    return conflicts.loc[:, ['ccode', 'ccode_potential_opponent', 'fought_potential_opponent', 'year', 'stateabb', 
                             'cinc', 'stateabb_potential_opponent', 'cinc_potential_opponent', 'StateNme',
                             'StateNme_potential_opponent', 'defense', 'neutrality', 'nonaggression', 'entente', 
                             'conttype', 'flow1', 'flow2']]

conflicts_with_data = add_data(total_possible_conflicts)

In [333]:
def clean_data(conflicts):
    
    # Since -9 marks missing values in dataset
    conflicts = conflicts[(conflicts_with_data.values != -9).any(axis=1)]
    conflicts = conflicts.dropna()
    conflicts = conflicts.drop_duplicates(subset=['ccode', 'ccode_potential_opponent', 'year'])
    
    # Dummify the contiguity data
    conflicts = pd.get_dummies(conflicts, columns=['conttype'])
    
    return conflicts

conflicts_cleaned = clean_data(conflicts_with_data)

In [334]:
def tally_alliance_power(conflicts, membership_by_year):
    """Returns a new dataframe with alliance power by year."""
    
    alliance_powers = list()
    
    # Tally the strength of alliances on each side and add to the data
    for year in membership_by_year:
        print('Processing year: {}'.format(year))
        yearly_data = conflicts[conflicts['year'] == year]
        for country in membership_by_year[year]:
            # Select using potential opponents since data is organized for the "receiver" of a deal
            # to be the country promised defense by the other
            alliance_power = yearly_data[(yearly_data['ccode_potential_opponent'] == country) & 
                                       (yearly_data['defense'] == 1)]['cinc'].sum()
            alliance_powers.append({
                'year': year,
                'ccode': country,
                'allies_power': alliance_power
            })
    
    return pd.DataFrame(alliance_powers)
            
alliance_powers_df = tally_alliance_power(conflicts_cleaned, membership)

Processing year: 1800
Processing year: 1801
Processing year: 1816
Processing year: 1817
Processing year: 1818
Processing year: 1819
Processing year: 1820
Processing year: 1821
Processing year: 1822
Processing year: 1823
Processing year: 1824
Processing year: 1825
Processing year: 1826
Processing year: 1827
Processing year: 1828
Processing year: 1829
Processing year: 1830
Processing year: 1831
Processing year: 1832
Processing year: 1833
Processing year: 1834
Processing year: 1835
Processing year: 1836
Processing year: 1837
Processing year: 1838
Processing year: 1839
Processing year: 1840
Processing year: 1841
Processing year: 1842
Processing year: 1843
Processing year: 1844
Processing year: 1845
Processing year: 1846
Processing year: 1847
Processing year: 1848
Processing year: 1849
Processing year: 1850
Processing year: 1851
Processing year: 1852
Processing year: 1853
Processing year: 1854
Processing year: 1855
Processing year: 1856
Processing year: 1857
Processing year: 1858
Processing

In [335]:
def add_alliance_power(conflicts, alliance_powers):

    # Merge alliance powers with existing conflict data
    conflicts = pd.merge(left=conflicts, right=alliance_powers, left_on=['ccode', 'year'],
                         right_on=['ccode', 'year'])
    conflicts = pd.merge(left=conflicts, right=alliance_powers, left_on=['ccode_potential_opponent', 'year'],
                     right_on=['ccode', 'year'], suffixes=('', '_potential_opponent'))
    
    # Remove extra columns
    conflicts = conflicts.loc[:, [not x for x in conflicts.columns.duplicated()]]
    return conflicts
    
potential_conflicts_prepped = add_alliance_power(conflicts_cleaned, alliance_powers_df)

In [355]:
import statsmodels.api as sm

# Run the regression per
# http://blog.yhat.com/posts/logistic-regression-python-rodeo.html

# Add intercept
potential_conflicts_prepped['intercept'] = 1

independent_vars = potential_conflicts_prepped.loc[:, ['cinc', 'cinc_potential_opponent', 'defense', 'neutrality', 
                                            'nonaggression', 'entente', 'flow1', 'flow2','conttype_1.0', 
                                            'conttype_2.0', 'conttype_3.0', 'conttype_4.0', 'conttype_5.0', 
                                            'allies_power', 'allies_power_potential_opponent',
                                            'intercept']]
logit = sm.Logit(potential_conflicts_prepped['fought_potential_opponent'], independent_vars)
result = logit.fit()
result.summary()

Optimization terminated successfully.
         Current function value: 0.062100
         Iterations 9


0,1,2,3
Dep. Variable:,fought_potential_opponent,No. Observations:,51226.0
Model:,Logit,Df Residuals:,51210.0
Method:,MLE,Df Model:,15.0
Date:,"Sat, 11 Mar 2017",Pseudo R-squ.:,0.09718
Time:,16:31:22,Log-Likelihood:,-3181.1
converged:,True,LL-Null:,-3523.6
,,LLR p-value:,3.1729999999999998e-136

0,1,2,3,4,5
,coef,std err,z,P>|z|,[95.0% Conf. Int.]
cinc,5.0679,0.635,7.979,0.000,3.823 6.313
cinc_potential_opponent,7.7708,1.091,7.122,0.000,5.632 9.909
defense,-0.5988,0.115,-5.206,0.000,-0.824 -0.373
neutrality,0.3321,0.158,2.104,0.035,0.023 0.641
nonaggression,-0.4187,0.088,-4.743,0.000,-0.592 -0.246
entente,-0.3567,0.110,-3.254,0.001,-0.572 -0.142
flow1,-4.528e-05,1.01e-05,-4.464,0.000,-6.52e-05 -2.54e-05
flow2,7.236e-05,1.2e-05,6.016,0.000,4.88e-05 9.59e-05
conttype_1.0,1.2513,0.094,13.352,0.000,1.068 1.435


In [352]:
def find_extra(x):
    try:
        return len(x) > 1
    except TypeError:
        return False

potential_conflicts_prepped[potential_conflicts_prepped['allies_power_potential_opponent'].apply(find_extra)]

Unnamed: 0,ccode,ccode_potential_opponent,fought_potential_opponent,year,stateabb,cinc,stateabb_potential_opponent,cinc_potential_opponent,StateNme,StateNme_potential_opponent,...,flow2,conttype_1.0,conttype_2.0,conttype_3.0,conttype_4.0,conttype_5.0,conttype_6.0,allies_power,allies_power_potential_opponent,intercept


In [328]:
potential_conflicts_prepped.columns

Index(['ccode', 'ccode_potential_opponent', 'fought_potential_opponent',
       'year', 'stateabb', 'cinc', 'stateabb_potential_opponent',
       'cinc_potential_opponent', 'StateNme', 'StateNme_potential_opponent',
       'defense', 'neutrality', 'nonaggression', 'entente', 'flow1', 'flow2',
       'conttype_1.0', 'conttype_2.0', 'conttype_3.0', 'conttype_4.0',
       'conttype_5.0', 'conttype_6.0', 'allies_power',
       'allies_power_potential_opponent', 'intercept'],
      dtype='object')

In [331]:
potential_conflicts_prepped.columns.duplicated()

array([False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False], dtype=bool)