In [40]:
import pandas as pd

# Dispute-level data and corresponding country codes into data frames
disputes = pd.read_csv('data/MIDB_4.01.csv')
country_codes = pd.read_csv('data/COW country codes.csv')
country_codes = country_codes.drop_duplicates()

# Get rid of columns we don't care about
disputes = disputes.loc[:, ['StAbb', 'ccode', 'StYear', 'EndYear', 'Orig', 'Fatality', 'FataPre', 'HiAct']]

# Only include hostile acts that involved at least a border violation
severity_filter = disputes['HiAct'].map(lambda x: x >= 12)
disputes = disputes[severity_filter]

In [41]:
# Get state membership by year
states_by_year = pd.read_csv('data/system2011.csv')

In [42]:
# Get years covered
years = states_by_year['year'].unique()

# Make a dict to track state membership by year
membership = dict()
for year in years:
    membership[year] = set()
    
# Iterate over the dataset, creating a set for each year with state membership by CCode
for row in states_by_year.iterrows():
    membership[row[1]['year']].add(row[1]['ccode'])

In [43]:
# Need to do a dataframe with all countries that existed in a year, whether attacked or not and with potential 
# explanatory variables (10 year average of CINC, international memberships, etc.).

# Create a new dataframe with the dispute data but filter out countries who were aggressor and add the variable columns
non_aggressors_filter = disputes['Orig'].map(lambda x: x == 0)
non_aggressors = disputes[non_aggressors_filter]

# New dataframe to store all countries, including those with no disputes in a given year
total_possible_conflicts = disputes

# Add the countries that existed in each year but did not have a conflict
for year in range(1816, 2011):
    
    # Find the countries that existed AND had a conflict for given year
    filter_criteria = disputes['StYear'].map(lambda x: x == year)
    
    # Find countries that existed but did not have a conflict
    had_conflict = set(disputes[filter_criteria]['ccode'])
    no_conflict = membership[year] - had_conflict
    
    for country in no_conflict:
        # Add to original data frame (conflict-related columns should be blank)
        new_line = pd.DataFrame({'ccode': country, 'no_conflict': 1, 'StYear': year, 'EndYear': year}, index=[0])
        total_possible_conflicts = total_possible_conflicts.append(new_line, ignore_index=True)
    

       EndYear  FataPre  Fatality  HiAct  Orig StAbb  StYear  ccode  \
15675     2010      NaN       NaN    NaN   NaN   NaN    2010    483   
15676     2010      NaN       NaN    NaN   NaN   NaN    2010    484   
15677     2010      NaN       NaN    NaN   NaN   NaN    2010    490   
15678     2010      NaN       NaN    NaN   NaN   NaN    2010    501   
15679     2010      NaN       NaN    NaN   NaN   NaN    2010    510   

       no_conflict  
15675            1  
15676            1  
15677            1  
15678            1  
15679            1  


In [44]:
# Add the variables investigating (NMC, etc.)
nmc = pd.read_csv('data/NMC_v4_0.csv')
total_possible_conflicts = pd.merge(left=total_possible_conflicts, right=nmc, left_on=['ccode', 'StYear'], 
                                    right_on=['ccode', 'year'])

# Fill out no_conflict for countries that had one (this is the variable we are testing).
total_possible_conflicts.no_conflict = total_possible_conflicts.no_conflict.fillna(value=0)

# Merge with country code names to get the full state name
total_possible_conflicts = pd.merge(left=total_possible_conflicts,right=country_codes, 
                                    left_on='ccode', right_on='CCode', how='inner')


Index(['EndYear', 'FataPre', 'Fatality', 'HiAct', 'Orig', 'StAbb', 'StYear',
       'ccode', 'no_conflict', 'stateabb', 'year', 'irst', 'milex', 'milper',
       'pec', 'tpop', 'upop', 'cinc', 'version', 'StateAbb', 'CCode',
       'StateNme'],
      dtype='object')