In [1]:
from warnings import filterwarnings

filterwarnings('ignore')

import pandas as pd
import numpy as np
from copy import deepcopy
from traceback import format_exc
from pprint import pprint
import the_networks_of_war_python_functions

In [2]:
pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', lambda x: '%.5f' % x)

# Descriptive Statistics for Each Country by Year
### Note: Applies to states/countries only.
### This will be joined to the participants of each war

In [3]:
alliance_df = pd.read_csv('/Users/the_networks_of_war/data_sources/csvs/alliance_v4.1_by_member_yearly.csv', encoding = 'latin-1')

# print('alliance_v4.1_by_member_yearly columns: \n')
# pprint(sorted(list(alliance_df.columns)))
# print('\n')

alliance_df.rename({'ccode': 'c_code',
                    'defense': 'defense_alliances',
                    'entente': 'entente_alliances',
                    'neutrality': 'neutrality_alliances',
                    'ss_type': 'alliances'}, axis = 1, inplace = True)

alliance_df['defense_alliances'] = alliance_df['defense_alliances'].astype(float)
alliance_df['entente_alliances'] = alliance_df['entente_alliances'].astype(float)
alliance_df['neutrality_alliances'] = alliance_df['neutrality_alliances'].astype(float)

aggregations = {
    'alliances': 'count',
    'defense_alliances': 'sum',
    'entente_alliances': 'sum',
    'neutrality_alliances': 'sum',
    }

alliance_df = deepcopy(alliance_df.groupby(['c_code', 'year']).agg(aggregations).reset_index())

In [4]:
## dyadic trade data that will need to be adjusted to be non-dyadic (by country, by year)
trade_df_1 = pd.read_csv('/Users/the_networks_of_war/data_sources/csvs/Dyadic_COW_4.0.csv', encoding = 'utf8')
trade_df_1.rename({'ccode1': 'c_code_a',
                   'ccode2': 'c_code_b',
                   'flow2': 'money_flow_in_a',
                   ## money flow out
                   'flow1': 'money_flow_in_b'}, axis = 1, inplace = True)

# print('Dyadic_COW_4.0 columns: \n')
# pprint(sorted(list(trade_df_1.columns)))
# print('\n')

## need to union to take summations but won't need to dedupe because there are no duplicates between a and b.
# this means a can be summed on its own when it's combined with b.
switched_columns_list = ['c_code_a',
                         'c_code_b',
                         'money_flow_in_a',
                         'money_flow_in_b']
trade_df_1 = deepcopy(the_networks_of_war_python_functions.union_opposite_columns(trade_df_1, switched_columns_list))
trade_df_1.rename({'money_flow_in_a': 'money_flow_in',
                   'money_flow_in_b': 'money_flow_out'}, axis = 1, inplace = True)

aggregations = {'money_flow_in': 'sum',
                'money_flow_out': 'sum'}
trade_df_1 = trade_df_1.groupby(['c_code_a', 'year']).agg(aggregations).reset_index()
trade_df_1.rename({'c_code_a':'c_code'}, axis = 1, inplace = True)

trade_df_2 = pd.read_csv('/Users/the_networks_of_war/data_sources/csvs/National_COW_4.0.csv', encoding = 'latin-1')
trade_df_2.rename({'ccode': 'c_code'}, axis = 1, inplace = True)

# print('National_COW_4.0 columns: \n')
# pprint(sorted(list(trade_df_2.columns)))
# print('\n')

trade_df_2 = deepcopy(trade_df_2[['c_code', 'year', 'imports', 'exports']])

trade_df = deepcopy(pd.merge(trade_df_1, trade_df_2, how = 'outer', on = ['c_code', 'year']))

In [5]:
# trade_df.head(3)

In [6]:
mil_cap_df = pd.read_csv('/Users/the_networks_of_war/data_sources/csvs/NMC_5_0-wsupplementary.csv', encoding = 'latin-1')

mil_cap_df.rename({'milex': 'military_expenditure',
                   'milper': 'military_personnel',
                   'irst': 'iron_steel_production',
                   'pec': 'prim_energy_consumption',
                   'tpop': 'total_population',
                   'upop': 'urban_population',
                   'upopgrowth': 'urban_pop_growth_rate',
                   'ccode': 'c_code',
                   'cinc': 'cinc_score'}, axis = 1, inplace = True)

# print('NMC_5_0-wsupplementary columns: \n')
# pprint(sorted(list(mil_cap_df.columns)))
# print('\n')

mil_cap_df = mil_cap_df.sort_values(by = 'year', ascending = True).reset_index(drop = True)
mil_cap_df = deepcopy(mil_cap_df[['c_code',
                                  'year',
                                  'military_expenditure',
                                  'military_personnel',
                                  'prim_energy_consumption',
                                  'iron_steel_production',
                                  'total_population',
                                  'urban_population',
                                  'cinc_score']])

In [7]:
descriptive_df_1 = deepcopy(pd.merge(trade_df, mil_cap_df, how = 'outer', on = ['c_code', 'year']))
descriptive_df_1 = deepcopy(pd.merge(descriptive_df_1, alliance_df, how = 'outer', on = ['c_code', 'year']))
descriptive_df_1['year'] = descriptive_df_1['year'].astype(float)

In [8]:
print('total rows of descriptive participant data: {}'.format(format(len(descriptive_df_1), ',d')))
descriptive_df_1.to_pickle('/Users/the_networks_of_war/data_sources/pickles/participant_descriptive_df.pkl')

total rows of descriptive participant data: 16,388


# Dyadic Descriptive Statistics by Year
### Note: Applies to states/countries only.
### This will be joined to the dyadic pairs for each war

In [9]:
## lot's to use in this dataset so I'll start with the basics
territory_dy_df = pd.read_csv('/Users/the_networks_of_war/data_sources/csvs/tc2018.csv', encoding = 'utf8')
## must be dyadic (two states per row)
territory_dy_df = deepcopy(territory_dy_df[territory_dy_df['loser']!=-9])
## starting with a binary representation of this dataset
## just checking which states appear in here at all.
## specific fields can be added later
territory_dy_df['territory_exchange'] = 1
territory_dy_df.rename({'gainer': 'c_code_a',
                        'loser': 'c_code_b'}, axis = 1, inplace = True)

# print('contcold columns: \n')
# pprint(sorted(list(colonial_contiguity_dy_df.columns)))
# print('\n')

territory_dy_df = deepcopy(territory_dy_df[['c_code_a',
                                            'c_code_b',
                                            'year',
                                            'territory_exchange']])

## unioning mismatching columns so each participant will get their own row
switched_columns_list = ['c_code_a',
                         'c_code_b']
territory_dy_df = deepcopy(the_networks_of_war_python_functions.union_opposite_columns(territory_dy_df, switched_columns_list))

In [10]:
## contiguity dataframe for states of colonial dependencies
colonial_contiguity_dy_df = pd.read_csv('/Users/the_networks_of_war/data_sources/csvs/contcold.csv', encoding = 'utf8')
colonial_contiguity_dy_df.rename({'statelno': 'c_code_a',
                                  'statehno': 'c_code_b',
                                  'land': 'land_colonial_contiguity',
                                  'sea': 'sea_colonial_contiguity',
                                  'total': 'total_colonial_contiguity'}, axis = 1, inplace = True)

# print('contcold columns: \n')
# pprint(sorted(list(colonial_contiguity_dy_df.columns)))
# print('\n')

colonial_contiguity_dy_df = deepcopy(colonial_contiguity_dy_df[['c_code_a',
                                                                'c_code_b',
                                                                'year',
                                                                'land_colonial_contiguity',
                                                                'sea_colonial_contiguity',
                                                                'total_colonial_contiguity']])

## unioning mismatching columns so each participant will get their own row
switched_columns_list = ['c_code_a',
                         'c_code_b']
colonial_contiguity_dy_df = deepcopy(the_networks_of_war_python_functions.union_opposite_columns(colonial_contiguity_dy_df, switched_columns_list))

In [11]:
contiguity_dy_df = pd.read_csv('/Users/the_networks_of_war/data_sources/csvs/contdird.csv', encoding = 'utf8')
contiguity_dy_df.rename({'state1no': 'c_code_a',
                             'state2no': 'c_code_b',
                             'conttype': 'contiguity_level'}, axis = 1, inplace = True)

# print('contdird columns: \n')
# pprint(sorted(list(contiguity_dy_df.columns)))
# print('\n')

# ## dictionary for the contiguity_level meanings
# ## obtained from Direct Contiguity Codebook.pdf
# contiguity_level_dic = {1: 'Separated by a land or river border',
#                         2: 'Separated by 12 miles of water or less',
#                         3: 'Separated by 24 miles of water or less (but more than 12 miles)',
#                         4: 'Separated by 150 miles of water or less (but more than 24 miles)',
#                         5: 'Separated by 400 miles of water or less (but more than 150 miles)'
#                        }

# for i, contiguity_level in enumerate(contiguity_level_dic.keys()):
#     ## specifying the subtypes of each war based on the documentation
#     contiguity_dy_df.loc[contiguity_dy_df['contiguity_level']==contiguity_level, 'contiguity_level_meaning'] = contiguity_level_dic[contiguity_level]

contiguity_dy_df = deepcopy(contiguity_dy_df[['c_code_a',
                                                      'c_code_b',
                                                      'year',
                                                      'contiguity_level'
#                                                       , 'contiguity_level_meaning'
                                                     ]])

## unioning mismatching columns so each participant will get their own row
switched_columns_list = ['c_code_a',
                         'c_code_b']
contiguity_dy_df = deepcopy(the_networks_of_war_python_functions.union_opposite_columns(contiguity_dy_df, switched_columns_list))

In [12]:
alliance_dy_df = pd.read_csv('/Users/the_networks_of_war/data_sources/csvs/alliance_v4.1_by_dyad_yearly.csv', encoding = 'utf8')
alliance_dy_df.rename({'ccode1': 'c_code_a',
                       'ccode2': 'c_code_b',
                       'defense': 'defense_alliance',
                       'neutrality': 'neutrality_alliance',
                       'entente': 'entente_alliance'}, axis = 1, inplace = True)

# print('alliance_v4.1_by_dyad_yearly columns: \n')
# pprint(sorted(list(dyadic_alliance_df.columns)))
# print('\n')

alliance_dy_df = deepcopy(alliance_dy_df[['c_code_a',
                                          'c_code_b',
                                          'year',
                                          'defense_alliance',
                                          'neutrality_alliance',
                                          'entente_alliance']])

## unioning mismatching columns so each participant will get their own row
switched_columns_list = ['c_code_a',
                         'c_code_b',
                         'participant_a',
                         'participant_b']
alliance_dy_df = deepcopy(the_networks_of_war_python_functions.union_opposite_columns(alliance_dy_df, switched_columns_list))

In [13]:
descriptive_df_2 = deepcopy(pd.merge(alliance_dy_df, contiguity_dy_df, how = 'outer', on = ['c_code_a', 'c_code_b', 'year']))
descriptive_df_2 = deepcopy(pd.merge(descriptive_df_2, colonial_contiguity_dy_df, how = 'outer', on = ['c_code_a', 'c_code_b', 'year']))
descriptive_df_2 = deepcopy(pd.merge(descriptive_df_2, territory_dy_df, how = 'outer', on = ['c_code_a', 'c_code_b', 'year']))
descriptive_df_2['year'] = descriptive_df_2['year'].astype(float)

In [14]:
print('total rows of descriptive dyadic data: {}'.format(format(len(descriptive_df_2), ',d')))
descriptive_df_2.to_pickle('/Users/the_networks_of_war/data_sources/pickles/dyadic_descriptive_df.pkl')

total rows of descriptive dyadic data: 314,336
