In [1]:
from warnings import filterwarnings

filterwarnings('ignore')

import pandas as pd
import numpy as np
from copy import deepcopy

In [2]:
pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', lambda x: '%.5f' % x)

In [3]:
c_code_df = pd.read_csv('/Users/the_networks_of_war/data_sources/csvs/COW country codes.csv', encoding = 'utf8')

c_code_df.rename({'CCode': 'c_code',
                  'StateNme': 'state_name'}, axis = 1, inplace = True)

c_code_df.drop(['StateAbb'], axis = 1, inplace = True)

## adjusting for state name consolidation across all sources
c_code_df.loc[c_code_df['c_code']==817, 'state_name'] = 'Republic of Vietnam'
c_code_df.loc[c_code_df['c_code']==365, 'state_name'] = 'Russia'

duplicate_list = ['c_code', 'state_name']
c_code_df.drop_duplicates(subset = duplicate_list, keep = 'first', inplace = True)
c_code_df = deepcopy(c_code_df.reset_index(drop = True))

c_code_dic = {}
for i, c_code in enumerate(c_code_df['c_code']):
    c_code_dic[c_code] = c_code_df.loc[i, 'state_name']
    
print(str(len(c_code_dic.keys())) + " total countries")

217 total countries


In [4]:
c_code_dic[732]

'South Korea'

# Participant DataFrames

## Inter-State Wars

In [5]:
df = pd.read_csv('/Users/the_networks_of_war/data_sources/csvs/Inter-StateWarData_v4.0.csv', encoding = 'latin-1')

In [6]:
participant_df = pd.read_csv('/Users/the_networks_of_war/data_sources/csvs/Inter-StateWarData_v4.0.csv', encoding = 'latin-1')

print("Inter-StateWarData_v4.0 columns: \n")
print(sorted(list(participant_df.columns)))

participant_df.rename({'WarNum': 'war_num',
                       'WarName': 'war_name',
                       'WarType': 'war_type',
                       'ccode': 'c_code',
                       'StateName': 'state_name',
                       'Side': 'side',
                       'BatDeath': 'battle_deaths',
                       ## using only first start date and first end date for now
                       ## this will need to be fine-tuned later on
                       'StartYear1': 'start_year',
                       'StartMonth1': 'start_month',
                       'StartDay1': 'start_day',
                       'EndYear1': 'end_year',
                       'EndMonth1': 'end_month',
                       'EndDay1': 'end_day'}, axis = 1, inplace = True)

Inter-StateWarData_v4.0 columns: 

['BatDeath', 'EndDay1', 'EndDay2', 'EndMonth1', 'EndMonth2', 'EndYear1', 'EndYear2', 'Initiator', 'Outcome', 'Side', 'StartDay1', 'StartDay2', 'StartMonth1', 'StartMonth2', 'StartYear1', 'StartYear2', 'StateName', 'TransFrom', 'TransTo', 'Version', 'WarName', 'WarNum', 'WarType', 'WhereFought', 'ccode']


In [7]:
## figuring out how much before adding missing values in next cell
missing_values_length = deepcopy(len(participant_df))

# ## some values in dyad_df for a given war aren't in the participant_df
# ## these need to be added manually

# participant_df[participant_df['war_num']==184]
# dyad_df_test = pd.read_csv('/Users/the_networks_of_war/data_sources/csvs/directed_dyadic_war.csv', encoding = 'latin-1')
# dyad_df_test = dyad_df_test[(dyad_df_test['warnum']==184) & ((dyad_df_test['statea']==350) | (dyad_df_test['stateb']==350))]

# mid_test = pd.read_csv('/Users/the_networks_of_war/data_sources/csvs/dyadic MIDs 3.1.csv', encoding = 'latin-1')
# mid_test = deepcopy(mid_test[(mid_test['disno']==1293) & ((mid_test['statea']==350)|(mid_test['stateb']==350))])


## manually filling in values that are found in dyadic cow datasets but seem to be missing from country level sources.
## values have been obtained from dyadic data (directed_dyadic_war.csv' and dyadic MIDs 3.1.csv)

# df_length = deepcopy(len(dyad_df))

# dyad_df.loc[df_length, 'war_num'] = 163
# dyad_df.loc[df_length, 'c_code_a'] = 811
# dyad_df.loc[df_length, 'state_name_a'] = c_code_dic[811]
# dyad_df.loc[df_length, 'c_code_b'] = c_code_dic[816]
# dyad_df.loc[df_length, 'state_name_b'] = 816

# df_length = deepcopy(len(dyad_df))

# dyad_df.loc[df_length, 'war_num'] = 163
# dyad_df.loc[df_length, 'c_code_a'] = 811
# dyad_df.loc[df_length, 'state_name_a'] = c_code_dic[811]
# dyad_df.loc[df_length, 'c_code_b'] = c_code_dic[710]
# dyad_df.loc[df_length, 'c_code_b'] = 710

df_length = deepcopy(len(participant_df))

participant_df.loc[df_length, 'war_num'] = 108
participant_df.loc[df_length, 'war_name'] = 'Latvian Liberation'
participant_df.loc[df_length, 'war_type'] = 1
participant_df.loc[df_length, 'c_code'] = 200
participant_df.loc[df_length, 'state_name'] = c_code_dic[200]
participant_df.loc[df_length, 'side'] = 1
participant_df.loc[df_length, 'battle_deaths'] = 128
participant_df.loc[df_length, 'start_day'] = 12
participant_df.loc[df_length, 'start_month'] = 12
participant_df.loc[df_length, 'start_year'] = 1918
participant_df.loc[df_length, 'end_day'] = 1
participant_df.loc[df_length, 'end_month'] = 2
participant_df.loc[df_length, 'end_year'] = 1920

participant_df.loc[df_length, 'end_date'] = pd.to_datetime(participant_df.loc[df_length, 'end_year'].astype(int).astype(str) + "-" + participant_df.loc[df_length, 'end_month'].astype(int).astype(str) + "-" + participant_df.loc[df_length, 'end_day'].astype(int).astype(str))
participant_df.loc[df_length, 'start_date'] = pd.to_datetime(participant_df.loc[df_length, 'start_year'].astype(int).astype(str) + "-" + participant_df.loc[df_length, 'start_month'].astype(int).astype(str) + "-" + participant_df.loc[df_length, 'start_day'].astype(int).astype(str))

df_length = deepcopy(len(participant_df))

participant_df.loc[df_length, 'war_num'] = 108
participant_df.loc[df_length, 'war_name'] = 'Latvian Liberation'
participant_df.loc[df_length, 'war_type'] = 1
participant_df.loc[df_length, 'c_code'] = 290
participant_df.loc[df_length, 'state_name'] = c_code_dic[290]
participant_df.loc[df_length, 'side'] = 1
participant_df.loc[df_length, 'battle_deaths'] = 100
participant_df.loc[df_length, 'start_day'] = 3
participant_df.loc[df_length, 'start_month'] = 1
participant_df.loc[df_length, 'start_year'] = 1920
participant_df.loc[df_length, 'end_day'] = 1
participant_df.loc[df_length, 'end_month'] = 2
participant_df.loc[df_length, 'end_year'] = 1920

participant_df.loc[df_length, 'end_date'] = pd.to_datetime(participant_df.loc[df_length, 'end_year'].astype(int).astype(str) + "-" + participant_df.loc[df_length, 'end_month'].astype(int).astype(str) + "-" + participant_df.loc[df_length, 'end_day'].astype(int).astype(str))
participant_df.loc[df_length, 'start_date'] = pd.to_datetime(participant_df.loc[df_length, 'start_year'].astype(int).astype(str) + "-" + participant_df.loc[df_length, 'start_month'].astype(int).astype(str) + "-" + participant_df.loc[df_length, 'start_day'].astype(int).astype(str))

df_length = deepcopy(len(participant_df))

participant_df.loc[df_length, 'war_num'] = 108
participant_df.loc[df_length, 'war_name'] = 'Latvian Liberation'
participant_df.loc[df_length, 'war_type'] = 1
participant_df.loc[df_length, 'c_code'] = 220
participant_df.loc[df_length, 'state_name'] = c_code_dic[220]
## need to figure out how to determine this one
participant_df.loc[df_length, 'side'] = 1
participant_df.loc[df_length, 'battle_deaths'] = 0
participant_df.loc[df_length, 'start_day'] = 27
participant_df.loc[df_length, 'start_month'] = 3
participant_df.loc[df_length, 'start_year'] = 1919
participant_df.loc[df_length, 'end_day'] = 1
participant_df.loc[df_length, 'end_month'] = 2
participant_df.loc[df_length, 'end_year'] = 1920

participant_df.loc[df_length, 'end_date'] = pd.to_datetime(participant_df.loc[df_length, 'end_year'].astype(int).astype(str) + "-" + participant_df.loc[df_length, 'end_month'].astype(int).astype(str) + "-" + participant_df.loc[df_length, 'end_day'].astype(int).astype(str))
participant_df.loc[df_length, 'start_date'] = pd.to_datetime(participant_df.loc[df_length, 'start_year'].astype(int).astype(str) + "-" + participant_df.loc[df_length, 'start_month'].astype(int).astype(str) + "-" + participant_df.loc[df_length, 'start_day'].astype(int).astype(str))

df_length = deepcopy(len(participant_df))

participant_df.loc[df_length, 'war_num'] = 139
participant_df.loc[df_length, 'war_name'] = 'World War II'
participant_df.loc[df_length, 'war_type'] = 1
participant_df.loc[df_length, 'c_code'] = 230
participant_df.loc[df_length, 'state_name'] = c_code_dic[230]
participant_df.loc[df_length, 'side'] = 2
participant_df.loc[df_length, 'battle_deaths'] = 950
participant_df.loc[df_length, 'start_day'] = 26
participant_df.loc[df_length, 'start_month'] = 6
participant_df.loc[df_length, 'start_year'] = 1941
participant_df.loc[df_length, 'end_day'] = 20
participant_df.loc[df_length, 'end_month'] = 2
participant_df.loc[df_length, 'end_year'] = 1944

participant_df.loc[df_length, 'end_date'] = pd.to_datetime(participant_df.loc[df_length, 'end_year'].astype(int).astype(str) + "-" + participant_df.loc[df_length, 'end_month'].astype(int).astype(str) + "-" + participant_df.loc[df_length, 'end_day'].astype(int).astype(str))
participant_df.loc[df_length, 'start_date'] = pd.to_datetime(participant_df.loc[df_length, 'start_year'].astype(int).astype(str) + "-" + participant_df.loc[df_length, 'start_month'].astype(int).astype(str) + "-" + participant_df.loc[df_length, 'start_day'].astype(int).astype(str))

df_length = deepcopy(len(participant_df))

participant_df.loc[df_length, 'war_num'] = 151
participant_df.loc[df_length, 'war_name'] = 'Korean'
participant_df.loc[df_length, 'war_type'] = 1
participant_df.loc[df_length, 'c_code'] = 920
participant_df.loc[df_length, 'state_name'] = c_code_dic[920]
participant_df.loc[df_length, 'side'] = 1
participant_df.loc[df_length, 'battle_deaths'] = 23
participant_df.loc[df_length, 'start_day'] = 29
participant_df.loc[df_length, 'start_month'] = 6
participant_df.loc[df_length, 'start_year'] = 1950
participant_df.loc[df_length, 'end_day'] = 27
participant_df.loc[df_length, 'end_month'] = 7
participant_df.loc[df_length, 'end_year'] = 1953

participant_df.loc[df_length, 'end_date'] = pd.to_datetime(participant_df.loc[df_length, 'end_year'].astype(int).astype(str) + "-" + participant_df.loc[df_length, 'end_month'].astype(int).astype(str) + "-" + participant_df.loc[df_length, 'end_day'].astype(int).astype(str))
participant_df.loc[df_length, 'start_date'] = pd.to_datetime(participant_df.loc[df_length, 'start_year'].astype(int).astype(str) + "-" + participant_df.loc[df_length, 'start_month'].astype(int).astype(str) + "-" + participant_df.loc[df_length, 'start_day'].astype(int).astype(str))

df_length = deepcopy(len(participant_df))

participant_df.loc[df_length, 'war_num'] = 169
participant_df.loc[df_length, 'war_name'] = 'Six Day War'
participant_df.loc[df_length, 'war_type'] = 1
participant_df.loc[df_length, 'c_code'] = 645
participant_df.loc[df_length, 'state_name'] = c_code_dic[645]
participant_df.loc[df_length, 'side'] = 2
participant_df.loc[df_length, 'battle_deaths'] = 30
participant_df.loc[df_length, 'start_day'] = 17
participant_df.loc[df_length, 'start_month'] = 5
participant_df.loc[df_length, 'start_year'] = 1967
participant_df.loc[df_length, 'end_day'] = 10
participant_df.loc[df_length, 'end_month'] = 6
participant_df.loc[df_length, 'end_year'] = 1967

participant_df.loc[df_length, 'end_date'] = pd.to_datetime(participant_df.loc[df_length, 'end_year'].astype(int).astype(str) + "-" + participant_df.loc[df_length, 'end_month'].astype(int).astype(str) + "-" + participant_df.loc[df_length, 'end_day'].astype(int).astype(str))
participant_df.loc[df_length, 'start_date'] = pd.to_datetime(participant_df.loc[df_length, 'start_year'].astype(int).astype(str) + "-" + participant_df.loc[df_length, 'start_month'].astype(int).astype(str) + "-" + participant_df.loc[df_length, 'start_day'].astype(int).astype(str))

df_length = deepcopy(len(participant_df))

participant_df.loc[df_length, 'war_num'] = 184
participant_df.loc[df_length, 'war_name'] = 'Turco-Cypriot'
participant_df.loc[df_length, 'war_type'] = 1
participant_df.loc[df_length, 'c_code'] = 350
participant_df.loc[df_length, 'state_name'] = c_code_dic[350]
participant_df.loc[df_length, 'side'] = 2
## value is 2, documented as 26-100 deaths
participant_df.loc[df_length, 'battle_deaths'] = 100
participant_df.loc[df_length, 'start_day'] = 2
participant_df.loc[df_length, 'start_month'] = 7
participant_df.loc[df_length, 'start_year'] = 1974
participant_df.loc[df_length, 'end_day'] = 16
participant_df.loc[df_length, 'end_month'] = 8
participant_df.loc[df_length, 'end_year'] = 1974

# participant_df.loc[df_length, 'end_date'] = pd.to_datetime(participant_df.loc[df_length, 'end_year'].astype(int).astype(str) + "-" + participant_df.loc[df_length, 'end_month'].astype(int).astype(str) + "-" + participant_df.loc[df_length, 'end_day'].astype(int).astype(str))
# participant_df.loc[df_length, 'start_date'] = pd.to_datetime(participant_df.loc[df_length, 'start_year'].astype(int).astype(str) + "-" + participant_df.loc[df_length, 'start_month'].astype(int).astype(str) + "-" + participant_df.loc[df_length, 'start_day'].astype(int).astype(str))

print(str(len(participant_df)) + " total values")
print("{} values before".format(missing_values_length))

344 total values
337 values before


In [8]:
# participant_df['start_date'] = pd.to_datetime(participant_df['start_year'].astype(str) + "-" + participant_df['start_month'].astype(str) + "-" + participant_df['start_day'].astype(str))
# participant_df['end_date'] = pd.to_datetime(participant_df['end_year'].astype(str) + "-" + participant_df['end_month'].astype(str) + "-" + participant_df['end_day'].astype(str))

In [9]:
## calculating null for all without valid start/end dates.
## those with invalid data will have null values.
## this will need to be improved
dates_found = 0
dates_not_found = 0

for i, date in enumerate(participant_df['c_code']):
    
    ## unsure why this occurs (.0 after integer)
    if '.' in str(participant_df.loc[i, 'start_day']):
        participant_df.loc[i, 'start_day'] = str(participant_df.loc[i, 'start_day']).split('.')[0]
    if '.' in str(participant_df.loc[i, 'start_month']):
        participant_df.loc[i, 'start_month'] = str(participant_df.loc[i, 'start_month']).split('.')[0]
    if '.' in str(participant_df.loc[i, 'start_year']):
        participant_df.loc[i, 'start_year'] = str(participant_df.loc[i, 'start_year']).split('.')[0]
    try:
        participant_df.loc[i, 'start_date'] = pd.to_datetime(participant_df.loc[i, 'start_year'] + "-" + participant_df.loc[i, 'start_month'] + "-" + participant_df.loc[i, 'start_day'])
        valid_start_date = 1
    except: 
        valid_start_date = 0
        participant_df.loc[i, 'start_date'] = None
        
    if '.' in str(participant_df.loc[i, 'end_day']):
        participant_df.loc[i, 'end_day'] = str(participant_df.loc[i, 'end_day']).split('.')[0]
    if '.' in str(participant_df.loc[i, 'end_month']):
        participant_df.loc[i, 'end_month'] = str(participant_df.loc[i, 'end_month']).split('.')[0]
    if '.' in str(participant_df.loc[i, 'end_year']):
        participant_df.loc[i, 'end_year'] = str(participant_df.loc[i, 'end_year']).split('.')[0]
        
    try:
        participant_df.loc[i, 'end_date'] = pd.to_datetime(participant_df.loc[i, 'end_year'] + "-" + participant_df.loc[i, 'end_month'] + "-" + participant_df.loc[i, 'end_day'])
        valid_end_date = 1
    except:
        valid_end_date = 0
        participant_df.loc[i, 'end_date'] = None
        
    if valid_start_date==1 and valid_end_date==1:
        participant_df.loc[i, 'days_at_war'] = participant_df.loc[i, 'end_date'] - participant_df.loc[i, 'start_date']
        participant_df.loc[i, 'days_at_war'] = int(str(participant_df.loc[i, 'days_at_war']).split(' ')[0])
    else:
        participant_df.loc[i, 'days_at_war'] = None
        
    if valid_start_date + valid_end_date == 2:
        dates_found += 1
    else:
        dates_not_found+=1
        
print("\ntotal dyads with both dates found {}".format(dates_found))
print("total dyads with at least one date not found {}".format(dates_not_found))


total dyads with both dates found 344
total dyads with at least one date not found 0


In [10]:
## accounting for all cases where countries have more than one side
aggregations = {
    ## they will become side 3
    'side': 'sum',
    'battle_deaths': 'sum',
    'start_date': 'min',
    'start_year': 'max',
    'end_date': 'max',
    'end_year': 'max'
    }

participant_df = deepcopy(participant_df.groupby(['war_num', 'war_name', 'war_type', 'c_code', 'state_name']).agg(aggregations).reset_index())

participant_df['days_at_war'] = participant_df['end_date'] - participant_df['start_date']

for i, country_code in enumerate(participant_df['c_code']):
    participant_df.loc[i, 'days_at_war'] = int(str(participant_df.loc[i, 'days_at_war']).split(' ')[0])

participant_df = deepcopy(participant_df[['war_num',
                                          'war_name',
                                          'war_type',
                                          'c_code',
                                          'state_name',
                                          'side',
                                          'battle_deaths',
                                          'start_date',
                                          'start_year',
                                          'end_date',
                                          'end_year',
                                          'days_at_war']])

In [11]:
# participant_df.tail()

## Intra-State Wars

In [12]:
## creating new dataframe to union to interstate wars participant_df
## note: the code below is very intefficient because something is off with integer formatting in this file.
## this did not occur in the interstate war file.
participant_df_2 = pd.read_csv('/Users/the_networks_of_war/data_sources/csvs/INTRA-STATE_State_participants v5.1.csv', encoding = 'latin-1')

print("INTRA-STATE_State_participants v5.1 columns: \n")
print(sorted(list(participant_df_2.columns)))

## either one of these a or b may not actually be states.
## this wil be fixed later on
participant_df_2.rename({'WarNum': 'war_num',
                         'WarName': 'war_name',
                         'WarType': 'war_type',
                         'StartDy1': 'start_day',
                         'StartMo1': 'start_month',
                         'StartYr1': 'start_year',
                         'EndDy1': 'end_day',
                         'EndMo1': 'end_month',
                         'EndYr1': 'end_year',
                         'CcodeA': 'c_code_a',
                         'SideA': 'state_name_a',
                         'CcodeB': 'c_code_b',
                         'SideB': 'state_name_b',
                         ## unsure if these are the same as battle deaths, or include civilians
                         'Deaths A': 'battle_deaths_a',
                         'Deaths B': 'battle_deaths_b',
                         ## according to documentation, this includes both sides
                         'TotalBDeaths': 'total_deaths_both_sides',
                         'WDuratDays': 'days_at_war',
                         'SideAPeakTotForces': 'side_a_peak_forces_available',
                         'SideBPeakTotForces': 'side_b_peak_forces_available',
                         'SideAPeak TheatForces': 'side_a_peak_battle_forces',
                         'SideBPeakTheatForces': 'side_b_peak_battle_forces',
                         'TransFrom': 'lagging_war',
                         'TransTo': 'leading_war'}, axis = 1, inplace = True)

## whoever is originally marked as side a is getting labelled as 1.
participant_df_2['side_a'] = 1
## whoever is originally marked as side b is getting labelled as 2.
participant_df_2['side_b'] = 2

## defining null values (missing data)
participant_df_2.loc[participant_df_2['start_day'] == -9, 'start_day'] = None
participant_df_2.loc[participant_df_2['start_month'] == -9, 'start_month'] = None
participant_df_2.loc[participant_df_2['start_year'] == -9, 'start_year'] = None
participant_df_2.loc[participant_df_2['end_day'] == -9, 'end_day'] = None
participant_df_2.loc[participant_df_2['end_month'] == -9, 'end_month'] = None
participant_df_2.loc[participant_df_2['end_year'] == -9, 'end_year'] = None

## adjusting for ongoing with wars with current date end date
participant_df_2.loc[participant_df_2['end_day'] == -7, 'end_day'] = str(pd.datetime.now().day)
participant_df_2.loc[participant_df_2['end_month'] == -7, 'end_month'] = str(pd.datetime.now().month)
participant_df_2.loc[participant_df_2['end_year'] == -7, 'end_year'] = str(pd.datetime.now().year)

## fixing for leap year issue
participant_df_2.loc[(participant_df_2['start_day'] == 29) & (participant_df_2['start_month'] == 2) & (participant_df_2['start_year'] == 1894), 'start_day'] = 28
    
for i, date in enumerate(participant_df_2['c_code_a']):
    
    ## unsure why this occurs (.0 after integer)
    if '.' in str(participant_df_2.loc[i, 'start_day']):
        participant_df_2.loc[i, 'start_day'] = str(participant_df_2.loc[i, 'start_day']).split('.')[0]
    if '.' in str(participant_df_2.loc[i, 'start_month']):
        participant_df_2.loc[i, 'start_month'] = str(participant_df_2.loc[i, 'start_month']).split('.')[0]
    if '.' in str(participant_df_2.loc[i, 'start_year']):
        participant_df_2.loc[i, 'start_year'] = str(participant_df_2.loc[i, 'start_year']).split('.')[0]
    try:
        participant_df_2.loc[i, 'start_date'] = pd.to_datetime(participant_df_2.loc[i, 'start_year'] + "-" + participant_df_2.loc[i, 'start_month'] + "-" + participant_df_2.loc[i, 'start_day'])
        valid_start_date = 1
    except: 
        valid_start_date = 0
        participant_df_2.loc[i, 'start_date'] = None
        
    if '.' in str(participant_df_2.loc[i, 'end_day']):
        participant_df_2.loc[i, 'end_day'] = str(participant_df_2.loc[i, 'end_day']).split('.')[0]
    if '.' in str(participant_df_2.loc[i, 'end_month']):
        participant_df_2.loc[i, 'end_month'] = str(participant_df_2.loc[i, 'end_month']).split('.')[0]
    if '.' in str(participant_df_2.loc[i, 'end_year']):
        participant_df_2.loc[i, 'end_year'] = str(participant_df_2.loc[i, 'end_year']).split('.')[0]
        
    try:
        participant_df_2.loc[i, 'end_date'] = pd.to_datetime(participant_df_2.loc[i, 'end_year'] + "-" + participant_df_2.loc[i, 'end_month'] + "-" + participant_df_2.loc[i, 'end_day'])
        valid_end_date = 1
    except:
        valid_end_date = 0
        participant_df_2.loc[i, 'end_date'] = None

INTRA-STATE_State_participants v5.1 columns: 

['CcodeA', 'CcodeB', 'Deaths A', 'Deaths B', 'EndDy1', 'EndDy2', 'EndDy3', 'EndDy4', 'EndMo1', 'EndMo2', 'EndMo3', 'EndMo4', 'EndYr1', 'EndYr2', 'EndYr3', 'EndYr4', 'Initiator', 'Intnl', 'Outcome', 'SideA', 'SideAPeak TheatForces', 'SideAPeakTotForces', 'SideB', 'SideBPeakTheatForces', 'SideBPeakTotForces', 'StartDy1', 'StartDy2', 'StartDy3', 'StartDy4', 'StartMo1', 'StartMo2', 'StartMo3', 'StartMo4', 'StartYr1', 'StartYr2', 'StartYr3', 'StartYr4', 'TotalBDeaths', 'TransFrom', 'TransTo', 'V5Region', 'Version', 'WDuratDays', 'WDuratMo', 'WarName', 'WarNum', 'WarType']


In [13]:
participant_union_df_2 = deepcopy(participant_df_2)

## doing these inefficient column name changes to fill in for a much needed sql union of mismatching column names
participant_union_df_2.rename({'c_code_a': 'c_code_a_new',
                               'c_code_b': 'c_code_b_new',
                               'state_name_a': 'state_name_a_new',
                               'state_name_b': 'state_name_b_new',
                               'side_a': 'side_a_new',
                               'side_b': 'side_b_new',
                               'battle_deaths_a': 'battle_deaths_a_new',
                               'battle_deaths_b': 'battle_deaths_b_new',
                               'side_a_peak_forces_available': 'side_a_peak_forces_available_new',
                               'side_b_peak_forces_available': 'side_b_peak_forces_available_new',
                               'side_a_peak_battle_forces': 'side_a_peak_battle_forces_new',
                               'side_b_peak_battle_forces': 'side_b_peak_battle_forces_new'}, axis = 1, inplace = True)

participant_union_df_2.rename({'c_code_a_new': 'c_code_b',
                               'c_code_b_new': 'c_code_a',
                               'state_name_a_new': 'state_name_b',
                               'state_name_b_new': 'state_name_a',
                               'side_a_new': 'side_b',
                               'side_b_new': 'side_a',
                               'battle_deaths_a_new': 'battle_deaths_b',
                               'battle_deaths_b_new': 'battle_deaths_a',
                               'side_a_peak_forces_available_new': 'side_b_peak_forces_available',
                               'side_b_peak_forces_available_new': 'side_a_peak_forces_available',
                               'side_a_peak_battle_forces_new': 'side_b_peak_battle_forces',
                               'side_b_peak_battle_forces_new': 'side_a_peak_battle_forces'}, axis = 1, inplace = True)

participant_df_2 = deepcopy(pd.concat([participant_df_2, participant_union_df_2], ignore_index = True).reset_index(drop = True))

## making a copy before duplicates a taken out.
## this will be used below for dyadic data (since no dyadic files are available for intra-state wars)
dyad_df_2 = deepcopy(participant_df_2[['war_num', 'c_code_a', 'state_name_a', 'c_code_b', 'state_name_b', 'start_year']])
## this will be adjusted again later
dyad_df_2.rename({'start_year': 'year'}, axis = 1, inplace = True)

participant_df_2 = deepcopy(participant_df_2[['war_num',
                                              'war_name',
                                              'war_type',
                                              'c_code_a',
                                              'state_name_a',
                                              'side_a',
                                              'battle_deaths_a',
                                              'start_date',
                                              'start_year',
                                              'end_date',
                                              'end_year',
                                              'days_at_war',
                                              'total_deaths_both_sides',
                                              'lagging_war',
                                              'leading_war',
                                              'side_a_peak_forces_available',
                                              'side_a_peak_battle_forces']])

## defining null values (missing data)
## -8 is not applicable, -9 is unknown.
participant_df_2.loc[participant_df_2['battle_deaths_a'] == -9, 'battle_deaths_a'] = None
participant_df_2.loc[participant_df_2['battle_deaths_a'] == -8, 'battle_deaths_a'] = None
participant_df_2.loc[participant_df_2['side_a_peak_forces_available'] == -9, 'side_a_peak_forces_available'] = None
participant_df_2.loc[participant_df_2['side_a_peak_forces_available'] == -8, 'side_a_peak_forces_available'] = None
participant_df_2.loc[participant_df_2['total_deaths_both_sides'] == -9, 'total_deaths_both_sides'] = None
participant_df_2.loc[participant_df_2['total_deaths_both_sides'] == -9, 'total_deaths_both_sides'] = None
participant_df_2.loc[participant_df_2['side_a_peak_forces_available'] == -8, 'side_a_peak_forces_available'] = None
participant_df_2.loc[participant_df_2['side_a_peak_forces_available'] == -8, 'side_a_peak_forces_available'] = None
participant_df_2.loc[participant_df_2['side_a_peak_battle_forces'] == -8, 'side_a_peak_battle_forces'] = None
participant_df_2.loc[participant_df_2['side_a_peak_battle_forces'] == -8, 'side_a_peak_battle_forces'] = None

# keeping one state (or non-state) per war after duplicate removal
duplicate_list = ['war_num', 'c_code_a', 'state_name_a']
participant_df_2.drop_duplicates(subset = duplicate_list, keep = 'first', inplace = True)
participant_df_2 = deepcopy(participant_df_2.reset_index(drop = True))
participant_df_2.rename({'c_code_a': 'c_code',
                         'state_name_a': 'state_name',
                         'side_a': 'side',
                         'battle_deaths_a': 'battle_deaths',
                         'side_a_peak_forces_available': 'side_peak_forces_available',
                         'side_a_peak_battle_forces': 'side_peak_battle_forces'}, axis = 1, inplace = True)

## Extra State Wars

In [14]:
## creating new dataframe to union to extra-state wars participant_df
## inefficient pipeline from above was used to accomodate integer formatting
## unsure if that problem occurs for this one too though
participant_df_3 = pd.read_csv('/Users/the_networks_of_war/data_sources/csvs/Extra-StateWarData_v4.0.csv', encoding = 'latin-1')

print("Extra-StateWarData_v4.0.csv columns: \n")
print(sorted(list(participant_df_3.columns)))

## either one of these a or b may not actually be states.
## this wil be fixed later on
participant_df_3.rename({'WarNum': 'war_num',
                         'WarName': 'war_name',
                         'WarType': 'war_type',
                         'StartDay1': 'start_day',
                         'StartMonth1': 'start_month',
                         'StartYear1': 'start_year',
                         'EndDay1': 'end_day',
                         'EndMonth1': 'end_month',
                         'EndYear1': 'end_year',
                         'ccode1': 'c_code_a',
                         'SideA': 'state_name_a',
                         'ccode2': 'c_code_b',
                         'SideB': 'state_name_b',
                         ## unsure if these are the same as battle deaths, or include civilians
                         'BatDeath': 'battle_deaths_a',
                         'NonStateDeaths': 'battle_deaths_b'}, axis = 1, inplace = True)

## whoever is originally marked as side a is getting labelled as 1.
participant_df_3['side_a'] = 1
## whoever is originally marked as side b is getting labelled as 2.
participant_df_3['side_b'] = 2

## defining null values (missing data)
participant_df_3.loc[participant_df_3['start_day'] == -8, 'start_day'] = None
participant_df_3.loc[participant_df_3['start_month'] == -8, 'start_month'] = None
participant_df_3.loc[participant_df_3['start_year'] == -8, 'start_year'] = None
participant_df_3.loc[participant_df_3['end_day'] == -8, 'end_day'] = None
participant_df_3.loc[participant_df_3['end_month'] == -8, 'end_month'] = None
participant_df_3.loc[participant_df_3['end_year'] == -8, 'end_year'] = None

## adjusting for ongoing with wars with current date end date
participant_df_3.loc[participant_df_3['end_day'] == -7, 'end_day'] = str(pd.datetime.now().day)
participant_df_3.loc[participant_df_3['end_month'] == -7, 'end_month'] = str(pd.datetime.now().month)
participant_df_3.loc[participant_df_3['end_year'] == -7, 'end_year'] = str(pd.datetime.now().year)

## calculating null for all without valid start/end dates.
## those with invalid data will have null values.
## this will need to be improved
dates_found = 0
dates_not_found = 0
    
for i, date in enumerate(participant_df_3['c_code_a']):
    
    ## unsure why this occurs (.0 after integer)
    if '.' in str(participant_df_3.loc[i, 'start_day']):
        participant_df_3.loc[i, 'start_day'] = str(participant_df_3.loc[i, 'start_day']).split('.')[0]
    if '.' in str(participant_df_3.loc[i, 'start_month']):
        participant_df_3.loc[i, 'start_month'] = str(participant_df_3.loc[i, 'start_month']).split('.')[0]
    if '.' in str(participant_df_3.loc[i, 'start_year']):
        participant_df_3.loc[i, 'start_year'] = str(participant_df_3.loc[i, 'start_year']).split('.')[0]
    try:
        participant_df_3.loc[i, 'start_date'] = pd.to_datetime(participant_df_3.loc[i, 'start_year'] + "-" + participant_df_3.loc[i, 'start_month'] + "-" + participant_df_3.loc[i, 'start_day'])
        valid_start_date = 1
    except: 
        valid_start_date = 0
        participant_df_3.loc[i, 'start_date'] = None
        
    if '.' in str(participant_df_3.loc[i, 'end_day']):
        participant_df_3.loc[i, 'end_day'] = str(participant_df_3.loc[i, 'end_day']).split('.')[0]
    if '.' in str(participant_df_3.loc[i, 'end_month']):
        participant_df_3.loc[i, 'end_month'] = str(participant_df_3.loc[i, 'end_month']).split('.')[0]
    if '.' in str(participant_df_3.loc[i, 'end_year']):
        participant_df_3.loc[i, 'end_year'] = str(participant_df_3.loc[i, 'end_year']).split('.')[0]
        
    try:
        participant_df_3.loc[i, 'end_date'] = pd.to_datetime(participant_df_3.loc[i, 'end_year'] + "-" + participant_df_3.loc[i, 'end_month'] + "-" + participant_df_3.loc[i, 'end_day'])
        valid_end_date = 1
    except:
        valid_end_date = 0
        participant_df_3.loc[i, 'end_date'] = None
        
    if valid_start_date==1 and valid_end_date==1:
        participant_df_3.loc[i, 'days_at_war'] = participant_df_3.loc[i, 'end_date'] - participant_df_3.loc[i, 'start_date']
        participant_df_3.loc[i, 'days_at_war'] = int(str(participant_df_3.loc[i, 'days_at_war']).split(' ')[0])
    else:
        participant_df_3.loc[i, 'days_at_war'] = None
        
    if valid_start_date + valid_end_date == 2:
        dates_found += 1
    else:
        dates_not_found+=1

print("\ntotal dyads with both dates found {}".format(dates_found))
print("total dyads with at least one date not found {}".format(dates_not_found))

Extra-StateWarData_v4.0.csv columns: 

['BatDeath', 'EndDay1', 'EndDay2 ', 'EndMonth1', 'EndMonth2', 'EndYear1', 'EndYear2', 'Initiator', 'Interven', 'NonStateDeaths', 'Outcome', 'SideA', 'SideB', 'StartDay1', 'StartDay2', 'StartMonth1', 'StartMonth2', 'StartYear1', 'StartYear2', 'TransFrom', 'TransTo', 'Version', 'WarName', 'WarNum', 'WarType', 'WhereFought', 'ccode1', 'ccode2']

total dyads with both dates found 149
total dyads with at least one date not found 49


In [15]:
participant_union_df_3 = deepcopy(participant_df_3)

## doing these inefficient column name changes to fill in for a much needed sql union of mismatching column names
participant_union_df_3.rename({'c_code_a': 'c_code_a_new',
                               'c_code_b': 'c_code_b_new',
                               'state_name_a': 'state_name_a_new',
                               'state_name_b': 'state_name_b_new',
                               'side_a': 'side_a_new',
                               'side_b': 'side_b_new',
                               'battle_deaths_a': 'battle_deaths_a_new',
                               'battle_deaths_b': 'battle_deaths_b_new'}, axis = 1, inplace = True)

participant_union_df_3.rename({'c_code_a_new': 'c_code_b',
                               'c_code_b_new': 'c_code_a',
                               'state_name_a_new': 'state_name_b',
                               'state_name_b_new': 'state_name_a',
                               'side_a_new': 'side_b',
                               'side_b_new': 'side_a',
                               'battle_deaths_a_new': 'battle_deaths_b',
                               'battle_deaths_b_new': 'battle_deaths_a'}, axis = 1, inplace = True)

participant_df_3 = deepcopy(pd.concat([participant_df_3, participant_union_df_3], ignore_index = True).reset_index(drop = True))

## making a copy before duplicates a taken out.
## this will be used below for dyadic data (since no dyadic files are available for extra-state wars)
dyad_df_3 = deepcopy(participant_df_3[['war_num', 'c_code_a', 'state_name_a', 'c_code_b', 'state_name_b', 'start_year']])
## this will be adjusted again later
dyad_df_3.rename({'start_year': 'year'}, axis = 1, inplace = True)

participant_df_3 = deepcopy(participant_df_3[['war_num',
                                              'war_name',
                                              'war_type',
                                              'c_code_a',
                                              'state_name_a',
                                              'side_a',
                                              'battle_deaths_a',
                                              'start_date',
                                              'start_year',
                                              'end_date',
                                              'end_year',
                                              'days_at_war']])

## defining null values (missing data)
## -8 is not applicable, -9 is unknown.
participant_df_3.loc[participant_df_3['battle_deaths_a'] == -9, 'battle_deaths_a'] = None
participant_df_3.loc[participant_df_3['battle_deaths_a'] == -8, 'battle_deaths_a'] = None

# keeping one state (or non-state) per war after duplicate removal
duplicate_list = ['war_num', 'war_type', 'war_name', 'c_code_a', 'state_name_a']
participant_df_3.drop_duplicates(subset = duplicate_list, keep = 'first', inplace = True)
participant_df_3 = deepcopy(participant_df_3.reset_index(drop = True))
participant_df_3.rename({'c_code_a': 'c_code',
                         'state_name_a': 'state_name',
                         'side_a': 'side',
                         'battle_deaths_a': 'battle_deaths'}, axis = 1, inplace = True)

## Combining Participant Sources

In [16]:
## removing non applicable participants
participant_df = deepcopy(participant_df[participant_df['state_name']!="-8"]).reset_index(drop = True)
participant_df_2 = deepcopy(participant_df_2[participant_df_2['state_name']!="-8"]).reset_index(drop = True)
participant_df_3 = deepcopy(participant_df_3[participant_df_3['state_name']!="-8"]).reset_index(drop = True)

In [17]:
print("{} Total Inter-State War Participants".format(len(participant_df)))
print("{} Total Intra-State War Participants".format(len(participant_df_2)))
print("{} Total Extra-State War Participants".format(len(participant_df_3)))

339 Total Inter-State War Participants
1012 Total Intra-State War Participants
361 Total Extra-State War Participants


In [18]:
participant_df = deepcopy(pd.concat([participant_df, participant_df_2], ignore_index = True).reset_index(drop = True))
participant_df = deepcopy(pd.concat([participant_df, participant_df_3], ignore_index = True).reset_index(drop = True))

In [19]:
print("{} Total War Participants After Merge".format(len(participant_df)))

1712 Total War Participants After Merge


In [20]:
participant_df.rename({'war_type': 'war_type_code'}, axis = 1, inplace = True)

participant_df.loc[participant_df['war_type_code']==1, 'war_type'] = 'Inter-State War'
participant_df.loc[participant_df['war_type_code']==1, 'war_sub_type'] = ''
participant_df.loc[participant_df['war_type_code']==2, 'war_type'] = 'Extra-State War'
participant_df.loc[participant_df['war_type_code']==2, 'war_sub_type'] = 'Colonial (conflict with colony)'
participant_df.loc[participant_df['war_type_code']==3, 'war_type'] = 'Extra-State War'
participant_df.loc[participant_df['war_type_code']==3, 'war_sub_type'] = 'Imperial (state vs non-state)'
participant_df.loc[participant_df['war_type_code']==4, 'war_type'] = 'Intra-State War'
participant_df.loc[participant_df['war_type_code']==4, 'war_sub_type'] = 'Civil War (for central control)'
participant_df.loc[participant_df['war_type_code']==5, 'war_type'] = 'Intra-State War'
participant_df.loc[participant_df['war_type_code']==5, 'war_sub_type'] = 'Civil War (over local issues)'
participant_df.loc[participant_df['war_type_code']==6, 'war_type'] = 'Intra-State War'
participant_df.loc[participant_df['war_type_code']==6, 'war_sub_type'] = 'Regional/Internal'
participant_df.loc[participant_df['war_type_code']==7, 'war_type'] = 'Intra-State War'
participant_df.loc[participant_df['war_type_code']==7, 'war_sub)type'] = 'Intercommunal'
participant_df.loc[participant_df['war_type_code']==8, 'war_type'] = 'Non-State War'
participant_df.loc[participant_df['war_type_code']==8, 'war_sub_type'] = 'In Non-State Territory'
participant_df.loc[participant_df['war_type_code']==9, 'war_type'] = 'Non-State War'
participant_df.loc[participant_df['war_type_code']==9, 'war_sub)type'] = 'Across State Borders'

In [21]:
participant_df['war_type'].value_counts()

Intra-State War    1011
Extra-State War     361
Inter-State War     339
Name: war_type, dtype: int64

## Dyadic DataFrames

In [22]:
## battle deaths and start/end dates are in this file too, but it's more confusing than the participant_df.
## this will just be used to get the combinations of countries directly at war with each other.

dyad_df = pd.read_csv('/Users/the_networks_of_war/data_sources/csvs/directed_dyadic_war.csv', encoding = 'latin-1')
dyad_df.rename({'warnum': 'war_num', 'statea': 'c_code_a', 'stateb': 'c_code_b'}, axis = 1, inplace = True)

print("directed_dyadic_war columns: \n")
print(sorted(list(dyad_df.columns)))

dyad_df = deepcopy(dyad_df[['war_num', 'c_code_a', 'c_code_b', 'year']])

for i, c_code_a in enumerate(dyad_df['c_code_a']):
    dyad_df.loc[i, 'state_name_a'] = c_code_dic[c_code_a]
    dyad_df.loc[i, 'state_name_b'] = c_code_dic[dyad_df.loc[i, 'c_code_b']] 

directed_dyadic_war columns: 

['batdtha', 'batdthb', 'batdths', 'c_code_a', 'c_code_b', 'changes_1', 'changes_2', 'disno', 'durindx', 'dyindex', 'outcomea', 'war_num', 'wardyadrolea', 'wardyadroleb', 'warenday', 'warendmnth', 'warendyr', 'warolea', 'waroleb', 'warstrtday', 'warstrtmnth', 'warstrtyr', 'year']


In [23]:
dyad_union_df = deepcopy(dyad_df)

## doing these inefficient column name changes to fill in for a much needed sql union of mismatching column names
dyad_union_df.rename({'c_code_a': 'c_code_a_new',
                      'c_code_b': 'c_code_b_new',
                      'state_name_a': 'state_name_a_new',
                      'state_name_b': 'state_name_b_new'}, axis = 1, inplace = True)

dyad_union_df.rename({'c_code_a_new': 'c_code_b',
                      'c_code_b_new': 'c_code_a',
                      'state_name_a_new': 'state_name_b',
                      'state_name_b_new': 'state_name_a'}, axis = 1, inplace = True)

dyad_df = deepcopy(pd.concat([dyad_df, dyad_union_df], ignore_index = True).reset_index(drop = True))

## Combining Dyadic Sources

In [24]:
## removing non applicable participants
## don't need to do this for inter-state war because all is applicable
dyad_df_2 = deepcopy(dyad_df_2[dyad_df_2['state_name_a']!="-8"]).reset_index(drop = True)
dyad_df_2 = deepcopy(dyad_df_2[dyad_df_2['state_name_b']!="-8"]).reset_index(drop = True)
dyad_df_3 = deepcopy(dyad_df_3[dyad_df_3['state_name_a']!="-8"]).reset_index(drop = True)
dyad_df_3 = deepcopy(dyad_df_3[dyad_df_3['state_name_b']!="-8"]).reset_index(drop = True)

In [25]:
print("{} Total Inter-State War Dyads".format(len(dyad_df)/2))
print("{} Total Intra-State War Dyads".format(len(dyad_df_2)/2))
print("{} Total Extra-State War Dyads".format(len(dyad_df_3)/2))

1364.0 Total Inter-State War Dyads
420.0 Total Intra-State War Dyads
164.0 Total Extra-State War Dyads


In [26]:
dyad_df = deepcopy(pd.concat([dyad_df, dyad_df_2], ignore_index = True).reset_index(drop = True))
dyad_df = deepcopy(pd.concat([dyad_df, dyad_df_3], ignore_index = True).reset_index(drop = True))

In [27]:
print("{} Total Dyads After Merge".format(len(dyad_df)/2))

1948.0 Total Dyads After Merge


In [28]:
participant_df_copy = deepcopy(participant_df)
participant_df_copy.rename({'c_code': 'total_countries'}, axis = 1, inplace = True)

participant_df_copy['war_num'] = participant_df_copy['war_num'].astype(float)
participant_df_copy['start_year'] = participant_df_copy['start_year'].astype(float)
participant_df_copy['end_year'] = participant_df_copy['end_year'].astype(float)

aggregations = {
    'total_countries': 'count',
    'start_year': 'min',
    'end_year': 'max',
    ## this will not be accurate if there are more than one lagging/leading wars per war.
    'lagging_war': 'min',
    'leading_war': 'max',
    }

war_df = deepcopy(participant_df_copy.groupby(['war_num', 'war_name', 'war_type_code', 'war_type', 'war_sub_type']).agg(aggregations).reset_index())

## removing duplicate war (multiple names, same war_num)
war_df = deepcopy(war_df[war_df['war_name']!='Xinjiang Revolt of 1932-1933'].reset_index(drop=True))
war_df = deepcopy(war_df[war_df['war_name']!='Boko Haram in Nigeria of 2013 - ongoing '].reset_index(drop=True))
war_df = deepcopy(war_df[war_df['war_name']!='Greek Civil War  round 2 of 1944-1945'].reset_index(drop=True))

war_df = deepcopy(war_df.sort_values(by = 'total_countries', ascending = False))

In [29]:
## need to figure out a way to add dyadic data when it's missing.
## these are clear cases where it should be added because one side on the war is only one country.
## it'll be trickier when each side isn't just one country.
for i, war_num in enumerate(war_df['war_num']):

    total_side_1 = len(participant_df[(participant_df['war_num']==war_num) & (participant_df['side']==1)])
    total_side_2 = len(participant_df[(participant_df['war_num']==war_num) & (participant_df['side']==2)])
    if total_side_1==1 and total_side_2!=1:
        side_1_name = participant_df[(participant_df['war_num']==war_num) & (participant_df['side']==1)]['state_name'].values[0]
        side_1_code = participant_df[(participant_df['war_num']==war_num) & (participant_df['side']==1)]['c_code'].values[0]
        participating_parties = sorted(list(set(list(participant_df[(participant_df['war_num']==war_num) & (participant_df['side']==2)]['state_name']))))
        dyadic_parties = sorted(list(set(list(dyad_df[dyad_df['war_num']==war_num]['state_name_a']) + list(dyad_df[dyad_df['war_num']==war_num]['state_name_b']))))
        process_number = 0
        for i, party in enumerate(participating_parties):
            if party in dyadic_parties:
                pass
            else:
                process_number+=1
                if process_number==1:
                    print("\n")
                df_length = deepcopy(len(dyad_df))
                dyad_df.loc[df_length, 'war_num'] = war_num
                dyad_df.loc[df_length, 'c_code_a'] = participant_df[(participant_df['war_num']==war_num) & (participant_df['state_name']==party)]['c_code'].values[0]
                dyad_df.loc[df_length, 'state_name_a'] = party
                dyad_df.loc[df_length, 'year'] = participant_df[(participant_df['war_num']==war_num) & (participant_df['state_name']==party)]['start_year'].values[0]
                dyad_df.loc[df_length, 'c_code_b'] = side_1_code
                dyad_df.loc[df_length, 'state_name_b'] = side_1_name
                print("Added dyadic for {} in {}.".format(party, war_df[war_df['war_num']==war_num]['war_name'].values[0]))
    elif total_side_2==1 and total_side_1!=1:
        side_2_name = participant_df[(participant_df['war_num']==war_num) & (participant_df['side']==2)]['state_name'].values[0]
        side_2_code = participant_df[(participant_df['war_num']==war_num) & (participant_df['side']==2)]['c_code'].values[0]
        participating_parties = sorted(list(set(list(participant_df[(participant_df['war_num']==war_num) & (participant_df['side']==1)]['state_name']))))
        dyadic_parties = sorted(list(set(list(dyad_df[dyad_df['war_num']==war_num]['state_name_a']) + list(dyad_df[dyad_df['war_num']==war_num]['state_name_b']))))
        process_number = 0
        for i, party in enumerate(participating_parties):
            if party in dyadic_parties:
                pass
            else:
                process_number+=1
                if process_number==1:
                    print("\n")
                df_length = deepcopy(len(dyad_df))
                dyad_df.loc[df_length, 'war_num'] = war_num
                dyad_df.loc[df_length, 'c_code_a'] = participant_df[(participant_df['war_num']==war_num) & (participant_df['state_name']==party)]['c_code'].values[0]
                dyad_df.loc[df_length, 'state_name_a'] = party
                dyad_df.loc[df_length, 'year'] = participant_df[(participant_df['war_num']==war_num) & (participant_df['state_name']==party)]['start_year'].values[0]
                dyad_df.loc[df_length, 'c_code_b'] = side_2_code
                dyad_df.loc[df_length, 'state_name_b'] = side_2_name
                print("Added dyadic for {} in {}.".format(party, war_df[war_df['war_num']==war_num]['war_name'].values[0]))



Added dyadic for Kuwait in Gulf War.
Added dyadic for Morocco in Gulf War.
Added dyadic for Oman in Gulf War.
Added dyadic for Qatar in Gulf War.
Added dyadic for United Arab Emirates in Gulf War.


Added dyadic for Australia in Iraqi Resistance.
Added dyadic for Iraq in Iraqi Resistance.
Added dyadic for Italy in Iraqi Resistance.
Added dyadic for Netherlands in Iraqi Resistance.
Added dyadic for Poland in Iraqi Resistance.
Added dyadic for Republic of Korea in Iraqi Resistance.
Added dyadic for Spain in Iraqi Resistance.
Added dyadic for Ukraine in Iraqi Resistance.
Added dyadic for United Kingdom in Iraqi Resistance.


Added dyadic for Burundi in Somali-Al-Shabaab war of 2014-present.
Added dyadic for Djibouti in Somali-Al-Shabaab war of 2014-present.
Added dyadic for Ethiopia in Somali-Al-Shabaab war of 2014-present.
Added dyadic for Kenya in Somali-Al-Shabaab war of 2014-present.
Added dyadic for Uganda in Somali-Al-Shabaab war of 2014-present.
Added dyadic for United States in 



Added dyadic for United States in First Waziristan War of 2004-2006.


Added dyadic for France in Rif Rebellion.


Added dyadic for France in Chad-United Opposition War of 2005-2008.


Added dyadic for USSR in Mongolian Armed Uprising of 1932.


Added dyadic for Poland in Polish Ukrainians War of 1945-1947.


Added dyadic for United States in Second Waziristan War of 2007-present.


Added dyadic for United States of America in Iraqi Sunni Revolt of 2010-2014.


Added dyadic for France in Second Cote d'Ivoire War of 2011.


Added dyadic for Austria in Second Schleswig-Holstein.


Added dyadic for Chad in Central African Republic War of 2012-2013.


Added dyadic for Cameroon in Boko Haram in Nigeria of 2013 - present.
Added dyadic for Chad in Boko Haram in Nigeria of 2013 - present.
Added dyadic for Niger in Boko Haram in Nigeria of 2013 - present.


Added dyadic for Uganda in South Sudan War of 2013 to present.


Added dyadic for Sardinia/Piedmont in Italian Unification.


Added dyadi

In [30]:
## consolidating state names between participant_df and dyad_df
dyad_df.loc[dyad_df['c_code_a'].astype(float)==365, 'state_name_a'] = 'Russia'
dyad_df.loc[dyad_df['c_code_b'].astype(float)==365, 'state_name_b'] = 'Russia'
participant_df.loc[participant_df['c_code'].astype(float)==365, 'state_name'] = 'Russia'

dyad_df.loc[dyad_df['c_code_a'].astype(float)==817, 'state_name_a'] = 'Republic of Vietnam'
dyad_df.loc[dyad_df['c_code_b'].astype(float)==817, 'state_name_b'] = 'Republic of Vietnam'
participant_df.loc[participant_df['c_code'].astype(float)==817, 'state_name'] = 'Republic of Vietnam'

In [31]:
dyadic_borders_df = pd.read_csv('/Users/the_networks_of_war/data_sources/csvs/contcold.csv', encoding = 'utf8')
dyadic_borders_df.rename({'statelno': 'c_code_a',
                          'statehno': 'c_code_b',
                          'land': 'land_contiguity',
                          'sea': 'sea_contiguity',
                          'total': 'total_contiguity'}, axis = 1, inplace = True)

In [32]:
dyadic_borders_df = pd.read_csv('/Users/the_networks_of_war/data_sources/csvs/contcold.csv', encoding = 'utf8')
dyadic_borders_df.rename({'statelno': 'c_code_a',
                          'statehno': 'c_code_b',
                          'land': 'land_contiguity',
                          'sea': 'sea_contiguity',
                          'total': 'total_contiguity'}, axis = 1, inplace = True)
dyadic_borders_df = deepcopy(dyadic_borders_df[['c_code_a',
                                                'c_code_b',
                                                'year',
                                                'land_contiguity',
                                                'sea_contiguity',
                                                'total_contiguity']])

print("contcold columns: \n")
print(sorted(list(dyadic_borders_df.columns)))

dyadic_borders_union_df = deepcopy(dyadic_borders_df)

## doing these inefficient column name changes to fill in for a much needed sql union of mismatching column names
dyadic_borders_union_df.rename({'c_code_a': 'c_code_a_new',
                                'c_code_b': 'c_code_b_new'}, axis = 1, inplace = True)

dyadic_borders_union_df.rename({'c_code_a_new': 'c_code_b',
                                'c_code_b_new': 'c_code_a'}, axis = 1, inplace = True)

dyadic_borders_df = deepcopy(pd.concat([dyadic_borders_df, dyadic_borders_union_df], ignore_index = True).reset_index(drop = True))

dyad_df = deepcopy(pd.merge(dyad_df, dyadic_borders_df, how = 'left', on = ['c_code_a', 'c_code_b', 'year']))

contcold columns: 

['c_code_a', 'c_code_b', 'land_contiguity', 'sea_contiguity', 'total_contiguity', 'year']


In [33]:
dyadic_alliance_df = pd.read_csv('/Users/the_networks_of_war/data_sources/csvs/alliance_v4.1_by_dyad_yearly.csv', encoding = 'utf8')
dyadic_alliance_df.rename({'ccode1': 'c_code_a',
                           'ccode2': 'c_code_b',
                           'defense': 'defense_alliance',
                           'neutrality': 'neutrality_alliance', 
                           'entente': 'entente_alliance'}, axis = 1, inplace = True)

dyadic_alliance_df = deepcopy(dyadic_alliance_df[['c_code_a',
                                                  'c_code_b',
                                                  'year',
                                                  'defense_alliance',
                                                  'neutrality_alliance',
                                                  'entente_alliance']])

print("alliance_v4.1_by_dyad_yearly columns: \n")
print(sorted(list(dyadic_alliance_df.columns)))

dyadic_alliance_union_df = deepcopy(dyadic_alliance_df)

## doing these inefficient column name changes to fill in for a much needed sql union of mismatching column names
dyadic_alliance_union_df.rename({'c_code_a': 'c_code_a_new',
                                'c_code_b': 'c_code_b_new'}, axis = 1, inplace = True)

dyadic_alliance_union_df.rename({'c_code_a_new': 'c_code_b',
                                'c_code_b_new': 'c_code_a'}, axis = 1, inplace = True)

dyadic_alliance_df = deepcopy(pd.concat([dyadic_alliance_df, dyadic_alliance_union_df], ignore_index = True).reset_index(drop = True))

dyad_df = deepcopy(pd.merge(dyad_df, dyadic_alliance_df, how = 'left', on = ['c_code_a', 'c_code_b', 'year']))

alliance_v4.1_by_dyad_yearly columns: 

['c_code_a', 'c_code_b', 'defense_alliance', 'entente_alliance', 'neutrality_alliance', 'year']


In [34]:
for i, state_name_a in enumerate(dyad_df['state_name_a']):
    dyad_list = []
    dyad_list.append(state_name_a)
    dyad_list.append(dyad_df.loc[i, 'state_name_b'])
    dyad_list = str(sorted(dyad_list))
    dyad_df.loc[i, 'conflict_pair'] = dyad_list

dyad_df['year'] = dyad_df['year'].astype(int)
dyad_df.sort_values(by = 'year', ascending = True, inplace = True)

dyad_df = deepcopy(dyad_df[['war_num',
                            'c_code_a',
                            'state_name_a',
                            'c_code_b',
                            'state_name_b',
                            'year',
                            'land_contiguity',
                            'sea_contiguity',
                            'total_contiguity',
                            'defense_alliance',
                            'neutrality_alliance',
                            'entente_alliance',
                            'conflict_pair']])

dyad_df = deepcopy(dyad_df[(dyad_df['state_name_a'].isnull()==False) & (dyad_df['state_name_b'].isnull()==False)])
## need to dedupe across conflict pair so a vs b are never repeated interchangably
duplicate_list = ['war_num', 'conflict_pair']
dyad_df.drop_duplicates(subset = duplicate_list, keep = 'first', inplace = True)
dyad_df.rename({'year': 'first_year'}, axis = 1, inplace = True)

dyad_df = deepcopy(dyad_df.reset_index(drop = True))
dyad_df.drop('conflict_pair', axis = 1, inplace = True)

In [35]:
## filling in nulls with zeros
dyad_df.loc[dyad_df['defense_alliance'].isnull(), 'defense_alliance'] = 0
dyad_df.loc[dyad_df['neutrality_alliance'].isnull(), 'neutrality_alliance'] = 0
dyad_df.loc[dyad_df['entente_alliance'].isnull(), 'entente_alliance'] = 0
dyad_df.loc[dyad_df['land_contiguity'].isnull(), 'land_contiguity'] = 0
dyad_df.loc[dyad_df['sea_contiguity'].isnull(), 'sea_contiguity'] = 0
dyad_df.loc[dyad_df['total_contiguity'].isnull(), 'total_contiguity'] = 0

## Adding Descriptive Data for Dyads

# Check Between Dyads and Participants

In [36]:
war_list = list(set(list(dyad_df['war_num'])))
for war in war_list:
    participant_list = list(participant_df[participant_df['war_num']==war]['state_name'])
    dyad_list = list(set(list(dyad_df[dyad_df['war_num']==war]['state_name_a']) + list(dyad_df[dyad_df['war_num']==war]['state_name_b'])))
    for country in dyad_list:
        if country not in participant_list:
            war_name = participant_df[participant_df['war_num']==war]['war_name'].values[0]
            print(str(war)[:-2] + ", " + war_name + ": " + str(country))
        else:
            pass

4, First Russo-Turkish: Turkey
10, Austro-Sardinian: Austria-Hungary
13, First Schleswig-Holstein: Germany
16, Roman Republic: Austria-Hungary
28, Italian Unification: Italy
28, Italian Unification: Austria-Hungary
34, Italian-Roman: Italy
37, Neapolitan: Italy
46, Second Schleswig-Holstein: Austria-Hungary
55, Seven Weeks: Austria-Hungary
139, World War II: Thailand
159, Taiwan Straits: China
159, Taiwan Straits: Taiwan
160, Assam: China


# Descriptive Statistics for Each Country by Year

In [37]:
# df = pd.read_csv('/Users/the_networks_of_war/data_sources/csvs/Diplomatic_Exchange_2006v1.csv')

In [38]:
# # https://sites.google.com/site/joseantoniocheibub/datasets/democracy-and-dictatorship-revisited
# # non cow data-set
    
# gov_df = pd.read_csv('/Users/the_networks_of_war/data_sources/csvs/ddrevisited_data_v1.csv', encoding = 'latin-1')
# print(gov_df.columns)

# gov_df.rename({'chgterr': 'territory_change',
#                'ychgterr': 'territory_change_year',
#                'entryy': 'first_recorded_year',
#                'exity': 'last_recorded_year',
#                'bornyear': 'born_year',
#                'endyear': 'died_year',
#                'exselec': 'election_type',
#                'legselec': 'legislation_type',
#                'closed': 'legislature_status',
#                'dejure': 'party_legal_status',
#                'defacto': 'party_existance',
#                'defacto2': 'party_existance_outside_regime', 
#                'lparty': 'legislature_parties',
#                'incumb': 'incumbent_type',
#                'collect': 'collective_leadership',
#                'eheads': 'num_leadership_changes',
#                'ehead': 'leader_name',
#                'epost': 'post_name',
#                'edate': 'entrance_date',
#                'ageeh': 'leader_tenure',
#                'emil': 'military_leader',
#                'royal': 'royal_leader',
#                'comm': 'communist_leader',
#                'edeath': 'leader_died',
#                'democracy': 'democratic_regime',
#                'assconfid': 'cabinet_assembly',
#                'poppreselec': 'popular_election',
#                'regime': 'regime_type',
#                'ttd': 'transition_to_democracy',
#                'tta': 'transition_to_dictatorship',
#                'agedem': 'age_govt',
#                'stra': 'num_transitions_ever',
#                'cowcode': 'c_code_a',
#                'ctryname': 'state_name_a',
#                'headdiff': 'nominal_vs_eff_diff',
#                'un_region_name': 'un_region',
#                'un_continent_name': 'un_continent',
#                 'cowcode2': 'c_code_b'}, axis = 1, inplace = True)

In [39]:
# co-emissions-per-capita.csv

## borders
# pd.read_csv('/Users/charlieyaris/github/international_armed_conflict/Data Sources/contcold.csv', encoding = 'latin-1')

In [40]:
alliance_df = pd.read_csv('/Users/the_networks_of_war/data_sources/csvs/alliance_v4.1_by_member_yearly.csv', encoding = 'latin-1')

print("alliance_v4.1_by_member_yearly columns: \n")
print(sorted(list(alliance_df.columns)))

alliance_df.rename({'ccode': 'c_code', 'nonaggression': 'non_aggression_alliances', 'entente': 'entente_alliances', 'ss_type': 'alliances'}, axis = 1, inplace = True)

alliance_df['non_aggression_alliances'] = alliance_df['non_aggression_alliances'].astype(float)
alliance_df['entente_alliances'] = alliance_df['entente_alliances'].astype(float)

aggregations = {
    'alliances': 'count',
    'non_aggression_alliances': 'sum',
    'entente_alliances': 'sum'
    }

alliance_df = deepcopy(alliance_df.groupby(['c_code', 'year']).agg(aggregations).reset_index())

alliance_v4.1_by_member_yearly columns: 

['all_end_day', 'all_end_month', 'all_end_year', 'all_st_day', 'all_st_month', 'all_st_year', 'ccode', 'defense', 'entente', 'left_censor', 'mem_end_day', 'mem_end_month', 'mem_end_year', 'mem_st_day', 'mem_st_month', 'mem_st_year', 'neutrality', 'nonaggression', 'right_censor', 'ss_type', 'state_name', 'version', 'version4id', 'year']


In [41]:
dyadic_trade_df = pd.read_csv('/Users/the_networks_of_war/data_sources/csvs/Dyadic_COW_4.0.csv', encoding = 'utf8')

print("Dyadic_COW_4.0 columns: \n")
print(sorted(list(dyadic_trade_df.columns)))

dyadic_trade_df.rename({'ccode1': 'c_code_a',
                        'ccode2': 'c_code_b',
                        'flow1': 'money_flow_out_a',
                        'flow2': 'money_flow_in_a'}, axis = 1, inplace = True)

## flow is in units of millions of us dollars in todays currency.
dyadic_trade_df['money_flow_out_a'] = ([s * 1000000 for s in dyadic_trade_df['money_flow_out_a']])
dyadic_trade_df['money_flow_in_a'] = ([s * 1000000 for s in dyadic_trade_df['money_flow_in_a']])
# -9000000 is unknown value
dyadic_trade_df.loc[dyadic_trade_df['money_flow_out_a'] == -9000000, 'money_flow_out_a'] = None
dyadic_trade_df.loc[dyadic_trade_df['money_flow_in_a'] == -9000000, 'money_flow_in_a'] = None

dyadic_trade_df = deepcopy(dyadic_trade_df[['year',
                                            'c_code_a',
                                            'c_code_b',
                                            'money_flow_out_a',
                                            'money_flow_in_a']])

Dyadic_COW_4.0 columns: 

['bel_lux_alt_flow1', 'bel_lux_alt_flow2', 'ccode1', 'ccode2', 'china_alt_flow1', 'china_alt_flow2', 'dip1', 'dip2', 'flow1', 'flow2', 'importer1', 'importer2', 'smoothflow1', 'smoothflow2', 'smoothtotrade', 'source1', 'source2', 'spike1', 'spike2', 'tradedip', 'trdspike', 'version', 'year']


In [42]:
# dyadic_trade_df.head(3)

In [43]:
## need to union to take summations but won't need to dedupe because there are no duplicates between a and b.
# this means a can be summed on its own when it's combined with b.

dyadic_trade_union_df = deepcopy(dyadic_trade_df)
## doing these inefficient column name changes to fill in for a much needed sql union of mismatching column names
dyadic_trade_union_df.rename({'c_code_a': 'c_code_a_new',
                      'c_code_b': 'c_code_b_new',
                      'money_flow_out_a': 'money_flow_out_a_new',
                      'money_flow_in_a': 'money_flow_in_a_new'}, axis = 1, inplace = True)

dyadic_trade_union_df.rename({'c_code_a_new': 'c_code_b',
                      'c_code_b_new': 'c_code_a',
                      'money_flow_out_a_new': 'money_flow_in_a',
                      'money_flow_in_a_new': 'money_flow_out_a'}, axis = 1, inplace = True)

dyadic_trade_df = deepcopy(pd.concat([dyadic_trade_df, dyadic_trade_union_df], ignore_index = True).reset_index(drop = True))

In [44]:
# dyadic_trade_df.head(3)

In [45]:
aggregations = {
    'money_flow_in_a': 'sum',
    'money_flow_out_a': 'sum',
    }

trade_df_1 = dyadic_trade_df.groupby(['c_code_a', 'year']).agg(aggregations).reset_index()
trade_df_1.rename({'c_code_a':'c_code', 'money_flow_in_a': 'money_flow_in', 'money_flow_out_a': 'money_flow_out'}, axis = 1, inplace = True)

In [46]:
# trade_df_1.head(3)

In [47]:
trade_df_2 = pd.read_csv('/Users/the_networks_of_war/data_sources/csvs/National_COW_4.0.csv', encoding = 'latin-1')
trade_df_2.rename({'ccode': 'c_code'}, axis = 1, inplace = True)

print("National_COW_4.0 columns: \n")
print(sorted(list(trade_df_2.columns)))

trade_df_2 = deepcopy(trade_df_2[['c_code', 'year', 'imports', 'exports']])

trade_df = deepcopy(pd.merge(trade_df_1, trade_df_2, how = 'outer', on = ['c_code', 'year']))

National_COW_4.0 columns: 

['alt_exports', 'alt_imports', 'c_code', 'exports', 'imports', 'source1', 'source2', 'stateabb', 'statename', 'version', 'year']


In [48]:
# trade_df.head(3)

In [49]:
mil_cap_df = pd.read_csv('/Users/the_networks_of_war/data_sources/csvs/NMC_5_0-wsupplementary.csv', encoding = 'latin-1')

mil_cap_df.rename({'milex': 'military_expenditure',
                   'milper': 'military_personnel',
                   'irst': 'iron_steel_production',
                   'pec': 'prim_energy_consumption',
                   'tpop': 'total_population',
                   'upop': 'urban_population',
                   'upopgrowth': 'urban_pop_growth_rate',
                   'ccode': 'c_code',
#                    'statenme': 'state_name',
                   'cinc': 'cinc_score'}, axis = 1, inplace = True)

print("NMC_5_0-wsupplementary columns: \n")
print(sorted(list(mil_cap_df.columns)))

mil_cap_df['military_expenditure'] = ([s * 1000 for s in mil_cap_df['military_expenditure']])
mil_cap_df['military_personnel'] = ([s * 1000 for s in mil_cap_df['military_personnel']])
mil_cap_df['total_population'] = ([s * 1000 for s in mil_cap_df['total_population']])
mil_cap_df['urban_population'] = ([s * 1000 for s in mil_cap_df['urban_population']])
mil_cap_df['iron_steel_production'] = ([s * 2000000 for s in mil_cap_df['iron_steel_production']])
mil_cap_df['prim_energy_consumption'] = ([s * 2000000 for s in mil_cap_df['prim_energy_consumption']])

mil_cap_df.loc[mil_cap_df['military_expenditure'] == -9000, 'military_expenditure'] = 0
mil_cap_df.loc[mil_cap_df['military_personnel'] == -9000, 'military_personnel'] = 0
mil_cap_df.loc[mil_cap_df['total_population'] == -9000, 'total_population'] = 0
mil_cap_df.loc[mil_cap_df['urban_population'] == -9000, 'urban_population'] = 0
mil_cap_df.loc[mil_cap_df['iron_steel_production'] == -18000000 , 'iron_steel_production'] = 0
mil_cap_df.loc[mil_cap_df['prim_energy_consumption'] == -18000000 , 'prim_energy_consumption'] = 0

mil_cap_df = mil_cap_df.sort_values(by = 'year', ascending = True).reset_index(drop = True)

mil_cap_df = deepcopy(mil_cap_df[['c_code',
                                  'year',
                                  'military_expenditure',
                                  'military_personnel',
                                  'prim_energy_consumption',
                                  'iron_steel_production',
                                  'total_population',
                                  'urban_population',
                                  'cinc_score']])

NMC_5_0-wsupplementary columns: 

['c_code', 'cinc_score', 'iron_steel_production', 'irstanomalycode', 'irstnote', 'irstqualitycode', 'irstsource', 'milexnote', 'milexsource', 'military_expenditure', 'military_personnel', 'milpernote', 'milpersource', 'pecanomalycode', 'pecnote', 'pecqualitycode', 'pecsource', 'prim_energy_consumption', 'stateabb', 'statenme', 'total_population', 'tpopanomalycode', 'tpopnote', 'tpopqualitycode', 'tpopsource', 'upopanomalycode', 'upopgrowthsource', 'upopnote', 'upopqualitycode', 'upopsource', 'urban_pop_growth_rate', 'urban_population', 'version', 'year']


In [50]:
mil_cap_df.head(3)

Unnamed: 0,c_code,year,military_expenditure,military_personnel,prim_energy_consumption,iron_steel_production,total_population,urban_population,cinc_score
0,2,1816,3823000,17000,508000000,160000000,8659000.0,101000.0,0.0397
1,230,1816,6512000,125000,0,20000000,11073000.0,221000.0,0.04639
2,210,1816,2375000,26000,2284000000,100000000,5610000.0,337000.0,0.03991


## Merging Descriptive DataFrames

In [51]:
descriptive_df = deepcopy(pd.merge(trade_df, mil_cap_df, how = 'outer', on = ['c_code', 'year']))
descriptive_df = deepcopy(pd.merge(descriptive_df, alliance_df, how = 'outer', on = ['c_code', 'year']))
descriptive_df['year'] = descriptive_df['year'].astype(float)

descriptive_df.rename({'year': 'start_year'}, axis = 1, inplace = True)

participant_df['start_year'] = participant_df['start_year'].astype(float)
participant_df['end_year'] = participant_df['end_year'].astype(float)

participant_df = deepcopy(pd.merge(participant_df, descriptive_df, how = 'left', on = ['c_code', 'start_year']))
descriptive_df.rename({'start_year': 'end_year'}, axis = 1, inplace = True)
participant_df = deepcopy(pd.merge(participant_df, descriptive_df, how = 'left', on = ['c_code', 'end_year']))

print("participant_df columns: \n")
print(sorted(list(participant_df.columns)))

participant_df columns: 

['alliances_x', 'alliances_y', 'battle_deaths', 'c_code', 'cinc_score_x', 'cinc_score_y', 'days_at_war', 'end_date', 'end_year', 'entente_alliances_x', 'entente_alliances_y', 'exports_x', 'exports_y', 'imports_x', 'imports_y', 'iron_steel_production_x', 'iron_steel_production_y', 'lagging_war', 'leading_war', 'military_expenditure_x', 'military_expenditure_y', 'military_personnel_x', 'military_personnel_y', 'money_flow_in_x', 'money_flow_in_y', 'money_flow_out_x', 'money_flow_out_y', 'non_aggression_alliances_x', 'non_aggression_alliances_y', 'prim_energy_consumption_x', 'prim_energy_consumption_y', 'side', 'side_peak_battle_forces', 'side_peak_forces_available', 'start_date', 'start_year', 'state_name', 'total_deaths_both_sides', 'total_population_x', 'total_population_y', 'urban_population_x', 'urban_population_y', 'war_name', 'war_num', 'war_sub)type', 'war_sub_type', 'war_type', 'war_type_code']


In [52]:
## filling in N/A with None
participant_df.loc[participant_df['c_code'].astype(float)==-8, 'c_code'] = None
participant_df.loc[participant_df['lagging_war'].astype(float)==-8, 'lagging_war'] = None
participant_df.loc[participant_df['leading_war'].astype(float)==-8, 'leading_war'] = None

## filling in nulls with zeros
## these are ones that most likely mean zero if null (not due to missing data)
participant_df.loc[participant_df['alliances_x'].isnull(), 'alliances_x'] = 0
participant_df.loc[participant_df['alliances_y'].isnull(), 'alliances_y'] = 0
participant_df.loc[participant_df['alliances_x'].isnull(), 'alliances_x'] = 0
participant_df.loc[participant_df['alliances_y'].isnull(), 'alliances_y'] = 0
participant_df.loc[participant_df['entente_alliances_x'].isnull(), 'entente_alliances_x'] = 0
participant_df.loc[participant_df['entente_alliances_y'].isnull(), 'entente_alliances_y'] = 0
participant_df.loc[participant_df['non_aggression_alliances_x'].isnull(), 'non_aggression_alliances_x'] = 0
participant_df.loc[participant_df['non_aggression_alliances_y'].isnull(), 'non_aggression_alliances_y'] = 0

In [53]:
print("{} total participants.".format(len(participant_df)))
print("{} total conflicts.".format(len(dyad_df)))
print("{} total wars.".format(len(war_df)))

1712 total participants.
1055 total conflicts.
651 total wars.


In [54]:
descriptive_df.to_pickle('/Users/the_networks_of_war/data_sources/pickles/descriptive_df.pkl')
participant_df.to_pickle('/Users/the_networks_of_war/data_sources/pickles/participant_df.pkl')
dyad_df.to_pickle('/Users/the_networks_of_war/data_sources/pickles/dyad_df.pkl')
war_df.to_pickle('/Users/the_networks_of_war/data_sources/pickles/war_df.pkl')

In [55]:
descriptive_df = pd.read_pickle('/Users/the_networks_of_war/data_sources/pickles/descriptive_df.pkl')
participant_df = pd.read_pickle('/Users/the_networks_of_war/data_sources/pickles/participant_df.pkl')
dyad_df = pd.read_pickle('/Users/the_networks_of_war/data_sources/pickles/dyad_df.pkl')
war_df = pd.read_pickle('/Users/the_networks_of_war/data_sources/pickles/war_df.pkl')

In [56]:
# why would vietnam and republic of vietnam not appear in contiguity data?

In [None]:
file_df = pd.DataFrame()

for i, war in enumerate(war_df['war_num']):
    
    file_directory = './json_files_by_war/'
    file_name = 'war_num_' + str(war).replace('.', '_') + '.json'
    file_df.loc[i, 'file_name'] = file_name
    file_df.loc[i, 'war_type'] = war_df[war_df['war_num']==war]['war_type'].values[0]
    
    participant_df_copy = deepcopy(pd.read_pickle('/Users/the_networks_of_war/data_sources/pickles/participant_df.pkl'))
    dyad_df_copy = deepcopy(pd.read_pickle('/Users/the_networks_of_war/data_sources/pickles/dyad_df.pkl'))
    war_df_copy = deepcopy(pd.read_pickle('/Users/the_networks_of_war/data_sources/pickles/war_df.pkl'))

    war_df_copy = deepcopy(war_df_copy[war_df_copy['war_num']==war].reset_index(drop = True))
    participant_df_copy = deepcopy(participant_df_copy[participant_df_copy['war_num']==war].reset_index(drop = True))
    dyad_df_copy = deepcopy(dyad_df_copy[dyad_df_copy['war_num']==war].reset_index(drop = True))

    network_nodes = list(participant_df_copy['state_name'])

    graph_file = open(file_directory + file_name, 'w').close()
    graph_file = open(file_directory + file_name, 'w')

    json_dic = {}
    json_dic['nodes'] = {}
    json_dic['links'] = {}
    start_line = '{\n  "war": [\n'
    middle_line_1 = '  ],\n  "nodes": [\n'
    middle_line_2 = '  ],\n  "links": [\n'
    end_line = '\n  ]\n}'

    graph_file.write(start_line)

    add_line = ('    {"war_name": "' + str(war_df_copy.loc[0, 'war_name'])
                + '", "war_num": "' + str(war_df_copy.loc[0, 'war_num'])
                + '", "war_type_code": "' + str(war_df_copy.loc[0, 'war_type_code'])
                + '", "war_type": "' + str(war_df_copy.loc[0, 'war_type'])
                + '", "lagging_war": "' + str(war_df_copy.loc[0, 'lagging_war'])
                + '", "leading_war": "' + str(war_df_copy.loc[0, 'leading_war'])
                + '"}\n')

    graph_file.write(add_line)

    graph_file.write(middle_line_1)
    for i, node in enumerate(participant_df_copy['state_name']):
        add_line = (
            '    {"country": "' + str(participant_df_copy.loc[i, 'state_name'])
            + '", "country_code": "' + str(participant_df_copy.loc[i, 'c_code'])
            + '", "side": "' + str(participant_df_copy.loc[i, 'side'])
            + '", "battle_deaths": "' + str(participant_df_copy.loc[i, 'battle_deaths'])
            + '", "start_year": "' + str(participant_df_copy.loc[i, 'start_year'])
            + '", "start_date": "' + str(participant_df_copy.loc[i, 'start_date'])
            + '", "end_year": "' + str(participant_df_copy.loc[i, 'end_year'])
            + '", "end_date": "' + str(participant_df_copy.loc[i, 'end_date'])
            + '", "days_at_war": "' + str(participant_df_copy.loc[i, 'days_at_war'])
            + '", "alliances_x": "' + str(participant_df_copy.loc[i, 'alliances_x'])
            + '", "alliances_y": "' + str(participant_df_copy.loc[i, 'alliances_y'])
            + '", "non_aggression_alliances_x": "' + str(participant_df_copy.loc[i, 'non_aggression_alliances_x'])
            + '", "non_aggression_alliances_y": "' + str(participant_df_copy.loc[i, 'non_aggression_alliances_y'])
            + '", "entente_alliances_x": "' + str(participant_df_copy.loc[i, 'entente_alliances_x'])
            + '", "entente_alliances_y": "' + str(participant_df_copy.loc[i, 'entente_alliances_y'])
            + '", "money_flow_out_x": "' + str(participant_df_copy.loc[i, 'money_flow_out_x'])
            + '", "money_flow_out_y": "' + str(participant_df_copy.loc[i, 'money_flow_out_y'])
            + '", "money_flow_in_x": "' + str(participant_df_copy.loc[i, 'money_flow_in_x'])
            + '", "money_flow_in_y": "' + str(participant_df_copy.loc[i, 'money_flow_in_y'])
            + '", "imports_x": "' + str(participant_df_copy.loc[i, 'imports_x'])
            + '", "imports_y": "' + str(participant_df_copy.loc[i, 'imports_y'])
            + '", "exports_x": "' + str(participant_df_copy.loc[i, 'exports_x'])
            + '", "exports_y": "' + str(participant_df_copy.loc[i, 'exports_y'])
            + '", "military_expenditure_x": "' + str(participant_df_copy.loc[i, 'military_expenditure_x'])
            + '", "military_expenditure_y": "' + str(participant_df_copy.loc[i, 'military_expenditure_y'])
            + '", "military_personnel_x": "' + str(participant_df_copy.loc[i, 'military_personnel_x'])
            + '", "military_personnel_y": "' + str(participant_df_copy.loc[i, 'military_personnel_y'])
            + '", "prim_energy_consumption_x": "' + str(participant_df_copy.loc[i, 'prim_energy_consumption_x'])
            + '", "prim_energy_consumption_y": "' + str(participant_df_copy.loc[i, 'prim_energy_consumption_y'])
            + '", "iron_steel_production_x": "' + str(participant_df_copy.loc[i, 'iron_steel_production_x'])
            + '", "iron_steel_production_y": "' + str(participant_df_copy.loc[i, 'iron_steel_production_y'])
            + '", "total_population_x": "' + str(participant_df_copy.loc[i, 'total_population_x'])
            + '", "total_population_y": "' + str(participant_df_copy.loc[i, 'total_population_y'])
            + '", "urban_population_x": "' + str(participant_df_copy.loc[i, 'urban_population_x'])
            + '", "urban_population_y": "' + str(participant_df_copy.loc[i, 'urban_population_y'])
            + '", "cinc_score_x": "' + str(participant_df_copy.loc[i, 'cinc_score_x'])
            + '", "cinc_score_y": "' + str(participant_df_copy.loc[i, 'cinc_score_y'])
            + '"'
        )
        if node == network_nodes[-1]:
            add_line = add_line + '}\n'
        else:
            add_line = add_line + '},\n'
        graph_file.write(add_line)

    graph_file.write(middle_line_2)


    ## sometimes a country could be in the dyad and not in the participant df_copy.
    ## this is rare but has happened (see spain in WWII)
    ## for this reason, a full outer join or something like it must incorporate/address all that is in both dataframes

    for i, node_1 in enumerate(dyad_df_copy['state_name_a']):
        if node_1 not in network_nodes:
            print("{} is missing from participant_df for {}.".format(c_code_dic[dyad_df_copy.loc[i, 'c_code_a']], war_df_copy.loc[0, 'war_name']))
        elif dyad_df_copy.loc[i, 'state_name_b'] not in network_nodes:
            print("{} is missing from participant_df for {}.".format(c_code_dic[dyad_df_copy.loc[i, 'c_code_b']], war_df_copy.loc[0, 'war_name']))
        else:
            add_line = (
                '    {"source": ' + str(network_nodes.index(node_1))
                + ', "target": ' + str(network_nodes.index(dyad_df_copy.loc[i, 'state_name_b']))
                + ', "first_year": ' + str(dyad_df_copy.loc[i, 'first_year'].astype(float))
                + ', "defense_alliance": ' + str(dyad_df_copy.loc[i, 'defense_alliance'].astype(float))
                + ', "neutrality_alliance": ' + str(dyad_df_copy.loc[i, 'neutrality_alliance'].astype(float))
                + ', "entente_alliance": ' + str(dyad_df_copy.loc[i, 'entente_alliance'].astype(float))
                + ', "land_contiguity": ' + str(dyad_df_copy.loc[i, 'land_contiguity'].astype(int))
                + ', "sea_contiguity": ' + str(dyad_df_copy.loc[i, 'sea_contiguity'].astype(int))
                + ', "total_contiguity": ' + str(dyad_df_copy.loc[i, 'total_contiguity'].astype(int))
                + ', "bond": ' + str(1))
            if i+1 != len(dyad_df_copy['state_name_a']):
                add_line = add_line + '},\n'
            else:
                add_line = add_line + '}\n'
            graph_file.write(add_line)

    graph_file.write(end_line)



Thailand is missing from participant_df for World War II.
Thailand is missing from participant_df for World War II.
Austria-Hungary is missing from participant_df for Seven Weeks.
Austria-Hungary is missing from participant_df for Seven Weeks.
Austria-Hungary is missing from participant_df for Seven Weeks.
Austria-Hungary is missing from participant_df for Austro-Sardinian.
Austria-Hungary is missing from participant_df for Austro-Sardinian.
Austria-Hungary is missing from participant_df for Austro-Sardinian.
Austria-Hungary is missing from participant_df for Roman Republic.
Austria-Hungary is missing from participant_df for Second Schleswig-Holstein.
Italy is missing from participant_df for Italian Unification.
Austria-Hungary is missing from participant_df for Italian Unification.
Italy is missing from participant_df for Neapolitan.
Germany is missing from participant_df for First Schleswig-Holstein.
Italy is missing from participant_df for Italian-Roman.


In [None]:
file_df.to_csv('war_file_list.csv', index = None)