In [1]:
import pandas as pd
import numpy as np
from copy import deepcopy
from traceback import format_exc
from pprint import pprint
import the_networks_of_war_python_functions

In [2]:
pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', lambda x: '%.5f' % x)

## Setup for Identifying Countries by Code
### Note: This is helpful for when different names are used for the same country.

In [3]:
csv_directory = '/Users/the_networks_of_war/data_sources/csvs/'
c_code_df = pd.read_csv(csv_directory + 'COW country codes.csv', encoding = 'utf8')

c_code_df.rename({'CCode': 'c_code',
                  'StateNme': 'country'}, axis=1, inplace = True)

c_code_df.drop(['StateAbb'], axis=1, inplace = True)

duplicate_list = ['c_code', 'country']
c_code_df.drop_duplicates(subset = duplicate_list, keep = 'first', inplace = True)
c_code_df = deepcopy(c_code_df.reset_index(drop = True))

c_code_dic = {}
for i, c_code in enumerate(c_code_df['c_code']):
    c_code_dic[c_code] = c_code_df.loc[i, 'country']
    
print('total countries: {}'.format(format(len(c_code_dic.keys()), ',d')))

total countries: 217


## Participant DataFrames
### Note: Only inter-state wars have different source files for dyadic and participant data.

### Inter-State Wars

In [4]:
part_df_1 = pd.read_csv(csv_directory + 'Inter-StateWarData_v4.0.csv', encoding = 'latin-1')

part_df_1.rename({'WarNum': 'war_num',
                  'WarName': 'war_name',
                  'WarType': 'war_type',
                  'ccode': 'c_code',
                  'StateName': 'participant',
                  'Side': 'side',
                  'BatDeath': 'battle_deaths',
                  'StartYear1': 'start_year',
                  'StartMonth1': 'start_month',
                  'StartDay1': 'start_day',
                  'EndYear1': 'end_year',
                  'EndMonth1': 'end_month',
                  'EndDay1': 'end_day'}, axis=1, inplace = True)

part_df_1 = deepcopy(the_networks_of_war_python_functions.start_and_end_dates(part_df_1))

total rows with both dates found: 337
total rows with at least one date not found: 0



In [5]:
aggregations = {
    ## accounting for all cases where countries have more than one side
    ## they will become side 3
    'side': 'sum',
    'battle_deaths': 'sum',
    'start_date': 'min',
    'start_year': 'max',
    'end_date': 'max',
    'end_year': 'max',
    'days_at_war': 'max',
    'ongoing_participation': 'max'
    }

part_df_1 = deepcopy(part_df_1.groupby(['war_num',
                                        'war_name',
                                        'war_type',
                                        'c_code',
                                        'participant']).agg(aggregations).reset_index())

part_df_1 = deepcopy(part_df_1[['war_num',
                                'war_name',
                                'war_type',
                                'c_code',
                                'participant',
                                'side',
                                'battle_deaths',
                                'start_date',
                                'start_year',
                                'end_date',
                                'end_year',
                                'days_at_war',
                                'ongoing_participation']])

In [6]:
# part_df.tail()

### Intra-State Wars
#### Using the same file to create the participant dataframe and dyadic dataframe.

In [7]:
## creating new dataframe to union to interstate wars part_df
## note: the code below is very intefficient because something is off with integer formatting in this file.
## this did not occur in the interstate war file.
part_df_2 = pd.read_csv(csv_directory + 'INTRA-STATE_State_participants v5.1.csv', encoding = 'latin-1')

## either one of these a or b may not actually be states.
## this wil be fixed later on
part_df_2.rename({'WarNum': 'war_num',
                  'WarName': 'war_name',
                  'WarType': 'war_type',
                  'CcodeA': 'c_code_a',
                  'CcodeB': 'c_code_b',
                  'SideA': 'participant_a',
                  'SideB': 'participant_b',
                  'StartDy1': 'start_day',
                  'StartMo1': 'start_month',
                  'StartYr1': 'start_year',
                  'EndDy1': 'end_day',
                  'EndMo1': 'end_month',
                  'EndYr1': 'end_year',
                  ## unsure if these are the same as battle deaths, or include civilians
                  'Deaths A': 'battle_deaths_a',
                  'Deaths B': 'battle_deaths_b',
                  ## according to documentation, this includes both sides
                  'TotalBDeaths': 'total_deaths_both_sides',
                  'WDuratDays': 'total_days_in_war',
                  'SideAPeakTotForces': 'peak_forces_available_a',
                  'SideBPeakTotForces': 'peak_forces_available_b',
                  'SideAPeak TheatForces': 'peak_battle_forces_a',
                  'SideBPeakTheatForces': 'peak_battle_forces_b',
                  'TransFrom': 'lagging_war',
                  'TransTo': 'leading_war'}, axis=1, inplace = True)

## fixing for leap year issue
part_df_2.loc[(part_df_2['start_day']==29) & (part_df_2['start_month']==2) & (part_df_2['start_year']==1894), 'start_day'] = 28
## adjusting for wrong start year
## this needs to be automated (check for  'of ___' in war_name where start_year <> ___)
part_df_2.loc[part_df_2['war_num']==976, 'start_year'] = '2011'

## two syrian arab spring wars starting on the same date.
## I believe one of these is a data entry error.
## combining both war_nums into one war
part_df_2.loc[part_df_2['war_num']==977, 'war_num'] = 979

extra_switch_columns = ['peak_forces_available_a',
                        'peak_forces_available_b',
                        'peak_battle_forces_a',
                        'peak_battle_forces_b']
format_part_df_return = deepcopy(the_networks_of_war_python_functions.format_part_df_from_dyadic_data(part_df_2, extra_switch_columns))
part_df_2 = deepcopy(format_part_df_return[0])
dyad_df_2 = deepcopy(format_part_df_return[1])

total rows with both dates found: 452
total rows with at least one date not found: 141



### Extra State Wars
#### Using the same file to create the participant dataframe and dyadic dataframe.

In [8]:
## creating new dataframe to union to extra-state wars part_df
## inefficient pipeline from above was used to accomodate integer formatting
## unsure if that problem occurs for this one too though
part_df_3 = pd.read_csv(csv_directory + 'Extra-StateWarData_v4.0.csv', encoding = 'latin-1')

## either one of these a or b may not actually be states.
## this wil be fixed later on
part_df_3.rename({'WarNum': 'war_num',
                  'WarName': 'war_name',
                  'WarType': 'war_type',
                  'ccode1': 'c_code_a',
                  'ccode2': 'c_code_b',
                  'SideA': 'participant_a',
                  'SideB': 'participant_b',
                  'StartDay1': 'start_day',
                  'StartMonth1': 'start_month',
                  'StartYear1': 'start_year',
                  'EndDay1': 'end_day',
                  'EndMonth1': 'end_month',
                  'EndYear1': 'end_year',
                  ## unsure if these are the same as battle deaths, or include civilians
                  'BatDeath': 'battle_deaths_a',
                  'NonStateDeaths': 'battle_deaths_b'}, axis=1, inplace = True)

format_part_df_return = deepcopy(the_networks_of_war_python_functions.format_part_df_from_dyadic_data(part_df_3, None))
part_df_3 = deepcopy(format_part_df_return[0])
dyad_df_3 = deepcopy(format_part_df_return[1])

total rows with both dates found: 124
total rows with at least one date not found: 74



### Combining Participant Sources

In [9]:
part_df = deepcopy(pd.concat([part_df_1, part_df_2], sort=True, ignore_index=True).reset_index(drop = True))
part_df = deepcopy(pd.concat([part_df, part_df_3], sort=True, ignore_index=True).reset_index(drop=True))

## keeping only essential columns
part_df = deepcopy(part_df[['war_num',
                            'war_name',
                            'war_type',
                            'c_code',
                            'participant',
                            'side',
                            'battle_deaths',
                            'start_date',
                            'start_year',
                            'end_date',
                            'end_year',
                            'days_at_war',
                            'lagging_war',
                            'leading_war',
                            'ongoing_participation',
                            'total_deaths_both_sides',
                            'peak_forces_available',
                            'peak_battle_forces']])

## removing non applicable participants
part_df = deepcopy(part_df[part_df['participant']!='-8']).reset_index(drop = True)

print('Total War Participants After Merging All War Types: {}'.format(format(len(part_df), ',d')))

Total War Participants After Merging All War Types: 1,705


In [10]:
part_df.rename({'war_type': 'war_type_code'}, axis=1, inplace = True)

## two lists, one for war_types and one for war_sub_types.
## the indexes for each must line up.
war_types = ['Inter-State War',
             'Extra-State War',
             'Extra-State War',
             'Intra-State War',
             'Intra-State War',
             'Intra-State War',
             'Intra-State War',
             'Non-State War',
             'Non-State War']

war_sub_types = ['',
                 'Colonial (conflict with colony)',
                 'Imperial (state vs non-state)',
                 'Civil War (for central control)',
                 'Civil War (over local issues)',
                 'Regional/Internal',
                 'Intercommunal',
                 'In Non-State Territory',
                 'Across State Borders']

for i, war_type in enumerate(war_types):
    ## specifying the subtypes of each war based on the documentation
    part_df.loc[part_df['war_type_code']==i+1, 'war_type'] = war_types[i]
    part_df.loc[part_df['war_type_code']==i+1, 'war_sub_type'] = war_sub_types[i]

In [11]:
## filling in non-applicable values with None
part_df = deepcopy(the_networks_of_war_python_functions.remaining_participant_null_values(part_df))

In [12]:
## creating war_dic that will be used to fill in missing values
war_dic = {}
for i, war_num in enumerate(part_df['war_num']):
    war_dic[war_num] = part_df.loc[i, 'war_name']

## Creating Inter-State Dyadic DataFrame

### Note: The other dyadic dataframes have already been defined above.

### Reading in the data and unioning each participant so they both appear as a and b

In [13]:
## battle deaths and start/end dates are in this file too, but it's more confusing than the part_df.
## this will just be used to get the combinations of countries directly at war with each other.

dyad_df_1 = pd.read_csv(csv_directory + 'directed_dyadic_war.csv', encoding = 'latin-1')

dyad_df_1.rename({'warnum': 'war_num',
                  'statea': 'c_code_a',
                  'stateb': 'c_code_b',
                  'batdtha': 'battle_deaths_a',
                  'batdthb': 'battle_deaths_b',
                  'batdths': 'total_battle_deaths'}, axis=1, inplace = True)

dyad_df_1 = deepcopy(dyad_df_1[['war_num',
                                'c_code_a',
                                'c_code_b',
                                'year',
                                'battle_deaths_a',
                                'battle_deaths_b',
                                'disno',
                                'total_battle_deaths']])

for i, c_code_a in enumerate(dyad_df_1['c_code_a']):
    dyad_df_1.loc[i, 'participant_a'] = c_code_dic[c_code_a]
    dyad_df_1.loc[i, 'participant_b'] = c_code_dic[dyad_df_1.loc[i, 'c_code_b']] 
    
## unioning mismatching columns so each participant will get their own row
switched_columns_list = ['c_code_a',
                         'c_code_b',
                         'participant_a',
                         'participant_b',
                         'battle_deaths_a',
                         'battle_deaths_b']
dyad_df_1 = deepcopy(the_networks_of_war_python_functions.union_opposite_columns(dyad_df_1, switched_columns_list))

# # ## fixing data entry error
# dyad_df.loc[dyad_df['war_num']==106, 'war_end_year'] = 1918

### Checking for any missing dyads that can be extracted from MID data
### Adding in Dyads (Not Included) Marked as War==1 in MID Data

In [14]:
mid_df = pd.read_csv(csv_directory + 'dyadic MIDs 3.1.csv', encoding = 'latin-1')
## only including war conflicts
mid_df = deepcopy(mid_df[mid_df['war']==1])
mid_df.rename({'statea': 'c_code_a', 'stateb': 'c_code_b'}, axis=1, inplace = True)
mid_df = deepcopy(mid_df[['c_code_a', 'c_code_b', 'year']])
## unioning mismatching columns so each participant will get their own row
switched_columns_list = ['c_code_a',
                         'c_code_b']
mid_df = deepcopy(the_networks_of_war_python_functions.union_opposite_columns(mid_df, switched_columns_list))

duplicate_list = ['c_code_a', 'c_code_b', 'year']
mid_df.sort_values(by = 'year', ascending = True, inplace = True)
## removing duplicates before merging
mid_df.drop_duplicates(subset = duplicate_list, keep = 'first', inplace = True)

dyad_df_1 = deepcopy(pd.merge(dyad_df_1, mid_df, how = 'outer', on = ['c_code_a', 'c_code_b', 'year']))

In [15]:
## these have been manually reviewed to all be WWII dyads
total_dyads_added = deepcopy(len(dyad_df_1[dyad_df_1['war_num'].isnull()]))

for i, war in enumerate(dyad_df_1['war_num']):
    if str(dyad_df_1.loc[i, 'war_num'])=='nan':
        ## these have been manually reviewed to all be WWII dyads
        dyad_df_1.loc[i, 'war_num'] = 139
        dyad_df_1.loc[i, 'participant_a'] = c_code_dic[dyad_df_1.loc[i, 'c_code_a']]
        dyad_df_1.loc[i, 'participant_b'] = c_code_dic[dyad_df_1.loc[i, 'c_code_b']]
## should be at least 8
print('Total Dyads Added From MIDs: {}'.format(format(total_dyads_added, ',d')))
print('note: these have all been manually reviewed to be WWII dyads.')

Total Dyads Added From MIDs: 8
note: these have all been manually reviewed to be WWII dyads.


In [16]:
dyad_df_1.tail(total_dyads_added)

Unnamed: 0,battle_deaths_a,battle_deaths_b,c_code_a,c_code_b,disno,participant_a,participant_b,total_battle_deaths,war_num,year
2728,,,325,355,,Italy,Bulgaria,,139.0,1943
2729,,,355,325,,Bulgaria,Italy,,139.0,1943
2730,,,355,325,,Bulgaria,Italy,,139.0,1944
2731,,,325,355,,Italy,Bulgaria,,139.0,1944
2732,,,255,375,,Germany,Finland,,139.0,1944
2733,,,375,255,,Finland,Germany,,139.0,1944
2734,,,255,375,,Germany,Finland,,139.0,1945
2735,,,375,255,,Finland,Germany,,139.0,1945


## Combining Dyadic Sources
#### Second and third dyadic sources are defined above during processing for participant dataframes.

In [17]:
## removing non applicable participants
## don't need to do this for inter-state war because all is applicable
dyad_df_2 = deepcopy(dyad_df_2[dyad_df_2['participant_a']!='-8']).reset_index(drop=True)
dyad_df_2 = deepcopy(dyad_df_2[dyad_df_2['participant_b']!='-8']).reset_index(drop=True)
dyad_df_3 = deepcopy(dyad_df_3[dyad_df_3['participant_a']!='-8']).reset_index(drop=True)
dyad_df_3 = deepcopy(dyad_df_3[dyad_df_3['participant_b']!='-8']).reset_index(drop=True)

print('Total Inter-State War Dyads: {}'.format(format(int(len(dyad_df_1)/2), ',d')))
print('Total Intra-State War Dyads: {}'.format(format(int(len(dyad_df_2)/2), ',d')))
print('Total Extra-State War Dyads: {}'.format(format(int(len(dyad_df_3)/2), ',d')))

dyad_df = deepcopy(pd.concat([dyad_df_1, dyad_df_2], sort=True, ignore_index=True).reset_index(drop=True))
dyad_df = deepcopy(pd.concat([dyad_df, dyad_df_3], sort=True, ignore_index=True).reset_index(drop=True))

print('Total Dyads After Merging All War Types: {}'.format(format(int(len(dyad_df)/2), ',d')))

## saving this for process below
dyad_df_for_missing_values = deepcopy(dyad_df)
## dropping all columns that were only needed for adding missing values
dyad_df.drop(['disno',
              'battle_deaths_a',
              'battle_deaths_b',
              'total_battle_deaths'], axis=1, inplace=True)

Total Inter-State War Dyads: 1,368
Total Intra-State War Dyads: 420
Total Extra-State War Dyads: 164
Total Dyads After Merging All War Types: 1,952


## Final Data Adjustments

## Addressing Missing Values from both Participant and Dyadic Data

### Adding in Missing Dyads for Wars with Only One Possible Adverary

In [18]:
## need to figure out a way to add dyadic data when it's missing.
## these are clear cases where it should be added because one side on the war is only one country.
## or, both sides are only one country
## it'll be trickier when each side isn't just one country.
## that will lead to a floating noad that isn't grounded in the network analysis graph

total_dyads = deepcopy(len(dyad_df))
for i, war_num in enumerate(list(set(list(part_df['war_num'])))):
    total_side_1 = len(part_df[(part_df['war_num']==war_num) & (part_df['side']==1)])
    total_side_2 = len(part_df[(part_df['war_num']==war_num) & (part_df['side']==2)])
    if total_side_1==1 and total_side_2==1:
        dyad_df = deepcopy(the_networks_of_war_python_functions.add_missing_dyads(part_df, dyad_df, war_num, 1))
    elif total_side_1==1 and total_side_2!=1:
        dyad_df = deepcopy(the_networks_of_war_python_functions.add_missing_dyads(part_df, dyad_df, war_num, 1))
    elif total_side_1!=1 and total_side_2==1:
        dyad_df = deepcopy(the_networks_of_war_python_functions.add_missing_dyads(part_df, dyad_df, war_num, 2))
    else:
        pass

dyads_added = deepcopy(len(dyad_df) - total_dyads)
print('Checking for floating participants with only one possible adversary.')
print('Total Dyads Added: {}'.format(format(dyads_added, ',d')))

Checking for floating participants with only one possible adversary.
Total Dyads Added: 188


### Adding in Missing Participants that Appear in Dyadic Data for War

In [19]:
print('Participants Added from Dyadic Data:\n')
war_list = list(set(list(dyad_df['war_num'])))

## filling in values below using MID data
mid_df = pd.read_csv(csv_directory + 'dyadic MIDs 3.1.csv', encoding = 'latin-1')
mid_df.rename({'statea': 'c_code_a',
               'stateb': 'c_code_b',
               'namea': 'participant_a',
               'nameb': 'participant_b'}, axis=1, inplace = True)

## giving this to the opposite side of the participant they fought against in the war (grabs first record)
## troublesome if they swithced sides but this would be very rare
opposing_side_dic = {1: 2, 2: 1, 3: 3}

original_part_df_length = deepcopy(len(part_df))
## the following checks for missing data across participant names and c_codes
## this is only possible (at the moment) for participants with c_codes, because the rest comes from dyadic data
for war_num in war_list:

    part_df_copy = deepcopy(part_df[part_df['war_num']==war_num].reset_index(drop=True))
    participant_list = list(part_df_copy['c_code'])
    dyad_df_copy = deepcopy(dyad_df[dyad_df['war_num']==war_num].reset_index(drop=True))
    dyad_list = list(set(list(dyad_df_copy['c_code_a']) + list(dyad_df_copy['c_code_b'])))
    try:
        dyad_list.remove(-8)
    except:
        pass
    for participant in dyad_list:
        if participant not in participant_list:
            war_num = part_df_copy['war_num'].values[0]
            print(str(war_num)[:-2] + ', ' + war_dic[war_num] + ': ' + str(participant) + ', ' + c_code_dic[participant])
            
            temp_df_1 = deepcopy(dyad_df_for_missing_values[(dyad_df_for_missing_values['war_num']==war_num) & ((dyad_df_for_missing_values['c_code_a']==participant) | (dyad_df_for_missing_values['c_code_b']==participant))].reset_index())
            dispute_numbers = list(set(list(temp_df_1['disno'].values)))
            ## should always be 1
            if len(dispute_numbers) > 1:
                print('missing participant has more than 1 dispute.')
                print('logic will need to be adjusted.')
                print('total disputes for missing participant: {}'.format(len(dispute_numbers)))
            dispute_number = dispute_numbers[0]
            
            temp_df_2 = deepcopy(mid_df[(mid_df['disno']==dispute_number) & ((mid_df['c_code_a']==participant)|(mid_df['c_code_b']==participant))].reset_index())
            temp_df_2.rename({'strtday': 'start_day',
                              'strtmnth': 'start_month',
                              'strtyr': 'start_year',
                              'endday': 'end_day',
                              'endmnth': 'end_month',
                              'endyear': 'end_year',
                             }, axis=1, inplace = True)
            temp_df_2['war_num'] = war_num
            temp_df_2 = deepcopy(the_networks_of_war_python_functions.start_and_end_dates(temp_df_2))
            aggregations = {
                'start_date': 'min',
                'end_date': 'max',
                'days_at_war': 'max'
                }
            temp_df_2 = deepcopy(temp_df_2.groupby(['war_num', 'c_code_a', 'participant_a', 'c_code_b', 'participant_b']).agg(aggregations).reset_index())

            ## manually filling in values that are found in dyadic cow datasets but seem to be missing from country level sources.
            ## values have been obtained from dyadic data (directed_dyadic_war.csv' and dyadic MIDs 3.1.csv)
            df_length = deepcopy(len(part_df))
            
            if len(temp_df_2[temp_df_2['c_code_a']==participant]) > 0:
                part_df.loc[df_length, 'c_code'] = temp_df_2[temp_df_2['c_code_a']==participant]['c_code_a'].values[0]
                ## assuming it's in c_code_dic
                ## otherwise, this breaks!
                part_df.loc[df_length, 'participant'] = c_code_dic[temp_df_2[temp_df_2['c_code_a']==participant]['c_code_a'].values[0]]
                opposite_participant = temp_df_2[temp_df_2['c_code_a']==participant]['c_code_b'].values[0]
            else:
                part_df.loc[df_length, 'c_code'] = temp_df_2[temp_df_2['c_code_b']==participant]['c_code_b'].values[0]
                ## assuming it's in c_code_dic
                ## otherwise, this breaks!
                part_df.loc[df_length, 'participant'] = c_code_dic[temp_df_2[temp_df_2['c_code_b']==participant]['c_code_a'].values[0]]
                opposite_participant = temp_df_2[temp_df_2['c_code_a']==participant]['c_code_b'].values[0]
             
            part_df.loc[df_length, 'war_num'] = war_num
            part_df.loc[df_length, 'war_name'] = war_dic[war_num]
            part_df.loc[df_length, 'war_type'] = part_df[part_df['war_num']==war_num]['war_type'].values[0]
            part_df.loc[df_length, 'war_type_code'] = part_df[part_df['war_num']==war_num]['war_type_code'].values[0]
            part_df.loc[df_length, 'war_sub_type'] = part_df[part_df['war_num']==war_num]['war_sub_type'].values[0]
            
            part_df.loc[df_length, 'side'] = opposing_side_dic[part_df[(part_df['war_num']==war_num) & (part_df['c_code']==opposite_participant)]['side'].values[0]]
            part_df.loc[df_length, 'battle_deaths'] = max(list(temp_df_1[temp_df_1['c_code_a']==participant]['battle_deaths_a']) + list(temp_df_1[temp_df_1['c_code_b']==participant]['battle_deaths_b']))
            
            part_df.loc[df_length, 'start_date'] = temp_df_2['start_date'].values[0]
            part_df.loc[df_length, 'start_year'] = float(str(temp_df_2['start_date'].values[0])[0:4])
            part_df.loc[df_length, 'end_date'] = pd.to_datetime(str(temp_df_2['end_date'].values[0])[0:11])
            part_df.loc[df_length, 'end_year'] = float(str(temp_df_2['end_date'].values[0])[0:4])
            ## no idea why days_at_war doesn't work in the function.
            ## adding it manually here
            ## finishing this over two lines to convert from string to int
            part_df.loc[df_length, 'days_at_war'] = part_df.loc[df_length, 'end_date'] - part_df.loc[df_length, 'start_date']
            part_df.loc[df_length, 'days_at_war'] = int(str(part_df.loc[df_length, 'days_at_war']).split(' ')[0])
            ## prevent duplication in for loop
            ## this may be an issue if they fought with more than one country
            participant_list.append(participant)
        else:
            pass
        
part_df['war_num'] = part_df['war_num'].astype(float)
part_df['start_year'] = part_df['start_year'].astype(float)
part_df['end_year'] = part_df['end_year'].astype(float)

Participants Added from Dyadic Data:

108, Latvian Liberation: 290.0, Poland
total rows with both dates found: 2
total rows with at least one date not found: 0

108, Latvian Liberation: 200.0, United Kingdom
total rows with both dates found: 6
total rows with at least one date not found: 0

108, Latvian Liberation: 220.0, France
total rows with both dates found: 4
total rows with at least one date not found: 0

139, World War II: 800.0, Thailand
total rows with both dates found: 24
total rows with at least one date not found: 0

139, World War II: 230.0, Spain
total rows with both dates found: 8
total rows with at least one date not found: 0

151, Korean: 920.0, New Zealand
total rows with both dates found: 16
total rows with at least one date not found: 0

169, Six Day War: 645.0, Iraq
total rows with both dates found: 2
total rows with at least one date not found: 0

184, Turco-Cypriot: 350.0, Greece
total rows with both dates found: 2
total rows with at least one date not found: 0



In [20]:
# missing value for thailand battle deaths in WWII obtained from wikipedia
# https://en.wikipedia.org/wiki/Thailand_in_World_War_II#:~:text=Thailand%20suffered%20about%205%2C569%20military,the%20brief%20Franco%2DThai%20War.
part_df.loc[(part_df['war_num']==139) & (part_df['participant']=='Thailand'), 'battle_deaths'] = 5569
# missing value for greece battle deaths in Turco Cypriot obtained from wikipedia
# https://en.wikipedia.org/wiki/Turkish_invasion_of_Cyprus#:~:text=The%20violence%20resulted%20in%20the,of%2025%2C000%E2%80%9330%2C000%20Turkish%20Cypriots.
part_df.loc[(part_df['war_num']==184) & (part_df['participant']=='Greece'), 'battle_deaths'] = 105

In [21]:
## check for all values that were just added
## making sure no fields are null that shouldn't be null
# part_df.tail(len(part_df)-original_part_df_length)

## Defining War DataFrame (One row for each war)

In [22]:
part_df_copy = deepcopy(part_df)
part_df_copy.rename({'participant': 'total_participants'}, axis=1, inplace = True)

## filling these dates in arbitrarily before taking aggregates
## high date for start_date because this will be min
## low date for end_date because this will be max
part_df_copy['start_date'].fillna(pd.to_datetime('2100-01-01'), inplace = True)
part_df_copy['end_date'].fillna(pd.to_datetime('1700-01-01'), inplace = True)
part_df_copy.rename({'ongoing_participation': 'ongoing_war'}, axis=1, inplace = True)

name_change_count = 0
for i, war in enumerate(part_df_copy['war_name']):
    if part_df_copy.loc[i, 'ongoing_war']==1:
        pass
    elif 'present' in part_df_copy.loc[i, 'war_name'].lower() or 'ongoing' in part_df_copy.loc[i, 'war_name'].lower():
        part_df_copy.loc[i, 'ongoing_war'] = 1
        name_change_count+=1
    if ' of 1' in part_df_copy.loc[i, 'war_name']:
        part_df_copy.loc[i, 'war_name'] = part_df_copy.loc[i, 'war_name'].split(' of 1')[0].replace('  ', ' ')
    elif ' of 2' in part_df_copy.loc[i, 'war_name']:
        part_df_copy.loc[i, 'war_name'] = part_df_copy.loc[i, 'war_name'].split(' of 2')[0].replace('  ', ' ')
        
print('Wars Changed to Ongoing: {}'.format(name_change_count))

Wars Changed to Ongoing: 19


In [23]:
aggregations = {
    'total_participants': 'count',
#     'potential_start_year': 'min',
    'start_year': 'min',
    'end_year': 'max',
    ## this will not be accurate if there are more than one lagging/leading wars per war.
    'lagging_war': 'min',
    'leading_war': 'max',
    'ongoing_war': 'max',
    'start_date': 'min',
    'end_date': 'max'
    ## not sure how to add this one just yet
#     'total_deaths_both_sides': 'max'
    }

war_df = deepcopy(part_df_copy.groupby(['war_num', 'war_name', 'war_type_code', 'war_type', 'war_sub_type']).agg(aggregations).reset_index())

## putting these back to none in case they made it through the aggregation
war_df.loc[war_df['start_date']==pd.to_datetime('2100-01-01'), 'start_date'] = None
war_df.loc[war_df['end_date']==pd.to_datetime('1700-01-01'), 'end_date'] = None
war_df['total_days_in_war'] = war_df['end_date'] - war_df['start_date']

for i, war in enumerate(war_df['war_name']):
    try:
        war_df.loc[i, 'total_days_in_war'] = int(str(war_df.loc[i, 'total_days_in_war']).split(' ')[0])
    except:
        war_df.loc[i, 'total_days_in_war'] = None
        
war_df = deepcopy(war_df.sort_values(by = ['start_year', 'end_year', 'war_name'], ascending = (False, True, True)))

## Integreating Descriptive Data (Defined in Outside Notebook)

### Merging Participant Data with Participant-Level Descriptive Data

In [24]:
pickle_directory = '/Users/the_networks_of_war/data_sources/pickles/'
descriptive_df_1 = pd.read_pickle(pickle_directory + 'participant_descriptive_df.pkl')

part_df['start_year'] = part_df['start_year'].astype(float)
part_df['end_year'] = part_df['end_year'].astype(float)
descriptive_df_1.rename({'year': 'start_year'}, axis=1, inplace = True)

part_df = deepcopy(pd.merge(part_df, descriptive_df_1, how = 'left', on = ['c_code', 'start_year']))
descriptive_df_1.rename({'start_year': 'end_year'}, axis=1, inplace = True)
part_df = deepcopy(pd.merge(part_df, descriptive_df_1, how = 'left', on = ['c_code', 'end_year']))

### Merging Dyadic Data with Dyadic-Level Descriptive Data

In [25]:
descriptive_df_2 = pd.read_pickle(pickle_directory + 'dyadic_descriptive_df.pkl')
dyad_df = deepcopy(pd.merge(dyad_df, descriptive_df_2, how = 'left', on = ['c_code_a', 'c_code_b', 'year']))

## Finalizing Participant and Dyadic Dataframes

### Keeping values for first and last year of each dyad
### Combining these into one row per dyad

In [26]:
## creating unique identifier "conflict_pair" for each dyad
## based on name
for i, participant_a in enumerate(dyad_df['participant_a']):
    dyad_list = []
    dyad_list.append(participant_a)
    dyad_list.append(dyad_df.loc[i, 'participant_b'])
    dyad_list = str(sorted(dyad_list))
    dyad_df.loc[i, 'conflict_pair'] = dyad_list

dyad_df_columns = ['war_num', 'year', 'participant_a', 'participant_b', 'conflict_pair']
for column in list(descriptive_df_2.columns):
    dyad_df_columns.append(column)
    dyad_df['year'] = dyad_df['year'].astype(int)
## not including any row without any participants
dyad_df = deepcopy(dyad_df[(dyad_df['participant_a'].isnull()==False) & (dyad_df['participant_b'].isnull()==False)])

## creating new dyad_df to get the max of each field during all the years of the dyad
max_dyad_df = deepcopy(dyad_df)
descriptive_columns =  list(descriptive_df_2.columns)
descriptive_columns.remove('year')
descriptive_columns.remove('c_code_a')
descriptive_columns.remove('c_code_b')
aggregations = {}
for column in descriptive_columns:
    max_dyad_df.rename({column: column + '_z'}, axis=1, inplace = True)
    aggregations[column + '_z'] = 'max'
max_dyad_df = deepcopy(max_dyad_df.groupby(['war_num', 'conflict_pair']).agg(aggregations).reset_index())

## need to dedupe across conflict pair so a vs b are never repeated interchangably
duplicate_list = ['war_num', 'conflict_pair']

first_year_dyad_df = deepcopy(dyad_df)
first_year_dyad_df.sort_values(by = 'year', ascending = True, inplace = True)
## keeping the values for only the first year of conflict within a given dyad
first_year_dyad_df.drop_duplicates(subset = duplicate_list, keep = 'first', inplace = True)
first_year_dyad_df.rename({'year': 'first_year'}, axis=1, inplace = True)

last_year_dyad_df = deepcopy(dyad_df)
last_year_dyad_df.sort_values(by = 'year', ascending = True, inplace = True)
## keeping the values for only the last year of conflict within a given dyad
last_year_dyad_df.drop_duplicates(subset = duplicate_list, keep = 'last', inplace = True)
last_year_dyad_df.rename({'year': 'last_year'}, axis=1, inplace = True)

## combining first and last year dyads into one dataframe
dyad_df = deepcopy(pd.merge(first_year_dyad_df, last_year_dyad_df, how = 'left', on = ['conflict_pair', 'war_num']))
## combining the maximum df into the final version of the dataframe
dyad_df = deepcopy(pd.merge(dyad_df, max_dyad_df, how = 'left', on = ['conflict_pair', 'war_num']))
## changing column names and then dropping duplicates
## adding them into the join woudl fail (since not all have c_codes)
dyad_df.rename({'c_code_a_x':'c_code_a',
                'c_code_b_x': 'c_code_b',
                'participant_a_x':'participant_a',
                'participant_b_x': 'participant_b'}, axis=1, inplace = True)
dyad_df = deepcopy(dyad_df.reset_index(drop = True))
dyad_df.drop(['conflict_pair',
              'c_code_a_y',
              'c_code_b_y',
              'participant_a_y',
              'participant_b_y'], axis=1, inplace = True)

### Addressing in null values, missing data, and conversions for dyads and participants

In [27]:
conversion_dic = {'money_flow_in': 1000000,
                  'money_flow_out': 1000000,
                  'military_expenditure': 1000,
                  'military_personnel': 1000,
                  'total_population': 1000,
                  'urban_population': 1000,
                  ## these are thousands of tons
                  'iron_steel_production': 2000000,
                  'prim_energy_consumption': 2000000
                 }

part_df = deepcopy(the_networks_of_war_python_functions.column_fills_and_converions(part_df, conversion_dic))
dyad_df = deepcopy(the_networks_of_war_python_functions.column_fills_and_converions(dyad_df, None))


total columns adjusted: 30
total columns adjusted for conversion: 16
null values notated: 24,818
unknown values notated: 476

total columns adjusted: 52
total columns adjusted for conversion: 0
null values notated: 50,182
unknown values notated: 0


In [28]:
dyad_df['test'] = 1
aggregations = {}
for column in descriptive_columns:
    aggregations[column + '_z'] = 'sum'
dyad_df_z = deepcopy(dyad_df.groupby('test').agg(aggregations).reset_index())
for column in sorted(list(dyad_df_z.drop('test', axis=1).columns)):
    print('{}: {}'.format(column[:-2], format(int(dyad_df_z[column].values[0]), ',d')))

alliance: 40
both_collective_leadership: 0
both_communist_leaders: 4
both_democratic_regimes: 3
both_dictatorships: 31
both_military_leaders: 12
both_royal_leaders: 0
colonial_contiguity: 40
contiguity: 152
defense_cooperation_agreements: 24
diplomatic_exchange: 90
inter_governmental_organizations: 294
same_leader_type: 12
same_regime_type: 19
territory_exchange: 56
trade_relations: 243


### Saving the data

In [29]:
print('total participants: {}'.format(format(len(part_df), ',d')))
print('total dyadic combinations: {}'.format(format(len(dyad_df), ',d')))
print('total wars: {}'.format(format(len(war_df), ',d')))

part_df.to_pickle(pickle_directory + 'part_df.pkl')
dyad_df.to_pickle(pickle_directory + 'dyad_df.pkl')
war_df.to_pickle(pickle_directory + 'war_df.pkl')

total participants: 1,713
total dyadic combinations: 1,066
total wars: 678


In [30]:
# part_df = pd.read_pickle(pickle_directory + '/part_df.pkl')
# dyad_df = pd.read_pickle(pickle_directory + 'dyad_df.pkl')
# war_df = pd.read_pickle(pickle_directory + 'war_df.pkl')

## JSON Export for D3.js Processing

In [31]:
file_df = pd.DataFrame()

war_column_list = list(war_df.columns)
## dropping fields that won't be needed in the participant section of the json file
participant_column_list = list(part_df.drop(['war_num',
                                             'war_name',
                                             'war_type',
                                             'war_sub_type',
                                             'total_deaths_both_sides'], axis=1).columns)
## dropping fields that won't be needed in the dyad section of the json file
dyad_column_list = list(dyad_df.drop(['war_num',
                                      'c_code_a',
                                      'c_code_b',
                                      'participant_a',
                                      'participant_b'], axis=1).columns)

csv_directory = '../../assets/the_networks_of_war/json_files_by_war/'
print('json files to be rewritten: {}\n'.format(format(len(war_df), ',d')))
for i, war in enumerate(war_df['war_num']):
    
    file_name = 'war_num_' + str(war).replace('.', '_') + '.json'
    file_df.loc[i, 'file_name'] = file_name
    for column in war_column_list:
        file_df.loc[i, column] = war_df[war_df['war_num']==war][column].values[0]

    part_df_copy = deepcopy(pd.read_pickle(pickle_directory + 'part_df.pkl'))
    part_df_copy = deepcopy(part_df_copy[part_df_copy['war_num']==war].reset_index(drop = True))
    dyad_df_copy = deepcopy(pd.read_pickle(pickle_directory + 'dyad_df.pkl'))
    dyad_df_copy = deepcopy(dyad_df_copy[dyad_df_copy['war_num']==war].reset_index(drop = True))
    war_df_copy = deepcopy(pd.read_pickle(pickle_directory + 'war_df.pkl'))
    war_df_copy = deepcopy(war_df_copy[war_df_copy['war_num']==war].reset_index(drop = True))
    
    node_names = []
    for i, participant in enumerate(part_df_copy['participant']):
        c_code_input = part_df_copy.loc[i, 'c_code']
        if c_code_input==-8:
            node_names.append(participant)
        else:
            node_names.append(part_df_copy.loc[i, 'c_code'])

    ## rewriting all of the files each time
    graph_file = open(csv_directory + file_name, 'w').close()
    graph_file = open(csv_directory + file_name, 'w')
    graph_file.write('{\n  "war": [\n    {')
    war_line = ''
    for i, column in enumerate(war_column_list):
        if i==0:
            war_line = deepcopy(war_line + '"' + column + '": "' + str(war_df_copy.loc[0, column]))
        else:
            war_line = deepcopy(war_line + '", "' + column + '": "' + str(war_df_copy.loc[0, column]))

    war_line = deepcopy(war_line + '"}\n')
    graph_file.write(war_line)
    graph_file.write('  ],\n  "nodes": [\n')

    participant_line = ''
    for i, node in enumerate(part_df_copy['participant']):
        participant_line = deepcopy(participant_line + '    {"id": "' + str(i))
        for column in participant_column_list:
            participant_line = deepcopy(participant_line + '", "' + column + '": "' + str(part_df_copy.loc[i, column]))
        participant_line = deepcopy(participant_line + '"},\n')
        
    participant_line = deepcopy(participant_line[:-2] + '\n')
    graph_file.write(participant_line)
    graph_file.write('  ],\n  "links": [\n')

    link_line = ''
    dyad_inputs = []
    if len(dyad_df_copy)==0:
        print('No dyads for war_num {} ({}), {} participants'.format(war, war_df_copy.loc[0, 'war_name'], len(part_df_copy)))
    for i, node_1 in enumerate(dyad_df_copy['c_code_a']):
        if dyad_df_copy.loc[i, 'c_code_a']==-8:
            node_1_input = dyad_df_copy.loc[i, 'participant_a']
            dyad_inputs.append(node_1_input)
        else:
            node_1_input = node_1
            dyad_inputs.append(node_1_input)
        if dyad_df_copy.loc[i, 'c_code_b']==-8:
            node_2_input = dyad_df_copy.loc[i, 'participant_b']
            dyad_inputs.append(node_2_input)
        else:
            node_2_input = dyad_df_copy.loc[i, 'c_code_b']
            dyad_inputs.append(node_2_input)
        ## sometimes a country could be in the dyad and not in the participant df_copy.
        ## this is rare but has happened (see Spain in WWII)
        ## should be taken care of in processes above, so checking below to see if that process failed
        if node_1_input not in node_names:
            print("{} is still missing from part_df for {}.".format(dyad_df_copy.loc[i, 'participant_a'], war_df_copy.loc[0, 'war_name']))   
        elif node_2_input not in node_names:
            print("{} is still missing from part_df for {}.".format(dyad_df_copy.loc[i, 'participant_b'], war_df_copy.loc[0, 'war_name']))   
        else:
            link_line = deepcopy(link_line
                + '    {"source": "' + str(node_names.index(node_1_input))
                + '", "target": "' + str(node_names.index(node_2_input)))
            for column in dyad_column_list:
                link_line = deepcopy(link_line + '", "' + column + '": "' + str(dyad_df_copy.loc[i, column]))
            link_line = deepcopy(link_line + '"},\n')
                                
    link_line = deepcopy(link_line[:-2] + '\n')
    graph_file.write(link_line)
    graph_file.write('\n  ]\n}')
            
    for node in node_names:
        if node not in dyad_inputs:
            if int(str(node)[0]) in np.arange(0, 10):
                print('Floating Node in {}: {}'.format(war_df_copy.loc[0, 'war_name'], c_code_dic[node]))
            else:
                print('Floating Node in {}: {}'.format(war_df_copy.loc[0, 'war_name'], node))

json files to be rewritten: 678

Floating Node in Third Somalia War: United States of America
Floating Node in Third Somalia War: Uganda
Floating Node in Third Somalia War: Kenya
Floating Node in Third Somalia War: Burundi
Floating Node in Third Somalia War: Ethiopia
Floating Node in Third Somalia War: Eritrea
Floating Node in Africa's World War: Central African Republic
Floating Node in Africa's World War: Chad
Floating Node in Africa's World War: Angola
Floating Node in Africa's World War: Zimbabwe
Floating Node in Africa's World War: Namibia
Floating Node in Africa's World War: Sudan
Floating Node in Africa's World War: Rwanda
Floating Node in Africa's World War: Burundi
Floating Node in Africa's World War: Uganda
Floating Node in Fourth Lebanese War: United States of America
Floating Node in Fourth Lebanese War: France
Floating Node in Fourth Lebanese War: Iran
Floating Node in Fourth Lebanese War: Syria
Floating Node in Second South Sudan War: Libya
Floating Node in Second South S

In [32]:
file_df.to_csv('../../assets/the_networks_of_war/war_file_list.csv', index = None)