In [1]:
import pandas as pd
import numpy as np
from copy import deepcopy
from traceback import format_exc
# from pprint import pprint
import the_networks_of_war_python_functions

In [2]:
pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', lambda x: '%.5f' % x)

In [3]:
csv_directory = '/Users/charlieyaris/Personal/data_sources/the_networks_of_war/csvs/'

## Setup for Identifying Countries by Code
### Note: This is helpful for when different names are used for the same country.

In [4]:
c_code_dic = the_networks_of_war_python_functions.define_c_code_dic()

Total Country Codes: 217


## Participant DataFrames
### Note: Only inter-state wars have different source files for dyadic and participant data.

### Inter-State Wars

In [5]:
part_df_1 = pd.read_csv(csv_directory + 'Inter-StateWarData_v4.0.csv', encoding='latin-1')
part_df_1_renaming = {'WarNum': 'war_num',
                      'WarName': 'war_name',
                      'WarType': 'war_type',
                      'ccode': 'c_code',
                      'StateName': 'participant',
                      'Side': 'side',
                      'BatDeath': 'battle_deaths',
                      'StartYear1': 'start_year',
                      'StartMonth1': 'start_month',
                      'StartDay1': 'start_day',
                      'EndYear1': 'end_year',
                      'EndMonth1': 'end_month',
                      'EndDay1': 'end_day'}
part_df_1.rename(part_df_1_renaming, axis=1, inplace=True)
part_df_1 = deepcopy(part_df_1[list(part_df_1_renaming.values())])
part_df_1 = deepcopy(the_networks_of_war_python_functions.start_and_end_dates(part_df_1))

Total Rows With Both Dates Found: 337
Total Rows With At Least One Date Not Found: 0
Total Estimated Start Dates: 0
Total Estimated End Dates: 0



In [6]:
aggregations = {
    ## accounting for all cases where countries have more than one side
    ## they will become side 3
    'side': 'sum',
    'battle_deaths': 'sum',
    'start_date': 'min',
    'start_year': 'max',
    'end_date': 'max',
    'end_year': 'max',
    'days_at_war': 'max',
    ## aggregation for estimation fields.
    ## this could fail for aggregations over multiple dates
    'start_date_estimated': 'max',
    'end_date_estimated': 'max',
    'ongoing_participation': 'max'
    }
part_df_1 = deepcopy(part_df_1.groupby(['war_num',
                                        'war_name',
                                        'war_type',
                                        'c_code',
                                        'participant']).agg(aggregations).reset_index())

In [7]:
print('Manually changing USSR to side 3 for WWII based on dyadic data.')
## manually changing USSR to side 3 for WWII because they invaded Poland before fighting against Germany.
part_df_1.loc[(part_df_1['war_num']==139) & (part_df_1['c_code']==365), 'side'] = 3

print('Manually changing Finland to side 3 for WWII based on dyadic data.')
## manually changing Finald to side 3 for WWII because they fought with Germany before Fighting against Germany.
part_df_1.loc[(part_df_1['war_num']==139) & (part_df_1['c_code']==375), 'side'] = 3

Manually changing USSR to side 3 for WWII based on dyadic data.
Manually changing Finland to side 3 for WWII based on dyadic data.


### Intra-State Wars
#### Using the same file to create the participant dataframe and dyadic dataframe.

In [8]:
## creating new dataframe to union to interstate wars part_df
## note: the code below is very intefficient because something is off with integer formatting in this file.
## this did not occur in the interstate war file.
dyad_df_2 = pd.read_csv(csv_directory + 'INTRA-STATE_State_participants v5.1.csv', encoding='latin-1')
## either one of these a or b may not actually be states.
## this wil be fixed later on
dyad_df_2_renaming = {'WarNum': 'war_num',
                      'WarName': 'war_name',
                      'WarType': 'war_type',
                      'CcodeA': 'c_code_a',
                      'CcodeB': 'c_code_b',
                      'SideA': 'participant_a',
                      'SideB': 'participant_b',
                      'StartDy1': 'start_day',
                      'StartMo1': 'start_month',
                      'StartYr1': 'start_year',
                      'EndDy1': 'end_day',
                      'EndMo1': 'end_month',
                      'EndYr1': 'end_year',
                      ## unsure if these are the same as battle deaths, or include civilians
                      'Deaths A': 'battle_deaths_a',
                      'Deaths B': 'battle_deaths_b',
                      ## according to documentation, this includes both sides
                      'TotalBDeaths': 'total_deaths_both_sides',
                      'WDuratDays': 'total_days_in_war',
                      'SideAPeakTotForces': 'peak_forces_available_a',
                      'SideBPeakTotForces': 'peak_forces_available_b',
                      'SideAPeak TheatForces': 'peak_battle_forces_a',
                      'SideBPeakTheatForces': 'peak_battle_forces_b',
                      'TransFrom': 'lagging_war',
                      'TransTo': 'leading_war'}
dyad_df_2.rename(dyad_df_2_renaming, axis=1, inplace = True)
dyad_df_2 = deepcopy(dyad_df_2[list(dyad_df_2_renaming.values())])

## fixing for leap year issue
dyad_df_2.loc[(dyad_df_2['start_day']==29) & (dyad_df_2['start_month']==2) & (dyad_df_2['start_year']==1894), 'start_day'] = 28

## adjusting for wrong start year
## this needs to be automated (check for  'of ___' in war_name where start_year <> ___)
dyad_df_2.loc[dyad_df_2['war_num']==976, 'start_year'] = 2011

## two syrian arab spring wars starting on the same date.
## I believe one of these is a data entry error.
## combining both war_nums into one war
dyad_df_2.loc[dyad_df_2['war_num']==977, 'war_num'] = 979

format_part_df_return = deepcopy(the_networks_of_war_python_functions.format_part_df_from_dyadic_data(dyad_df_2))
part_df_2 = deepcopy(format_part_df_return[0])
dyad_df_2 = deepcopy(format_part_df_return[1])

Total Rows With Both Dates Found: 569
Total Rows With At Least One Date Not Found: 24
Total Estimated Start Dates: 80
Total Estimated End Dates: 70



### Extra State Wars
#### Using the same file to create the participant dataframe and dyadic dataframe.

In [9]:
## creating new dataframe to union to extra-state wars part_df
## inefficient pipeline from above was used to accomodate integer formatting
## unsure if that problem occurs for this one too though
dyad_df_3 = pd.read_csv(csv_directory + 'Extra-StateWarData_v4.0.csv', encoding='latin-1')
## either one of these a or b may not actually be states.
## this wil be fixed later on
dyad_df_3_renaming = {'WarNum': 'war_num',
                      'WarName': 'war_name',
                      'WarType': 'war_type',
                      'ccode1': 'c_code_a',
                      'ccode2': 'c_code_b',
                      'SideA': 'participant_a',
                      'SideB': 'participant_b',
                      'StartDay1': 'start_day',
                      'StartMonth1': 'start_month',
                      'StartYear1': 'start_year',
                      'EndDay1': 'end_day',
                      'EndMonth1': 'end_month',
                      'EndYear1': 'end_year',
                      ## unsure if these are the same as battle deaths, or include civilians
                      'BatDeath': 'battle_deaths_a',
                      'NonStateDeaths': 'battle_deaths_b'}
dyad_df_3.rename(dyad_df_3_renaming, axis=1, inplace=True)
dyad_df_3 = deepcopy(dyad_df_3[list(dyad_df_3_renaming.values())])

format_part_df_return = deepcopy(the_networks_of_war_python_functions.format_part_df_from_dyadic_data(dyad_df_3))
part_df_3 = deepcopy(format_part_df_return[0])
dyad_df_3 = deepcopy(format_part_df_return[1])

Total Rows With Both Dates Found: 185
Total Rows With At Least One Date Not Found: 13
Total Estimated Start Dates: 46
Total Estimated End Dates: 38



### Combining Participant Sources

In [10]:
part_df = deepcopy(pd.concat([part_df_1, part_df_2], sort=True, ignore_index=True).reset_index(drop=True))
part_df = deepcopy(pd.concat([part_df, part_df_3], sort=True, ignore_index=True).reset_index(drop=True))

## one more check whether years were inputted correctly
part_df = deepcopy(the_networks_of_war_python_functions.final_date_formatting(part_df))

## keeping only essential columns
part_df = deepcopy(part_df[['war_num',
                            'war_name',
                            'war_type',
                            'c_code',
                            'participant',
                            'side',
                            'battle_deaths',
                            'start_date',
                            'start_year',
                            'end_date',
                            'end_year',
                            'days_at_war',
                            'lagging_war',
                            'leading_war',
                            'ongoing_participation',
                            'total_deaths_both_sides',
                            'peak_forces_available',
                            'peak_battle_forces',
                            'start_date_estimated',
                            'end_date_estimated']])

print('Total War Participants After Merging All War Types: {}'.format(format(len(part_df), ',d')))

Start Years Reformatted: 1,373
End Years Reformatted: 0

Total War Participants After Merging All War Types: 1,705


In [11]:
part_df.rename({'war_type': 'war_type_code'}, axis=1, inplace=True)

## two lists, one for war_types and one for war_sub_types.
## the indexes for each must line up.
war_types = ['Inter-State War',
             'Extra-State War',
             'Extra-State War',
             'Intra-State War',
             'Intra-State War',
             'Intra-State War',
             'Intra-State War',
             'Non-State War',
             'Non-State War']

war_sub_types = ['',
                 'Colonial (conflict with colony)',
                 'Imperial (state vs non-state)',
                 'Civil War (for central control)',
                 'Civil War (over local issues)',
                 'Regional/Internal',
                 'Intercommunal',
                 'In Non-State Territory',
                 'Across State Borders']

for i, war_type in enumerate(war_types):
    ## specifying the subtypes of each war based on the documentation
    part_df.loc[part_df['war_type_code']==i+1, 'war_type'] = war_types[i]
    part_df.loc[part_df['war_type_code']==i+1, 'war_sub_type'] = war_sub_types[i]

In [12]:
remaining_participant_fields = list(part_df.drop(['war_num',
                                                  'war_name',
                                                  'war_type_code',
                                                  'war_type',
                                                  'war_sub_type',
                                                  'c_code',
                                                  'participant',
                                                  'side',
                                                  'start_year',
                                                  'end_year',
                                                  'start_date',
                                                  'end_date',
                                                  'days_at_war',
                                                  'start_date_estimated',
                                                  'end_date_estimated'], axis=1).columns)
## filling in non-applicable values with None
part_df = deepcopy(the_networks_of_war_python_functions.remaining_participant_null_values(part_df, remaining_participant_fields))

### Creating Dictionary to Lookup Wars Individually

In [13]:
## creating war_dic that will be used to fill in missing values
war_dic = the_networks_of_war_python_functions.dictionary_from_field(part_df, 'war_num', 'war_name')

## Creating Inter-State Dyadic DataFrame

### Note: The other dyadic dataframes have already been defined above.

### Reading in the data and unioning each participant so they both appear as a and b

In [14]:
## battle deaths and start/end dates are in this file too, but it's more confusing than the part_df.
## this will just be used to get the combinations of countries directly at war with each other.
dyad_df_1 = pd.read_csv(csv_directory + 'directed_dyadic_war.csv', encoding='latin-1')

## including columns that need to be included later on (that don't need name changes)
dyad_df_1_renaming = {'warnum': 'war_num',
                      'statea': 'c_code_a',
                      'stateb': 'c_code_b',
                      'batdtha': 'battle_deaths_a',
                      'batdthb': 'battle_deaths_b',
                      'batdths': 'total_battle_deaths',
                      'year': 'year',
                      'disno': 'disno'}

dyad_df_1.rename(dyad_df_1_renaming, axis=1, inplace = True)
dyad_df_1 = deepcopy(dyad_df_1[list(dyad_df_1_renaming.values())])

for i, c_code_a in enumerate(dyad_df_1['c_code_a']):
    dyad_df_1.loc[i, 'participant_a'] = c_code_dic[c_code_a]
    dyad_df_1.loc[i, 'participant_b'] = c_code_dic[dyad_df_1.loc[i, 'c_code_b']] 
    
dyad_df_1 = deepcopy(the_networks_of_war_python_functions.union_opposite_columns(dyad_df_1))

In [15]:
# # ## fixing data entry error
# dyad_df.loc[dyad_df['war_num']==106, 'war_end_year'] = 1918

### Checking for any missing dyads that can be extracted from MID data
### Adding in Dyads (Not Included) Marked as War==1 in MID Data

In [16]:
mid_df = pd.read_csv(csv_directory + 'dyadic MIDs 3.1.csv', encoding='latin-1')
## only including war conflicts
mid_df = deepcopy(mid_df[mid_df['war']==1])
mid_df.rename({'statea': 'c_code_a',
               'stateb': 'c_code_b'}, axis=1, inplace=True)
mid_df = deepcopy(mid_df[['c_code_a',
                          'c_code_b',
                          'year']])
mid_df = deepcopy(the_networks_of_war_python_functions.union_opposite_columns(mid_df))
dyad_df_1 = deepcopy(pd.merge(dyad_df_1, mid_df, how='outer', on=['c_code_a', 'c_code_b', 'year']))

In [17]:
## these have been manually reviewed to all be WWII dyads
total_dyads_added = deepcopy(len(dyad_df_1[dyad_df_1['war_num'].isnull()]))

for i, war in enumerate(dyad_df_1['war_num']):
    if str(dyad_df_1.loc[i, 'war_num'])=='nan':
        ## these have been manually reviewed to all be WWII dyads
        dyad_df_1.loc[i, 'war_num'] = 139
        dyad_df_1.loc[i, 'participant_a'] = c_code_dic[dyad_df_1.loc[i, 'c_code_a']]
        dyad_df_1.loc[i, 'participant_b'] = c_code_dic[dyad_df_1.loc[i, 'c_code_b']]

## should be 8
if total_dyads_added!=8:
    raise Exception('There are usually 8 MIDs added here.')
    
print('Total Dyads Added From MIDs: {}'.format(format(total_dyads_added, ',d')))
print('Note: These have all been manually reviewed to be WWII dyads.')

Total Dyads Added From MIDs: 8
Note: These have all been manually reviewed to be WWII dyads.


In [18]:
dyad_df_1.tail(total_dyads_added)

Unnamed: 0,battle_deaths_a,battle_deaths_b,c_code_a,c_code_b,disno,participant_a,participant_b,total_battle_deaths,war_num,year
1378,,,255,375,,Germany,Finland,,139.0,1944
1379,,,255,375,,Germany,Finland,,139.0,1945
1380,,,375,255,,Finland,Germany,,139.0,1944
1381,,,375,255,,Finland,Germany,,139.0,1945
1382,,,325,355,,Italy,Bulgaria,,139.0,1943
1383,,,325,355,,Italy,Bulgaria,,139.0,1944
1384,,,355,325,,Bulgaria,Italy,,139.0,1943
1385,,,355,325,,Bulgaria,Italy,,139.0,1944


## Combining Dyadic Sources
#### Second and third dyadic sources are defined above during processing for participant dataframes.

In [19]:
dyad_df = deepcopy(pd.concat([dyad_df_1, dyad_df_2], sort=True, ignore_index=True).reset_index(drop=True))
dyad_df = deepcopy(pd.concat([dyad_df, dyad_df_3], sort=True, ignore_index=True).reset_index(drop=True))

## saving this for process below
dyad_df_for_missing_values = deepcopy(dyad_df)

## dropping all columns that were only needed for adding missing values
dyad_df.drop(['disno',
              'battle_deaths_a',
              'battle_deaths_b',
              'total_battle_deaths'], axis=1, inplace=True)

## removing any duplicates that were present in the three dyadic sources
dyad_df.drop_duplicates(subset=list(dyad_df.columns), keep='first', inplace=True)
dyad_df = deepcopy(dyad_df.reset_index(drop=True))

print('Total Unique Dyads After Merging All War Types: {}'.format(format(int(len(dyad_df)/2), ',d')))

Total Unique Dyads After Merging All War Types: 1,270


## Final Data Adjustments

### Adjusting participant names for part_df and dyad_df

In [20]:
part_df = deepcopy(the_networks_of_war_python_functions.adjustParticipantNames(part_df, 'participant'))
dyad_df = deepcopy(the_networks_of_war_python_functions.adjustParticipantNames(dyad_df, 'dyad'))

Adjusting and consolidating participant names for part_df.
Adjusting and consolidating participant names for dyad_df.


In [21]:
print('Displaying all participant names.\n')
sorted(list(set(list(part_df['participant'].unique()) + list(dyad_df['participant_a'].unique()) + list(dyad_df['participant_a'].unique()))))

Displaying all participant names.



['19th Route Army',
 'AFDL Coalition',
 'ALN',
 'ALiR',
 'ANC',
 'AQAP',
 'Abkhazia',
 'Aceh',
 'Afghanistan',
 'Agrarian League',
 'Al Masifu Sect',
 'Al-Shabaab ',
 'Albanians',
 'Albanians & Bosnians',
 'Aleppo',
 'Alfaristas',
 'Algeria',
 'Algerian forces',
 'Ali Pasha Loyalists',
 'An-Fu Faction',
 'Angola',
 'Ansar Allah',
 'Anti-Bolsheviks',
 'Anti-Khomeini Coalition',
 'Anti-Lumumba Coalition',
 'Anti-Reform Rebels',
 'Anti-Shah Coalition',
 'Anya Nya',
 'April 1 Movement',
 'Apristas',
 'Argentina',
 'Armenia',
 'Army of Islam',
 'Army of National Liberation',
 'Ashanti',
 'Ashanti-Ghana',
 'Asir & Yemen Rebels',
 'Asturian Miners',
 'Australia',
 'Austria',
 'Austria-Hungary',
 'Azerbaijan',
 'BRA',
 'Baden',
 'Bahia Sabinada',
 'Bahr el-Ghazal',
 'Bailundu of Angola',
 'Balaida Rebels',
 'Bali',
 'Baltic Partisans',
 'Baluchi Rebels',
 'Bangilima Militia',
 'Banyarwanda Militia',
 "Baron von Ungern-Sternberg's White army",
 'Basmachi',
 'Basuto',
 'Bavaria',
 'Belgium',
 'B

## Addressing Missing Values from both Participant and Dyadic Data

### Adding in Missing Dyads for Wars with Only One Possible Adverary

In [22]:
## need to figure out a way to add dyadic data when it's missing.
## these are clear cases where it should be added because one side on the war is only one country.
## or, both sides are only one country
## it'll be trickier when each side isn't just one country.
## that will lead to a floating noad that isn't grounded in the network analysis graph

total_dyads = deepcopy(len(dyad_df))

for war_num in list(part_df['war_num'].unique()):
    
    part_df_copy = deepcopy(part_df[part_df['war_num']==war_num].reset_index(drop=True))
    total_side_1 = len(list(part_df_copy[part_df_copy['side']==1]['participant'].unique()))
    total_side_2 = len(list(part_df_copy[part_df_copy['side']==2]['participant'].unique()))
    ## non-state only
    total_side_1_non_state = len(list(part_df_copy[(part_df_copy['side']==1) & (part_df_copy['c_code']==-8)]['participant'].unique()))
    total_side_2_non_state = len(list(part_df_copy[(part_df_copy['side']==2) & (part_df_copy['c_code']==-8)]['participant'].unique()))

    ## state only
    total_side_1_state = len(list(part_df_copy[(part_df_copy['side']==1) & (part_df_copy['c_code']!=-8)]['c_code'].unique()))
    total_side_2_state = len(list(part_df_copy[(part_df_copy['side']==2) & (part_df_copy['c_code']!=-8)]['c_code'].unique()))
    
    if total_side_1==1:
        dyad_df = deepcopy(the_networks_of_war_python_functions.add_missing_dyads(part_df_copy, dyad_df, war_num, 1, 'all_participants').reset_index(drop=True))
    elif total_side_2==1:
        dyad_df = deepcopy(the_networks_of_war_python_functions.add_missing_dyads(part_df_copy, dyad_df, war_num, 2, 'all_participants').reset_index(drop=True))
    ## if the above processes fail, at the very least we can link all opposing parties to the non-state participant on the other side
    ## if there is only on non-state participant on the other side, we know that all opposing forces fought against them
    ## the thought here is that if there is only one non-state participant on a particular side, we can assume all members of the other side fought against them because this isn't an inter-state war
    elif total_side_1_non_state==1:
        dyad_df = deepcopy(the_networks_of_war_python_functions.add_missing_dyads(part_df_copy, dyad_df, war_num, 1, 'non-state').reset_index(drop=True))
    elif total_side_2_non_state==1:
        dyad_df = deepcopy(the_networks_of_war_python_functions.add_missing_dyads(part_df_copy, dyad_df, war_num, 2, 'non-state').reset_index(drop=True))
    else:
        pass

    ## below is the case where only one state participant was on a particular side in a non-state war.
    ## assuming that this country fought with all opposing parties.
    ## filling in any networks with only one state participant on a particular side
    ## see Eritrea in Third Somalia War
    if total_side_1_state==1:
        dyad_df = deepcopy(the_networks_of_war_python_functions.add_missing_dyads(part_df_copy, dyad_df, war_num, 1, 'state').reset_index(drop=True))
    elif total_side_2_state==1:
        dyad_df = deepcopy(the_networks_of_war_python_functions.add_missing_dyads(part_df_copy, dyad_df, war_num, 2, 'state').reset_index(drop=True))
    else:
        pass

dyads_added = deepcopy(len(dyad_df) - total_dyads)
print('Checking for floating participants with only one possible adversary, or one definite adversary.\n')
print('Total Dyads Added Overall: {}'.format(format(dyads_added, ',d')))
print('Total Particpants with Null Start Years: {}'.format(len(part_df[part_df['start_year'].isnull()])))
print('Total Particpants with Null End Years: {}'.format(len(part_df[part_df['end_year'].isnull()])))
print('Total Dyads with Null Years: {}'.format(len(dyad_df[dyad_df['year'].isnull()])))

total_dyads = deepcopy(len(dyad_df))
## unioning to obtain all combinations of dyads that were added
dyad_df = deepcopy(the_networks_of_war_python_functions.union_opposite_columns(dyad_df))
dyads_added = deepcopy(len(dyad_df) - total_dyads)
print('Total Dyads Unioned From Opposite Columns: {}'.format(format(dyads_added, ',d')))

Checking for floating participants with only one possible adversary, or one definite adversary.

Total Dyads Added Overall: 630
Total Particpants with Null Start Years: 0
Total Particpants with Null End Years: 49
Total Dyads with Null Years: 0
Total Dyads Unioned From Opposite Columns: 630


### Adding in Missing Participants that Appear in Dyadic Data for War

In [23]:
print('Participants Added from Dyadic Data:\n')
war_list = list(dyad_df['war_num'].unique())

## filling in values below using MID data
mid_df = pd.read_csv(csv_directory + 'dyadic MIDs 3.1.csv', encoding='latin-1')
mid_df.rename({'statea': 'c_code_a',
               'stateb': 'c_code_b',
               'namea': 'participant_a',
               'nameb': 'participant_b'}, axis=1, inplace=True)

## giving this to the opposite side of the participant they fought against in the war (grabs first record)
## troublesome if they swithced sides but this would be very rare
opposing_side_dic = {1: 2,
                     2: 1,
                     3: 3}

original_part_df_length = deepcopy(len(part_df))
## the following checks for missing data across participant names and c_codes
## this is only possible (at the moment) for participants with c_codes, because the rest comes from dyadic data
for war_num in war_list:

    part_df_copy = deepcopy(part_df[part_df['war_num']==war_num].reset_index(drop=True))
    participant_list = list(part_df_copy['c_code'])
    
    dyad_df_copy = deepcopy(dyad_df[dyad_df['war_num']==war_num].reset_index(drop=True))
    dyad_list = list(set(list(dyad_df_copy['c_code_a']) + list(dyad_df_copy['c_code_b'])))
    
    try:
        dyad_list.remove(-8)
    except:
        pass
    for participant in dyad_list:
        if participant not in participant_list:
            
            war_num = part_df_copy['war_num'].values[0]

            print(str(war_num)[:-2] + ', ' + war_dic[war_num] + ': ' + str(int(participant)) + ', ' + c_code_dic[participant])
            
            ## this will overwrite the dyad_df_copy dataframe made just a few lines earlier.
            dyad_df_copy = deepcopy(dyad_df_for_missing_values[(dyad_df_for_missing_values['war_num']==war_num) & ((dyad_df_for_missing_values['c_code_a']==participant) | (dyad_df_for_missing_values['c_code_b']==participant))].reset_index(drop=True))
            dispute_numbers = list(dyad_df_copy['disno'].unique())
            dispute_number = dispute_numbers[0]
            
            ## should always be 1
            if len(dispute_numbers) > 1:
                print('Missing participant has more than 1 dispute.')
                print('Logic will need to be adjusted.')
                print('Total Disputes for Missing Participant: {}'.format(len(dispute_numbers)))
            
            mid_df_copy = deepcopy(mid_df[(mid_df['disno']==dispute_number) & ((mid_df['c_code_a']==participant) | (mid_df['c_code_b']==participant))].reset_index(drop=True))
            mid_df_copy.rename({'strtday': 'start_day',
                              'strtmnth': 'start_month',
                              'strtyr': 'start_year',
                              'endday': 'end_day',
                              'endmnth': 'end_month',
                              'endyear': 'end_year',
                             }, axis=1, inplace = True)
            mid_df_copy['war_num'] = war_num
            mid_df_copy = deepcopy(the_networks_of_war_python_functions.start_and_end_dates(mid_df_copy))
            aggregations = {
                'start_date': 'min',
                'end_date': 'max',
                'days_at_war': 'max',
                ## aggregation for estimation fields.
                ## this could fail for aggregations over multiple dates
                'start_date_estimated': 'max',
                'end_date_estimated': 'max'
                }
            mid_df_copy = deepcopy(mid_df_copy.groupby(['war_num', 'c_code_a', 'c_code_b', 'participant_a', 'participant_b']).agg(aggregations).reset_index())
            
            ## manually filling in values that are found in dyadic cow datasets but seem to be missing from country level sources.
            ## values have been obtained from dyadic data (directed_dyadic_war.csv' and dyadic MIDs 3.1.csv)
            df_length = deepcopy(len(part_df))
            
            part_df_copy = deepcopy(part_df[part_df['war_num']==war_num].reset_index(drop=True))
            
            if len(mid_df_copy[mid_df_copy['c_code_a']==participant]) > 0:
                part_df.loc[df_length, 'c_code'] = mid_df_copy[mid_df_copy['c_code_a']==participant]['c_code_a'].values[0]
                ## assuming it's in c_code_dic
                ## otherwise, this breaks!
                part_df.loc[df_length, 'participant'] = c_code_dic[mid_df_copy[mid_df_copy['c_code_a']==participant]['c_code_a'].values[0]]
                opposite_participant = mid_df_copy[mid_df_copy['c_code_a']==participant]['c_code_b'].values[0]
            else:
                part_df.loc[df_length, 'c_code'] = mid_df_copy[mid_df_copy['c_code_b']==participant]['c_code_b'].values[0]
                ## assuming it's in c_code_dic
                ## otherwise, this breaks!
                part_df.loc[df_length, 'participant'] = c_code_dic[mid_df_copy[mid_df_copy['c_code_b']==participant]['c_code_a'].values[0]]
                opposite_participant = mid_df_copy[mid_df_copy['c_code_a']==participant]['c_code_b'].values[0]
             
            part_df.loc[df_length, 'war_num'] = war_num
            part_df.loc[df_length, 'war_name'] = war_dic[war_num]
            part_df.loc[df_length, 'war_type'] = part_df_copy['war_type'].values[0]
            part_df.loc[df_length, 'war_type_code'] = part_df_copy['war_type_code'].values[0]
            part_df.loc[df_length, 'war_sub_type'] = part_df_copy['war_sub_type'].values[0]
            
            part_df.loc[df_length, 'side'] = opposing_side_dic[part_df_copy[part_df_copy['c_code']==opposite_participant]['side'].values[0]]
            part_df.loc[df_length, 'battle_deaths'] = max(list(dyad_df_copy[dyad_df_copy['c_code_a']==participant]['battle_deaths_a']) + list(dyad_df_copy[dyad_df_copy['c_code_b']==participant]['battle_deaths_b']))
            
            part_df.loc[df_length, 'start_date'] = mid_df_copy['start_date'].values[0]
            part_df.loc[df_length, 'start_year'] = float(str(mid_df_copy['start_date'].values[0])[0:4])
            part_df.loc[df_length, 'end_date'] = pd.to_datetime(str(mid_df_copy['end_date'].values[0])[0:11])
            part_df.loc[df_length, 'end_year'] = float(str(mid_df_copy['end_date'].values[0])[0:4])
            part_df.loc[df_length, 'days_at_war'] = float(mid_df_copy['days_at_war'].values[0])
            part_df.loc[df_length, 'start_date_estimated'] = float(mid_df_copy['start_date_estimated'].values[0])
            part_df.loc[df_length, 'end_date_estimated'] = float(mid_df_copy['end_date_estimated'].values[0])
            ## prevent duplication in for loop
            ## this may be an issue if they fought with more than one country
            participant_list.append(participant)
        else:
            pass
        
part_df['war_num'] = part_df['war_num'].astype(float)
part_df['start_year'] = part_df['start_year'].astype(float)
part_df['end_year'] = part_df['end_year'].astype(float)

participants_added = deepcopy(len(part_df) - original_part_df_length)
print('\nTotal Participants Added from Dyadic Data: {}'.format(format(participants_added, ',d')))

Participants Added from Dyadic Data:

108, Latvian Liberation: 290, Poland
Total Rows With Both Dates Found: 2
Total Rows With At Least One Date Not Found: 0
Total Estimated Start Dates: 0
Total Estimated End Dates: 0

108, Latvian Liberation: 200, United Kingdom
Total Rows With Both Dates Found: 6
Total Rows With At Least One Date Not Found: 0
Total Estimated Start Dates: 0
Total Estimated End Dates: 0

108, Latvian Liberation: 220, France
Total Rows With Both Dates Found: 4
Total Rows With At Least One Date Not Found: 0
Total Estimated Start Dates: 0
Total Estimated End Dates: 0

139, World War II: 800, Thailand
Total Rows With Both Dates Found: 24
Total Rows With At Least One Date Not Found: 0
Total Estimated Start Dates: 0
Total Estimated End Dates: 0

139, World War II: 230, Spain
Total Rows With Both Dates Found: 8
Total Rows With At Least One Date Not Found: 0
Total Estimated Start Dates: 0
Total Estimated End Dates: 0

151, Korean: 920, New Zealand
Total Rows With Both Dates Fo

In [24]:
# missing value for thailand battle deaths in WWII obtained from wikipedia
# https://en.wikipedia.org/wiki/Thailand_in_World_War_II#:~:text=Thailand%20suffered%20about%205%2C569%20military,the%20brief%20Franco%2DThai%20War.
print('Manually adding battle deaths from Wikipedia for Thailand during WWII.')
part_df.loc[(part_df['war_num']==139) & (part_df['participant']=='Thailand'), 'battle_deaths'] = 5569

# missing value for greece battle deaths in Turco Cypriot obtained from wikipedia
# https://en.wikipedia.org/wiki/Turkish_invasion_of_Cyprus#:~:text=The%20violence%20resulted%20in%20the,of%2025%2C000%E2%80%9330%2C000%20Turkish%20Cypriots.
print('Manually adding battle deaths from Wikipedia for Greece during Turco Cypriot.')
part_df.loc[(part_df['war_num']==184) & (part_df['participant']=='Greece'), 'battle_deaths'] = 105

Manually adding battle deaths from Wikipedia for Thailand during WWII.
Manually adding battle deaths from Wikipedia for Greece during Turco Cypriot.


In [25]:
# # check for all values that were just added
# # making sure no fields are null that shouldn't be null
# part_df.tail(len(part_df)-original_part_df_length)

## Defining War DataFrame (One row for each war)

In [26]:
## part_df_copy will be used to calculate war_df.
part_df_copy = deepcopy(part_df)
part_df_copy.rename({'participant': 'total_participants'}, axis=1, inplace=True)

## filling these dates in arbitrarily before taking aggregates
## high date for start_date because this will be min
## low date for end_date because this will be max
part_df_copy['start_date'].fillna(pd.to_datetime('2100-01-01'), inplace=True)
part_df_copy['end_date'].fillna(pd.to_datetime('1700-01-01'), inplace=True)
part_df_copy.rename({'ongoing_participation': 'ongoing_war'}, axis=1, inplace=True)

war_name_changes = {}
wars_changed_to_ongoing = []
for i, war in enumerate(part_df_copy['war_name']):
    original_war_name = part_df_copy.loc[i, 'war_name']
    if part_df_copy.loc[i, 'ongoing_war']==1:
        pass
    elif 'present' in part_df_copy.loc[i, 'war_name'].lower() or 'ongoing' in part_df_copy.loc[i, 'war_name'].lower():
        part_df_copy.loc[i, 'ongoing_war'] = 1
        wars_changed_to_ongoing.append(part_df_copy.loc[i, 'war_num'])
    if ' of 1' in part_df_copy.loc[i, 'war_name']:
        part_df_copy.loc[i, 'war_name'] = part_df_copy.loc[i, 'war_name'].split(' of 1')[0].replace('  ', ' ')
    elif ' of 2' in part_df_copy.loc[i, 'war_name']:
        part_df_copy.loc[i, 'war_name'] = part_df_copy.loc[i, 'war_name'].split(' of 2')[0].replace('  ', ' ')
    if part_df_copy.loc[i, 'war_name'][0]==' ':
        part_df_copy.loc[i, 'war_name'] = part_df_copy.loc[i, 'war_name'][1:]
    if original_war_name!=part_df_copy.loc[i, 'war_name']:
        war_name_changes[original_war_name] = part_df_copy.loc[i, 'war_name']
        
print('Total Wars Manually Changed to Ongoing: {}'.format(format(len(list(set(wars_changed_to_ongoing))), ',d')))
print('Total Wars Marked as Ongoing: {}'.format(format(len(list(part_df_copy[part_df_copy['ongoing_war']==1]['war_num'].unique())), ',d')))
print('Total Wars With Name Adjustments: {}'.format(format(len(war_name_changes.keys()), ',d')))
# pprint(war_name_changes)

Total Wars Manually Changed to Ongoing: 5
Total Wars Marked as Ongoing: 12
Total Wars With Name Adjustments: 422


In [27]:
### Creating Dictionary to Lookup Estimated Dates That Have Affected Days_At_War
estimated_start_df = deepcopy(part_df_copy[part_df_copy['start_date_estimated']==1][['war_num', 'start_date']])
estimated_start_dates = the_networks_of_war_python_functions.dictionary_from_field(estimated_start_df, 'war_num', 'start_date')

estimated_end_df = deepcopy(part_df_copy[(part_df_copy['end_date_estimated']==1) & (part_df_copy['ongoing_war']==0)][['war_num', 'end_date']])
estimated_end_dates = the_networks_of_war_python_functions.dictionary_from_field(estimated_end_df, 'war_num', 'end_date')

## not longer needed after the dictionaries are created
## however, they could still potentially have been used to generate days_at_war_by_participant
part_df.drop(['start_date_estimated',
              'end_date_estimated'], axis=1, inplace=True)
## needed for to create war_df but not individually part_df
part_df.drop(['lagging_war',
              'leading_war'], axis=1, inplace=True)

print('Total Wars with Estimated Start Dates: {}'.format(len(list(estimated_start_df['war_num'].unique()))))
print('Total (Non-Ongoing) Wars with Estimated End Dates: {}'.format(len(list(estimated_end_df['war_num'].unique()))))

Total Wars with Estimated Start Dates: 109
Total (Non-Ongoing) Wars with Estimated End Dates: 95


In [28]:
aggregations = {
    'total_participants': 'count',
    'start_year': 'min',
    'end_year': 'max',
    ## this will not be accurate if there are more than one lagging/leading wars per war.
    'lagging_war': 'min',
    'leading_war': 'max',
    'ongoing_war': 'max',
    'start_date': 'min',
    'end_date': 'max'
    ## not sure how to add this one just yet
#     'total_deaths_both_sides': 'max'
    }
war_df = deepcopy(part_df_copy.groupby(['war_num',
                                        'war_name',
                                        'war_type_code',
                                        'war_type',
                                        'war_sub_type']).agg(aggregations).reset_index())

## putting these back to none in case they made it through the aggregation
war_df.loc[war_df['start_date']==pd.to_datetime('2100-01-01'), 'start_date'] = None
war_df.loc[war_df['end_date']==pd.to_datetime('1700-01-01'), 'end_date'] = None

for i, war in enumerate(war_df['war_name']):
    try:
        war_df.loc[i, 'total_days_in_war'] = war_df.loc[i, 'end_date'] - war_df.loc[i, 'start_date']
        war_df.loc[i, 'total_days_in_war'] = int(str(war_df.loc[i, 'total_days_in_war']).split(' ')[0]) + 1
    except:
        war_df.loc[i, 'total_days_in_war'] = None
        
war_df = deepcopy(war_df.sort_values(by=['start_year', 'ongoing_war', 'end_year', 'start_date', 'war_name'], ascending=(False, False, False, False, True)))

print('Total Wars with Null Start Years: {}'.format(format(int(len(war_df[war_df['start_year'].isnull()])), ',d')))
print('Total (Non-Ongoing) Wars with Null End Years: {}'.format(format(int(len(war_df[(war_df['end_year'].isnull()) & (war_df['ongoing_war']==0)])), ',d')))
print('Total Ongoing Wars: {}'.format(format(int(len(war_df[war_df['ongoing_war']==1])), ',d')))


Total Wars with Null Start Years: 0
Total (Non-Ongoing) Wars with Null End Years: 0
Total Ongoing Wars: 12


In [29]:
print('Evaluating for each war, whether the start/end date is based on an estimation.')

for i, war in enumerate(war_df['war_num']):
    if war_df.loc[i, 'war_num'] in list(estimated_start_dates.keys()) and estimated_start_dates[war_df.loc[i, 'war_num']]==war_df.loc[i, 'start_date']:
        war_df.loc[i, 'start_date_estimated'] = 1
    if war_df.loc[i, 'war_num'] in list(estimated_end_dates.keys()) and estimated_end_dates[war_df.loc[i, 'war_num']]==war_df.loc[i, 'end_date']:
        war_df.loc[i, 'end_date_estimated'] = 1
        
war_df.loc[war_df['start_date_estimated'].isnull(), 'start_date_estimated'] = 0
war_df.loc[war_df['end_date_estimated'].isnull(), 'end_date_estimated'] = 0

print("Total Estimated Start Dates: {}".format(format(len(war_df[war_df['start_date_estimated']==1]), ',d')))
print("Total Estimated End Dates: {}".format(format(len(war_df[war_df['end_date_estimated']==1]), ',d')))

Evaluating for each war, whether the start/end date is based on an estimation.
Total Estimated Start Dates: 88
Total Estimated End Dates: 90


In [30]:
print('Total Participants: {}'.format(format(len(part_df), ',d')))
print('Total Dyadic Combinations: {}'.format(format(len(dyad_df), ',d')))
print('Total Wars: {}'.format(format(len(war_df), ',d')))

pickle_directory = '/Users/charlieyaris/Personal/data_sources/the_networks_of_war/pickles/'

part_df.to_pickle(pickle_directory + 'initial_part_df.pkl')
dyad_df.to_pickle(pickle_directory + 'initial_dyad_df.pkl')
war_df.to_pickle(pickle_directory + 'initial_war_df.pkl')

Total Participants: 1,713
Total Dyadic Combinations: 3,800
Total Wars: 678
