In [19]:
import pandas as pd
import numpy as np
from pandasql import sqldf
pysqldf = lambda q: sqldf(q, globals())
from copy import deepcopy
from traceback import format_exc
# from pprint import pprint
import the_networks_of_war_python_functions

In [20]:
pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', lambda x: '%.5f' % x)

In [21]:
csv_directory = '/Users/charlieyaris/Personal/data_sources/the_networks_of_war/csvs/'

In [22]:
monthly_max_df = deepcopy(pd.read_csv('monthly_max_days.csv'))
war_type_df = deepcopy(pd.read_csv('war_types.csv'))

## Setup for Identifying Countries by Code
### Note: This is helpful for when different names are used for the same country.

In [23]:
# c_code_dic = the_networks_of_war_python_functions.define_c_code_dic()

In [24]:
c_code_df = pd.read_csv(csv_directory + 'COW country codes.csv', encoding='latin-1')

query_text = """

select
    ccode as c_code,
    statenme as state_name,
    stateabb as state_name_abbreviation
from c_code_df
group by 1, 2, 3

"""

c_code_df = deepcopy(pysqldf(query_text))

In [25]:
part_query_template = """
    
select
    war_num,
    war_name,
    war_type_code,
    war_type,
    war_subtype,
    C_CODE as c_code,
    PARTICIPANT as participant,
    case when min(SIDE) = 1 and max(SIDE) = 2 then 3
        else max(SIDE) end as side,
    sum(BATTLE_DEATHS) as battle_deaths,
    cast(strftime('%Y', min(start_date)) as integer) as start_year,
    cast(strftime('%m', min(start_date)) as integer) as start_month,
    cast(strftime('%d', min(start_date)) as integer) as start_day,
    min(start_date) as start_date,
    cast(strftime('%Y', max(end_date)) as integer) as end_year,
    cast(strftime('%m', max(end_date)) as integer) as end_month,
    cast(strftime('%d', max(end_date)) as integer) as end_day,
    max(end_date) as end_date,
    max(ongoing_participation) as ongoing_participation,
    max(start_date_estimated) as start_date_estimated,
    max(end_date_estimated) as end_date_estimated,
    max(LAGGING_WAR) as lagging_war,
    max(LEADING_WAR) as leading_war,
    sum(TOTAL_DEATHS_BOTH_SIDES) as total_deaths_both_sides,
    sum(TOTAL_DAYS_IN_WAR) as total_days_in_war,
    sum(PEAK_FORCES_AVAILABLE) as peak_forces_available,
    sum(PEAK_BATTLE_FORCES) as peak_battle_forces
from SOURCE_TABLE
group by 1, 2, 3, 4, 5, 6, 7

"""

dyad_join_query_template = """

select
    cast(a.warnum as integer) as war_num,
    a.warname as war_name,
    cast(a.wartype as integer) as war_type_code,
    wt.war_type,
    coalesce(wt.war_subtype, wt.war_type) as war_subtype,
    a.C_CODE_A as c_code_a,
    a.C_CODE_B as c_code_b,
    coalesce(cca.state_name, a.PARTICIPANT_A) as participant_a,
    coalesce(ccb.state_name, a.PARTICIPANT_B) as participant_b,
    0 as side_a,
    0 as side_b,
    max(coalesce(a.BATTLE_DEATHS_A, 0), 0) as battle_deaths_a,
    max(coalesce(a.BATTLE_DEATHS_B, 0), 0) as battle_deaths_b,
    max(coalesce(a.START_YEAR, 1), 1) as start_year,
    max(coalesce(a.START_MONTH, 1), 1) as start_month,
    max(coalesce(a.START_DAY, 1), 1) as start_day,
    case when coalesce(a.END_YEAR, -1) < 0 then cast(strftime('%Y', date('now')) as integer)
        else a.END_YEAR end as end_year,
    case when coalesce(a.END_YEAR, -1) < 0 then cast(strftime('%m', date('now')) as integer)
    when a.END_YEAR > 0 and a.END_MONTH < 0 then 12
        else a.END_MONTH end as end_month,
    case when coalesce(a.END_YEAR, -1) < 0 then cast(strftime('%d', date('now')) as integer)
    when a.END_MONTH > 0 and a.END_DAY < 0 then mm.max_days
    when a.END_MONTH < 0 and a.END_DAY < 0 then 31
        else a.END_DAY end as end_day,
    case when coalesce(a.END_YEAR, -1) < 0 then 1
        else 0 end as ongoing_participation,
    case when coalesce(a.START_YEAR, -1) < 0 or coalesce(a.START_MONTH, -1) < 0 or coalesce(a.START_DAY, -1) < 0 then 1
        else 0 end as start_date_estimated,
    case when coalesce(a.END_YEAR, -1) < 0 or coalesce(a.END_MONTH, -1) < 0 or coalesce(a.END_DAY, -1) < 0 then 1
        else 0 end as end_date_estimated,
    case when a.transfrom >= 0 then a.transfrom else null end as lagging_war,
    case when a.transto >= 0 then a.transto else null end as leading_war,
    max(coalesce(a.totalbdeaths, 0), 0) as total_deaths_both_sides,
    a.wduratdays as total_days_in_war,
    max(coalesce(a.sideapeaktotforces, 0), 0) as peak_forces_available_a,
    max(coalesce(a.sidebpeaktotforces, 0), 0) as peak_forces_available_b,
    max(coalesce(a.sideapeak_theatforces, 0), 0) as peak_battle_forces_a,
    max(coalesce(a.sidebpeaktheatforces, 0), 0) as peak_battle_forces_b
from SOURCE_TABLE a
left join c_code_df cca on a.C_CODE_A = cca.c_code
left join c_code_df ccb on a.C_CODE_B = ccb.c_code
left join war_type_df wt on a.wartype = wt.war_type_code
left join monthly_max_df mm on case when coalesce(a.END_YEAR, -1) < 0 then cast(strftime('%m', date('now')) as integer) when a.END_YEAR > 0 and a.END_MONTH < 0 then 12 else a.END_MONTH end = mm.month
group by 1, 2, 3, 4, 5, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20

"""

dyad_query_template = """
    
select
    war_num,
    war_name,
    war_type_code,
    war_type,
    war_subtype,
    c_code_a,
    c_code_b,
    participant_a,
    participant_b,
    case when min(side_a) = 1 and max(side_a) = 2 then 3
        else max(side_a) end as side_a,
    case when min(side_b) = 1 and max(side_b) = 2 then 3
        else max(side_b) end as side_b,
    sum(battle_deaths_a) as battle_deaths_a,
    sum(battle_deaths_b) as battle_deaths_b,
    cast(strftime('%Y', min(start_date)) as integer) as start_year,
    cast(strftime('%m', min(start_date)) as integer) as start_month,
    cast(strftime('%d', min(start_date)) as integer) as start_day,
    min(start_date) as start_date,
    cast(strftime('%Y', max(end_date)) as integer) as end_year,
    cast(strftime('%m', max(end_date)) as integer) as end_month,
    cast(strftime('%d', max(end_date)) as integer) as end_day,
    max(end_date) as end_date,
    max(ongoing_participation) as ongoing_participation,
    max(start_date_estimated) as start_date_estimated,
    max(end_date_estimated) as end_date_estimated,
    max(lagging_war) as lagging_war,
    max(leading_war) as leading_war,
    sum(total_deaths_both_sides) as total_deaths_both_sides,
    sum(total_days_in_war) as total_days_in_war,
    sum(peak_forces_available_a) as peak_forces_available_a,
    sum(peak_forces_available_b) as peak_forces_available_b,
    sum(peak_battle_forces_a) as peak_battle_forces_a,
    sum(peak_battle_forces_b) as peak_battle_forces_b
from SOURCE_TABLE
group by 1, 2, 3, 4, 5, 6, 7, 8, 9

"""

## Participant DataFrames
### Note: Only inter-state wars have different source files for dyadic and participant data.

### Inter-State Wars

In [26]:
inter_part_df = pd.read_csv(csv_directory + 'Inter-StateWarData_v4.0.csv', encoding='latin-1')

query_text = """

select
    cast(a.warnum as integer) as war_num,
    a.warname as war_name,
    cast(a.wartype as integer) as war_type_code,
    wt.war_type,
    coalesce(wt.war_subtype, wt.war_type) as war_subtype,
    a.ccode as c_code,
    coalesce(cc.state_name, a.statename) as participant,
    a.side,
    max(coalesce(a.batdeath, 0), 0) as battle_deaths,
    max(coalesce(a.startyear1, 1), 1) as start_year,
    max(coalesce(a.startmonth1, 1), 1) as start_month,
    max(coalesce(a.startday1, 1), 1) as start_day,
    case when coalesce(a.endyear1, -1) < 0 then cast(strftime('%Y', date('now')) as integer)
        else a.endyear1 end as end_year,
    case when coalesce(a.endyear1, -1) < 0 then cast(strftime('%m', date('now')) as integer)
    when a.endyear1 > 0 and a.endmonth1 < 0 then 12
        else a.endmonth1 end as end_month,
    case when coalesce(a.endyear1, -1) < 0 then cast(strftime('%d', date('now')) as integer)
    when a.endmonth1 > 0 and a.endday1 < 0 then mm.max_days
    when a.endmonth1 < 0 and a.endday1 < 0 then 31
        else a.endday1 end as end_day,
    case when coalesce(a.endyear1, -1) < 0 then 1
        else 0 end as ongoing_participation,
    case when coalesce(a.startyear1, -1) < 0 or coalesce(a.startmonth1, -1) < 0 or coalesce(a.startday1, -1) < 0 then 1
        else 0 end as start_date_estimated,
    case when coalesce(a.endyear1, -1) < 0 or coalesce(a.endmonth1, -1) < 0 or coalesce(a.endday1, -1) < 0 then 1
        else 0 end as end_date_estimated
from inter_part_df a
left join c_code_df cc on a.ccode = cc.c_code
left join war_type_df wt on a.wartype = wt.war_type_code
left join monthly_max_df mm on case when coalesce(a.endyear1, -1) < 0 then cast(strftime('%m', date('now')) as integer) when a.endyear1 > 0 and a.endmonth1 < 0 then 12 else a.endmonth1 end = mm.month
group by 1, 2, 3, 4, 5, 6, 7, 8, 13, 14, 15, 16, 17, 18

"""

inter_part_df = deepcopy(pysqldf(query_text))

## fulfilling start and dates (in the same manner for all source tables).
inter_part_df['start_date'] = pd.to_datetime(inter_part_df['start_year'].astype(str) + '-' + inter_part_df['start_month'].astype(str) + '-' + inter_part_df['start_day'].astype(str)).dt.date
inter_part_df['end_date'] = pd.to_datetime(inter_part_df['end_year'].astype(str) + '-' + inter_part_df['end_month'].astype(str) + '-' + inter_part_df['end_day'].astype(str)).dt.date

template_replacement_dic = {
    
    'SOURCE_TABLE': 'inter_part_df',
    'C_CODE': 'c_code',
    'PARTICIPANT': 'participant',
    'BATTLE_DEATHS': 'battle_deaths',
    'LAGGING_WAR': 'null',
    'LEADING_WAR': 'null',
    'TOTAL_DEATHS_BOTH_SIDES': 'null',
    'TOTAL_DAYS_IN_WAR': 'null',
    'PEAK_FORCES_AVAILABLE': 'null',
    'PEAK_BATTLE_FORCES': 'null',
    'SIDE': 'side'
}

query_text = deepcopy(part_query_template)
for replacement in template_replacement_dic.keys():
    query_text = deepcopy(query_text.replace(replacement, template_replacement_dic[replacement]))
inter_part_df = deepcopy(pysqldf(query_text))

print('Manually changing USSR to side 3 for WWII based on dyadic data.')
## manually changing USSR to side 3 for WWII because they invaded Poland before fighting against Germany.
inter_part_df.loc[(inter_part_df['war_num']==139) & (inter_part_df['c_code']==365), 'side'] = 3

print('Manually changing Finland to side 3 for WWII based on dyadic data.')
## manually changing Finald to side 3 for WWII because they fought with Germany before Fighting against Germany.
inter_part_df.loc[(inter_part_df['war_num']==139) & (inter_part_df['c_code']==375), 'side'] = 3

Manually changing USSR to side 3 for WWII based on dyadic data.
Manually changing Finland to side 3 for WWII based on dyadic data.


### Intra-State Wars
#### Using the same file to create the participant dataframe and dyadic dataframe.

In [9]:
## creating new dataframe for intra-state wars to feed into both participant and dyadic tables.
intra_dyad_df = deepcopy(pd.read_csv(csv_directory + 'INTRA-STATE_State_participants v5.1.csv', encoding='latin-1'))
intra_dyad_df.rename({'Deaths A': 'deaths_a', 'SideAPeak TheatForces': 'SideAPeak_TheatForces', 'Deaths B': 'deaths_b'}, axis=1, inplace=True)

## filling in null wartype from other values in the dataset
intra_dyad_df.loc[intra_dyad_df['WarNum']==762, 'WarType'] = 4

template_replacement_dic = {
    
    'SOURCE_TABLE': 'intra_dyad_df',
    'C_CODE_A': 'ccodea',
    'C_CODE_B': 'ccodeb',
    'PARTICIPANT_A': 'sidea',
    'PARTICIPANT_B': 'sideb',
    '0 as side_a': '1 as side_a',
    '0 as side_b': '2 as side_b',
    'BATTLE_DEATHS_A': 'deaths_a',
    'BATTLE_DEATHS_B': 'deaths_b',
    'START_YEAR': 'startyr1',
    'START_MONTH': 'startmo1',
    'START_DAY': 'startdy1',
    'END_YEAR': 'endyr1',
    'END_MONTH': 'endmo1',
    'END_DAY': 'enddy1'
}

query_text = deepcopy(dyad_join_query_template)
for replacement in template_replacement_dic.keys():
    query_text = deepcopy(query_text.replace(replacement, template_replacement_dic[replacement]))
intra_dyad_union_df_1 = deepcopy(pysqldf(query_text))

template_replacement_dic = {
    
    'SOURCE_TABLE': 'intra_dyad_df',
    'C_CODE_A': 'ccodeb',
    'C_CODE_B': 'ccodea',
    'PARTICIPANT_A': 'sideb',
    'PARTICIPANT_B': 'sidea',
    '0 as side_a': '2 as side_a',
    '0 as side_b': '1 as side_b',
    'BATTLE_DEATHS_A': 'deaths_b',
    'BATTLE_DEATHS_B': 'deaths_a',
    'START_YEAR': 'startyr1',
    'START_MONTH': 'startmo1',
    'START_DAY': 'startdy1',
    'END_YEAR': 'endyr1',
    'END_MONTH': 'endmo1',
    'END_DAY': 'enddy1'
}

query_text = deepcopy(dyad_join_query_template)
for replacement in template_replacement_dic.keys():
    query_text = deepcopy(query_text.replace(replacement, template_replacement_dic[replacement]))
intra_dyad_union_df_2 = deepcopy(pysqldf(query_text))

query_text = """

select
    *
from intra_dyad_union_df_1
union
select
    *
from intra_dyad_union_df_2

"""

intra_dyad_df = deepcopy(pysqldf(query_text))

## adjusting for wrong start year
## this needs to be automated (check for  'of ___' in war_name where start_year <> ___)
intra_dyad_df.loc[intra_dyad_df['war_num']==976, 'start_year'] = 2011

## fixing for leap year issues below caused by date being filled in as final day for month.
intra_dyad_df.loc[(intra_dyad_df['start_day']==29) & (intra_dyad_df['start_month']==2) & (intra_dyad_df['start_year']==1894), 'start_day'] = 28
intra_dyad_df.loc[(intra_dyad_df['end_day']==29) & (intra_dyad_df['end_month']==2) & (intra_dyad_df['end_year']==1865), 'end_day'] = 28
intra_dyad_df.loc[(intra_dyad_df['end_day']==29) & (intra_dyad_df['end_month']==2) & (intra_dyad_df['end_year']==1867), 'end_day'] = 28
intra_dyad_df.loc[(intra_dyad_df['end_day']==29) & (intra_dyad_df['end_month']==2) & (intra_dyad_df['end_year']==1910), 'end_day'] = 28
intra_dyad_df.loc[(intra_dyad_df['end_day']==29) & (intra_dyad_df['end_month']==2) & (intra_dyad_df['end_year']==1934), 'end_day'] = 28
intra_dyad_df.loc[(intra_dyad_df['end_day']==29) & (intra_dyad_df['end_month']==2) & (intra_dyad_df['end_year']==1979), 'end_day'] = 28

## two syrian arab spring wars starting on the same date.
## I believe one of these is a data entry error.
## combining both war_nums into one war
intra_dyad_df.loc[intra_dyad_df['war_num']==977, 'war_num'] = 979

## fulfilling start and dates (in the same manner for all source tables).
intra_dyad_df['start_date'] = pd.to_datetime(intra_dyad_df['start_year'].astype(str) + '-' + intra_dyad_df['start_month'].astype(str) + '-' + intra_dyad_df['start_day'].astype(str)).dt.date
intra_dyad_df['end_date'] = pd.to_datetime(intra_dyad_df['end_year'].astype(str) + '-' + intra_dyad_df['end_month'].astype(str) + '-' + intra_dyad_df['end_day'].astype(str)).dt.date

## aggregation below needed for multiple rows for same dyad.
query_text = deepcopy(dyad_query_template.replace('SOURCE_TABLE', 'intra_dyad_df'))
intra_dyad_df = deepcopy(pysqldf(query_text))


### Extra State Wars
#### Using the same file to create the participant dataframe and dyadic dataframe.

In [12]:
## creating new dataframe for extra-state wars to feed into both participant and dayadic tables.
extra_dyad_df = pd.read_csv(csv_directory + 'Extra-StateWarData_v4.0.csv', encoding='latin-1')

template_replacement_dic = {
    
    'SOURCE_TABLE': 'extra_dyad_df',
    'C_CODE_A': 'ccode1',
    'C_CODE_B': 'ccode2',
    'PARTICIPANT_A': 'sidea',
    'PARTICIPANT_B': 'sideb',
    '0 as side_a': '1 as side_a',
    '0 as side_b': '2 as side_b',
    'BATTLE_DEATHS_A': 'batdeath',
    'BATTLE_DEATHS_B': 'nonstatedeaths',
    'START_YEAR': 'startyear1',
    'START_MONTH': 'startmonth1',
    'START_DAY': 'startday1',
    'END_YEAR': 'endyear1',
    'END_MONTH': 'endmonth1',
    'END_DAY': 'endday1',
    'a.totalbdeaths': 'null',
    'a.wduratdays': 'null',
    'a.sideapeaktotforces': 'null',
    'a.sidebpeaktotforces': 'null',
    'a.sideapeak_theatforces': 'null',
    'a.sidebpeaktheatforces': 'null'
}

query_text = deepcopy(dyad_join_query_template)
for replacement in template_replacement_dic.keys():
    query_text = deepcopy(query_text.replace(replacement, template_replacement_dic[replacement]))
extra_dyad_union_df_1 = deepcopy(pysqldf(query_text))

template_replacement_dic = {
    
    'SOURCE_TABLE': 'extra_dyad_df',
    'C_CODE_A': 'ccode1',
    'C_CODE_B': 'ccode2',
    'PARTICIPANT_A': 'sideb',
    'PARTICIPANT_B': 'sidea',
    '0 as side_a': '2 as side_a',
    '0 as side_b': '1 as side_b',
    'BATTLE_DEATHS_A': 'nonstatedeaths',
    'BATTLE_DEATHS_B': 'batdeath',
    'START_YEAR': 'startyear1',
    'START_MONTH': 'startmonth1',
    'START_DAY': 'startday1',
    'END_YEAR': 'endyear1',
    'END_MONTH': 'endmonth1',
    'END_DAY': 'endday1',
    'a.totalbdeaths': 'null',
    'a.wduratdays': 'null',
    'a.sideapeaktotforces': 'null',
    'a.sidebpeaktotforces': 'null',
    'a.sideapeak_theatforces': 'null',
    'a.sidebpeaktheatforces': 'null'
}

query_text = deepcopy(dyad_join_query_template)
for replacement in template_replacement_dic.keys():
    query_text = deepcopy(query_text.replace(replacement, template_replacement_dic[replacement]))
extra_dyad_union_df_2 = deepcopy(pysqldf(query_text))

query_text = """

select
    *
from extra_dyad_union_df_1
union
select
    *
from extra_dyad_union_df_2

"""

extra_dyad_df = deepcopy(pysqldf(query_text))

## fixing for leap year issues below caused by date being filled in as final day for month.
extra_dyad_df.loc[(extra_dyad_df['end_day']==29) & (extra_dyad_df['end_month']==2) & (extra_dyad_df['end_year']==1894), 'end_day'] = 28
extra_dyad_df.loc[(extra_dyad_df['end_day']==29) & (extra_dyad_df['end_month']==2) & (extra_dyad_df['end_year']==1922), 'end_day'] = 28

## fulfilling start and dates (in the same manner for all source tables).
extra_dyad_df['start_date'] = pd.to_datetime(extra_dyad_df['start_year'].astype(str) + '-' + extra_dyad_df['start_month'].astype(str) + '-' + extra_dyad_df['start_day'].astype(str)).dt.date
extra_dyad_df['end_date'] = pd.to_datetime(extra_dyad_df['end_year'].astype(str) + '-' + extra_dyad_df['end_month'].astype(str) + '-' + extra_dyad_df['end_day'].astype(str)).dt.date

## aggregation below needed for multiple rows for same dyad.
## aggregation below needed for multiple rows for same dyad.
query_text = deepcopy(dyad_query_template.replace('SOURCE_TABLE', 'extra_dyad_df'))
extra_dyad_df = deepcopy(pysqldf(query_text))

In [27]:
template_replacement_dic = {
    
    'SOURCE_TABLE': 'intra_dyad_df',
    'C_CODE': 'c_code_a',
    'PARTICIPANT': 'participant_a',
    'BATTLE_DEATHS': 'battle_deaths_a',
    'LAGGING_WAR': 'lagging_war',
    'LEADING_WAR': 'leading_war',
    'TOTAL_DEATHS_BOTH_SIDES': 'total_deaths_both_sides',
    'TOTAL_DAYS_IN_WAR': 'total_days_in_war',
    'PEAK_FORCES_AVAILABLE': 'peak_forces_available_a',
    'PEAK_BATTLE_FORCES': 'peak_battle_forces_a',
    'SIDE': 'side_a'
}

query_text = deepcopy(part_query_template)
for replacement in template_replacement_dic.keys():
    query_text = deepcopy(query_text.replace(replacement, template_replacement_dic[replacement]))
intra_part_df = deepcopy(pysqldf(query_text))

template_replacement_dic['SOURCE_TABLE'] = 'extra_dyad_df'
query_text = deepcopy(part_query_template)
for replacement in template_replacement_dic.keys():
    query_text = deepcopy(query_text.replace(replacement, template_replacement_dic[replacement]))
extra_part_df = deepcopy(pysqldf(query_text))

query_text = """

select
    *
from inter_part_df
union
select
    *
from intra_part_df
where
    participant != '-8'
union
select
    *
from extra_part_df
where
    participant != '-8'

"""

part_df = deepcopy(pysqldf(query_text))

In [28]:
query_text = """

select
    war_num,
    war_name,
    war_type_code,
    war_type,
    war_subtype,
    min(start_year) as start_year,
    max(end_year) as end_year,
    max(ongoing_participation) as ongoing_participation,
    count(distinct participant) as participants,
    count(distinct case when c_code > 0 then c_code else null end) as state_participants
from part_df
group by 1, 2, 3, 4, 5

"""

war_df = deepcopy(pysqldf(query_text))

In [29]:
pd.set_option('display.max_rows', 1000)

query_text = """

select
    count(case when start_date is null then 1 else null end) as null_start_date,
    count(case when end_date is null then 1 else null end) as null_end_date,
    sum(start_date_estimated) as start_date_estimated,
    sum(end_date_estimated) as end_date_estimated,
    sum(ongoing_participation) as ongoing_participation,
    min(start_year) as min_start_year,
    min(end_year) as min_end_year,
    max(start_year) as max_start_year,
    max(end_year) as max_end_year,
    count(*) as war_participants
from part_df

"""

# print(part_df.columns)
pysqldf(query_text)

Unnamed: 0,null_start_date,null_end_date,start_date_estimated,end_date_estimated,ongoing_participation,min_start_year,min_end_year,max_start_year,max_end_year,war_participants
0,0,0,178,209,47,1816,1816,2014,2021,1542


In [18]:
pd.set_option('display.max_rows', 1000)

query_text = """

select
    war_num, war_name, c_code, participant, count(*)
from part_df
group by 1, 2, 3, 4
order by 5 desc

"""

# print(part_df.columns)
pysqldf(query_text)

Unnamed: 0,war_num,war_name,c_code,participant,count(*)
0,1,Franco-Spanish War,220,France,1
1,1,Franco-Spanish War,230,Spain,1
2,4,First Russo-Turkish,365,Russia,1
3,4,First Russo-Turkish,640,Turkey,1
4,7,Mexican-American,2,United States of America,1
...,...,...,...,...,...
1537,993,Donbas War of 2014-present,369,Ukraine,1
1538,994,Second Libyan Civil War of 2014-present,-8,Libyan Dawn,1
1539,994,Second Libyan Civil War of 2014-present,620,Libya,1
1540,997,Rada'a War of 2014-present,-8,AQAP,1


### Combining Participant Sources

In [None]:
# part_df = deepcopy(pd.concat([part_df_1, part_df_2], sort=True, ignore_index=True).reset_index(drop=True))
# part_df = deepcopy(pd.concat([part_df, part_df_3], sort=True, ignore_index=True).reset_index(drop=True))

# ## one more check whether years were inputted correctly
# part_df = deepcopy(the_networks_of_war_python_functions.final_date_formatting(part_df))

# ## keeping only essential columns
# part_df = deepcopy(part_df[['war_num',
#                             'war_name',
#                             'war_type',
#                             'c_code',
#                             'participant',
#                             'side',
#                             'battle_deaths',
#                             'start_date',
#                             'start_year',
#                             'end_date',
#                             'end_year',
#                             'days_at_war',
#                             'lagging_war',
#                             'leading_war',
#                             'ongoing_participation',
#                             'total_deaths_both_sides',
#                             'peak_forces_available',
#                             'peak_battle_forces',
#                             'start_date_estimated',
#                             'end_date_estimated']])

# print('Total War Participants After Merging All War Types: {}'.format(format(len(part_df), ',d')))

In [None]:
# part_df.rename({'war_type': 'war_type_code'}, axis=1, inplace=True)

# ## two lists, one for war_types and one for war_sub_types.
# ## the indexes for each must line up.
# war_types = ['Inter-State War',
#              'Extra-State War',
#              'Extra-State War',
#              'Intra-State War',
#              'Intra-State War',
#              'Intra-State War',
#              'Intra-State War',
#              'Non-State War',
#              'Non-State War']

# war_sub_types = ['',
#                  'Colonial (conflict with colony)',
#                  'Imperial (state vs non-state)',
#                  'Civil War (for central control)',
#                  'Civil War (over local issues)',
#                  'Regional/Internal',
#                  'Intercommunal',
#                  'In Non-State Territory',
#                  'Across State Borders']

# for i, war_type in enumerate(war_types):
#     ## specifying the subtypes of each war based on the documentation
#     part_df.loc[part_df['war_type_code']==i+1, 'war_type'] = war_types[i]
#     part_df.loc[part_df['war_type_code']==i+1, 'war_sub_type'] = war_sub_types[i]

In [None]:
# remaining_participant_fields = list(part_df.drop(['war_num',
#                                                   'war_name',
#                                                   'war_type_code',
#                                                   'war_type',
#                                                   'war_sub_type',
#                                                   'c_code',
#                                                   'participant',
#                                                   'side',
#                                                   'start_year',
#                                                   'end_year',
#                                                   'start_date',
#                                                   'end_date',
#                                                   'days_at_war',
#                                                   'start_date_estimated',
#                                                   'end_date_estimated'], axis=1).columns)
# ## filling in non-applicable values with None
# part_df = deepcopy(the_networks_of_war_python_functions.remaining_participant_null_values(part_df, remaining_participant_fields))

### Creating Dictionary to Lookup Wars Individually

In [None]:
## creating war_dic that will be used to fill in missing values
war_dic = the_networks_of_war_python_functions.dictionary_from_field(part_df, 'war_num', 'war_name')

## Creating Inter-State Dyadic DataFrame

### Note: The other dyadic dataframes have already been defined above.

### Reading in the data and unioning each participant so they both appear as a and b

In [None]:
## battle deaths and start/end dates are in this file too, but it's more confusing than the part_df.
## this will just be used to get the combinations of countries directly at war with each other.
dyad_df_1 = pd.read_csv(csv_directory + 'directed_dyadic_war.csv', encoding='latin-1')

## including columns that need to be included later on (that don't need name changes)
dyad_df_1_renaming = {'warnum': 'war_num',
                      'statea': 'c_code_a',
                      'stateb': 'c_code_b',
                      'batdtha': 'battle_deaths_a',
                      'batdthb': 'battle_deaths_b',
                      'batdths': 'total_battle_deaths',
                      'year': 'year',
                      'disno': 'disno'}

dyad_df_1.rename(dyad_df_1_renaming, axis=1, inplace = True)
dyad_df_1 = deepcopy(dyad_df_1[list(dyad_df_1_renaming.values())])

for i, c_code_a in enumerate(dyad_df_1['c_code_a']):
    dyad_df_1.loc[i, 'participant_a'] = c_code_dic[c_code_a]
    dyad_df_1.loc[i, 'participant_b'] = c_code_dic[dyad_df_1.loc[i, 'c_code_b']] 
    
dyad_df_1 = deepcopy(the_networks_of_war_python_functions.union_opposite_columns(dyad_df_1))

In [None]:
# # ## fixing data entry error
# dyad_df.loc[dyad_df['war_num']==106, 'war_end_year'] = 1918

### Checking for any missing dyads that can be extracted from MID data
### Adding in Dyads (Not Included) Marked as War==1 in MID Data

In [None]:
mid_df = pd.read_csv(csv_directory + 'dyadic MIDs 3.1.csv', encoding='latin-1')
## only including war conflicts
mid_df = deepcopy(mid_df[mid_df['war']==1])
mid_df.rename({'statea': 'c_code_a',
               'stateb': 'c_code_b'}, axis=1, inplace=True)
mid_df = deepcopy(mid_df[['c_code_a',
                          'c_code_b',
                          'year']])
mid_df = deepcopy(the_networks_of_war_python_functions.union_opposite_columns(mid_df))
dyad_df_1 = deepcopy(pd.merge(dyad_df_1, mid_df, how='outer', on=['c_code_a', 'c_code_b', 'year']))

In [None]:
## these have been manually reviewed to all be WWII dyads
total_dyads_added = deepcopy(len(dyad_df_1[dyad_df_1['war_num'].isnull()]))

for i, war in enumerate(dyad_df_1['war_num']):
    if str(dyad_df_1.loc[i, 'war_num'])=='nan':
        ## these have been manually reviewed to all be WWII dyads
        dyad_df_1.loc[i, 'war_num'] = 139
        dyad_df_1.loc[i, 'participant_a'] = c_code_dic[dyad_df_1.loc[i, 'c_code_a']]
        dyad_df_1.loc[i, 'participant_b'] = c_code_dic[dyad_df_1.loc[i, 'c_code_b']]

## should be 8
if total_dyads_added!=8:
    raise Exception('There are usually 8 MIDs added here.')
    
print('Total Dyads Added From MIDs: {}'.format(format(total_dyads_added, ',d')))
print('Note: These have all been manually reviewed to be WWII dyads.')

In [None]:
dyad_df_1.tail(total_dyads_added)

## Combining Dyadic Sources
#### Second and third dyadic sources are defined above during processing for participant dataframes.

In [None]:
dyad_df = deepcopy(pd.concat([dyad_df_1, dyad_df_2], sort=True, ignore_index=True).reset_index(drop=True))
dyad_df = deepcopy(pd.concat([dyad_df, dyad_df_3], sort=True, ignore_index=True).reset_index(drop=True))

## saving this for process below
dyad_df_for_missing_values = deepcopy(dyad_df)

## dropping all columns that were only needed for adding missing values
dyad_df.drop(['disno',
              'battle_deaths_a',
              'battle_deaths_b',
              'total_battle_deaths'], axis=1, inplace=True)

## removing any duplicates that were present in the three dyadic sources
dyad_df.drop_duplicates(subset=list(dyad_df.columns), keep='first', inplace=True)
dyad_df = deepcopy(dyad_df.reset_index(drop=True))

print('Total Unique Dyads After Merging All War Types: {}'.format(format(int(len(dyad_df)/2), ',d')))

## Final Data Adjustments

### Adjusting participant names for part_df and dyad_df

In [None]:
part_df = deepcopy(the_networks_of_war_python_functions.adjustParticipantNames(part_df, 'participant'))
dyad_df = deepcopy(the_networks_of_war_python_functions.adjustParticipantNames(dyad_df, 'dyad'))

In [None]:
# print('Displaying all participant names.\n')
# sorted(list(set(list(part_df['participant'].unique()) + list(dyad_df['participant_a'].unique()) + list(dyad_df['participant_a'].unique()))))

## Addressing Missing Values from both Participant and Dyadic Data

### Adding in Missing Dyads for Wars with Only One Possible Adverary

In [None]:
## need to figure out a way to add dyadic data when it's missing.
## these are clear cases where it should be added because one side on the war is only one country.
## or, both sides are only one country
## it'll be trickier when each side isn't just one country.
## that will lead to a floating noad that isn't grounded in the network analysis graph

total_dyads = deepcopy(len(dyad_df))

for war_num in list(part_df['war_num'].unique()):
    
    part_df_copy = deepcopy(part_df[part_df['war_num']==war_num].reset_index(drop=True))
    total_side_1 = len(list(part_df_copy[part_df_copy['side']==1]['participant'].unique()))
    total_side_2 = len(list(part_df_copy[part_df_copy['side']==2]['participant'].unique()))
    ## non-state only
    total_side_1_non_state = len(list(part_df_copy[(part_df_copy['side']==1) & (part_df_copy['c_code']==-8)]['participant'].unique()))
    total_side_2_non_state = len(list(part_df_copy[(part_df_copy['side']==2) & (part_df_copy['c_code']==-8)]['participant'].unique()))

    ## state only
    total_side_1_state = len(list(part_df_copy[(part_df_copy['side']==1) & (part_df_copy['c_code']!=-8)]['c_code'].unique()))
    total_side_2_state = len(list(part_df_copy[(part_df_copy['side']==2) & (part_df_copy['c_code']!=-8)]['c_code'].unique()))
    
    if total_side_1==1:
        dyad_df = deepcopy(the_networks_of_war_python_functions.add_missing_dyads(part_df_copy, dyad_df, war_num, 1, 'all_participants').reset_index(drop=True))
    elif total_side_2==1:
        dyad_df = deepcopy(the_networks_of_war_python_functions.add_missing_dyads(part_df_copy, dyad_df, war_num, 2, 'all_participants').reset_index(drop=True))
    ## if the above processes fail, at the very least we can link all opposing parties to the non-state participant on the other side
    ## if there is only on non-state participant on the other side, we know that all opposing forces fought against them
    ## the thought here is that if there is only one non-state participant on a particular side, we can assume all members of the other side fought against them because this isn't an inter-state war
    elif total_side_1_non_state==1:
        dyad_df = deepcopy(the_networks_of_war_python_functions.add_missing_dyads(part_df_copy, dyad_df, war_num, 1, 'non-state').reset_index(drop=True))
    elif total_side_2_non_state==1:
        dyad_df = deepcopy(the_networks_of_war_python_functions.add_missing_dyads(part_df_copy, dyad_df, war_num, 2, 'non-state').reset_index(drop=True))
    else:
        pass

    ## below is the case where only one state participant was on a particular side in a non-state war.
    ## assuming that this country fought with all opposing parties.
    ## filling in any networks with only one state participant on a particular side
    ## see Eritrea in Third Somalia War
    if total_side_1_state==1:
        dyad_df = deepcopy(the_networks_of_war_python_functions.add_missing_dyads(part_df_copy, dyad_df, war_num, 1, 'state').reset_index(drop=True))
    elif total_side_2_state==1:
        dyad_df = deepcopy(the_networks_of_war_python_functions.add_missing_dyads(part_df_copy, dyad_df, war_num, 2, 'state').reset_index(drop=True))
    else:
        pass

dyads_added = deepcopy(len(dyad_df) - total_dyads)
print('Checking for floating participants with only one possible adversary, or one definite adversary.\n')
print('Total Dyads Added Overall: {}'.format(format(dyads_added, ',d')))
print('Total Particpants with Null Start Years: {}'.format(len(part_df[part_df['start_year'].isnull()])))
print('Total Particpants with Null End Years: {}'.format(len(part_df[part_df['end_year'].isnull()])))
print('Total Dyads with Null Years: {}'.format(len(dyad_df[dyad_df['year'].isnull()])))

total_dyads = deepcopy(len(dyad_df))
## unioning to obtain all combinations of dyads that were added
dyad_df = deepcopy(the_networks_of_war_python_functions.union_opposite_columns(dyad_df))
dyads_added = deepcopy(len(dyad_df) - total_dyads)
print('Total Dyads Unioned From Opposite Columns: {}'.format(format(dyads_added, ',d')))

### Adding in Missing Participants that Appear in Dyadic Data for War

In [None]:
print('Participants Added from Dyadic Data:\n')
war_list = list(dyad_df['war_num'].unique())

## filling in values below using MID data
mid_df = pd.read_csv(csv_directory + 'dyadic MIDs 3.1.csv', encoding='latin-1')
mid_df.rename({'statea': 'c_code_a',
               'stateb': 'c_code_b',
               'namea': 'participant_a',
               'nameb': 'participant_b'}, axis=1, inplace=True)

## giving this to the opposite side of the participant they fought against in the war (grabs first record)
## troublesome if they swithced sides but this would be very rare
opposing_side_dic = {1: 2,
                     2: 1,
                     3: 3}

original_part_df_length = deepcopy(len(part_df))
## the following checks for missing data across participant names and c_codes
## this is only possible (at the moment) for participants with c_codes, because the rest comes from dyadic data
for war_num in war_list:

    part_df_copy = deepcopy(part_df[part_df['war_num']==war_num].reset_index(drop=True))
    participant_list = list(part_df_copy['c_code'])
    
    dyad_df_copy = deepcopy(dyad_df[dyad_df['war_num']==war_num].reset_index(drop=True))
    dyad_list = list(set(list(dyad_df_copy['c_code_a']) + list(dyad_df_copy['c_code_b'])))
    
    try:
        dyad_list.remove(-8)
    except:
        pass
    for participant in dyad_list:
        if participant not in participant_list:
            
            war_num = part_df_copy['war_num'].values[0]

            print(str(war_num)[:-2] + ', ' + war_dic[war_num] + ': ' + str(int(participant)) + ', ' + c_code_dic[participant])
            
            ## this will overwrite the dyad_df_copy dataframe made just a few lines earlier.
            dyad_df_copy = deepcopy(dyad_df_for_missing_values[(dyad_df_for_missing_values['war_num']==war_num) & ((dyad_df_for_missing_values['c_code_a']==participant) | (dyad_df_for_missing_values['c_code_b']==participant))].reset_index(drop=True))
            dispute_numbers = list(dyad_df_copy['disno'].unique())
            dispute_number = dispute_numbers[0]
            
            ## should always be 1
            if len(dispute_numbers) > 1:
                print('Missing participant has more than 1 dispute.')
                print('Logic will need to be adjusted.')
                print('Total Disputes for Missing Participant: {}'.format(len(dispute_numbers)))
            
            mid_df_copy = deepcopy(mid_df[(mid_df['disno']==dispute_number) & ((mid_df['c_code_a']==participant) | (mid_df['c_code_b']==participant))].reset_index(drop=True))
            mid_df_copy.rename({'strtday': 'start_day',
                              'strtmnth': 'start_month',
                              'strtyr': 'start_year',
                              'endday': 'end_day',
                              'endmnth': 'end_month',
                              'endyear': 'end_year',
                             }, axis=1, inplace = True)
            mid_df_copy['war_num'] = war_num
            mid_df_copy = deepcopy(the_networks_of_war_python_functions.start_and_end_dates(mid_df_copy))
            aggregations = {
                'start_date': 'min',
                'end_date': 'max',
                'days_at_war': 'max',
                ## aggregation for estimation fields.
                ## this could fail for aggregations over multiple dates
                'start_date_estimated': 'max',
                'end_date_estimated': 'max'
                }
            mid_df_copy = deepcopy(mid_df_copy.groupby(['war_num', 'c_code_a', 'c_code_b', 'participant_a', 'participant_b']).agg(aggregations).reset_index())
            
            ## manually filling in values that are found in dyadic cow datasets but seem to be missing from country level sources.
            ## values have been obtained from dyadic data (directed_dyadic_war.csv' and dyadic MIDs 3.1.csv)
            df_length = deepcopy(len(part_df))
            
            part_df_copy = deepcopy(part_df[part_df['war_num']==war_num].reset_index(drop=True))
            
            if len(mid_df_copy[mid_df_copy['c_code_a']==participant]) > 0:
                part_df.loc[df_length, 'c_code'] = mid_df_copy[mid_df_copy['c_code_a']==participant]['c_code_a'].values[0]
                ## assuming it's in c_code_dic
                ## otherwise, this breaks!
                part_df.loc[df_length, 'participant'] = c_code_dic[mid_df_copy[mid_df_copy['c_code_a']==participant]['c_code_a'].values[0]]
                opposite_participant = mid_df_copy[mid_df_copy['c_code_a']==participant]['c_code_b'].values[0]
            else:
                part_df.loc[df_length, 'c_code'] = mid_df_copy[mid_df_copy['c_code_b']==participant]['c_code_b'].values[0]
                ## assuming it's in c_code_dic
                ## otherwise, this breaks!
                part_df.loc[df_length, 'participant'] = c_code_dic[mid_df_copy[mid_df_copy['c_code_b']==participant]['c_code_a'].values[0]]
                opposite_participant = mid_df_copy[mid_df_copy['c_code_a']==participant]['c_code_b'].values[0]
             
            part_df.loc[df_length, 'war_num'] = war_num
            part_df.loc[df_length, 'war_name'] = war_dic[war_num]
            part_df.loc[df_length, 'war_type'] = part_df_copy['war_type'].values[0]
            part_df.loc[df_length, 'war_type_code'] = part_df_copy['war_type_code'].values[0]
            part_df.loc[df_length, 'war_sub_type'] = part_df_copy['war_sub_type'].values[0]
            
            part_df.loc[df_length, 'side'] = opposing_side_dic[part_df_copy[part_df_copy['c_code']==opposite_participant]['side'].values[0]]
            part_df.loc[df_length, 'battle_deaths'] = max(list(dyad_df_copy[dyad_df_copy['c_code_a']==participant]['battle_deaths_a']) + list(dyad_df_copy[dyad_df_copy['c_code_b']==participant]['battle_deaths_b']))
            
            part_df.loc[df_length, 'start_date'] = mid_df_copy['start_date'].values[0]
            part_df.loc[df_length, 'start_year'] = float(str(mid_df_copy['start_date'].values[0])[0:4])
            part_df.loc[df_length, 'end_date'] = pd.to_datetime(str(mid_df_copy['end_date'].values[0])[0:11])
            part_df.loc[df_length, 'end_year'] = float(str(mid_df_copy['end_date'].values[0])[0:4])
            part_df.loc[df_length, 'days_at_war'] = float(mid_df_copy['days_at_war'].values[0])
            part_df.loc[df_length, 'start_date_estimated'] = float(mid_df_copy['start_date_estimated'].values[0])
            part_df.loc[df_length, 'end_date_estimated'] = float(mid_df_copy['end_date_estimated'].values[0])
            ## prevent duplication in for loop
            ## this may be an issue if they fought with more than one country
            participant_list.append(participant)
        else:
            pass
        
part_df['war_num'] = part_df['war_num'].astype(float)
part_df['start_year'] = part_df['start_year'].astype(float)
part_df['end_year'] = part_df['end_year'].astype(float)

participants_added = deepcopy(len(part_df) - original_part_df_length)
print('\nTotal Participants Added from Dyadic Data: {}'.format(format(participants_added, ',d')))

In [None]:
# missing value for thailand battle deaths in WWII obtained from wikipedia
# https://en.wikipedia.org/wiki/Thailand_in_World_War_II#:~:text=Thailand%20suffered%20about%205%2C569%20military,the%20brief%20Franco%2DThai%20War.
print('Manually adding battle deaths from Wikipedia for Thailand during WWII.')
part_df.loc[(part_df['war_num']==139) & (part_df['participant']=='Thailand'), 'battle_deaths'] = 5569

# missing value for greece battle deaths in Turco Cypriot obtained from wikipedia
# https://en.wikipedia.org/wiki/Turkish_invasion_of_Cyprus#:~:text=The%20violence%20resulted%20in%20the,of%2025%2C000%E2%80%9330%2C000%20Turkish%20Cypriots.
print('Manually adding battle deaths from Wikipedia for Greece during Turco Cypriot.')
part_df.loc[(part_df['war_num']==184) & (part_df['participant']=='Greece'), 'battle_deaths'] = 105

In [None]:
# # check for all values that were just added
# # making sure no fields are null that shouldn't be null
# part_df.tail(len(part_df)-original_part_df_length)

## Defining War DataFrame (One row for each war)

In [None]:
## part_df_copy will be used to calculate war_df.
part_df_copy = deepcopy(part_df)
part_df_copy.rename({'participant': 'total_participants'}, axis=1, inplace=True)

## filling these dates in arbitrarily before taking aggregates
## high date for start_date because this will be min
## low date for end_date because this will be max
part_df_copy['start_date'].fillna(pd.to_datetime('2100-01-01'), inplace=True)
part_df_copy['end_date'].fillna(pd.to_datetime('1700-01-01'), inplace=True)
part_df_copy.rename({'ongoing_participation': 'ongoing_war'}, axis=1, inplace=True)

war_name_changes = {}
wars_changed_to_ongoing = []
for i, war in enumerate(part_df_copy['war_name']):
    original_war_name = part_df_copy.loc[i, 'war_name']
    if part_df_copy.loc[i, 'ongoing_war']==1:
        pass
    elif 'present' in part_df_copy.loc[i, 'war_name'].lower() or 'ongoing' in part_df_copy.loc[i, 'war_name'].lower():
        part_df_copy.loc[i, 'ongoing_war'] = 1
        wars_changed_to_ongoing.append(part_df_copy.loc[i, 'war_num'])
    if ' of 1' in part_df_copy.loc[i, 'war_name']:
        part_df_copy.loc[i, 'war_name'] = part_df_copy.loc[i, 'war_name'].split(' of 1')[0].replace('  ', ' ')
    elif ' of 2' in part_df_copy.loc[i, 'war_name']:
        part_df_copy.loc[i, 'war_name'] = part_df_copy.loc[i, 'war_name'].split(' of 2')[0].replace('  ', ' ')
    if part_df_copy.loc[i, 'war_name'][0]==' ':
        part_df_copy.loc[i, 'war_name'] = part_df_copy.loc[i, 'war_name'][1:]
    if original_war_name!=part_df_copy.loc[i, 'war_name']:
        war_name_changes[original_war_name] = part_df_copy.loc[i, 'war_name']
        
print('Total Wars Manually Changed to Ongoing: {}'.format(format(len(list(set(wars_changed_to_ongoing))), ',d')))
print('Total Wars Marked as Ongoing: {}'.format(format(len(list(part_df_copy[part_df_copy['ongoing_war']==1]['war_num'].unique())), ',d')))
print('Total Wars With Name Adjustments: {}'.format(format(len(war_name_changes.keys()), ',d')))
# pprint(war_name_changes)

In [None]:
### Creating Dictionary to Lookup Estimated Dates That Have Affected Days_At_War
estimated_start_df = deepcopy(part_df_copy[part_df_copy['start_date_estimated']==1][['war_num', 'start_date']])
estimated_start_dates = the_networks_of_war_python_functions.dictionary_from_field(estimated_start_df, 'war_num', 'start_date')

estimated_end_df = deepcopy(part_df_copy[(part_df_copy['end_date_estimated']==1) & (part_df_copy['ongoing_war']==0)][['war_num', 'end_date']])
estimated_end_dates = the_networks_of_war_python_functions.dictionary_from_field(estimated_end_df, 'war_num', 'end_date')

## not longer needed after the dictionaries are created
## however, they could still potentially have been used to generate days_at_war_by_participant
part_df.drop(['start_date_estimated',
              'end_date_estimated'], axis=1, inplace=True)
## needed for to create war_df but not individually part_df
part_df.drop(['lagging_war',
              'leading_war'], axis=1, inplace=True)

print('Total Wars with Estimated Start Dates: {}'.format(len(list(estimated_start_df['war_num'].unique()))))
print('Total (Non-Ongoing) Wars with Estimated End Dates: {}'.format(len(list(estimated_end_df['war_num'].unique()))))

In [None]:
aggregations = {
    'total_participants': 'count',
    'start_year': 'min',
    'end_year': 'max',
    ## this will not be accurate if there are more than one lagging/leading wars per war.
    'lagging_war': 'min',
    'leading_war': 'max',
    'ongoing_war': 'max',
    'start_date': 'min',
    'end_date': 'max'
    ## not sure how to add this one just yet
#     'total_deaths_both_sides': 'max'
    }
war_df = deepcopy(part_df_copy.groupby(['war_num',
                                        'war_name',
                                        'war_type_code',
                                        'war_type',
                                        'war_sub_type']).agg(aggregations).reset_index())

## putting these back to none in case they made it through the aggregation
war_df.loc[war_df['start_date']==pd.to_datetime('2100-01-01'), 'start_date'] = None
war_df.loc[war_df['end_date']==pd.to_datetime('1700-01-01'), 'end_date'] = None

for i, war in enumerate(war_df['war_name']):
    try:
        war_df.loc[i, 'total_days_in_war'] = war_df.loc[i, 'end_date'] - war_df.loc[i, 'start_date']
        war_df.loc[i, 'total_days_in_war'] = int(str(war_df.loc[i, 'total_days_in_war']).split(' ')[0]) + 1
    except:
        war_df.loc[i, 'total_days_in_war'] = None
        
war_df = deepcopy(war_df.sort_values(by=['start_year', 'ongoing_war', 'end_year', 'start_date', 'war_name'], ascending=(False, False, False, False, True)))

print('Total Wars with Null Start Years: {}'.format(format(int(len(war_df[war_df['start_year'].isnull()])), ',d')))
print('Total (Non-Ongoing) Wars with Null End Years: {}'.format(format(int(len(war_df[(war_df['end_year'].isnull()) & (war_df['ongoing_war']==0)])), ',d')))
print('Total Ongoing Wars: {}'.format(format(int(len(war_df[war_df['ongoing_war']==1])), ',d')))


In [None]:
print('Evaluating for each war, whether the start/end date is based on an estimation.')

for i, war in enumerate(war_df['war_num']):
    if war_df.loc[i, 'war_num'] in list(estimated_start_dates.keys()) and estimated_start_dates[war_df.loc[i, 'war_num']]==war_df.loc[i, 'start_date']:
        war_df.loc[i, 'start_date_estimated'] = 1
    if war_df.loc[i, 'war_num'] in list(estimated_end_dates.keys()) and estimated_end_dates[war_df.loc[i, 'war_num']]==war_df.loc[i, 'end_date']:
        war_df.loc[i, 'end_date_estimated'] = 1
        
war_df.loc[war_df['start_date_estimated'].isnull(), 'start_date_estimated'] = 0
war_df.loc[war_df['end_date_estimated'].isnull(), 'end_date_estimated'] = 0

print("Total Estimated Start Dates: {}".format(format(len(war_df[war_df['start_date_estimated']==1]), ',d')))
print("Total Estimated End Dates: {}".format(format(len(war_df[war_df['end_date_estimated']==1]), ',d')))

In [None]:
print('Total Participants: {}'.format(format(len(part_df), ',d')))
print('Total Dyadic Combinations: {}'.format(format(len(dyad_df), ',d')))
print('Total Wars: {}'.format(format(len(war_df), ',d')))

pickle_directory = '/Users/charlieyaris/Personal/data_sources/the_networks_of_war/pickles/'

part_df.to_pickle(pickle_directory + 'initial_part_df.pkl')
dyad_df.to_pickle(pickle_directory + 'initial_dyad_df.pkl')
war_df.to_pickle(pickle_directory + 'initial_war_df.pkl')