In [1]:
import pandas as pd
import numpy as np
from copy import deepcopy
from traceback import format_exc
# from pprint import pprint
import the_networks_of_war_python_functions

In [2]:
pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', lambda x: '%.5f' % x)

In [1]:
pwd

'/Users/charlieyaris/Personal/git/the_networks_of_war'

In [3]:
csv_directory = '/Users/charlieyaris/Personal/data_sources/the_networks_of_war/csvs/'
csv_output_directory = '../../assets/csv/the_networks_of_war/'
pickle_directory = '/Users/charlieyaris/Personal/data_sources/the_networks_of_war/pickles/'

## Setup for Identifying Countries by Code
### Note: This is helpful for when different names are used for the same country.

In [4]:
c_code_dic = the_networks_of_war_python_functions.define_c_code_dic()

Total Country Codes: 217


## Integrating Descriptive Data (Defined in Outside Notebook)

### Merging Participant Data with Participant-Level Descriptive Data

In [5]:
part_df = pd.read_pickle(pickle_directory + 'initial_part_df.pkl')
dyad_df = pd.read_pickle(pickle_directory + 'initial_dyad_df.pkl')
war_df = pd.read_pickle(pickle_directory + 'initial_war_df.pkl')

In [6]:
descriptive_df_1 = pd.read_pickle(pickle_directory + 'participant_descriptive_df.pkl')

descriptive_df_1.rename({'year': 'start_year'}, axis=1, inplace=True)
part_df = deepcopy(pd.merge(part_df, descriptive_df_1, how='left', on=['c_code', 'start_year']))

descriptive_df_1.rename({'start_year': 'end_year'}, axis=1, inplace=True)
part_df = deepcopy(pd.merge(part_df, descriptive_df_1, how='left', on=['c_code', 'end_year']))

### Merging Dyadic Data with Dyadic-Level Descriptive Data

In [7]:
initial_descriptive_columns = deepcopy(set(list(dyad_df.columns)))

descriptive_df_2 = pd.read_pickle(pickle_directory + 'dyadic_descriptive_df.pkl')
dyad_df = deepcopy(pd.merge(dyad_df, descriptive_df_2, how='left', on=['c_code_a', 'c_code_b', 'year']))

print('Counting Total Dyadic War and Year Combinations by Descriptive Field\n')
the_networks_of_war_python_functions.print_new_fields(dyad_df, initial_descriptive_columns, None)

Counting Total Dyadic War and Year Combinations by Descriptive Field

        inter_governmental_organizations  967
                         trade_relations  854
                                   mtops  562
                              contiguity  445
                    elective_legislature  272
                           dictatorships  183
                multi_party_state_exists  178
                     diplomatic_exchange  152
                                    atop  133
          defense_cooperation_agreements  130
                     colonial_contiguity  123
                                alliance  102
                     non_elected_leaders   90
                        same_leader_type   73
                      territory_exchange   71
             no_non_regime_parties_exist   45
           multi_party_legislature_legal   37
           no_partisan_legislature_legal   37
                        military_leaders   34
                       indirect_election   33
 no_non_re

## Finalizing Participant and Dyadic Dataframes

### Keeping values for first and last year of each dyad
### Combining these into one row per dyad

In [8]:
dyad_df['year'] = dyad_df['year'].astype(float)

## creating unique identifier "conflict_pair" for each dyad
## based on name
for i, participant_a in enumerate(dyad_df['participant_a']):
    ## standardizing participant names with ccode when available
    dyad_list = []
    if dyad_df.loc[i, 'c_code_a'] in c_code_dic.keys():
        dyad_list.append(str(dyad_df.loc[i, 'c_code_a']))
    else:
        dyad_list.append(participant_a)
    if dyad_df.loc[i, 'c_code_b'] in c_code_dic.keys():
        dyad_list.append(str(dyad_df.loc[i, 'c_code_b']))
    else:
        dyad_list.append(dyad_df.loc[i, 'participant_b'])
    dyad_list = str(sorted(dyad_list))
    dyad_df.loc[i, 'conflict_pair'] = dyad_list

dyad_df_columns = ['war_num',
                   'year',
                   'participant_a',
                   'participant_b',
                   'conflict_pair']

for column in list(descriptive_df_2.columns):
    dyad_df_columns.append(column)
## not including any row without any participants
dyad_df = deepcopy(dyad_df[(dyad_df['participant_a'].isnull()==False) & (dyad_df['participant_b'].isnull()==False)])

## creating new dyad_df to get the max of each field during all the years of the dyad
max_dyad_df = deepcopy(dyad_df)
descriptive_columns =  list(descriptive_df_2.columns)
descriptive_columns.remove('year')
descriptive_columns.remove('c_code_a')
descriptive_columns.remove('c_code_b')
aggregations = {}
for column in descriptive_columns:
    max_dyad_df.rename({column: column + '_z'}, axis=1, inplace=True)
    aggregations[column + '_z'] = 'max'
max_dyad_df = deepcopy(max_dyad_df.groupby(['war_num', 'conflict_pair']).agg(aggregations).reset_index())

first_year_dyad_df = deepcopy(dyad_df)
first_year_dyad_df.sort_values(by='year', ascending=True, inplace=True)
## keeping the values for only the first year of conflict within a given dyad
## need to dedupe across conflict pair so a vs b are never repeated interchangably
first_year_dyad_df.drop_duplicates(subset=['war_num', 'conflict_pair'], keep='first', inplace=True)
first_year_dyad_df.rename({'year': 'first_year'}, axis=1, inplace=True)

last_year_dyad_df = deepcopy(dyad_df)
last_year_dyad_df.sort_values(by='year', ascending=True, inplace=True)
## keeping the values for only the last year of conflict within a given dyad
## need to dedupe across conflict pair so a vs b are never repeated interchangably
last_year_dyad_df.drop_duplicates(subset=['war_num', 'conflict_pair'], keep='last', inplace=True)
last_year_dyad_df.rename({'year': 'last_year'}, axis=1, inplace=True)

## combining first and last year dyads into one dataframe
dyad_df = deepcopy(pd.merge(first_year_dyad_df, last_year_dyad_df, how='left', on=['conflict_pair', 'war_num']))
## combining the maximum df into the final version of the dataframe
dyad_df = deepcopy(pd.merge(dyad_df, max_dyad_df, how='left', on=['conflict_pair', 'war_num']).reset_index(drop=True))
## changing column names and then dropping duplicates
## adding them into the join woudl fail (since not all have c_codes)
for column in dyad_df.columns:
    if column[-4:]=='_a_x' or column[-4:]=='_b_x':
        dyad_df.rename({column: column[:-2]}, axis=1, inplace=True)
    elif column[-4:]=='_a_y' or column[-4:]=='_b_y':
        dyad_df.drop([column], axis=1, inplace=True)

In [9]:
print('Counting Total Dyadic War Combinations (Overall) by Descriptive Field\n')
max_dyad_columns = list(max_dyad_df.drop(['war_num',
                                          'conflict_pair'], axis=1).columns)
the_networks_of_war_python_functions.print_new_fields(dyad_df, None, max_dyad_columns)

# print('Counting Total Dyadic War Combinations by Descriptive Field\n')
# descriptive_dyad_columns = list(dyad_df.drop(['war_num',
#                                               'c_code_a',
#                                               'c_code_b',
#                                               'participant_a',
#                                               'participant_b',
#                                               'first_year',
#                                               'last_year'], axis=1).columns)
# the_networks_of_war_python_functions.print_new_fields(dyad_df, None, descriptive_dyad_columns)

Counting Total Dyadic War Combinations (Overall) by Descriptive Field

 Overall         inter_governmental_organizations  394
 Overall                          trade_relations  337
 Overall                                    mtops  249
 Overall                               contiguity  201
 Overall                      diplomatic_exchange  130
 Overall                     elective_legislature   99
 Overall                 multi_party_state_exists   92
 Overall                                     atop   90
 Overall           defense_cooperation_agreements   67
 Overall                                 alliance   65
 Overall                       territory_exchange   62
 Overall                            dictatorships   61
 Overall                      colonial_contiguity   52
 Overall                      non_elected_leaders   38
 Overall            multi_party_legislature_legal   30
 Overall                         same_leader_type   28
 Overall                         military_leaders

### Addressing in null values, missing data, and conversions for dyads and participants

In [10]:
## the following participant columns need _z labels to be categorized as overall metrics
add_z_list = ['peak_forces_available',
              'peak_battle_forces',
              'days_at_war',
              'battle_deaths',
              'total_deaths_both_sides']
for column in add_z_list:
    part_df.rename({column: column + '_z'}, axis=1, inplace=True)

conversion_dic = {'money_flow_in': 1000000,
                  'money_flow_out': 1000000,
                  'military_expenditure': 1000,
                  'military_personnel': 1000,
                  'population': 1000,
                  'urban_population': 1000,
                  'refugees_originated': 1000,
                  'internally_displaced_persons': 1000,
                  'refugees_hosted': 1000,
#                   'land_mass_exchange_gain': 'unsure',
#                   'land_mass_exchange_loss': 'unsure',
#                   'population_exchange_gain': 'unsure',
#                   'population_exchange_loss': 'unsure',
                  ## these are thousands of tons
                  'iron_steel_production': 2000000,
                  'energy_consumption': 2000000
                 }
print('\nAddressng null values, missing data, and conversions for part_df.')
part_df = deepcopy(the_networks_of_war_python_functions.column_fills_and_converions(part_df, 'participant', conversion_dic))

print('\nAddressng null values, missing data, and conversions for dyad_df.')
dyad_df = deepcopy(the_networks_of_war_python_functions.column_fills_and_converions(dyad_df, 'dyad', None))


Addressng null values, missing data, and conversions for part_df.

Total Columns Adjusted: 49
Total Columns Adjusted for Conversion: 22
Total Null Values Notated: 48,899
Total Unknown Values Notated: 0

Addressng null values, missing data, and conversions for dyad_df.

Total Columns Adjusted: 90
Total Columns Adjusted for Conversion: 0
Total Null Values Notated: 94,481
Total Unknown Values Notated: 0


### Saving the data

In [11]:
print('Total Participants: {}'.format(format(len(part_df), ',d')))
print('Total Dyadic Combinations: {}'.format(format(len(dyad_df), ',d')))
print('Total Wars: {}'.format(format(len(war_df), ',d')))

part_df.to_pickle(pickle_directory + 'part_df.pkl')
dyad_df.to_pickle(pickle_directory + 'dyad_df.pkl')
war_df.to_pickle(pickle_directory + 'war_df.pkl')

Total Participants: 1,713
Total Dyadic Combinations: 1,117
Total Wars: 678


In [12]:
part_df = pd.read_pickle(pickle_directory + 'part_df.pkl')
dyad_df = pd.read_pickle(pickle_directory + 'dyad_df.pkl')
war_df = pd.read_pickle(pickle_directory + 'war_df.pkl')

## JSON Export for D3.js Processing

In [13]:
war_column_list = deepcopy(list(war_df.columns))
## dropping fields that won't be needed in the participant section of the json file
## values may also differ between those in part_df and war_df.
## those in war_df have undergone additional preprocessing.
part_column_list = deepcopy(list(part_df.drop(['war_num',
                                               'war_name',
                                               'war_type',
                                               'war_sub_type',
                                               'total_deaths_both_sides_z'], axis=1).columns))
## dropping fields that won't be needed in the dyad section of the json file
dyad_column_list = deepcopy(list(dyad_df.drop(['war_num',
                                               'c_code_a',
                                               'c_code_b',
                                               'conflict_pair',
                                               'side_a',
                                               'side_b',
                                               'participant_a',
                                               'participant_b'], axis=1).columns))

print('\nUpdating/recreating one JSON file per war_num.')
print('JSON Files to be Rewritten: {}\n'.format(format(len(war_df), ',d')))

for i, war in enumerate(war_df['war_num']):
    
    file_name = deepcopy('war_num_' + str(war).replace('.', '_') + '.json')
    ## need to filter by war here because i does not follow the sorting of the dataframe
    war_df.loc[war_df['war_num']==war, 'file_name'] = file_name
    ## rewriting all of the files each time
    graph_file = open(csv_output_directory + 'json_files_by_war/' + file_name, 'w').close()
    graph_file = open(csv_output_directory + 'json_files_by_war/' + file_name, 'w')
    
    part_df_copy = deepcopy(part_df[part_df['war_num']==war].reset_index(drop=True))
    dyad_df_copy = deepcopy(dyad_df[dyad_df['war_num']==war].reset_index(drop=True))
    war_df_copy = deepcopy(war_df[war_df['war_num']==war].reset_index(drop=True))
    
    if len(part_df_copy)==0:
        print('No participants for war_num {} ({}), {} dyads'.format(war, war_df_copy.loc[0, 'war_name'], len(part_df_copy)))
    if len(dyad_df_copy)==0:
        print('No dyads for war_num {} ({}), {} participants'.format(war, war_df_copy.loc[0, 'war_name'], len(part_df_copy)))
        
    war_line = ''
    for i, column in enumerate(war_column_list):
        if i==0:
            war_line = deepcopy(war_line + '"' + column + '": "' + str(war_df_copy.loc[0, column]))
        else:
            war_line = deepcopy(war_line + '", "' + column + '": "' + str(war_df_copy.loc[0, column]))

    graph_file.write('{\n  "war": [\n    {' + war_line + '"}\n')

    node_names = []
    sides = []
    for i, participant in enumerate(part_df_copy['participant']):
        ## assigning the input based on whether a c_code is available
        ## this will be used to link part_df_copy to the dyad_df_copy
        ## (since c_code is not always availablen and participant names are not standardized)
        c_code_input = part_df_copy.loc[i, 'c_code']
        if str(c_code_input)[0] in str(np.arange(0, 10)):
            node_names.append(part_df_copy.loc[i, 'c_code'])
        else:
            node_names.append(participant)
        sides.append(part_df_copy.loc[i, 'side'])
        
    participant_line = ''
    for i, participant in enumerate(part_df_copy['participant']):
        ## this process begins the writing of the file and is completely separate from the one above
        participant_line = deepcopy(participant_line + '    {"id": "' + str(i))
        for column in part_column_list:
            participant_line = deepcopy(participant_line + '", "' + column + '": "' + str(part_df_copy.loc[i, column]))
        participant_line = deepcopy(participant_line + '"},\n')

    graph_file.write('  ],\n  "nodes": [\n' + participant_line[:-2] + '\n')

    link_line = ''
    dyad_inputs = []
    for i, node_1 in enumerate(dyad_df_copy['c_code_a']):
        ## assigning the input based on whether a c_code is available
        if str(node_1)[0] in str(np.arange(0, 10)):
            node_1_input = node_1
            dyad_inputs.append(node_1)
        else:
            node_1_input = dyad_df_copy.loc[i, 'participant_a']
            dyad_inputs.append(node_1_input)
        ## assigning the input based on whether a c_code is available
        if str(dyad_df_copy.loc[i, 'c_code_b'])[0] in str(np.arange(0, 10)):
            node_2_input = dyad_df_copy.loc[i, 'c_code_b']
            dyad_inputs.append(node_2_input)
        else:
            node_2_input = dyad_df_copy.loc[i, 'participant_b']
            dyad_inputs.append(node_2_input)
        ## sometimes a country could be in the dyad and not in the participant df_copy.
        ## this is rare but has happened (see Spain in WWII)
        ## should be taken care of in processes above, so checking below to see if that process failed
        if node_1_input not in node_names:
            print("Participant Missing in Dyadic Data: {} ({}) for {}.".format(dyad_df_copy.loc[i, 'participant_a'], int(dyad_df_copy.loc[i, 'c_code_a']), war_df_copy.loc[0, 'war_name']))   
        elif node_2_input not in node_names:
            print("Participant Missing in Dyadic Data: {} {} for {}.".format(dyad_df_copy.loc[i, 'participant_b'], int(dyad_df_copy.loc[i, 'c_code_b']), war_df_copy.loc[0, 'war_name']))   
        else:
            link_line = deepcopy(link_line
                + '    {"source": "' + str(node_names.index(node_1_input))
                + '", "target": "' + str(node_names.index(node_2_input)))
            for column in dyad_column_list:
                link_line = deepcopy(link_line + '", "' + column + '": "' + str(dyad_df_copy.loc[i, column]))
            link_line = deepcopy(link_line + '"},\n')
            ## checking for any dyads that are paired with participants on the same side as them
            ## this would most likely suggest a mistake in pre-processing
            ## does not apply to side 3 (switched sides)
            if sides[node_names.index(node_1_input)]==sides[node_names.index(node_2_input)] and sides[node_names.index(node_1_input)]!=3:
                print('Dyads on the Same Side: {} {} and {} {} for {}.'.format(dyad_df_copy.loc[i, 'participant_a'], int(dyad_df_copy.loc[i, 'c_code_a']), dyad_df_copy.loc[i, 'participant_b'], int(dyad_df_copy.loc[i, 'c_code_b']), war_df_copy.loc[0, 'war_name']))   
                         
    graph_file.write('  ],\n  "links": [\n' + link_line[:-2] + '\n\n  ]\n}')

    for node in node_names:
        if node not in dyad_inputs:
            if int(str(node)[0]) in np.arange(0, 10):
                print('Floating Node in {}: {}'.format(war_df_copy.loc[0, 'war_name'],
                                                       c_code_dic[node]))
            else:
                print('Floating Node in {}: {}'.format(war_df_copy.loc[0, 'war_name'],
                                                       node))
                
print('\nCompleted JSON File Updates.')

print('\nUpdating war_file_list.csv')
war_df.to_csv(csv_output_directory + 'war_file_list.csv', index=None)
print('\nCompleted war_file_list update.')


Updating/recreating one JSON file per war_num.
JSON Files to be Rewritten: 678

Floating Node in Africa's World War: Rwanda
Floating Node in Africa's World War: Burundi
Floating Node in Africa's World War: Uganda
Floating Node in Fourth Lebanese War: Iran
Floating Node in Fourth Lebanese War: Syria
Floating Node in Chad - FAN War: France
Floating Node in Chad - FAN War: Democratic Republic of the Congo
Floating Node in Spanish Civil War: Portugal
Floating Node in Spanish Civil War: Germany
Floating Node in Spanish Civil War: Italy
Floating Node in World War I: Japan

Completed JSON File Updates.

Updating war_file_list.csv

Completed war_file_list update.
