In [1]:
import pandas as pd
import numpy as np
from pandasql import sqldf
pysqldf = lambda q: sqldf(q, globals())
from copy import deepcopy
from traceback import format_exc
# from pprint import pprint
import the_networks_of_war_python_functions

In [2]:
pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', lambda x: '%.5f' % x)

In [3]:
pwd

'/Users/charlieyaris/Personal/github_repositories/the_networks_of_war'

In [4]:
csv_output_directory = '../cyaris.github.io/assets/csv/the_networks_of_war/'
json_output_directory = '../cyaris.github.io/assets/json/the_networks_of_war/'
csv_directory = '/Users/charlieyaris/Personal/data_sources/the_networks_of_war/csvs/'
pickle_directory = '/Users/charlieyaris/Personal/data_sources/the_networks_of_war/pickles/'

## Setup for Identifying Countries by Code
### Note: This is helpful for when different names are used for the same country.

In [5]:
pwd

'/Users/charlieyaris/Personal/github_repositories/the_networks_of_war'

In [6]:
c_code_df = pd.read_csv(csv_directory + 'COW country codes.csv', encoding='latin-1')

query_text = """

select
    ccode as c_code,
    statenme as state_name,
    stateabb as state_name_abbreviation
from c_code_df
group by 1, 2, 3

"""

c_code_df = deepcopy(pysqldf(query_text))

## Integrating Descriptive Data (Defined in Outside Notebook)

### Merging Participant Data with Participant-Level Descriptive Data

In [7]:
part_df = pd.read_pickle(pickle_directory + 'participant_descriptive_df.pkl')
dyad_df = pd.read_pickle(pickle_directory + 'dyadic_descriptive_df.pkl')
war_df = pd.read_pickle(pickle_directory + 'initial_war_df.pkl')

### Merging Dyadic Data with Dyadic-Level Descriptive Data

In [8]:
print('Counting Total Dyadic War and Year Combinations by Descriptive Field\n')
initial_dyad_columns = ['start_date', 'start_year', 'end_date', 'end_year', 'war_num', 'c_code_a', 'c_code_b', 'participant_a', 'participant_b']
the_networks_of_war_python_functions.print_new_fields(dyad_df, initial_dyad_columns, list(dyad_df.drop(initial_dyad_columns, axis=1).columns))

Counting Total Dyadic War and Year Combinations by Descriptive Field

       inter_governmental_organizations_x 397
       inter_governmental_organizations_z 397
       inter_governmental_organizations_y 396
                                  mtops_y 254
                                  mtops_z 254
                                  mtops_x 248
                             contiguity_z 201
                             contiguity_x 199
                             contiguity_y 198
                        trade_relations_z 167
                    diplomatic_exchange_z 161
                        trade_relations_y 155
                        trade_relations_x 149
                   elective_legislature_z 101
                                   atop_z  97
               multi_party_state_exists_z  97
               multi_party_state_exists_x  96
               multi_party_state_exists_y  93
                   elective_legislature_y  92
                    diplomatic_exchange_x  91
          

## Finalizing Participant and Dyadic Dataframes

### Keeping values for first and last year of each dyad
### Combining these into one row per dyad

In [9]:
## creating unique identifier "conflict_pair" for each dyad
## based on name
for i, participant_a in enumerate(dyad_df['participant_a']):
    ## standardizing participant names with ccode when available
    dyad_list = []
    
    if dyad_df.loc[i, 'c_code_a'] in list(c_code_df['c_code']):
        dyad_list.append(str(dyad_df.loc[i, 'c_code_a']))
    else:
        dyad_list.append(participant_a)
    if dyad_df.loc[i, 'c_code_b'] in list(c_code_df['c_code']):
        dyad_list.append(str(dyad_df.loc[i, 'c_code_b']))
    else:
        dyad_list.append(dyad_df.loc[i, 'participant_b'])
    dyad_list = str(sorted(dyad_list))
    dyad_df.loc[i, 'conflict_pair'] = dyad_list

## keeping the values for once for each a/b dyad combination.
## need to dedupe across conflict pair so a vs b are never repeated interchangably.
dyad_df.drop_duplicates(subset=['war_num', 'conflict_pair'], keep='first', inplace=True)

### Addressing in null values, missing data, and conversions for dyads and participants

In [10]:
conversion_dic = {'money_flow_in': 1000000,
                  'money_flow_out': 1000000,
                  'military_expenditure': 1000,
                  'military_personnel': 1000,
                  'population': 1000,
                  'urban_population': 1000,
                  'refugees_originated': 1000,
                  'internally_displaced_persons': 1000,
                  'refugees_hosted': 1000,
#                   'land_mass_exchange_gain': 'unsure',
#                   'land_mass_exchange_loss': 'unsure',
#                   'population_exchange_gain': 'unsure',
#                   'population_exchange_loss': 'unsure',
                  ## these are thousands of tons
                  'iron_steel_production': 2000000,
                  'energy_consumption': 2000000
                 }
print('\nAddressng null values, missing data, and conversions for part_df.')
part_df = deepcopy(the_networks_of_war_python_functions.column_fills_and_converions(part_df, 'participant', conversion_dic))

print('\nAddressng null values, missing data, and conversions for dyad_df.')
dyad_df = deepcopy(the_networks_of_war_python_functions.column_fills_and_converions(dyad_df, 'dyad', None))


Addressng null values, missing data, and conversions for part_df.

Total Columns Adjusted: 66
Total Columns Adjusted for Conversion: 22
Total Null Values Notated: 68,529
Total Unknown Values Notated: 671

Addressng null values, missing data, and conversions for dyad_df.

Total Columns Adjusted: 90
Total Columns Adjusted for Conversion: 0
Total Null Values Notated: 93,866
Total Unknown Values Notated: 0


### Saving the data

In [11]:
print('Total Participants: {}'.format(format(len(part_df), ',d')))
print('Total Dyadic Combinations: {}'.format(format(len(dyad_df), ',d')))
print('Total Wars: {}'.format(format(len(war_df), ',d')))

part_df.to_pickle(pickle_directory + 'part_df.pkl')
dyad_df.to_pickle(pickle_directory + 'dyad_df.pkl')
war_df.to_pickle(pickle_directory + 'war_df.pkl')

Total Participants: 1,715
Total Dyadic Combinations: 1,106
Total Wars: 678


In [12]:
part_df = pd.read_pickle(pickle_directory + 'part_df.pkl')
dyad_df = pd.read_pickle(pickle_directory + 'dyad_df.pkl')
war_df = pd.read_pickle(pickle_directory + 'war_df.pkl')

## JSON Export for D3.js Processing

In [15]:
sorted(list(set(list(part_df['c_code']))))

[-9,
 -8,
 2,
 20,
 40,
 41,
 42,
 70,
 90,
 91,
 92,
 93,
 94,
 100,
 101,
 130,
 135,
 140,
 145,
 150,
 155,
 160,
 165,
 200,
 210,
 211,
 220,
 230,
 235,
 240,
 245,
 255,
 267,
 269,
 271,
 273,
 275,
 280,
 290,
 300,
 310,
 315,
 325,
 327,
 329,
 332,
 337,
 344,
 345,
 346,
 350,
 352,
 355,
 359,
 360,
 365,
 366,
 367,
 368,
 369,
 371,
 372,
 373,
 375,
 385,
 390,
 404,
 432,
 433,
 435,
 436,
 437,
 438,
 450,
 451,
 452,
 471,
 475,
 482,
 483,
 484,
 490,
 500,
 501,
 510,
 516,
 517,
 520,
 522,
 530,
 531,
 540,
 541,
 552,
 553,
 560,
 565,
 600,
 615,
 616,
 620,
 625,
 626,
 630,
 640,
 645,
 651,
 652,
 660,
 663,
 666,
 670,
 678,
 679,
 680,
 690,
 694,
 696,
 698,
 700,
 702,
 710,
 712,
 713,
 730,
 731,
 732,
 740,
 750,
 770,
 775,
 780,
 790,
 800,
 811,
 812,
 816,
 817,
 840,
 850,
 900,
 910,
 920]

In [19]:
def get_c_code_or_part_name(c_code_input, participant_input):
    
    if str(c_code_input) not in ['-8', '-9'] and str(c_code_input)[0].isdigit():
        return c_code_input
    else:
        return participant_input

In [None]:
war_column_list = deepcopy(list(war_df.columns))
## dropping fields that won't be needed in the participant section of the json file
## values may also differ between those in part_df and war_df.
## those in war_df have undergone additional preprocessing.
part_column_list = deepcopy(list(part_df.drop(['war_num',
                                               'war_name',
                                               'war_type',
                                               'war_subtype'
#                                                'deaths_both_sides_z'
                                              ], axis=1).columns))
## dropping fields that won't be needed in the dyad section of the json file

dyad_column_list = deepcopy(list(dyad_df.drop(initial_dyad_columns, axis=1).columns))

print('\nUpdating/recreating one JSON file per war_num.')
print('JSON Files to be Rewritten: {}\n'.format(format(len(war_df), ',d')))

for i, war_num in enumerate(war_df['war_num']):
    
    file_name = deepcopy('war_num_' + str(war_num).replace('.', '_') + '.json')
    ## need to filter by war here because i does not follow the sorting of the dataframe
    war_df.loc[war_df['war_num']==war_num, 'file_name'] = file_name
    ## rewriting all of the files each time
    graph_file = open(json_output_directory + file_name, 'w').close()
    graph_file = open(json_output_directory + file_name, 'w')
     
    part_df_copy = deepcopy(part_df[part_df['war_num']==war_num].reset_index(drop=True))
    dyad_df_copy = deepcopy(dyad_df[dyad_df['war_num']==war_num].reset_index(drop=True))
    war_df_copy = deepcopy(war_df[war_df['war_num']==war_num].reset_index(drop=True))
    
    if len(part_df_copy)==0:
        print('No participants for war_num {} ({}), {} dyads'.format(war_num, war_df_copy.loc[0, 'war_name'], len(part_df_copy)))
    if len(dyad_df_copy)==0:
        print('No dyads for war_num {} ({}), {} participants'.format(war_num, war_df_copy.loc[0, 'war_name'], len(part_df_copy)))
        
    war_line = ''
    for i, column in enumerate(war_column_list):
        if i > 0:
            war_line = deepcopy(war_line + '", "')
        war_line = deepcopy(war_line + '"' + column + '": "' + str(war_df_copy.loc[0, column]))

    graph_file.write('{\n  "war": [\n    {' + war_line + '"}\n')

    node_names = []
    sides = []
    participant_line = ''
    for i, participant in enumerate(part_df_copy['participant']):
        
        ## assigning the input based on whether a c_code is available
        ## this will be used to link part_df_copy to the dyad_df_copy
        ## (since c_code is not always availablen and participant names are not standardized)
        node_names.append(get_c_code_or_part_name(part_df_copy.loc[i, 'c_code'], participant))
        sides.append(part_df_copy.loc[i, 'side'])
        
        ## this process begins the writing of the file and is completely separate from the one above
        participant_line = deepcopy(participant_line + '    {"id": "' + str(i))
        for column in part_column_list:
            participant_line = deepcopy(participant_line + '", "' + column + '": "' + str(part_df_copy.loc[i, column]))
        participant_line = deepcopy(participant_line + '"},\n')

    graph_file.write('  ],\n  "nodes": [\n' + participant_line[:-2] + '\n')

    link_line = ''
    dyad_inputs = []
    for i, c_code_a in enumerate(dyad_df_copy['c_code_a']):
        ## assigning the input based on whether a c_code is available
        node_1 = get_c_code_or_part_name(dyad_df_copy.loc[i, 'c_code_a'], dyad_df_copy.loc[i, 'participant_a'])
        dyad_inputs.append(node_1)
        node_2 = get_c_code_or_part_name(dyad_df_copy.loc[i, 'c_code_b'], dyad_df_copy.loc[i, 'participant_b'])
        dyad_inputs.append(node_2)
        ## sometimes a country could be in the dyad and not in the participant df_copy.
        ## this is rare but has happened (see Spain in WWII)
        ## should be taken care of in processes above, so checking below to see if that process failed
        if node_1 not in node_names and node_2 not in node_names:
            print("Participant Missing in Dyadic Data: {} ({}) for {}.".format(int(dyad_df_copy.loc[i, 'c_code_a']), dyad_df_copy.loc[i, 'participant_a'], war_df_copy.loc[0, 'war_name']))
            print("Participant Missing in Dyadic Data: {} ({}) for {}.".format(int(dyad_df_copy.loc[i, 'c_code_b']), dyad_df_copy.loc[i, 'participant_b'], war_df_copy.loc[0, 'war_name']))
        elif node_1 not in node_names:
            print("Participant Missing in Dyadic Data: {} ({}) for {}.".format(int(dyad_df_copy.loc[i, 'c_code_a']), dyad_df_copy.loc[i, 'participant_a'], war_df_copy.loc[0, 'war_name']))
        elif node_2 not in node_names:
            print("Participant Missing in Dyadic Data: {} ({}) for {}.".format(int(dyad_df_copy.loc[i, 'c_code_b']), dyad_df_copy.loc[i, 'participant_b'], war_df_copy.loc[0, 'war_name']))   
        else:
            link_line = deepcopy(link_line
                + '    {"source": "' + str(node_names.index(node_1))
                + '", "target": "' + str(node_names.index(node_2)))
            for column in dyad_column_list:
                link_line = deepcopy(link_line + '", "' + column + '": "' + str(dyad_df_copy.loc[i, column]))
            link_line = deepcopy(link_line + '"},\n')
            ## checking for any dyads that are paired with participants on the same side as them
            ## this would most likely suggest a mistake in pre-processing
            ## does not apply to side 3 (switched sides)
            if sides[node_names.index(node_1)]==sides[node_names.index(node_2)] and sides[node_names.index(node_1)]!=3:
                print('Dyads on the Same Side: {} ({}) and {} ({}) for {}.'.format(int(dyad_df_copy.loc[i, 'c_code_a']), dyad_df_copy.loc[i, 'participant_a'], int(dyad_df_copy.loc[i, 'c_code_b']), dyad_df_copy.loc[i, 'participant_b'], war_df_copy.loc[0, 'war_name']))   
                         
    graph_file.write('  ],\n  "links": [\n' + link_line[:-2] + '\n\n  ]\n}')

    for node in node_names:
        if node not in dyad_inputs:
            print('Floating Node in {}: {} ({})'.format(war_df_copy.loc[0, 'war_name'], node, c_code_df[c_code_df['c_code']==node]['state_name'].values[0]))
                
print('\nCompleted JSON File Updates.')

print('Updating war_file_list.csv')
war_df.to_csv(csv_output_directory + 'war_file_list.csv', index=None)
print('Completed war_file_list update.')


Updating/recreating one JSON file per war_num.
JSON Files to be Rewritten: 678

Floating Node in Third Somalia War: 500 (Uganda)
Floating Node in Third Somalia War: 501 (Kenya)
Floating Node in Third Somalia War: 516 (Burundi)
Floating Node in First Waziristan War: 2 (United States of America)
Floating Node in Cote d'Ivoire Military War: 220 (France)
Floating Node in Africa's World War: 516 (Burundi)
Floating Node in Second Sierra Leone War: 200 (United Kingdom)
Floating Node in Bosnian-Serb Rebellion: 2 (United States of America)
Floating Node in Bosnian-Serb Rebellion: 344 (Croatia)
Floating Node in Fourth Lebanese War: 630 (Iran)
Floating Node in Fourth Lebanese War: 652 (Syria)
Floating Node in Chad - FAN War: 220 (France)
Floating Node in Chad - FAN War: 490 (Democratic Republic of the Congo)
Floating Node in Second Lebanese War: 652 (Syria)
Floating Node in First Laotian War: 2 (United States of America)
Floating Node in Spanish Civil War: 235 (Portugal)
Floating Node in Spanish