# Data preprocessing for IGO


### Installing and importing libraries
Install any uninstalled packages, and import necessary libraries

In [1]:
"""
!pip install numpy
!pip install matplotlib
!pip install pandas
!pip install networkx
!pip install pyvis
"""

'\n!pip install numpy\n!pip install matplotlib\n!pip install pandas\n!pip install networkx\n!pip install pyvis\n'

In [2]:
import numpy as np
import networkx as nx
from pyvis.network import Network
from IPython.core.display import display, HTML
import matplotlib.pyplot as plt
import pandas as pd
import re

### Reading in country codes
Read in using `pd.read_csv()`

In [3]:
names_to_abb = pd.read_csv("abb_ccode_names.csv", index_col='IGO_dataset').loc[:, 'StateAbb']
names_to_abb

IGO_dataset
Antigua and Barbuda    AAB
Afghanistan            AFG
Albania                ALB
Algeria                ALG
Andorra                AND
                      ... 
Venezuela              VEN
Samoa                  WSM
Yemen                  YEM
Zambia                 ZAM
Zimbabwe               ZIM
Name: StateAbb, Length: 195, dtype: object

### Reading in important countries and non-states
Read in important countries and non-states using `open()`, and split them to make them into a list using `re.split()`

In [4]:
# Read the important_countries.txt file
with open('important_countries.txt', 'r') as file:
    text = file.read()

# Split the text by blank lines to separate the countries
important_countries = re.split(r'\n\s*\n', text)
print(important_countries)

# Read the non_state.txt file
with open('non_states.txt', 'r') as file:
    text = file.read()

# Split the text by blank lines to separate the countries
non_states = re.split(r'\n\s*\n', text)
print(non_states)

['KOR', 'FRN', 'RUS', 'JPN', 'GER', 'CHN', 'UKG', 'USA', 'AUL', 'IND']
['American Samoa', 'Anguilla', 'Aruba', 'Bermuda', 'British Indian Ocean Territory', 'British Virgin Islands', 'Cayman Islands', 'Cook Islands', 'Curacao', 'Falkland Islands (Islas Malvinas)', 'Faroe Islands', 'French Polynesia', 'French Southern and Antarctic Lands', 'Gibraltar', 'Greenland', 'Guam', 'Guernsey', 'Holy See (Vatican City)', 'Hong Kong', 'Isle of Man', 'Jersey', 'Macau', 'New Caledonia', 'Niue', 'Norfolk Island', 'Northern Mariana Islands', 'Pitcairn Islands', 'Puerto Rico', 'Saint Barthelemy', 'Saint Helena, Ascension, and Tristan da Cunha', 'Saint Martin', 'Saint Pierre and Miquelon', 'Sint Maarten', 'South Georgia and South Sandwich Islands', 'Tokelau', 'Turks and Caicos Islands', 'Virgin Islands', 'Wallis and Futuna', 'European Union', 'Montserrat']


### Reading in IGO dataset

In [5]:
# Read the IGO.txt file
with open('IGO.txt', 'r') as file:
    text = file.read()

# Split the text by blank lines to separate the countries
splitted_text = re.split(r'\n\s*\n|\n', text)
print(splitted_text)

['Afghanistan', 'ADB, CICA, CP, ECO, EITI (candidate country), FAO, G-77, IAEA, IBRD, ICAO, ICC (NGOs), ICCt, ICRM, IDA, IDB, IFAD, IFC, IFRCS, ILO, IMF, Interpol, IOC, IOM, IPU, ISO (correspondent), ITSO, ITU, ITUC (NGOs), MIGA, NAM, OIC, OPCW, OSCE (partner), SAARC, SACEP, SCO (dialogue member), UN, UNAMA, UNCTAD, UNESCO, UNHCR, UNIDO, UNWTO, UPU, WCO, WFTU (NGOs), WHO, WIPO, WMO, WTO', 'Albania', 'BSEC, CD, CE, CEI, EAPC, EBRD, EITI (compliant country), FAO, IAEA, IBRD, ICAO, ICC (national committees), ICCt, ICRM, IDA, IDB, IFAD, IFC, IFRCS, ILO, IMF, IMO, Interpol, IOC, IOM, IPU, ISO (correspondent), ITU, ITUC (NGOs), MIGA, NATO, OAS (observer), OIC, OIF, OPCW, OSCE, PCA, SELEC, UN, UNCTAD, UNESCO, UNIDO, UNWTO, UPU, WCO, WFTU (NGOs), WHO, WIPO, WMO, WTO', 'note: Albania is an EU candidate country whose satisfactory completion of accession criteria is required before being granted full EU membership', 'Algeria', 'ABEDA, AfDB, AFESD, AMF, AMU, AU, BIS, CAEU, CD, FAO, G-15, G-24, G-7

### Build dataframe from text file

In [6]:
# Define two lists to store country names and their organizations
country_names = []
organizations = []

# Iterate through each line (element)
i = 0
for line in splitted_text:
    # if a note is found, skip the note
    if line.find("note") != -1 or line.find("Note") != -1:
        continue
    # increment the number of lines processed
    i += 1
    # if i is odd, the line denotes the name of the country
    if i % 2 == 1:
        country_names.append(line)
        continue
    # if i in even, the line denotes the list of IGOs assiociated with the country
    # split the line with ', ' and remove parentheses
    orgs = line.split(', ')
    orgs_without_parentheses = []
    for org in orgs:
        org_without_parentheses = re.sub(r' \(.+\)', '', org.strip().replace(';', ''))
        orgs_without_parentheses.append(org_without_parentheses)
    organizations.append(orgs_without_parentheses)


# Create a dictionary where keys are country names and values are lists of organizations
country_dict = dict(zip(country_names, organizations))

# Create an empty dataframe with country names as index
df = pd.DataFrame(index=country_names)

# Fill the dataframe
for country, orgs in country_dict.items():
    for org in orgs:
        if org not in df.columns:
            df[org] = 0
        df.loc[country, org] = 1

# drop the non-states
df.drop(labels=non_states, axis=0, inplace=True)
df.index = names_to_abb[df.index]

# remove the index's name
df.index.name = None

  df[org] = 0
  df[org] = 0
  df[org] = 0
  df[org] = 0
  df[org] = 0
  df[org] = 0
  df[org] = 0
  df[org] = 0
  df[org] = 0
  df[org] = 0
  df[org] = 0
  df[org] = 0
  df[org] = 0
  df[org] = 0
  df[org] = 0
  df[org] = 0
  df[org] = 0
  df[org] = 0
  df[org] = 0
  df[org] = 0
  df[org] = 0
  df[org] = 0
  df[org] = 0
  df[org] = 0
  df[org] = 0
  df[org] = 0
  df[org] = 0
  df[org] = 0
  df[org] = 0
  df[org] = 0
  df[org] = 0
  df[org] = 0
  df[org] = 0
  df[org] = 0
  df[org] = 0
  df[org] = 0
  df[org] = 0
  df[org] = 0
  df[org] = 0
  df[org] = 0
  df[org] = 0
  df[org] = 0
  df[org] = 0
  df[org] = 0
  df[org] = 0
  df[org] = 0
  df[org] = 0
  df[org] = 0
  df[org] = 0
  df[org] = 0
  df[org] = 0
  df[org] = 0
  df[org] = 0
  df[org] = 0
  df[org] = 0
  df[org] = 0
  df[org] = 0
  df[org] = 0
  df[org] = 0
  df[org] = 0
  df[org] = 0
  df[org] = 0
  df[org] = 0
  df[org] = 0
  df[org] = 0
  df[org] = 0
  df[org] = 0
  df[org] = 0
  df[org] = 0
  df[org] = 0
  df[org] = 0
  df[o

### Define normalization function

`normalize()` normalizes the numpy array to the range $(0,\,1)$

In [7]:
# function for normalizing ndarrays

def normalize(array):
    array_min, array_max = np.min(array), np.max(array)
    return (array - array_min) / (array_max - array_min)

### Define filtering-by-percentile function
`top_p_percent()`  first calculates the threshold using the np.percentile() function, and uses np.where() to create a new array where only the values above the threshold remain

In [8]:
def top_p_percent(array, p):
    # Calculate the threshold for top (100*p)%
    threshold = np.percentile(array, 100-p)
    

    # Create a new array: values above threshold remain, others become 0
    result = np.where(array > threshold, array, 0)

    return result

In [9]:
# Turn dataframe into a numpy array
X = df.to_numpy()
print(X.shape)

# Calculate the adjacency matrix, and obtain a scaled version of it
adjacency_matrix = X @ X.T
adjacency_matrix_scaled = normalize(adjacency_matrix)
print(adjacency_matrix.shape)

# Build dataframes
adjmat_df = pd.DataFrame(adjacency_matrix, index=df.index, columns=df.index)
adjmat_scaled_df = pd.DataFrame(adjacency_matrix_scaled, index=df.index, columns=df.index)
adjmat_important_df = adjmat_df.loc[important_countries, important_countries]
adjmat_scaled_important_df = adjmat_scaled_df.loc[important_countries, important_countries]
zero_diag = adjmat_scaled_important_df.to_numpy()
np.fill_diagonal(zero_diag, 0)
adjmat_scaled_topXpct_important_df = pd.DataFrame(top_p_percent(zero_diag, 50), index=important_countries, columns=important_countries)
adjmat_scaled_topXpct_important_df

(195, 239)
(195, 195)


Unnamed: 0,KOR,FRN,RUS,JPN,GER,CHN,UKG,USA,AUL,IND
KOR,0.0,0.677083,0.645833,0.739583,0.677083,0.677083,0.0,0.708333,0.645833,0.645833
FRN,0.677083,0.0,0.645833,0.739583,0.854167,0.0,0.8125,0.802083,0.0,0.0
RUS,0.645833,0.645833,0.0,0.0,0.0,0.0,0.0,0.65625,0.0,0.0
JPN,0.739583,0.739583,0.0,0.0,0.75,0.0,0.739583,0.802083,0.6875,0.645833
GER,0.677083,0.854167,0.0,0.75,0.0,0.0,0.833333,0.760417,0.0,0.0
CHN,0.677083,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.666667
UKG,0.0,0.8125,0.0,0.739583,0.833333,0.0,0.0,0.78125,0.0,0.0
USA,0.708333,0.802083,0.65625,0.802083,0.760417,0.0,0.78125,0.0,0.677083,0.0
AUL,0.645833,0.0,0.0,0.6875,0.0,0.0,0.0,0.677083,0.0,0.0
IND,0.645833,0.0,0.0,0.645833,0.0,0.666667,0.0,0.0,0.0,0.0


In [10]:
# Select countries with more than 70 participating IGOs
countries_to_keep = []
for i in range(len(adjmat_df)):
    if adjmat_df.iloc[i, i] >= 70:
        countries_to_keep.append(i)
adjmat_over70_df = adjmat_df.iloc[countries_to_keep, countries_to_keep]

In [11]:
# Save dataframes
df.to_csv('IGO_preprocessed.csv')
adjmat_df.to_csv('IGO_adjmat.csv')
adjmat_scaled_df.to_csv('IGO_adjmat_scaled.csv')
adjmat_important_df.to_csv('IGO_adjmat_important.csv')
adjmat_scaled_important_df.to_csv('IGO_adjmat_scaled_important.csv')
adjmat_over70_df.to_csv('IGO_adjmat_over70.csv')
adjmat_scaled_topXpct_important_df.to_csv('IGO_adjmat_scaled_topXpct_important.csv')