# Data preprocessing for DCAD


### Installing and importing libraries
Install any uninstalled packages, and import necessary libraries

In [66]:
"""
!pip install numpy
!pip install matplotlib
!pip install pandas
!pip install networkx
!pip install pyvis
"""

'\n!pip install numpy\n!pip install matplotlib\n!pip install pandas\n!pip install networkx\n!pip install pyvis\n'

In [67]:
import numpy as np
import networkx as nx
from pyvis.network import Network
from IPython.core.display import display, HTML
import matplotlib.pyplot as plt
import pandas as pd
import re

### Reading in country codes
Read in using `pd.read_csv()`, and erase the name of the index

In [68]:
abbs = pd.read_csv("abb_ccode_names.csv", index_col='StateAbb').index
abbs.name = None

### Reading in important countries and non-states
Read in important countries and non-states using `open()`, and split them to make them into a list using `re.split()`

In [69]:
# Read the important_countries.txt file
with open('important_countries.txt', 'r') as file:
    text = file.read()

# Split the text by blank lines to separate the countries
important_countries = re.split(r'\n\s*\n', text)
print(important_countries)

# Read the non_state.txt file
with open('non_states.txt', 'r') as file:
    text = file.read()

# Split the text by blank lines to separate the countries
non_states = re.split(r'\n\s*\n', text)
print(non_states)

['KOR', 'FRN', 'RUS', 'JPN', 'GER', 'CHN', 'UKG', 'USA', 'AUL', 'IND']
['American Samoa', 'Anguilla', 'Aruba', 'Bermuda', 'British Indian Ocean Territory', 'British Virgin Islands', 'Cayman Islands', 'Cook Islands', 'Curacao', 'Falkland Islands (Islas Malvinas)', 'Faroe Islands', 'French Polynesia', 'French Southern and Antarctic Lands', 'Gibraltar', 'Greenland', 'Guam', 'Guernsey', 'Holy See (Vatican City)', 'Hong Kong', 'Isle of Man', 'Jersey', 'Macau', 'New Caledonia', 'Niue', 'Norfolk Island', 'Northern Mariana Islands', 'Pitcairn Islands', 'Puerto Rico', 'Saint Barthelemy', 'Saint Helena, Ascension, and Tristan da Cunha', 'Saint Martin', 'Saint Pierre and Miquelon', 'Sint Maarten', 'South Georgia and South Sandwich Islands', 'Tokelau', 'Turks and Caicos Islands', 'Virgin Islands', 'Wallis and Futuna', 'European Union', 'Montserrat']


### Reading in DCAD dataset

In [70]:
# Read the DCAD.csv file
DCAD = pd.read_csv('DCAD.csv')
DCAD = DCAD[DCAD.signYear >= 2000]
print(len(DCAD))
country_pairs = set(zip(DCAD.cowName1, DCAD.cowName2))
adjmat_df = pd.DataFrame(np.zeros((len(abbs), len(abbs))), index=abbs, columns=abbs)
for ctry1, ctry2 in country_pairs:
    print(ctry1, ctry2)
    adjmat_df.loc[ctry1, ctry2] = 1
    adjmat_df.loc[ctry2, ctry1] = 1
adjmat_df

996
IND MAL
SPN ALG
POL LAT
SUD IRN
SAU PAK
VEN BOL
BOS BUL
MLI SAF
KOR IND
LIT UKR
UZB IND
TUR AFG
FRN NIG
USA ALB
RUS KZK
TUR KOR
BRA CHN
UKG ALG
SWZ FIN
MLD LIT
SER TUR
RUS MYA
SWD IND
SOM TUR
BLR SAF
USA PAR
CRO TUR
EQG SAF
SAL ARG
USA CHL
LAT UKR
UKR SWD
OMA IND
USA ARG
HUN BUL
KZK MON
SPN ALB
CHL RUS
RUS JOR
SAF SAU
USA FIN
CHN BNG
INS AUL
RUS BEN
PER ITA
ITA DJI
IRN LEB
LAT AZE
DRV BRU
GRC BUL
RUS TUR
MAC CRO
EGY KOR
BEL SAF
UKR FIN
BUL LIT
SER SLV
ROM ISR
BOS PAK
BEL EST
UKR LIB
KOR MAL
JPN AUL
BOL VEN
ITA OMA
BOS ROM
USA SAF
CRO SLV
BRA FRN
UKR KOR
GRC UAE
MZM LIB
SER EGY
TAJ PAK
SLV ROM
UAE KOR
EST AZE
UKG CRO
GER CRO
FRN DRV
ECU SPN
LIT ARM
BUL MON
MNG SLV
RUS GAM
SPN MAC
NOR KOR
BOS CHN
ARG ITA
ARG RUS
HUN IND
MZM SAF
UGA ANG
HUN ROM
MAC UKR
LAT ARM
AUS SER
ITA SAU
POL LIT
LIT KZK
GRC ROM
RUS LEB
ECU URU
HUN CHN
FRN SWZ
CZE SER
PAR ARG
CRO SWD
FIN UAE
DRC ZIM
TUR MON
IND CAM
PER UKR
UKR GUI
SWZ CZE
BEL LIT
UKG UKR
FRN ARM
DOM BRA
SLO MAC
THI SIN
BRA NIG
BRA UKG
NOR ICE
KUW 

Unnamed: 0,AAB,AFG,ALB,ALG,AND,ANG,ARG,ARM,AUL,AUS,...,UKR,URU,USA,UZB,VAN,VEN,WSM,YEM,ZAM,ZIM
AAB,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AFG,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ALB,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ALG,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AND,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
VEN,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
WSM,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
YEM,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ZAM,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [71]:
# Select countries with more than X participating DCAs

degrees = adjmat_df.sum(axis=1)
threshold = np.percentile(degrees, 95)
countries_to_keep = degrees[degrees > threshold].index.tolist()

print(degrees.loc[countries_to_keep])

BRA    42.0
BUL    32.0
CHN    37.0
IND    38.0
RUS    54.0
SAF    43.0
SPN    41.0
TUR    48.0
UKR    46.0
USA    39.0
dtype: float64


In [72]:
adjmat_top5pct_df = adjmat_df.loc[countries_to_keep, countries_to_keep]
adjmat_important_df = adjmat_df.loc[important_countries, important_countries]

In [73]:
# Save dataframes
adjmat_df.to_csv('DCAD_adjmat.csv')
adjmat_important_df.to_csv('DCAD_adjmat_important.csv')
adjmat_top5pct_df.to_csv('DCAD_adjmat_top5pct.csv')