# Data Prepping Polity5
This notebook loads data from the polity5 project and merges them with the existing dataset, including data from UCDP, FORGE, World Bank and the Quality of Government Dataset. The following variable is added: the combined polity score. The variable has missing values which need to be imputed.

In [1]:
### Load libraries -------
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import rcParams
import numpy as np
import os

# Pandas parameters
#pd.set_option('display.max_colwidth', -1)
#pd.set_option('display.max_rows', 2000)
#pd.set_option('display.max_columns', 500)

In [2]:
### Define plot parameters ------
plt.rcParams["font.family"] = "garamond"
plt.rcParams.update({'font.size': 22})
rcParams.update({'figure.autolayout': True})

In [3]:
### Define out paths ------

# check if out path exists, if it does not exists add the path
if not os.path.exists('C:\\Users\\frank\\Desktop\\master_thesis_out'):
    os.makedirs('C:\\Users\\frank\\Desktop\\master_thesis_out')

# add subfolders for the out path
out_paths = {
    "desciptive_plots_outcome": 'C:\\Users\\frank\\Desktop\\master_thesis_out\desciptive_plots_outcome',
    "desciptive_tables_outcome": 'C:\\Users\\frank\\Desktop\\master_thesis_out\desciptive_tables_outcome',
    "data": 'C:\\Users\\frank\\Desktop\\master_thesis_out\data'
}

for key, val in out_paths.items():
    if not os.path.exists(val):
        os.makedirs(val)
        
out_paths["desciptive_tables_outcome"]

'C:\\Users\\frank\\Desktop\\master_thesis_out\\desciptive_tables_outcome'

In [4]:
### Load data --------
ucdp_forge_wb_qog = pd.read_csv(os.path.join(out_paths["data"], "ucdp_forge_wb_qog.csv"))
ucdp_forge_wb_qog.head(4)
len(ucdp_forge_wb_qog)

2065

In [5]:
### Load Polity5: Regime Authority Characteristics and Transitions Datasets ----------
# Codebook: http://www.systemicpeace.org/inscr/p5manualv2018.pdf

polity = pd.read_excel("https://www.systemicpeace.org/inscr/p5v2018.xls")
polity.head()

Unnamed: 0,p5,cyear,ccode,scode,country,year,flag,fragment,democ,autoc,...,interim,bmonth,bday,byear,bprec,post,change,d5,sf,regtrans
0,0,7001800,700,AFG,Afghanistan,1800,0,,1,7,...,,1.0,1.0,1800.0,1.0,-6.0,88.0,1.0,,
1,0,7001801,700,AFG,Afghanistan,1801,0,,1,7,...,,,,,,,,,,
2,0,7001802,700,AFG,Afghanistan,1802,0,,1,7,...,,,,,,,,,,
3,0,7001803,700,AFG,Afghanistan,1803,0,,1,7,...,,,,,,,,,,
4,0,7001804,700,AFG,Afghanistan,1804,0,,1,7,...,,,,,,,,,,


In [6]:
### Check cow country codes in polity data ----
# check manually with home page: https://correlatesofwar.org/data-sets/cow-country-codes
polity_codes = polity.groupby(["country", "ccode"]).size().reset_index().rename(columns={0:'count'})
polity_codes.head(3)

Unnamed: 0,country,ccode,count
0,Afghanistan,700,219
1,Albania,339,105
2,Algeria,615,57


In [7]:
### Make subset to only included the needed variables -----
polity_s = polity[["ccode", "year", "polity2"]]
polity_s = polity_s.rename(columns={'ccode': 'ccode_polity', 'year': 'year', 'polity2': 'polity2'})
polity_s.head(3)

Unnamed: 0,ccode_polity,year,polity2
0,700,1800,-6.0
1,700,1801,-6.0
2,700,1802,-6.0


In [8]:
### Merge polity variable with existing dataset (how = left) -----
ucdp_forge_wb_qog_polity = pd.merge(ucdp_forge_wb_qog, polity_s, how='left',
                                                   left_on=['year', 'ccode'], right_on=['year', 'ccode_polity'])
ucdp_forge_wb_qog_polity.head(3)

Unnamed: 0,year,StateAbb,ccode,cname,side_a,side_a_new_id,side_b,side_b_new_id,dyad_name,dyad_new_id,...,ccode_wb,oil,forest_cov,rural,internet_use,gov_qual,milexp_in,milper_in,ccode_polity,polity2
0,1990,IRN,630,Iran,Government of Iran,114,KDPI,164,Government of Iran - KDPI,406,...,IRN,21.474983,5.572374,43.67,0.0,0.305556,1.84,0.52,630.0,-6.0
1,1991,IRN,630,Iran,Government of Iran,114,KDPI,164,Government of Iran - KDPI,406,...,IRN,,5.587699,43.135,0.0,0.37037,1.52,0.52,630.0,-6.0
2,1992,IRN,630,Iran,Government of Iran,114,KDPI,164,Government of Iran - KDPI,406,...,IRN,,5.603023,42.347,0.0,0.467593,1.34,0.51,630.0,-6.0


In [9]:
### Check if merge was successful ---------
codes = ucdp_forge_wb_qog_polity.groupby(["ccode_polity", 'ccode']).size()
ucdp_forge_wb_qog["ccode"].unique()
len(codes)

73

In [10]:
### Delete merge variable ----
ucdp_forge_wb_qog_polity = ucdp_forge_wb_qog_polity.drop(['ccode_polity'], 1)
len(ucdp_forge_wb_qog_polity)

2065

In [11]:
### Which variables are in data -------------
dat_var = []
for col in ucdp_forge_wb_qog_polity.columns:
    dat_var.append(col)
dat_var

['year',
 'StateAbb',
 'ccode',
 'cname',
 'side_a',
 'side_a_new_id',
 'side_b',
 'side_b_new_id',
 'dyad_name',
 'dyad_new_id',
 'active_year',
 'best',
 'high',
 'low',
 'foundloc',
 'foundyear',
 'foundmo',
 'foundday',
 'fightyear',
 'fightmo',
 'fightday',
 'goalnominal',
 'goalindep',
 'goalauto',
 'goalrights',
 'goalrep',
 'goalchange',
 'goaldem',
 'goalother',
 'goalnote',
 'ideology',
 'ideolcom',
 'ideolleft',
 'ideolright',
 'ideolnat',
 'ideolanti',
 'ideolrel',
 'ideoloth',
 'ideolnote',
 'religious',
 'religion',
 'ethnic',
 'ethnicity',
 'preorg',
 'preorgno',
 'preorgreb',
 'preorgter',
 'preorgpar',
 'preorgmvt',
 'preorgyou',
 'preorglab',
 'preorgmil',
 'preorggov',
 'preorgfmr',
 'preorgrel',
 'preorgfor',
 'preorgref',
 'preorgeth',
 'preorgoth',
 'preorgname',
 'merger',
 'splinter',
 'splinterUCDP',
 'foundloc_cat',
 'foundloc_cat_lab',
 'foundloc_cat_cat',
 'foundloc_cat_cat_lab',
 'age_formation',
 'age_active',
 'goal_territory',
 'goal_gov_represent',
 'go

In [12]:
### Check missing values in added variables ----
missing = ucdp_forge_wb_qog_polity[['year',
 'cname', 'ccode', 'polity2']]
null_data = missing[missing.isnull().any(axis=1)]
null_data.sort_values(by=["cname", "year"]).head(3)

Unnamed: 0,year,cname,ccode,polity2
1808,2001,Afghanistan,700,
1816,2001,Afghanistan,700,
774,2002,Afghanistan,700,


In [13]:
### Check countries which missing values to avoid merging issue ----
polity[["country", "ccode", 'polity2', 'year']].loc[polity["ccode"]==700] # Afghanistan, check
polity[["country", "ccode", 'polity2', 'year']].loc[polity["ccode"]==530] # Ethiopia, check
polity[["country", "ccode", 'polity2', 'year']].loc[polity["ccode"]==645] # Iraq, check
polity[["country", "ccode", 'polity2', 'year']].loc[polity["ccode"]==365].head(3) # USSR, check

Unnamed: 0,country,ccode,polity2,year
13040,Russia,365,-10.0,1800
13041,Russia,365,-10.0,1801
13042,Russia,365,-10.0,1802


In [14]:
### Reset index -----------
ucdp_forge_wb_qog_polity.reset_index(drop=False, inplace=False)
ucdp_forge_wb_qog_polity.head(3)

Unnamed: 0,year,StateAbb,ccode,cname,side_a,side_a_new_id,side_b,side_b_new_id,dyad_name,dyad_new_id,...,parent_merger,ccode_wb,oil,forest_cov,rural,internet_use,gov_qual,milexp_in,milper_in,polity2
0,1990,IRN,630,Iran,Government of Iran,114,KDPI,164,Government of Iran - KDPI,406,...,0,IRN,21.474983,5.572374,43.67,0.0,0.305556,1.84,0.52,-6.0
1,1991,IRN,630,Iran,Government of Iran,114,KDPI,164,Government of Iran - KDPI,406,...,0,IRN,,5.587699,43.135,0.0,0.37037,1.52,0.52,-6.0
2,1992,IRN,630,Iran,Government of Iran,114,KDPI,164,Government of Iran - KDPI,406,...,0,IRN,,5.603023,42.347,0.0,0.467593,1.34,0.51,-6.0


In [15]:
### Save dataset -----------
ucdp_forge_wb_qog_polity.to_csv(os.path.join(out_paths["data"], 
                                              "ucdp_forge_wb_qog_polity.csv"), 
                                               index=False, sep=',')
len(ucdp_forge_wb_qog_polity)

2065