# Preparation of [WGI dataset](http://info.worldbank.org/governance/wgi/)

In [None]:
import numpy as np
import pandas as pd
from causal import ccm_bivariate, granger, eval_candidate_DAGs, causal_discovery
from kernels import K_ID
from synthetic_data import spline_multi_sample
import networkx as nx
import matplotlib.pyplot as plt

Create a separate dataframe for each variable

In [None]:
df_VA = pd.read_excel(r'/home/felix/Dropbox/Research/projects/causal-fda/data/wgidataset.xlsx', sheet_name='VoiceandAccountability', header=13).rename(columns={'Unnamed: 0': 'Country'})
df_PS = pd.read_excel(r'/home/felix/Dropbox/Research/projects/causal-fda/data/wgidataset.xlsx', sheet_name='Political StabilityNoViolence', header=13).rename(columns={'Unnamed: 0': 'Country'})
df_GE = pd.read_excel(r'/home/felix/Dropbox/Research/projects/causal-fda/data/wgidataset.xlsx', sheet_name='GovernmentEffectiveness', header=13).rename(columns={'Unnamed: 0': 'Country'})
df_RQ = pd.read_excel(r'/home/felix/Dropbox/Research/projects/causal-fda/data/wgidataset.xlsx', sheet_name='RegulatoryQuality', header=13).rename(columns={'Unnamed: 0': 'Country'})
df_RL = pd.read_excel(r'/home/felix/Dropbox/Research/projects/causal-fda/data/wgidataset.xlsx', sheet_name='RuleofLaw', header=13).rename(columns={'Unnamed: 0': 'Country'})
df_CC = pd.read_excel(r'/home/felix/Dropbox/Research/projects/causal-fda/data/wgidataset.xlsx', sheet_name='ControlofCorruption', header=13).rename(columns={'Unnamed: 0': 'Country'})

In [None]:
dfs = [df_VA, df_PS, df_GE, df_RQ, df_RL, df_CC]

In [None]:
# check
df_GE

Only work with Estimate and delete each Country that has NaNs in time-series:

In [None]:
for df in dfs:
    df.drop(columns=[c for c in df if c!='Country' and c not in list(np.arange(1996, 2021, 1))],inplace=True)
    df.drop(index=0, inplace=True)
    df.dropna(inplace=True)

Linearly interpolate missing years: 1997, 1999, 2001

In [None]:
columns = ['Country', 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020]

In [None]:
for df in dfs:
    for y in np.arange(1996, 2021, 1):
        if y not in df.columns:
            df[y] = np.nan
    
    df[np.arange(1996, 2021, 1)] = df[np.arange(1996, 2021, 1)].astype(float).interpolate(axis=1)

Reorder the columns:

In [None]:
df_VA = df_VA[columns]
df_PS = df_PS[columns]
df_GE = df_GE[columns]
df_RQ = df_RQ[columns]
df_RL = df_RL[columns]
df_CC = df_CC[columns]

Only work with countries where data are available for all six variables:

In [None]:
countries = set(df_VA['Country']) & set(df_PS['Country']) & set(df_GE['Country']) & set(df_RQ['Country']) & set(df_RL['Country']) & set(df_CC['Country'])

In [None]:
df_VA = df_VA.loc[df_VA['Country'].isin(countries)]
df_PS = df_PS.loc[df_PS['Country'].isin(countries)]
df_GE = df_GE.loc[df_GE['Country'].isin(countries)]
df_RQ = df_RQ.loc[df_RQ['Country'].isin(countries)]
df_RL = df_RL.loc[df_RL['Country'].isin(countries)]
df_CC = df_CC.loc[df_CC['Country'].isin(countries)]

In [None]:
# check
df_RQ

### Causal discovery on corruption and income inequality in Africa

In [None]:
Africa_countries = ['Algeria', 'Egypt, Arab Rep.', 'Morocco', 'Tunisia', 'Burundi', 'Comoros', 'Djibouti', 'Eritrea', 'Ethiopia', 'Kenya', 
'Madagascar', 'Malawi', 'Mauritius', 'Mozambique', 'Rwanda', 'Seychelles', 'Somalia', 'Uganda',
'Tanzania', 'Zambia', 'Zimbabwe', 'Angola', 'Cameroon', 'Central African Republic', 'Chad', 'Congo, Rep.',
'Congo, Dem. Rep.', 'Equatorial Guinea', 'Gabon', 'Botswana', 'Lesotho', 'Namibia',
'South Africa', 'Benin', 'Burkina Faso', "Côte d'Ivoire", 'Gambia, The', 'Ghana', 'Guinea-Bissau', 'Liberia',
'Mali', 'Mauritania', 'Niger', 'Nigeria', 'Senegal', 'Sierra Leone', 'Togo']

In [None]:
# corruption data
df_CC_Africa = df_CC[df_CC['Country'].isin(Africa_countries)].iloc[:, 0:22]

In [None]:
df_CC_Africa

In [None]:
# income inequality data
df_IE = pd.read_csv(r'/home/felix/Dropbox/Research/projects/causal-fda/data/gini.csv')
df_IE.replace("Cote d'Ivoire", "Côte d'Ivoire", inplace=True)

df_IE_Africa = df_IE[df_IE['Country'].isin(Africa_countries)]
df_IE_Africa_time = df_IE_Africa.loc[:, df_IE_Africa.columns != 'Country']
df_IE_Africa_time_1 = df_IE_Africa_time.replace('..', np.nan).astype(float).interpolate(axis=1)
df_IE_Africa_time_2 = df_IE_Africa_time_1.fillna(df_IE_Africa_time_1.mean(axis=0))

df_IE_Africa_int = pd.concat([df_IE_Africa.loc[:, df_IE_Africa.columns=='Country'], df_IE_Africa_time_2], axis=1)

In [None]:
df_IE_Africa_int

#### CCM analysis

In [None]:
%%time

DAGs = {}
DAGs_01 = {}
DAGs_10 = {}
p_values = {}
p_values_01 = {}
p_values_10 = {}
for i, country in enumerate(df_CC_Africa['Country']):
    arr1 = df_CC_Africa.loc[df_CC_Africa['Country']==country].loc[:, df_CC_Africa.columns != 'Country'].to_numpy()
    arr2 = df_IE_Africa_int.loc[df_IE_Africa_int['Country']==country].loc[:, df_IE_Africa_int.columns != 'Country'].to_numpy()
    arr = np.asarray([arr1, arr2]).squeeze()

    DAG, _, p_value, _ = ccm_bivariate(arr, alpha=0.05)
    DAGs[country] = DAG
    p_values[country] = p_value
    
    if DAG == {0: [], 1: 0}:
        DAGs_01[i] = DAG
        p_values_01[i] = p_value
        
    if DAG == {0: 1, 1: []}:
        DAGs_10[i] = DAG
        p_values_10[i] = p_value

In [None]:
len(DAGs_01)/len(DAGs)

In [None]:
len(DAGs_10)/len(DAGs)

#### Granger analysis

In [None]:
%%time

DAGs = {}
DAGs_01 = {}
DAGs_10 = {}
p_values = {}
p_values_01 = {}
p_values_10 = {}
for i, country in enumerate(df_CC_Africa['Country']):
    arr1 = df_CC_Africa.loc[df_CC_Africa['Country']==country].loc[:, df_CC_Africa.columns != 'Country'].to_numpy()
    arr2 = df_IE_Africa_int.loc[df_IE_Africa_int['Country']==country].loc[:, df_IE_Africa_int.columns != 'Country'].to_numpy()
    arr = np.asarray([arr1, arr2]).squeeze()

    DAG, _, p_value, _ = granger(arr, alpha=0.05)
    DAGs[country] = DAG
    p_values[country] = p_value
    
    if DAG == {0: [], 1: 0}:
        DAGs_01[i] = DAG
        p_values_01[i] = p_value
        
    if DAG == {0: 1, 1: []}:
        DAGs_10[i] = DAG
        p_values_10[i] = p_value

In [None]:
len(DAGs_01)/len(DAGs)

In [None]:
len(DAGs_10)/len(DAGs)

#### Regression-based analysis

In [None]:
np_CC_Africa = np.zeros(df_CC_Africa.iloc[:, 1:].shape)
np_IE_Africa = np.zeros(df_CC_Africa.iloc[:, 1:].shape)

for i, country in enumerate(df_CC_Africa['Country']):
    np_CC_Africa[i] = df_CC_Africa.loc[df_CC_Africa['Country']==country].loc[:, df_CC_Africa.columns != 'Country'].to_numpy()
    np_IE_Africa[i] = df_IE_Africa_int.loc[df_IE_Africa_int['Country']==country].loc[:, df_IE_Africa_int.columns != 'Country'].to_numpy()

In [None]:
%%time

arr = np.asarray([np_CC_Africa, np_IE_Africa]).squeeze()
n_intervals = 12
analyse = True
pred_points = np.linspace(0, 1, 21)
n_neighbours = 5
n_perms = 1000
alpha = 0.05
make_K = K_ID
regressor = 'hist'

print(eval_candidate_DAGs(arr, pred_points, n_intervals, n_neighbours, n_perms, alpha, 
                          make_K, analyse, regressor, pd_graph=None))

### Causal discovery on the WGI dataset

Countries are seen as independent samples, data are taken from 1996 to 2020. Each country has one score for each variable in every year.

In [None]:
np_VA = df_VA.loc[:, df_VA.columns != 'Country'].to_numpy()
np_PS = df_PS.loc[:, df_PS.columns != 'Country'].to_numpy()
np_GE = df_GE.loc[:, df_GE.columns != 'Country'].to_numpy()
np_RQ = df_RQ.loc[:, df_RQ.columns != 'Country'].to_numpy()
np_RL = df_RL.loc[:, df_RL.columns != 'Country'].to_numpy()
np_CC = df_CC.loc[:, df_CC.columns != 'Country'].to_numpy()

In [None]:
n_vars = 6
n_samples, n_years = np_VA.shape
nps = np.zeros((n_vars, n_samples, n_years))
nps[0] = np_VA
nps[1] = np_PS
nps[2] = np_GE
nps[3] = np_RQ
nps[4] = np_RL
nps[5] = np_CC

In [None]:
# experiment parameters
cd_type = 'combined'
obs_points = np.tile(np.linspace(0, 1, nps.shape[2]), (n_samples, 1))
pred_points = np.linspace(0, 1, 100)
n_intervals = 12
n_neighbours = 3
n_perms = 1000
alpha = 0.05
make_K = K_ID
lambs = 1e-1
n_pretests = 100
n_steps = 50
analyse = True
regressor = 'hist'
init = 'cond_set'
find_lambda = False

l_cond = np.zeros(n_vars - 2)
r_opts = np.zeros(n_vars - 2)

In [None]:
# data preparation
nps_int = np.zeros((nps.shape[0], nps.shape[1], len(pred_points)))
for d in range(nps.shape[0]):
    nps_int[d] = spline_multi_sample(nps[d], obs_points, pred_points).evaluate(pred_points).squeeze()

In [None]:
%%time
sparse_g, _DAGs, p_values, lamb_cond, rejects_opts, lags, corr_values = causal_discovery(cd_type, nps_int, pred_points, n_intervals, n_neighbours, n_perms, alpha, make_K, lambs, n_pretests, 
                                                                                         n_steps, analyse, regressor, l_cond, r_opts, init, find_lambda, pd_graph=None)

In [None]:
print(sparse_g, _DAGs, p_values, lamb_cond, rejects_opts, lags, corr_values)

### Draw networks

#### Undirected network

In [None]:
for i in range(nps.shape[0]):
    if i not in list(_DAGs.keys()):
        _DAGs[i] = []

In [None]:
pg_graph = nx.Graph()

for d, p_list in _DAGs.items():
    pg_graph.add_node(d)
    for p in p_list:
        pg_graph.add_edge(p, d)

pg = nx.Graph()
pg.add_nodes_from(sorted(pg_graph.nodes(data=True)))
pg.add_edges_from(pg_graph.edges(data=True))

In [None]:
# label variables according to official WGI names
mapping = {0: "VA", 1: "PS", 2: "GE", 3: "RQ", 4: "RL", 5: "CC"}
pg_ = nx.relabel_nodes(pg, mapping)

In [None]:
plt.figure(figsize=(10,6))
plt.tight_layout()
nx.draw_networkx(pg_, pos=nx.circular_layout(pg_), node_color='grey', node_size=1000, font_color='white', font_size=16)
plt.axis('off')
plt.box(False)
plt.show()

#### Directed network

In [None]:
g_graph = nx.DiGraph()

for d, p_list in _DAGs.items():
    g_graph.add_node(d)
    for p in p_list:
        g_graph.add_edge(p, d)
        
g = nx.DiGraph()
g.add_nodes_from(sorted(g_graph.nodes(data=True)))
g.add_edges_from(g_graph.edges(data=True))

In [None]:
mapping = {0: "VA", 1: "PS", 2: "GE", 3: "RQ", 4: "RL", 5: "CC"}
g_ = nx.relabel_nodes(g, mapping)

In [None]:
plt.figure(figsize=(10,6))
plt.tight_layout()
nx.draw_networkx(g_, pos=nx.circular_layout(g_), arrowsize=24, node_color='grey', node_size=1000, font_color='white', font_size=16)
plt.axis('off')
plt.box(False)
plt.show()