In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import spearmanr
from utils import *

In [2]:
countries = pd.read_csv(f"{data_paths['atlas']}/countries.csv", encoding="latin1")

In [3]:
trustworthiness = pd.read_csv(f"{data_paths['graphs_data']}/trustworthiness_scores.csv", dtype={"cmd": str})

discrepancy = []
for i, y in enumerate(range(2017, 2023), start=1):
    _ = pd.read_csv(f"{data_paths['additional']}/4.2.{i} DISCREPANCY_INDEX_H5_{y}_csv.zip", dtype={"ProductCode": str})
    _ = _[["CountryA", "ProductCode", "Year", "DI"]]
    discrepancy.append(_)
discrepancy = pd.concat(discrepancy, ignore_index=True)

In [4]:
trustworthiness.head()

Unnamed: 0,country_id,year,cmd,trustworthiness
0,4,2012,1,0.0
1,4,2012,2,0.0
2,4,2012,3,0.363488
3,4,2012,4,0.300776
4,4,2012,5,0.011893


In [5]:
discrepancy.head()

Unnamed: 0,CountryA,ProductCode,Year,DI
0,ARE,1,2017,1.0
1,ARM,1,2017,0.057946
2,BRA,1,2017,-0.931879
3,CAN,1,2017,-1.0
4,CAN,1,2017,-1.0


Convert all discrepancies to positive (ignore the 'direction' of the discrepancy)

In [6]:
discrepancy.loc[:, "DI"] = np.abs(discrepancy.DI)

Reduce ProductCode to single 2 digits

In [7]:
discrepancy.loc[:, "ProductCode"] = discrepancy["ProductCode"].str[:2]

Aggregate by Year & Commodity

In [8]:
discrepancy = discrepancy.groupby(["CountryA", "Year", "ProductCode"]).mean().reset_index()

Add Country ID

In [9]:
discrepancy = discrepancy.merge(countries[["iso_code", "country_id"]], left_on="CountryA", right_on="iso_code", how="inner")
discrepancy.head()

Unnamed: 0,CountryA,Year,ProductCode,DI,iso_code,country_id
0,ABW,2017,1,1.0,ABW,533
1,ABW,2017,2,1.0,ABW,533
2,ABW,2017,3,1.0,ABW,533
3,ABW,2017,4,1.0,ABW,533
4,ABW,2017,5,1.0,ABW,533


In [10]:
discrepancy.rename(columns={"Year": "year", "ProductCode": "cmd"}, inplace=True)

In [11]:
trust_discr = trustworthiness.merge(discrepancy[["country_id", "year", "cmd", "DI"]], on=["country_id", "year", "cmd"], how="inner")
trust_discr.head()

Unnamed: 0,country_id,year,cmd,trustworthiness,DI
0,4,2017,1,0.65198,1.0
1,4,2017,2,1.0,1.0
2,4,2017,3,0.0,1.0
3,4,2017,4,0.970628,1.0
4,4,2017,5,0.997369,1.0


In [12]:
trust_discr.DI.describe()

count    119046.000000
mean          0.871354
std           0.128096
min           0.209813
25%           0.765985
50%           0.879175
75%           1.000000
max           1.000000
Name: DI, dtype: float64

In [13]:
spearmanr(trust_discr["DI"], trust_discr["trustworthiness"])

SignificanceResult(statistic=-0.6117783528691697, pvalue=0.0)

### Corruption Index

In [14]:
corruption = pd.read_csv(f"{data_paths['additional']}/4.3 Corruption Perception Index.csv")

In [15]:
corruption.head()

Unnamed: 0,Country,ISO3,2023,2022,2021,2020,2019,2018,2017,2016,2015,2014,2013,2012
0,Afghanistan,AFG,20.0,24.0,16.0,19.0,16.0,16.0,15.0,15.0,11.0,12.0,8.0,8.0
1,Albania,ALB,37.0,36.0,35.0,36.0,35.0,36.0,38.0,39.0,36.0,33.0,31.0,33.0
2,Algeria,DZA,36.0,33.0,33.0,36.0,35.0,35.0,33.0,34.0,36.0,36.0,36.0,34.0
3,Angola,AGO,33.0,33.0,29.0,27.0,26.0,19.0,19.0,18.0,15.0,19.0,23.0,22.0
4,Argentina,ARG,37.0,38.0,38.0,42.0,45.0,40.0,39.0,36.0,32.0,34.0,34.0,35.0


In [16]:
corruption = corruption.rename(columns={"ISO3": "iso_code"})

In [17]:
corruption = corruption.merge(countries[["iso_code", "country_id"]], on="iso_code", how="inner")

In [18]:
corruption = corruption.melt(id_vars=["country_id", "iso_code"], var_name="year", value_vars=[f"{x}" for x in range(2012, 2024)])
corruption["year"] = corruption["year"].astype(int)

In [19]:
corruption.head()

Unnamed: 0,country_id,iso_code,year,value
0,4,AFG,2012,8.0
1,8,ALB,2012,33.0
2,12,DZA,2012,34.0
3,24,AGO,2012,22.0
4,32,ARG,2012,35.0


In [20]:
trust_discr_avg = trust_discr.groupby(["country_id", "year"])[["trustworthiness", "DI"]].mean().reset_index()
trust_discr_avg.head()

Unnamed: 0,country_id,year,trustworthiness,DI
0,4,2017,0.385121,1.0
1,4,2018,0.519441,1.0
2,4,2019,0.529528,1.0
3,4,2020,0.277879,1.0
4,4,2021,0.275534,1.0


In [21]:
trust_discr_corrup = corruption.merge(trust_discr_avg, on=["country_id", "year"], how="inner")
trust_discr_corrup.head()

Unnamed: 0,country_id,iso_code,year,value,trustworthiness,DI
0,4,AFG,2017,15.0,0.385121,1.0
1,8,ALB,2017,38.0,0.937757,0.85617
2,12,DZA,2017,33.0,0.76321,0.806892
3,24,AGO,2017,19.0,0.83818,1.0
4,32,ARG,2017,39.0,0.976712,1.0


In [22]:
trust_discr_corrup.dropna(inplace=True)

In [23]:
print(spearmanr(trust_discr_corrup["trustworthiness"], trust_discr_corrup["value"]))
print(spearmanr(trust_discr_corrup["DI"], trust_discr_corrup["value"]))

SignificanceResult(statistic=0.5702511246543168, pvalue=1.3295499297918967e-88)
SignificanceResult(statistic=-0.49056075714776826, pvalue=1.4154963245920621e-62)
