# Logic:
* Normalize per param & per year
* Assign weights to proxies
* Calculate scores

In [1]:
import numpy as np
import pandas as pd
from datetime import datetime

In [2]:
DATE = datetime.today().strftime("%Y-%m-%d")
NEGATIVES = ["SH.DYN.MORT"]
PROXIES_WEIGHT = {
    "MS.MIL.TOTL.TF.ZS": 1.25,
    "MS.MIL.XPND.GD.ZS": 1,
    "EG.ELC.ACCS.RU.ZS": 0.1,
    "EG.USE.ELEC.KH.PC": 0.9,
    "SH.DYN.MORT":       1.25,
    "SH.MED.BEDS.ZS":    1,
    "GC.TAX.TOTL.GD.ZS": 1,
    "IQ.CPA.FISP.XQ":    0.7
}
CATEGORIES_WEIGHT = {
    "Coercion":       1.3,
    "Infrastructure": 1.4,
    "Taxes":          1.2
}

In [3]:
def min_max_normalization(v, reverse=False):
    """ Apply min-max normalization per year & indicator. """
    
    formula = (v - v.min()) / (v.max() - v.min())
    if reverse:
        return 1 - formula
    else:
        return formula

In [4]:
data = pd.read_excel("./../data/interim/world_bank_data.xlsx")
data.shape

(2304, 7)

In [5]:
# nan policy?
# df = data.loc[data["value"].notnull()].copy()
df = data.copy()
df["proxy_weight"] = df["id"].map(PROXIES_WEIGHT)
df.shape

(2304, 8)

In [6]:
# positive-negative indicators
positives = df.loc[~df["id"].isin(NEGATIVES)].copy()
negatives = df.loc[df["id"].isin(NEGATIVES)].copy()

In [7]:
positives["norm"] = positives.groupby(["year", "indicator"])["value"].transform(min_max_normalization)
negatives["norm"] = negatives.groupby(["year", "indicator"])["value"].transform(min_max_normalization, reverse=True)
stateness = pd.concat([positives, negatives], ignore_index=False).sort_values(["year", "id"])

In [8]:
stateness.loc[stateness["year"].eq(1999) & stateness["id"].eq("SH.MED.BEDS.ZS")]

Unnamed: 0,iso3,indicator,id,year,value,region,category,proxy_weight,norm
1457,ARM,"Hospital beds (per 1,000 people)",SH.MED.BEDS.ZS,1999,6.2,Transcaucasia,Infrastructure,1.0,0.111724
1475,AZE,"Hospital beds (per 1,000 people)",SH.MED.BEDS.ZS,1999,8.9,Transcaucasia,Infrastructure,1.0,0.484138
1493,BLR,"Hospital beds (per 1,000 people)",SH.MED.BEDS.ZS,1999,12.64,Eastern Europe,Infrastructure,1.0,1.0
1511,EST,"Hospital beds (per 1,000 people)",SH.MED.BEDS.ZS,1999,7.53,Baltic States,Infrastructure,1.0,0.295172
1529,GEO,"Hospital beds (per 1,000 people)",SH.MED.BEDS.ZS,1999,5.69,Transcaucasia,Infrastructure,1.0,0.041379
1547,KAZ,"Hospital beds (per 1,000 people)",SH.MED.BEDS.ZS,1999,7.25,Central Asia,Infrastructure,1.0,0.256552
1565,KGZ,"Hospital beds (per 1,000 people)",SH.MED.BEDS.ZS,1999,7.47,Central Asia,Infrastructure,1.0,0.286897
1583,LTU,"Hospital beds (per 1,000 people)",SH.MED.BEDS.ZS,1999,9.85,Baltic States,Infrastructure,1.0,0.615172
1601,LVA,"Hospital beds (per 1,000 people)",SH.MED.BEDS.ZS,1999,9.03,Baltic States,Infrastructure,1.0,0.502069
1619,MDA,"Hospital beds (per 1,000 people)",SH.MED.BEDS.ZS,1999,8.19,Eastern Europe,Infrastructure,1.0,0.386207


In [9]:
stateness["proxy_score"] = stateness["norm"] * stateness["proxy_weight"] 
g = stateness.groupby(["year", "iso3", "category"], as_index=False)["proxy_score"].sum()
g["weighted_proxy_score"] = g["proxy_score"] / sum(PROXIES_WEIGHT.values())
g["category_weight"] = g["category"].map(CATEGORIES_WEIGHT)
g["category_score"] = g["weighted_proxy_score"] * g["category_weight"]
index = g.groupby(["year", "iso3"], as_index=False)["category_score"].sum()
index["stateness"] = index["category_score"] / sum(CATEGORIES_WEIGHT.values()) * 100
result = index.drop("category_score", 1)

In [10]:
result.loc[result["year"].eq(2016)]

Unnamed: 0,year,iso3,stateness
272,2016,ARM,17.979636
273,2016,AZE,10.519046
274,2016,BLR,13.288546
275,2016,EST,12.760979
276,2016,GEO,12.639133
277,2016,KAZ,6.916129
278,2016,KGZ,9.804923
279,2016,LTU,12.448891
280,2016,LVA,11.872775
281,2016,MDA,9.020566


---

In [11]:
polity = pd.read_excel("./../data/interim/P5_composite_indicators.xlsx")

In [12]:
polity["polity2_norm"] = polity.groupby("year")["polity2"].transform(min_max_normalization)
polity["polity2_norm"] *= 100

In [13]:
m = pd.merge(
    result, 
    polity[["iso3", "year", "polity2_norm"]],
    how="left",
    on=["year", "iso3"]
)

In [14]:
m.to_excel(f"./../data/processed/stateness_{DATE}.xlsx", index=False)

In [15]:
m

Unnamed: 0,year,iso3,stateness,polity2_norm
0,1999,ARM,15.136754,73.684211
1,1999,AZE,11.170572,10.526316
2,1999,BLR,19.638342,10.526316
3,1999,EST,17.548126,84.210526
4,1999,GEO,6.537743,68.421053
...,...,...,...,...
283,2016,SRB,8.579495,89.473684
284,2016,TJK,2.067613,31.578947
285,2016,TKM,2.573390,5.263158
286,2016,UKR,13.605215,68.421053
