# Logic:
* Normalize per param & per year
* Assign weights for proxies
* Calculate scores

In [1]:
import numpy as np
import pandas as pd
from datetime import datetime

In [2]:
DATE = datetime.today().strftime("%Y-%m-%d")

In [3]:
def min_max_normalization(v, reverse=False):
    """ Apply min-max normalization per year & indicator. """
    
    formula = (v - v.min()) / (v.max() - v.min())
    if reverse:
        return 1 - formula
    else:
        return formula

In [4]:
df = pd.read_excel("./../data/raw/dataset_2020-05-16 21_28.xlsx")

In [5]:
# random data
df["min_max_norm"] = df.loc[df["value"].notnull()].groupby(["year","indicator"])["value"].transform(min_max_normalization)

In [6]:
proxy_weights_l = [1.15, 1.25, -1, -1, 1.1]
low_weights_sum = np.sum(proxy_weights_l)

df["weights"] = np.select(
    condlist=[
        df["indicator"].eq("Access to electricity, rural (% of rural population)"),
        df["indicator"].eq("Armed forces personnel (% of total labor force)"),
        df["indicator"].eq("Mortality rate, under-5 (per 1,000 live births)"),
        df["indicator"].eq("Prevalence of HIV, total (% of population ages 15-49)"),
        df["indicator"].eq("Tax revenue (% of GDP)"),
    ],
    choicelist=proxy_weights_l
)

In [7]:
df["score"] = df["min_max_norm"] * df["weights"] 
yic = df.groupby(["year", "iso3", "category"], as_index=False)["score"].sum() 
yic["total_score"] = yic["score"] / low_weights_sum * 10

In [8]:
proxy_weights_h = [1.15, 1.35, 1.75]
high_weights_sum = np.sum(proxy_weights_h)

yic["high_cat_weights"] = np.select(
    condlist=[
        yic["category"].eq("Medicine"),
        yic["category"].eq("Infrastructure"),
        yic["category"].eq("Public Sector")
    ],
    choicelist=proxy_weights_h
)

In [9]:
yic["category_scores"] = yic["total_score"] * yic["high_cat_weights"]
stateness = yic.groupby(["year", "iso3"], as_index=False)["category_scores"].sum()
stateness["stateness"] = stateness["category_scores"] / high_weights_sum * 10

---

In [10]:
polity = pd.read_excel("./../data/interim/P5_composite_indicators.xlsx")

In [11]:
polity["polity2_norm"] = polity.groupby("year")["polity2"].transform(min_max_normalization)

In [12]:
m = pd.merge(
    stateness.drop("category_scores", 1), 
    polity[["scode", "year", "polity2_norm"]],
    how="left",
    left_on=["year", "iso3"], right_on=["year", "scode"]
)

In [13]:
m

Unnamed: 0,year,iso3,stateness,scode,polity2_norm
0,1991,ARM,-6.367964,ARM,1.0
1,1991,AZE,-16.191100,AZE,0.4
2,1991,BLR,0.000000,BLR,1.0
3,1991,EST,-0.462029,EST,1.0
4,1991,GEO,-6.488493,,
...,...,...,...,...,...
430,2019,RUS,0.000000,,
431,2019,TJK,0.000000,,
432,2019,TKM,0.000000,,
433,2019,UKR,0.000000,,


In [14]:
m.to_excel(f"./../data/processed/stateness_{DATE}.xlsx", index=False)