# Human Development Index vs. trial coverage disproportionality
Linear regression analyses with statsmodels.

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
%load_ext blackcellmagic

In [None]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
sns.set(color_codes=True)
sns.set_style("whitegrid", {"grid.color": "gainsboro"})

from tools import visualization

## Load trial data

In [None]:
grouper_column_names = [
    "country_ISO",
    "country_continent",
    "subregion",
    "economy_level",
    "consolidated_economy_level",
    "income_group",
    "consolidated_income_group",
    "hdi_category",
]

In [None]:
consolidated_counts_overall = {}
for grouper_column_name in grouper_column_names:
    consolidated_counts_overall[grouper_column_name] = pd.read_excel(
        "data/results/trials_sites_counts.xlsx",
        sheet_name="ovr_" + grouper_column_name,
    )

In [None]:
consolidated_counts_overall["country_continent"]

In [None]:
consolidated_counts_per_phase = {}
for grouper_column_name in grouper_column_names:
    consolidated_counts_per_phase[grouper_column_name] = pd.read_excel(
        "data/results/trials_sites_counts.xlsx",
        sheet_name="phs_" + grouper_column_name,
    )

In [None]:
consolidated_counts_per_phase["country_continent"].head()

## Combine country data with HDI data
Our counts data only is per HDI category; we need the actual HDI value for regression analyses, so we have to load this data again.

### Geographic and socioeconomic data

In [None]:
ms_trials_socioeconomic = pd.read_excel(
    "data/results/trials_sites_counts.xlsx", sheet_name="Base_dataset"
)

In [None]:
country_socioeconomic_data = (
    ms_trials_socioeconomic[
        [
            "country_ISO",
            "country_continent",
            "subregion",
            "consolidated_income_group",
            "consolidated_economy_level",
            "hdi_category",
        ]
    ]
    .drop_duplicates()
    .reset_index(drop=True)
    .copy()
)

In [None]:
country_socioeconomic_data

### Trial data

In [None]:
country_overall = consolidated_counts_overall["country_ISO"].copy()

In [None]:
country_per_phase = consolidated_counts_per_phase["country_ISO"].copy()

In [None]:
country_overall["phase"] = "All phases"

In [None]:
trials_sites_counts = pd.concat(
    [
        country_overall,
        country_per_phase.rename(
            columns={
                colname: colname.replace("_phase", "_overall")
                for colname in country_per_phase.columns
            }
        ),
    ]
).reset_index(drop=True)

In [None]:
trials_sites_counts

### HDI data

In [None]:
hdi_raw = pd.read_csv("data/source/unstats/human-development-index.csv")

In [None]:
hdi_raw.head()

#### Get most recent per country

In [None]:
max_year_hdi_per_country = (
    hdi_raw[["Entity", "Code", "Year"]]
    .groupby(["Entity", "Code"])
    .max()
    .reset_index()
)

In [None]:
max_year_hdi_per_country.head()

In [None]:
hdi_raw = pd.merge(
    left=hdi_raw,
    right=max_year_hdi_per_country,
    on=["Entity", "Code", "Year"],
    how="inner",
)

In [None]:
hdi_raw

### Rename columns for convenience

In [None]:
hdi_data = (
    hdi_raw[["Entity", "Code", "Human Development Index"]]
    .rename(
        columns={
            "Entity": "hdi_country",
            "Code": "country_ISO",
            "Human Development Index": "hdi",
        }
    )
    .copy()
)

In [None]:
hdi_data.head()

### Merge

We use an inner join here, since we only do regression with the countries we have the HDI data for.

In [None]:
trials_sites_counts = pd.merge(left=trials_sites_counts, right=hdi_data[["country_ISO", "hdi"]], on="country_ISO", how="inner")

In [None]:
trials_sites_counts

### Add info

In [None]:
trials_sites_counts = pd.merge(left=trials_sites_counts, right=country_socioeconomic_data, on="country_ISO", how="left")

### Nice-ify data

In [None]:
trials_sites_counts["phase"] = trials_sites_counts["phase"].str.replace("PHASE", "Phase ")

## Regression plots

### Prepare data

In [None]:
plot_data_nonzero = trials_sites_counts[trials_sites_counts["n_trials"] > 0].copy()

Compute the logs of the disproportionality factor:

In [None]:
plot_data_nonzero["log_factor_deviation_n_trials_from_expected"] = np.log10(plot_data_nonzero["factor_deviation_n_trials_from_expected"])
plot_data_nonzero["log_factor_deviation_n_sites_from_expected"] = np.log10(plot_data_nonzero["factor_deviation_n_sites_from_expected"])

We also need a list of continents to create the hue/hue order in scatterplots:

In [None]:
continents = sorted(list(plot_data_nonzero["country_continent"].drop_duplicates()))

In [None]:
continents

In [None]:
continent_colors = ["black", "indigo", "deeppink", "orange", "olivedrab", "teal"]

### All phases

#### Trials

In [None]:
fig, ax = plt.subplots(1, figsize=(6, 6))
g = visualization.linear_regression_and_scatter_plot(
    data=plot_data_nonzero[plot_data_nonzero["phase"] == "All phases"],
    x_column="hdi",
    y_column="log_factor_deviation_n_trials_from_expected",
    scatter_palette=continent_colors,
    scatter_hue_column="country_continent",
    scatter_hue_order=continents,
    scatter_alpha=.75,
    xlim=(0.55, 0.975),
    ylim=(-1.5, 3.5),
    n_points=100,
    ax=ax,
)

ax.legend(title="Continent")
ax.set_xlabel("Human Development Index")
ax.set_ylabel("Log-disproportionality of trials")

sns.despine(bottom=True, top=True, left=True, right=True, ax=ax)
fig.tight_layout()
plt.show()

#### Trial sites

In [None]:
fig, ax = plt.subplots(1, figsize=(6, 6))
g = visualization.linear_regression_and_scatter_plot(
    data=plot_data_nonzero[plot_data_nonzero["phase"] == "All phases"],
    x_column="hdi",
    y_column="log_factor_deviation_n_sites_from_expected",
    scatter_palette=continent_colors,
    scatter_hue_column="country_continent",
    scatter_hue_order=continents,
    scatter_alpha=.75,
    xlim=(0.55, 0.975),
    ylim=(-2.75, 2.25),
    n_points=100,
    ax=ax,
)

ax.legend(title="Continent")
ax.set_xlabel("Human Development Index")
ax.set_ylabel("Log-disproportionality of trial sites")

sns.despine(bottom=True, top=True, left=True, right=True, ax=ax)
fig.tight_layout()
plt.show()

### 4 phases

####  Trials

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(14, 9))

for i in range(1, 5):

    g = visualization.linear_regression_and_scatter_plot(
        data=plot_data_nonzero[
            plot_data_nonzero["phase"] == "Phase " + str(i)
        ],
        x_column="hdi",
        y_column="log_factor_deviation_n_trials_from_expected",
        scatter_palette=continent_colors,
        scatter_hue_column="country_continent",
        scatter_hue_order=continents,
        scatter_alpha=.75,
        xlim=(0.55, 0.975),
        ylim=(-1.5, 3.5),
        n_points=100,
        ax=axes[i // 3][1 - i % 2],
    )

    axes[i // 3][1 - i % 2].legend(title="Continent")
    axes[i // 3][1 - i % 2].set_xlabel("Human Development Index")
    axes[i // 3][1 - i % 2].set_ylabel("Log-disproportionality of trias")
    axes[i // 3][1 - i % 2].set_title("ABCD"[i - 1] + ") Phase " + str(i))

fig.tight_layout()
plt.show()

#### Trial sites

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(14, 9))

for i in range(1, 5):

    g = visualization.linear_regression_and_scatter_plot(
        data=plot_data_nonzero[
            plot_data_nonzero["phase"] == "Phase " + str(i)
        ],
        x_column="hdi",
        y_column="log_factor_deviation_n_sites_from_expected",
        scatter_palette=continent_colors,
        scatter_hue_column="country_continent",
        scatter_hue_order=continents,
        scatter_alpha=.75,
        xlim=(0.55, 0.975),
        ylim=(-2.75, 2.25),
        n_points=100,
        ax=axes[i // 3][1 - i % 2],
    )

    axes[i // 3][1 - i % 2].legend(title="Continent")
    axes[i // 3][1 - i % 2].set_xlabel("Human Development Index")
    axes[i // 3][1 - i % 2].set_ylabel("Log-disproportionality of trial sites")
    axes[i // 3][1 - i % 2].set_title("ABCD"[i - 1] + ") Phase " + str(i))

fig.tight_layout()
plt.show()