In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
from linearmodels import PanelOLS, RandomEffects
from statsmodels.stats.outliers_influence import variance_inflation_factor
import plotly.express as px

sns.set_theme()

# Functions

In [2]:
# Definisci una funzione per mappare i paesi alle regioni
def assign_region(country):
    country = HDR_ISO_country[country]
    if country in Asia:
        return 'EAP'
    elif country in Europe_Central_Asia:
        return 'ECA'
    elif country in Weastern_Europe:
        return 'WE'
    elif country in North_America:
        return 'NA'
    elif country in Arab_states:
        return 'AS'
    elif country in Oceania:
        return 'OC'
    elif country in Areas:
        return 'AREA'

In [3]:
def Pivoting(df): # Function to pivot the dataframe
    df_r = df.pivot(index=['iso3','hdicode','region','year'], columns='indicator_name', values='value').reset_index()
    df_r.reset_index(inplace=True)
    df_r.columns.name = None
    df_r.drop(columns=['index'], inplace=True)
    df_r['year'] = pd.to_datetime(df_r['year'], format='%Y',errors='coerce').dt.year
    return df_r

In [4]:
def Gap(df: pd.DataFrame, inds: list[str]) -> pd.DataFrame:
    """
    For each iso3, find the overall min and max year in df,
    then for each indicator in inds compute newest – oldest.
    
    Returns a DataFrame with columns:
      iso3, min_year, max_year,
      oldest_<ind>, newest_<ind>, <ind>_diff  (one set per indicator)
    """
    # 1) Find global min and max year per country
    year_range = (
        df.groupby('iso3')['year']
          .agg(min_year='min', max_year='max')
          .reset_index()
    )
    
    # 2) Pull out oldest rows
    oldest = (
        df.merge(year_range, on='iso3')
          .query("year == min_year")
          .loc[:, ['iso3', 'year'] + inds]
          .rename(columns={ 'year': 'min_year', 
                            **{ind: f'oldest_{ind}' for ind in inds}})
    )
    
    # 3) Pull out newest rows
    newest = (
        df.merge(year_range, on='iso3')
          .query("year == max_year")
          .loc[:, ['iso3', 'year'] + inds]
          .rename(columns={ 'year': 'max_year', 
                            **{ind: f'newest_{ind}' for ind in inds}})
    )
    
    # 4) Merge them all together
    result = (
        year_range
        .merge(oldest, on='iso3')
        .merge(newest, on='iso3')
    )
    
    # 5) Compute diffs
    for ind in inds:
        result[f'{ind}_diff'] = result[f'newest_{ind}'] - result[f'oldest_{ind}']
    
    return result

# Example usage:
# df has columns ['iso3','year','logGDP','pop','ind1',...]
# gaps = Gap(df, ['logGDP','pop'])


In [5]:
def PrepareData(df,ind): # Function to prepare the data for analysis
    Panel = df.copy()
    shift = ind.replace('_value', '') + '_shifted'
    Panel[shift] = Panel.groupby('iso3')[ind].shift(1)
    Panel.dropna(subset = shift ,inplace=True) # Drop rows with NaN values

    Country_with_few_years = []
    for each in Panel['iso3'].unique():
        if Panel[Panel['iso3'] == each].shape[0] < 20:
            Country_with_few_years.append(each)
    Panel = Panel[~Panel['iso3'].isin(Country_with_few_years)] # Drop countries with less than 20 years of data
    return Panel

In [6]:
def Regression(Panel,independent_vars,dependent_var): # Function to run the regression
    L = ['iso3','year']
    L.extend(independent_vars)
    L.append(dependent_var)
    model_df = Panel[L].copy()
    model_df = model_df.set_index(['iso3', 'year'])
    # Drop rows with NaN values only in the independent and dependent variable columns
    model_df = model_df.dropna(subset=independent_vars + [dependent_var])
    model_df[independent_vars] = model_df[independent_vars].apply(pd.to_numeric, errors='coerce')
    model_df[dependent_var] = model_df[dependent_var].apply(pd.to_numeric, errors='coerce')

    Y = model_df[dependent_var]
    X = model_df[independent_vars]
    X = sm.add_constant(X)

    mod_fe = PanelOLS(Y,X, entity_effects=True, time_effects=True)
    Results_Panel = mod_fe.fit(cov_type='clustered',cluster_entity=True)

    return Results_Panel.summary

# Data loading and preparation

## Data Loading

In [7]:
path = 'Datasets/'

gdp_pc_ppp = pd.read_csv(path + 'GDP per capita, PPP (current international)/WB_WDI_NY_GDP_PCAP_PP_CD.csv') # GDP per capita, PPP (current international $)
hdr = pd.read_csv(path + 'HDR/HDR25_Composite_indices_complete_time_series.csv',encoding='latin1') # Human Development Index
hdr_labels = pd.read_excel(path + 'HDR/HDR25_Composite_indices_metadata.xlsx', sheet_name = 'codebook') # Human Development
schooling = pd.read_csv(path+'UNESCO/OPRI_DATA_NATIONAL.zip', dtype={'INDICATOR_ID': 'object'}, compression='zip')  # Schooling data
schooling_labels = pd.read_csv(path+'UNESCO/OPRI_LABEL.csv') # Schooling labels
gs = pd.read_csv(path + 'GenderStatistics/GS.csv') # Globalization data

  schooling = pd.read_csv(path+'UNESCO/OPRI_DATA_NATIONAL.zip', dtype={'INDICATOR_ID': 'object'}, compression='zip')  # Schooling data


## Data preparation

### HDR

In [8]:
HDR_ISO_country = {hdr['iso3'][i]:hdr['country'][i] for i in range(len(hdr))}

In [9]:
hdr_labels.drop(columns=['Time series'], inplace=True)
hdr_labels.dropna(subset=['Short name'], inplace=True)
hdr_labels.rename(columns={'Full name': 'indicator_name', 'Short name': 'indicator'}, inplace=True)

In [10]:
HDR = hdr.copy()
df_melted = HDR.melt(id_vars=['iso3', 'country', 'hdicode', 'region', 'hdi_rank_2023'], var_name='indicator_year', value_name='value') # Melt the dataframe to long format
df_melted[['indicator', 'year']] = df_melted['indicator_year'].str.extract(r'([a-z0-9_]+)_(\d{4})')
df_final = df_melted[['iso3','hdicode','region','year', 'indicator', 'value']] # Reorder and select final columns
HDR = pd.merge(hdr_labels, df_final, on=['indicator'], how='right') # Merge with labels

In [11]:
North_America = ['Canada','United States'] 
Oceania = ['Australia','New Zealand'] 
Asia = ['Hong Kong, China (SAR)','Korea (Republic of)','Japan'] 
Europe_Central_Asia = ['Cyprus','Russian Federation','Israel'] 
Weastern_Europe = ['Andorra', 'Austria', 'Belgium', 'Bulgaria', 'Croatia', 'Czechia', 'Denmark', 
                   'Estonia', 'Finland', 'France', 'Germany', 'Greece', 'Hungary', 'Iceland', 'Ireland', 
                   'Italy', 'Latvia', 'Liechtenstein', 'Lithuania', 'Luxembourg', 'Malta', 'Monaco', 
                   'Netherlands', 'Norway', 'Poland', 'Portugal', 'Romania', 'San Marino', 'Slovakia', 'Slovenia', 'Spain', 'Sweden', 'Switzerland','United Kingdom']
Arab_states = ['Algeria', 'Bahrain', 'Djibouti', 'Egypt', 'Iraq', 'Jordan']
Areas = ['Very high human development', 'High human development',
       'Medium human development', 'Low human development', 'Arab States',
       'East Asia and the Pacific', 'Europe and Central Asia',
       'Latin America and the Caribbean', 'South Asia',
       'Sub-Saharan Africa', 'World']

In [12]:
region_labels = {
    'ECA': 'Europe & Central Asia',
    'AS':  'Arab States',
    'SSA': 'Sub-Saharan Africa', 
    'LAC': 'Latin America & Caribbean',
    'SA':  'South Asia',
    'EAP': 'East Asia & Pacific',
    'AS':  'Arab States',
    'NA': 'North America',
    'WE': 'Western Europe',
    'OC': 'Oceania'
}

In [13]:
h = HDR[HDR['region'].isna()].copy()
h.loc[:,'region'] = h.loc[:,'iso3'].apply(assign_region)
HDR = pd.concat([HDR[~HDR['region'].isna()], h], ignore_index=True) # Concatenate the two dataframes
HDR.dropna(subset=['value'], inplace=True) # Drop rows with NaN values in the region column
HDR = HDR[HDR['region'] != 'AREA']

In [14]:
HDI_indicators = ['hdi', 'le', 'eys', 'mys', 'gnipc']
GDI_indicators = ['gdi_group', 'gdi', 'hdi_f', 'le_f', 'eys_f', 'mys_f', 'gni_pc_f', 'hdi_m', 'le_m', 'eys_m', 'mys_m', 'gni_pc_m']
IHDI_indicators = ['ihdi', 'coef_ineq', 'loss', 'ineq_le', 'ineq_edu', 'ineq_inc']
GII_indicators = ['gii_rank', 'gii', 'mmr', 'abr', 'se_f', 'se_m', 'pr_f', 'pr_m', 'lfpr_f', 'lfpr_m']
PHDI_indicators = ['rankdiff_hdi_phdi', 'phdi', 'diff_hdi_phdi', 'co2_prod', 'mf']
Population_indicator = ['pop_total']

In [15]:
hdi = HDR[HDR['indicator'].isin(HDI_indicators+Population_indicator)]
gdi = HDR[HDR['indicator'].isin(GDI_indicators+Population_indicator)]
ihdi = HDR[HDR['indicator'].isin(IHDI_indicators+Population_indicator)]
gii = HDR[HDR['indicator'].isin(GII_indicators+Population_indicator)]
phdi = HDR[HDR['indicator'].isin(PHDI_indicators+Population_indicator)]

In [16]:
HDI = Pivoting(hdi)
GDI = Pivoting(gdi)
IHDI = Pivoting(ihdi)
GII = Pivoting(gii)

In [17]:
HDR_idx = {'hdi': 'Human Development Index (value)', 'gii': 'Gender Inequality Index (value)', 
           'gdi': 'Gender Development Index (value)', 'ihdi': 'Inequality-adjusted Human Development Index (value)'}

#### GDP per Capita PPP

In [18]:
WB_ISO_country = {gdp_pc_ppp['REF_AREA_ID'][i]:gdp_pc_ppp['REF_AREA_NAME'][i] for i in range(len(gdp_pc_ppp))}

In [19]:
GDP = gdp_pc_ppp.copy()
GDP = GDP[['REF_AREA_ID','TIME_PERIOD','OBS_VALUE']]
GDP.rename(columns={'REF_AREA_ID':'iso3','TIME_PERIOD':'year','OBS_VALUE':'GDP'},inplace=True)
GDP['year'] = pd.to_datetime(GDP['year'], format='%Y',errors='coerce').dt.year
GDP['logGDP'] = np.log(GDP['GDP'])

In [20]:
# GDP and GII
GDP_GII = pd.merge(GDP, GII, on=["iso3", "year"],how="inner")
GDP_GII.columns.name = None
GDP_GII = GDP_GII[~GDP_GII['iso3'].isin(['KWT', 'ARE', 'SAU', 'QAT'])]

# Cross-sectional

In [21]:
Panel_GII_CS = PrepareData(GDP_GII,HDR_idx['gii']) # Prepare the GII data
for y in range(Panel_GII_CS['year'].min(), Panel_GII_CS['year'].max(),5):
    df = Panel_GII_CS[Panel_GII_CS['year'] == y].copy()
    Y = df['logGDP']
    X = df[HDR_idx['gii']]
    X = sm.add_constant(X)

    model = sm.OLS(Y, X).fit()
    print(f"Year: {y}")
    print(model.summary())

Year: 1991
                            OLS Regression Results                            
Dep. Variable:                 logGDP   R-squared:                       0.761
Model:                            OLS   Adj. R-squared:                  0.758
Method:                 Least Squares   F-statistic:                     245.5
Date:                Fri, 16 May 2025   Prob (F-statistic):           1.16e-25
Time:                        10:06:23   Log-Likelihood:                -44.993
No. Observations:                  79   AIC:                             93.99
Df Residuals:                      77   BIC:                             98.73
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                                      coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------------------------

# Regression panel

In [25]:
df = PrepareData(GDP_GII,HDR_idx['gii'])
df = df.set_index(['iso3', 'year'])
df = df.dropna(subset=['logGDP', HDR_idx['gii']])
y = df['logGDP']
X = df[HDR_idx['gii']+'_shifted']
X = sm.add_constant(X)

pooled = PanelOLS(y, X, entity_effects=False, time_effects=False).fit(cov_type='clustered',cluster_entity=True)
re = RandomEffects(y, X).fit(cov_type='clustered',cluster_entity=True)
fe = PanelOLS(y, X, entity_effects=True, time_effects=False).fit(cov_type='clustered',cluster_entity=True)
fe2 = PanelOLS(y, X, entity_effects=True, time_effects=True).fit(cov_type='clustered',cluster_entity=True)

print("Pooled OLS:\n", pooled.summary)
print("Random Effects:\n", re.summary)
print("Fixed Effects:\n", fe.summary)

Pooled OLS:
                           PanelOLS Estimation Summary                           
Dep. Variable:                 logGDP   R-squared:                        0.6956
Estimator:                   PanelOLS   R-squared (Between):              0.6886
No. Observations:                3875   R-squared (Within):               0.6614
Date:                Fri, May 16 2025   R-squared (Overall):              0.6956
Time:                        10:28:37   Log-likelihood                   -3595.4
Cov. Estimator:             Clustered                                           
                                        F-statistic:                      8849.9
Entities:                         130   P-value                           0.0000
Avg Obs:                       29.808   Distribution:                  F(1,3873)
Min Obs:                       20.000                                           
Max Obs:                       33.000   F-statistic (robust):             495.79
               

In [23]:
Panel_GII = PrepareData(GDP_GII,HDR_idx['gii']) # Prepare the GII data
Panel_GII = Panel_GII[~Panel_GII['iso3'].isin(['KWT', 'ARE', 'SAU', 'QAT'])]
Regression(Panel_GII, independent_vars = [HDR_idx['gii']+'_shifted'], dependent_var = 'logGDP') 

0,1,2,3
Dep. Variable:,logGDP,R-squared:,0.0529
Estimator:,PanelOLS,R-squared (Between):,0.2840
No. Observations:,3875,R-squared (Within):,0.2377
Date:,"Fri, May 16 2025",R-squared (Overall):,0.2860
Time:,10:06:24,Log-likelihood,1676.3
Cov. Estimator:,Clustered,,
,,F-statistic:,207.51
Entities:,130,P-value,0.0000
Avg Obs:,29.808,Distribution:,"F(1,3712)"
Min Obs:,20.000,,

0,1,2,3,4,5,6
,Parameter,Std. Err.,T-stat,P-value,Lower CI,Upper CI
const,9.6499,0.1263,76.425,0.0000,9.4023,9.8974
Gender Inequality Index (value)_shifted,-1.1113,0.3396,-3.2726,0.0011,-1.7770,-0.4455
