In [83]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
from linearmodels import PanelOLS
import plotly.express as px

sns.set_theme()

In [84]:
DF = pd.read_csv('Datasets/GenderStatistics/WB_GS_WIDEF.csv')
df = DF.copy()

In [85]:
df.drop(columns=['FREQ', 'FREQ_LABEL','INDICATOR','AGE','UNIT_MEASURE',
       'COMP_BREAKDOWN_1_LABEL', 'COMP_BREAKDOWN_2', 'COMP_BREAKDOWN_2_LABEL',
       'DATABASE_ID', 'DATABASE_ID_LABEL', 'UNIT_MULT', 'UNIT_MULT_LABEL',
       'OBS_STATUS', 'OBS_STATUS_LABEL', 'OBS_CONF', 'OBS_CONF_LABEL'], inplace=True)

In [89]:
df_melt = df.melt(
    id_vars=['REF_AREA', 'REF_AREA_LABEL', 'INDICATOR_LABEL', 'SEX', 'SEX_LABEL', 'UNIT_MEASURE_LABEL','AGE_LABEL','COMP_BREAKDOWN_1'],
    value_vars=[col for col in df.columns if col.isdigit()],
    var_name='YEAR',
    value_name='VALUE'
)
df_melt['YEAR'] = pd.to_datetime(df_melt['YEAR'], format='%Y', errors='coerce').dt.year
df_melt['VALUE'] = pd.to_numeric(df_melt['VALUE'], errors='coerce')
df_melt.dropna(subset=['VALUE'], inplace=True)

In [90]:
INDICATOR_NON_NEEDED = ['Inflation, consumer prices (annual %)', 'GDP (current US$)',
       'GDP growth (annual %)', 'GDP per capita (current US$)',
       'GDP per capita (constant 2010 US$)',
       'GNI, Atlas method (current US$)',
       'GNI per capita, Atlas method (current US$)',
       'GNI per capita, PPP (current international $)',
       'Children out of school, primary (Number)',
       'Educational attainment by level of education, cumulative (% population 25+)',
       'School enrollment, gender parity index',
       'School enrollment, preprimary (% gross)',
       'Primary completion rate, based on completers (%)',
       'Primary completion rate (% of relevant age group)',
       'Primary education, pupils (% female)',
       'School enrollment, primary (%)',
       'Gross intake ratio in grade 1, total (% of relevant age group)',
       'Net intake rate in grade 1 (% of official school-age population)',
       'Persistence to grade 5 (% of cohort)',
       'Primary education, teachers (% female)',
       'Adjusted net enrollment rate, primary (% of primary school age children)',
       'Children out of school, primary (Number)',
       'Children out of school (% of primary school age)',
       'Expected years of schooling',
       'Lower secondary completion rate (% of relevant age group)',
       'Secondary education, pupils (% female)',
       'Students in secondary education enrolled in vocational programmes, male (%)',
       'Secondary education, vocational pupils',
       'Secondary education, vocational pupils (% female)',
       'Vocational and technical enrolment (% of total secondary enrolment)',
       'School enrollment, secondary (%)',
       'Progression to secondary school (%)',
       'Secondary education, teachers (% female)',
       'Gross graduation ratio, tertiary (%)',
       'School enrollment, tertiary (% gross)',
       'Share of graduates by field, female (%)',
       'Tertiary education, academic staff (% female)',
       'Ratio of female to male labor force participation rate (%)',
       'Ratio of female to male youth unemployment rate (% ages 15-24)',
       'Share of youth not in education, employment or training (% of youth population)',
       'Unemployment (%)',
       'Adolescent fertility rate (births per 1,000 women ages 15-19)',
       'Mortality rate, adult (per 1,000 adults)',
       'Birth rate, crude (per 1,000 people)',
       'Death rate, crude (per 1,000 people)',
       'Life expectancy at birth (years)',
       'Life expectancy at age 60 (years)', 'Mean age at first marriage',
       'Fertility rate, total (births per woman)',
       'Survival to age 65 (% of cohort)', 'Population (number)',
       'Sex ratio at birth (male births per female births)',
       'Age dependency ratio (% of working-age population)',
       'Age population, interpolated',
       'Population, female (% of total population)',
       'Population (age group as % of total population)',
       'Students in lower secondary vocational education, female (%)',
       'Students in upper secondary vocational education, female (%)',
       'Students in post-secondary non-tertiary vocational education,  female (%)',
       'Share of graduates in Business, Administration and Law programmes, female (%)',
       'Share of graduates in other fields than Science, Technology, Engineering and Mathematics programmes, female (%)',
       'Share of all students in lower secondary education enrolled in vocational programmes (%)',
       'Share of all students in upper secondary education enrolled in vocational programmes (%)',
       'Share of all students in post-secondary non-tertiary education enrolled in vocational programmes (%)',
       'Youth illiterate population, 15-24 years, female (%)',
       'Youth illiterate population, 15-24 years (number)',
       'Rate of out-of-school youth of upper secondary school age (%)',
       'Intentional homicides (per 100,000 people)',
       'Cost of business start-up procedures (% of GNI per capita)',
       'Time required to start a business (days)',
       'Start-up procedures to register a business (number)',
       'Proportion of seats held by women in national parliaments (%)',
       'Total alcohol consumption per capita (liters of pure alcohol, projected estimates, 15+ years of age)',
       'Prevalence of anemia among women of reproductive age (% of women ages 15-49)',
       'Prevalence of anemia among non-pregnant women (% of women ages 15-49), with Hb<120 g/L',
       'Number of infant deaths', 'Number of under-five deaths',
       'Number of stillbirths', 'Cause of death (%)',
       'Share of population ages 15+ living with HIV, female (%)',
       'Mortality rate, under-5 (per 1,000 live births)',
       'Mortality from CVD, cancer, diabetes or CRD between exact ages 30 and 70 (%)',
       'Stillbirth rate (per 1,000 total births)',
       'Prevalence of HIV (% ages 15-24)',
       'Antiretroviral therapy coverage (% of adults living with HIV)',
       'Incidence of HIV (per 1,000 uninfected population)',
       'Antiretroviral therapy coverage for PMTCT (% of pregnant women living with HIV)',
       'Immunization, DPT (% of children ages 12-23 months)',
       'Immunization, measles (% of children ages 12-23 months)',
       'Intermittent Preventive Treatment of malaria in pregnancy (% of pregnant women)',
       'Number of maternal deaths',
       'Lifetime risk of maternal death (1 in: rate varies by country)',
       'Lifetime risk of maternal death (%)',
       'Prevalence of anemia among pregnant women (any age), with Hb<110 g/L',
       'Prevalence of syphilis (% of women attending antenatal care)',
       'Prevalence of current tobacco use (% of adults)',
       'Mortality rate attributed to household and ambient air pollution, age-standardized (per 100,000 population)',
       'Maternal mortality ratio (per 100,000 live births)',
       'People practicing open defecation, rural (% of rural population)',
       'People practicing open defecation, urban (% of urban population)',
       'Prevalence of overweight (% of adults)',
       'Mortality rate attributed to unintentional poisoning (per 100,000 population)',
       'Suicide mortality rate (per 100,000 population)',
       'Mortality caused by road traffic injury (per 100,000 population)',
       'Mortality rate attributed to unsafe water, unsafe sanitation and lack of hygiene (per 100,000 population)',
       'Employment by sector (%)',
       'Employers (% of employment) (modeled ILO estimate)',
       'Self-employed (% of employment) (modeled ILO estimate)',
       'Vulnerable employment (% of employment) (modeled ILO estimate)',
       'Wage and salaried workers (% of employment) (modeled ILO estimate)',
       'Labor force, female (% of total labor force)',
       'Labor force (number)',
       'Mortality rate, infant (per 1,000 live births)',
       'Completeness of birth registration (%)',
       'Unmet need for contraception (% of married women ages 15-49)',
       'No account because financial institutions are too far away (% age 15+)',
       'No account because financial institutions are too far away (% without an account, age 15+)',
       'No account because financial services are too expensive (% age 15+)',
       'No account because financial services are too expensive (% without an account, age 15+)',
       'No account because of a lack of necessary documentation (% age 15+)',
       'No account because of a lack of necessary documentation (% without an account, age 15+)',
       'No account because of a lack of trust in financial institutions (% age 15+)',
       'No account because of a lack of trust in financial institutions (% without an account, age 15+)',
       'No account because of religious reasons (% age 15+)',
       'No account because of religious reasons (% without an account, age 15+)',
       'No account because of insufficient funds (% age 15+)',
       'No account because of insufficient funds (% without an account, age 15+)',
       'Can use account at a bank or financial institution without help if opened (% without an account, age 15+)',
       'Used a mobile phone or the internet to send money (% age 15+)',
       'Used a mobile phone or the internet to pay bills in the past year (% age 15+)',
       'Used a mobile phone or the internet to buy something online in the past year(% age 15+)',
       'Made a digital in-store merchant payment: using a mobile phone (% age 15+)',
       'Saved to start, operate, or expand a farm or business (% age 15+)',
       'Saved for old age (% age 15+)',
       'Saved money using a mobile money account (% age 15+)',
       'Saved at a financial institution or using a mobile money account (% age 15+)',
       'Saved at a financial institution (% age 15+)',
       'Saved using a savings club or a person outside the family (% age 15+)',
       'Saved for education or school fees (% age 15+)',
       'Saved any money in the past year (% age 15+)',
       'Financial institution account (% age 15+)',
       'Borrowed for health or medical purposes (% age 15+)',
       'Borrowed from a store by buying on credit (% age 15+)',
       'Borrowed to start, operate, or expand a farm or business (% age 15+)',
       'Borrowed for education or school fees (% age 15+)',
       'Borrowed any money from a formal financial institution or using a mobile money account (% age 15+)',
       'Borrowed from a financial institution (% age 15+)',
       'Borrowed from family or friends (% age 15+)',
       'Borrowed from a savings club (% age 15+)',
       'Borrowed any money in the past year (% age 15+)',
       'Coming up with emergency funds in 30 days: possible (% age 15+)',
       'Coming up with emergency funds in 7 days: possible (% age 15+)',
       'Coming up with emergency funds in 30 days: not possible (% age 15+)',
       'Sent or received domestic remittances in the past year (% age 15+)',
       'Received domestic remittances in the past year (% age 15+)',
       'Sent domestic remittances in the past year (% age 15+)',
       'Debit card ownership (% age 15+)',
       'Paid utility bills in the past year (% age 15+)',
       'Paid school fees in the past year (% age 15+)',
       'Received government transfer or pension (% age 15+)',
       'Received government transfers in the past year (% age 15+)',
       'Received a public sector pension in the past year (% age 15+)',
       'Received payments for the sale of agricultural products, livestock, or crops (% age 15+)',
       'Worried about not having enough money for old age (% age 15+)',
       'Worried about not being able to pay for medical costs in case of a serious illness or accident (% age 15+)',
       'Worried about not having enough money for monthly expenses or bills (% age 15+)',
       'Worried about not being able to pay school fees or fees for education (% age 15+)',
       'Experience or continue to experience severe financial hardship as a result of the disruption caused by COVID-19 (% age 15+)',
       'Most worrying financial issue (% age 15+)',
       'Used a mobile phone or the internet to access an account (% with an account, age 15+)',
       'Credit card ownership (% age 15+)',
       'Has an inactive account (% age 15+)',
       'Received government payments in the past year (% age 15+)',
       'Account ownership at a financial institution or with a mobile-money-service provider (% of population ages 15+)',
       'Made or received digital payments in the past year (% age 15+)',
       'Made digital payments in the past year (% age 15+)',
       'Received digital payments in the past year (% age 15+)',
       'Survival Rate from Age 15-60', 'Expected Years of School',
       'Harmonized Test Scores', 'Learning-Adjusted Years of School',
       'Probability of Survival to Age 5',
       'Human Capital Index (HCI) (scale 0-1)',
       'Human Capital Index, Lower Bound (scale 0-1)',
       'Human Capital Index, Upper Bound (scale 0-1)',
       'Fraction of Children Under 5 Not Stunted', 'Number of directors',
       'Mobile money account (% age 15+)',
       'Learning poverty: Share of Children at the End-of-Primary age below minimum reading proficiency adjusted by Out-of-School Children (%)',
       'Pupils below minimum reading proficiency at end of primary (%). Low GAML threshold',
       'Mandatory retirement age',
       'Main cooking fuel: charcoal (% of households)',
       'Main cooking fuel: agricultural crop (% of households)',
       'Main cooking fuel: dung (% of households)',
       'Main cooking fuel: electricity (% of households)',
       'Location of cooking: inside the house (% of households)',
       'Main cooking fuel: LPG/natural gas/biogas (% of households)',
       'Location of cooking: other places (% of households)',
       'Location of cooking: outdoors (% of households)',
       'Location of cooking: separate building (% of households)',
       'Main cooking fuel: straw/shrubs/grass (% of households)',
       'Main cooking fuel: wood (% of households)',
       'Households with water on the premises (%)',
       'Households with water less than 30 minutes away round trip (%)',
       'Households with water 30 minutes or longer away round trip (%)',
       'Proportion who initiated sexual intercourse by or before age 15',
       'Proportion of women subjected to physical and/or sexual violence in the last 12 months (%)',
       'Proportion of women who have ever experienced any form of sexual violence (% of women age 15-49)',
       'Proportion of women who have never sought help to stop violence, and never told anyoneÂ\xa0 (% of ever-married women ages 15-49 who have ever experienced any physical or sexual violence)',
       'Proportion of women who have never sought help to stop violence, but told someone (% of ever-married women ages 15-49 who have ever experienced any physical or sexual violence)',
       'Proportion of women who have sought help to stop physical or sexual violence (% of women age 15-49)',
       'Comprehensive correct knowledge of HIV/AIDS (2 prevent ways and reject 3 misconceptions)',
       'Length of paid leave (calendar days)',
       'Length of parental leave (calendar days)',
       'Prevalence of underweight, weight for age (% of children under 5)',
       'Prevalence of obesity (% of population ages 18+)',
       'Prevalence of overweight, weight for height (% of children under 5)',
       'Prevalence of stunting, height for age (% of children under 5)',
       'Prevalence of wasting, weight for height (% of children under 5)',
       'Prevalence of severe wasting, weight for height (% of children under 5)',
       'Poverty headcount ratio at national poverty lines (% of population)',
       'Time-related underemployment (% of employment)',
       'Children in employment (% of children ages 7-14)',
       'Part time employment (% of employment)',
       'Labor force by level of education (%)',
       'Unemployment by level of education (%)',
       'Wanted fertility rate (births per woman)',
       'Retirement age by type of benefits',
       'Condom use at last high-risk sex (% ages 15-49)',
       'Poverty headcount ratio at $2.15 a day (2017 PPP) (% of population)',
       'Gini index (World Bank estimate)',
       'Informal employment (% of total non-agricultural employment)',
       'Share of graduates in Information and Communication Technologies programmes, female (%)',
       'Withdrew money from a financial institution account 2 or more times a month (% age 15+)',
       'Withdrew money from a financial institution account 2 or more times a month (% who had withdrawn money, age 15+)',
       'Made a digital in-store merchant payment for the first time after COVID-19 started (% age 15+)',
       'Made a digital in-store merchant payment for the first time after COVID-19 started (% who made a digital in-store payment, age 15+)',
       'Proportion of time spent on unpaid domestic and care work (% of 24 hour day)',
       'Use a mobile money account two or more times a month (% age 15+)',
       'Use a mobile money account two or more times a month (% with a mobile money account, age 15+)',
       'Can use a mobile money account without help from anyone, including a mobile money agent (% age 15+)',
       'Can use a mobile money account without help from anyone, including a mobile money agent (% with a mobile money account, age 15+)',
       'Primary school age children out-of-school (%)',
       'Made a digital online payment for an online purchase for the first time after COVID-19 started (% age 15+)',
       'Made a utility payment: using an account for the first time after COVID-19 started (% age 15+)',
       'Reason for not having a mobile money account: mobile money agents are too far away (% age 15+)',
       'Reason for not having a mobile money account: mobile money agents are too far away (% without an account, age 15+)',
       'Reason for not having a mobile money account: available mobile money products are too expensive (% age 15+)',
       'Reason for not having a mobile money account: available mobile money products are too expensive (% without an account, age 15+)',
       "Reason for not having a mobile money account: don't have the necessary documentation (% age 15+)",
       "Reason for not having a mobile money account: don't have the necessary documentation (% without an account, age 15+)",
       "Reason for not having a mobile money account: don't have enough money to use a mobile money account (% age 15+)",
       "Reason for not having a mobile money account: don't have enough money to use a mobile money account (% without an account, age 15+)",
       'Reason for not having a mobile money account: use an agent or someone else to make payments (% age 15+)',
       'Reason for not having a mobile money account: use an agent or someone else to make payments (% without an account, age 15+)',
       'Reason for not having a mobile money account: do not have their own mobile phone (% age 15+)',
       'Reason for not having a mobile money account: do not have their own mobile phone (% without an account, age 15+)',
       'Reason for not using their inactive account: bank or financial institution is too far away (% age 15+)',
       'Reason for not using their inactive account: bank or financial institution is too far away (% with an inactive account, age 15+)',
       'Reason for not using their inactive account: no need for an account (% age 15+)',
       'Reason for not using their inactive account: no need for an account (% with an inactive account, age 15+)',
       "Reason for not using their inactive account: don't have enough money to use an account (% age 15+)",
       "Reason for not using their inactive account: don't have enough money to use an account (% with an inactive account, age 15+)",
       "Reason for not using their inactive account: don't feel comfortable using the account by themselves (% age 15+)",
       "Reason for not using their inactive account: don't feel comfortable using an account by themselves (% with an inactive account, age 15+)",
       "Reason for not using their inactive account: don't trust banks or financial institutions (% age 15+)",
       "Reason for not using their inactive account: don't trust banks or financial institutions (% with an inactive account, age 15+)",
       'Literacy rate, gender parity index (youth ages 15-24)',
       'Migrants, female (% of international migrant stock)',
       'People practicing open defecation (% of population)',
       'Employment to population ratio (%)',
       'Labor force participation rate (% of population)',
       'Rural population (%)', 'Urban population (%)',
       'Contraceptive prevalence (% of women ages 15-49)',
       'Pregnant women receiving prenatal care of at least four visits (% of pregnant women)',
       'Number of business owners',
       'Number of sole proprietors',
       'Women Business and the Law Index Score (scale 1-100)',
       'Women, Business and the Law: Assets Indicator Score (scale 1-100)',
       'Women, Business and the Law: Entrepreneurship Indicator Score (scale 1-100)',
       'Women, Business and the Law: Mobility Indicator Score (scale 1-100)',
       'Women, Business and the Law: Marriage Indicator Score (scale 1-100)',
       'Women, Business and the Law: Pension Indicator Score (scale 1-100)',
       'Women, Business and the Law: Parenthood Indicator Score (scale 1-100)',
       'Women, Business and the Law: Pay Indicator Score (scale 1-100)',
       'Women, Business and the Law: Workplace Indicator Score (scale 1-100)',
       'The law provides for the valuation of nonmonetary contributions (1=yes; 0=no)',
       'Women whose first experience of spousal physical or sexual violence was before marriageÂ\xa0 (% of currently married women age 15-49 who have been married only once)',
       'Women whose first experience of spousal physical or sexual violence was during marriageÂ\xa0 (% of currently married women age 15-49 who have been married only once)',
       'Women who have not experienced spousal physical or sexual violence (% of currently married women age 15-49 who have been married only once)',
       'Proportion of women who have ever experienced intimate partner violence (%)',
       'Women who have ever experienced violence committed by their husband/partner by type of violenceÂ\xa0 (% of ever-married women ages 15-49)',
       'Women who have experienced violence committed by their husband/partner in the last 12 months by type of violence (% of ever-married women ages 15-49)',
       'Women who experienced first sexual violence before age 15, 18, or 22 (% of women ages 15-49)',
       'Knowledge of contraception methods (%)',
       'Fertility Planning Status (% of births)']

In [105]:
df2 = df_melt[~df_melt['INDICATOR_LABEL'].isin(INDICATOR_NON_NEEDED)]
df2 = df2.drop(columns=['UNIT_MEASURE_LABEL','SEX_LABEL','REF_AREA_LABEL'])
df2 = df2[df2['AGE_LABEL'] != 'under 15 years old']
df2 = df2[df2['AGE_LABEL'] != '15 to 24 years old']

df2 = df2[~((df2['INDICATOR_LABEL'] == 'Government expenditure on education (% of GDP)') & (df2['COMP_BREAKDOWN_1'] != 'ISIC4_P'))]
df2 = df2.drop(columns=['AGE_LABEL','COMP_BREAKDOWN_1'])

df_T = df2[df2['SEX'] == '_T']
df_F = df2[df2['SEX'] == '_F']

In [108]:
GS_T = df_T.pivot(
    index=['REF_AREA', 'YEAR'],
    columns='INDICATOR_LABEL',
    values='VALUE'
)
GS_T.columns.name = None
GS_T.reset_index(inplace=True)
GS_F = df_F.pivot(
    index=['REF_AREA', 'YEAR', 'SEX'],
    columns='INDICATOR_LABEL',
    values='VALUE'
)
GS_F.reset_index(inplace=True)
GS_F.columns.name = None

In [122]:
min_non_null = int(0.75 * GS_T.shape[0])
GS_T = GS_T.dropna(axis=1, thresh=min_non_null)
GS_T = GS_T.dropna()

In [116]:
GS_T.shape[0] * 0.8

9987.2

In [125]:
GS_T.isna().shape

(10206, 36)