In [1]:
import requests, zipfile, io
import pandas as pd
import matplotlib.pyplot as plt
import os
import matplotlib as mpl
import datetime as dt
import seaborn as sns

idx = pd.IndexSlice

plt.rcParams['figure.figsize'] = [16, 10]
plt.rcParams.update({'font.size': 22})
plt.style.use('seaborn-poster')
mpl.rcParams['axes.spines.right'] = False
mpl.rcParams['axes.spines.top'] = False

# 1. Download zip file
https://www.worldbank.org/en/data/datatopics/gender

In [3]:
file = 'Gender_StatsEXCEL.xlsx'
if any([x for x in os.listdir('Input') if file in x]):
    print('File already downloaded')
else:
    zip_file_url = 'http://databank.worldbank.org/data/download/Gender_stats_excel.zip'
    r = requests.get(zip_file_url)
    z = zipfile.ZipFile(io.BytesIO(r.content))
    z.extractall('Input/')
    

# 2. Explore data

## 2.1 Sheet names

In [4]:
file = 'Input/Gender_StatsEXCEL.xlsx'

xl = pd.ExcelFile(file)

xl.sheet_names  # see all sheet names

['Data', 'Country', 'Series', 'Country-Series', 'Series-Time', 'FootNote']

## 2.2 Available series

In [5]:
se = pd.read_excel(file, sheet_name='Series')
se.head(2)

Unnamed: 0,Series Code,Topic,Indicator Name,Short definition,Long definition,Unit of measure,Periodicity,Base Period,Other notes,Aggregation method,Limitations and exceptions,Notes from original source,General comments,Source,Statistical concept and methodology,Development relevance,Related source links,Other web links,Related indicators,License Type
0,fin1.t.a,Assets,Financial institution account (% age 15+),The percentage of respondents who report havin...,The percentage of respondents who report havin...,Percent,Triennial,,,Weighted Average,,,,Global Findex database,,,,,,
1,fin1.t.a.1,Assets,"Financial institution account,male(% age 15+)",The percentage of respondents who report havin...,The percentage of respondents who report havin...,Percent,Triennial,,,Weighted Average,,,,Global Findex database,,,,,,


### 2.2.1 Topics

In [6]:
se.Topic.unique()

array(['Assets', 'Technology', 'Entrepreneurship',
       'Economic and Social Context', 'Health', 'Education', 'Leadership',
       'Employment and Time Use', 'Norms and Decision-making', 'Violence',
       'Population', nan, 'Environment'], dtype=object)

### 2.2.2 Search by category

In [7]:
def search(topic):
    mask = se.Topic == topic
    res = se.loc[mask,'Indicator Name'].unique()
    return res

In [8]:
search('Employment and Time Use')[:4]

array(['There are periods of absence due to childcare accounted for in pension benefits (1=yes; 0=no)',
       'The age at which men and women can retire with full pension benefits is the same (1=yes; 0=no)',
       'The mandatory retirement age for men and women is the same (1=yes; 0=no)',
       'The age at which men and women can retire with partial pension benefits is the same (1=yes; 0=no)'],
      dtype=object)

# 3. Import all data

In [9]:
df = pd.read_excel(file)
df.head(2)

Unnamed: 0,Country Name,Country Code,Indicator Name,Indicator Code,1960,1961,1962,1963,1964,1965,...,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021
0,Africa Eastern and Southern,AFE,A woman can apply for a passport in the same w...,SG.APL.PSPT.EQ,,,,,,,...,,,,,,,,,,
1,Africa Eastern and Southern,AFE,A woman can be head of household in the same w...,SG.HLD.HEAD.EQ,,,,,,,...,,,,,,,,,,


In [10]:
df.info(verbose=False)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 254665 entries, 0 to 254664
Columns: 66 entries, Country Name to 2021
dtypes: float64(62), object(4)
memory usage: 128.2+ MB


## 3.C Wide to long

In [11]:
df.drop('Country Code', axis=1, inplace=True)

In [12]:
df = df.melt(id_vars=['Country Name', 'Indicator Name','Indicator Code'])
df.head(2)

Unnamed: 0,Country Name,Indicator Name,Indicator Code,variable,value
0,Africa Eastern and Southern,A woman can apply for a passport in the same w...,SG.APL.PSPT.EQ,1960,
1,Africa Eastern and Southern,A woman can be head of household in the same w...,SG.HLD.HEAD.EQ,1960,


In [13]:
df = df.rename(columns={'variable':'date'}).set_index(['Country Name', 'date'])
df.head(2)

Unnamed: 0_level_0,Unnamed: 1_level_0,Indicator Name,Indicator Code,value
Country Name,date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Africa Eastern and Southern,1960,A woman can apply for a passport in the same w...,SG.APL.PSPT.EQ,
Africa Eastern and Southern,1960,A woman can be head of household in the same w...,SG.HLD.HEAD.EQ,
