# Relation of schooling years to education

In this notebook we will analyze the correlation between years of education and the proficiency in certain skills.

First we will load all the data and extract the relevant indicators (see notebook `11` for details):

In [3]:
import numpy as np
import pandas as pd
import seaborn as sns
import os
from matplotlib import pyplot as plt
import edstats_utils as edstats
%matplotlib inline

# some pandas options
pd.set_option('display.max_colwidth', -1)

datapath = "./datasets/edstats"
df_country = pd.read_csv(os.path.join(datapath, "EdStatsCountry.csv"))
df_data = pd.read_csv(os.path.join(datapath, "EdStatsData.csv"))
df_series = pd.read_csv(os.path.join(datapath, "EdStatsSeries.csv"))
df_note = pd.read_csv(os.path.join(datapath, "EdStatsFootNote.csv"))
df_cseries = pd.read_csv(os.path.join(datapath, "EdStatsCountry-Series.csv"))

In [2]:
# list of total wittgenstein projection indicators
list_witt_total = edstats.list_indicators('Population in .* highest level .* Total', df_data)
#list_witt_total = ['PRJ.POP.ALL.S1.MF', 'PRJ.POP.ALL.1.MF', 'PRJ.POP.ALL.2.MF', 'PRJ.POP.ALL.3.MF', 'PRJ.POP.ALL.4.MF', 'PRJ.POP.ALL.NED.MF']
list_witt_total

['PRJ.POP.ALL.S1.MF',
 'PRJ.POP.ALL.2.MF',
 'PRJ.POP.ALL.NED.MF',
 'PRJ.POP.ALL.4.MF',
 'PRJ.POP.ALL.1.MF',
 'PRJ.POP.ALL.3.MF']

In [8]:
def _retrieve_indicator(df, reg_str):
    '''Retrieve all datapoints that contain the given indicator regex string.'''
    # retireve list of relevant indicator codes
    ls_inds = df[df['Indicator Name'].str.contains(reg_str, regex=True, case=False)]['Indicator Code'].tolist()
    # extract all relevant items
    return df[df['Indicator Code'].isin(ls_inds)]

In [46]:
#df_barro = _retrieve_indicator(df_data, 'Barro-Lee: Percentage of .*')
    
# split data
df_barro['Indicator Name'].str.extract('Barro-Lee: (?P<indicator>Percentage of[ ]?(?P<gender>female)? population[ ]?(?P<age_group>age [0-9]+(-[0-9]+|\+))? with (?P<schooling>[\s\S]+?)(\. (?P<completed>[\s\S]+))?$)', expand=True)
#df_barro['Indicator Name'].str.extract('Barro-Lee: (?P<indicator>Percentage of[ ]?(?P<gender>female)? population[ ]?(?P<age_group>age [0-9]+(-[0-9]+|\+))? with (?P<schooling>[\s\S]+)\. (?P<completed>[\s\S]+))', expand=True)

Unnamed: 0,indicator,gender,age_group,3,schooling,5,completed
269,Percentage of female population age 15+ with no education,female,age 15+,+,no education,,
270,Percentage of female population age 15+ with primary schooling. Completed Primary,female,age 15+,+,primary schooling,. Completed Primary,Completed Primary
271,Percentage of female population age 15+ with primary schooling. Total (Incomplete and Completed Primary),female,age 15+,+,primary schooling,. Total (Incomplete and Completed Primary),Total (Incomplete and Completed Primary)
272,Percentage of female population age 15+ with secondary schooling. Completed Secondary,female,age 15+,+,secondary schooling,. Completed Secondary,Completed Secondary
273,Percentage of female population age 15+ with secondary schooling. Total (Incomplete and Completed Secondary),female,age 15+,+,secondary schooling,. Total (Incomplete and Completed Secondary),Total (Incomplete and Completed Secondary)
274,Percentage of female population age 15+ with tertiary schooling. Completed Tertiary,female,age 15+,+,tertiary schooling,. Completed Tertiary,Completed Tertiary
275,Percentage of female population age 15+ with tertiary schooling. Total (Incomplete and Completed Tertiary),female,age 15+,+,tertiary schooling,. Total (Incomplete and Completed Tertiary),Total (Incomplete and Completed Tertiary)
276,Percentage of female population age 15-19 with no education,female,age 15-19,-19,no education,,
277,Percentage of female population age 15-19 with primary schooling. Completed Primary,female,age 15-19,-19,primary schooling,. Completed Primary,Completed Primary
278,Percentage of female population age 15-19 with primary schooling. Total (Incomplete and Completed Primary),female,age 15-19,-19,primary schooling,. Total (Incomplete and Completed Primary),Total (Incomplete and Completed Primary)


In [29]:
pd.concat([df_witt, df_witt_groups[['indicator', 'age_group', 'gender']]], axis=1)
#df_witt_groups

Unnamed: 0,Country Name,Country Code,Indicator Name,Indicator Code,1970,1971,1972,1973,1974,1975,...,2075,2080,2085,2090,2095,2100,Unnamed: 69,indicator,age_group,gender
3349,Arab World,ARB,Wittgenstein Projection: Mean years of schooling. Age 0-19. Female,PRJ.MYS.0T19.FE,,,,,,,...,,,,,,,,Mean years of schooling. Age 0-19. Female,Age 0-19,Female
3350,Arab World,ARB,Wittgenstein Projection: Mean years of schooling. Age 0-19. Male,PRJ.MYS.0T19.MA,,,,,,,...,,,,,,,,Mean years of schooling. Age 0-19. Male,Age 0-19,Male
3351,Arab World,ARB,Wittgenstein Projection: Mean years of schooling. Age 0-19. Total,PRJ.MYS.0T19.MF,,,,,,,...,,,,,,,,Mean years of schooling. Age 0-19. Total,Age 0-19,Total
3352,Arab World,ARB,Wittgenstein Projection: Mean years of schooling. Age 15+. Female,PRJ.MYS.15UP.FE,,,,,,,...,,,,,,,,Mean years of schooling. Age 15+. Female,Age 15+,Female
3353,Arab World,ARB,Wittgenstein Projection: Mean Years of Schooling. Age 15+. Gender Gap,PRJ.MYS.15UP.GPI,,,,,,,...,,,,,,,,Mean Years of Schooling. Age 15+. Gender Gap,Age 15+,Gender Gap
3354,Arab World,ARB,Wittgenstein Projection: Mean years of schooling. Age 15+. Male,PRJ.MYS.15UP.MA,,,,,,,...,,,,,,,,Mean years of schooling. Age 15+. Male,Age 15+,Male
3355,Arab World,ARB,Wittgenstein Projection: Mean years of schooling. Age 15+. Total,PRJ.MYS.15UP.MF,,,,,,,...,,,,,,,,Mean years of schooling. Age 15+. Total,Age 15+,Total
3356,Arab World,ARB,Wittgenstein Projection: Mean years of schooling. Age 15-19. Female,PRJ.MYS.1519.FE,,,,,,,...,,,,,,,,Mean years of schooling. Age 15-19. Female,Age 15-19,Female
3357,Arab World,ARB,Wittgenstein Projection: Mean years of schooling. Age 15-19. Male,PRJ.MYS.1519.MA,,,,,,,...,,,,,,,,Mean years of schooling. Age 15-19. Male,Age 15-19,Male
3358,Arab World,ARB,Wittgenstein Projection: Mean years of schooling. Age 15-19. Total,PRJ.MYS.1519.MF,,,,,,,...,,,,,,,,Mean years of schooling. Age 15-19. Total,Age 15-19,Total
