# Relation of schooling years to education

In this notebook we will analyze the correlation between years of education and the proficiency in certain skills.

First we will load all the data and extract the relevant indicators (see notebook `11` for details):

In [58]:
import numpy as np
import pandas as pd
import seaborn as sns
import os
from matplotlib import pyplot as plt
import edstats_utils as edstats
%matplotlib inline

# some pandas options
pd.set_option('display.max_colwidth', -1)

datapath = "./datasets/edstats"
df_country = pd.read_csv(os.path.join(datapath, "EdStatsCountry.csv"))
df_data = pd.read_csv(os.path.join(datapath, "EdStatsData.csv"))
df_series = pd.read_csv(os.path.join(datapath, "EdStatsSeries.csv"))
df_note = pd.read_csv(os.path.join(datapath, "EdStatsFootNote.csv"))
df_cseries = pd.read_csv(os.path.join(datapath, "EdStatsCountry-Series.csv"))

In [2]:
# list of total wittgenstein projection indicators
list_witt_total = edstats.list_indicators('Population in .* highest level .* Total', df_data)
#list_witt_total = ['PRJ.POP.ALL.S1.MF', 'PRJ.POP.ALL.1.MF', 'PRJ.POP.ALL.2.MF', 'PRJ.POP.ALL.3.MF', 'PRJ.POP.ALL.4.MF', 'PRJ.POP.ALL.NED.MF']
list_witt_total

['PRJ.POP.ALL.S1.MF',
 'PRJ.POP.ALL.2.MF',
 'PRJ.POP.ALL.NED.MF',
 'PRJ.POP.ALL.4.MF',
 'PRJ.POP.ALL.1.MF',
 'PRJ.POP.ALL.3.MF']

In [8]:
def _retrieve_indicator(df, reg_str):
    '''Retrieve all datapoints that contain the given indicator regex string.'''
    # retireve list of relevant indicator codes
    ls_inds = df[df['Indicator Name'].str.contains(reg_str, regex=True, case=False)]['Indicator Code'].tolist()
    # extract all relevant items
    return df[df['Indicator Code'].isin(ls_inds)]

In [73]:
df_skill = _retrieve_indicator(df_data, 'PIAAC: .* proficiency level')
    
    # extract columns
df_skill_groups = df_skill['Indicator Name'].str.extract('PIAAC: (?P<indicator>(?P<gender>(Male|Female))?[ ]*(?P<type>Young)?[ ]*[aA]dults by .*?(?P<subject>((?<=proficiency level in ).*|.*(?= proficiency level))).*? \(%\). (?P<level>[\S ]+))', expand=True)
pd.concat([df_skill, df_skill_groups[['indicator', 'gender', 'type', 'subject', 'level']].fillna({'gender': 'Total', 'type': 'all'})], axis=1)

Unnamed: 0,Country Name,Country Code,Indicator Name,Indicator Code,1970,1971,1972,1973,1974,1975,...,2085,2090,2095,2100,Unnamed: 69,indicator,gender,type,subject,level
2085,Arab World,ARB,PIAAC: Adults by literacy proficiency level (%). Below Level 1,LO.PIAAC.LIT.BE,,,,,,,...,,,,,,Adults by literacy proficiency level (%). Below Level 1,Total,all,literacy,Below Level 1
2086,Arab World,ARB,PIAAC: Adults by literacy proficiency level (%). Level 1,LO.PIAAC.LIT.1,,,,,,,...,,,,,,Adults by literacy proficiency level (%). Level 1,Total,all,literacy,Level 1
2087,Arab World,ARB,PIAAC: Adults by literacy proficiency level (%). Level 2,LO.PIAAC.LIT.2,,,,,,,...,,,,,,Adults by literacy proficiency level (%). Level 2,Total,all,literacy,Level 2
2088,Arab World,ARB,PIAAC: Adults by literacy proficiency level (%). Level 3,LO.PIAAC.LIT.3,,,,,,,...,,,,,,Adults by literacy proficiency level (%). Level 3,Total,all,literacy,Level 3
2089,Arab World,ARB,PIAAC: Adults by literacy proficiency level (%). Level 4,LO.PIAAC.LIT.4,,,,,,,...,,,,,,Adults by literacy proficiency level (%). Level 4,Total,all,literacy,Level 4
2090,Arab World,ARB,PIAAC: Adults by literacy proficiency level (%). Level 5,LO.PIAAC.LIT.5,,,,,,,...,,,,,,Adults by literacy proficiency level (%). Level 5,Total,all,literacy,Level 5
2091,Arab World,ARB,PIAAC: Adults by numeracy proficiency level (%). Below Level 1,LO.PIAAC.NUM.BE,,,,,,,...,,,,,,Adults by numeracy proficiency level (%). Below Level 1,Total,all,numeracy,Below Level 1
2092,Arab World,ARB,PIAAC: Adults by numeracy proficiency level (%). Level 1,LO.PIAAC.NUM.1,,,,,,,...,,,,,,Adults by numeracy proficiency level (%). Level 1,Total,all,numeracy,Level 1
2093,Arab World,ARB,PIAAC: Adults by numeracy proficiency level (%). Level 2,LO.PIAAC.NUM.2,,,,,,,...,,,,,,Adults by numeracy proficiency level (%). Level 2,Total,all,numeracy,Level 2
2094,Arab World,ARB,PIAAC: Adults by numeracy proficiency level (%). Level 3,LO.PIAAC.NUM.3,,,,,,,...,,,,,,Adults by numeracy proficiency level (%). Level 3,Total,all,numeracy,Level 3


In [53]:
df = pd.concat([df_witt, df_witt_groups[['indicator', 'age_group', 'gender']]], axis=1)
#df_witt_groups

# generate a list of data
years = list(range(1970, 2018)) + list(range(2020, 2101, 5))
rm = ['2010', '2015']
years = [str(y) for y in years if str(y) not in rm]
# drop the irrelevant years
df.drop(years, axis=1)

Unnamed: 0,Country Name,Country Code,Indicator Name,Indicator Code,2010,2015,Unnamed: 69,indicator,age_group,gender
3349,,,,,,,,Mean years of schooling. Age 0-19. Female,Age 0-19,Female
3350,,,,,,,,Mean years of schooling. Age 0-19. Male,Age 0-19,Male
3351,,,,,,,,Mean years of schooling. Age 0-19. Total,Age 0-19,Total
3352,,,,,,,,Mean years of schooling. Age 15+. Female,Age 15+,Female
3353,,,,,,,,Mean Years of Schooling. Age 15+. Gender Gap,Age 15+,Gender Gap
3354,,,,,,,,Mean years of schooling. Age 15+. Male,Age 15+,Male
3355,,,,,,,,Mean years of schooling. Age 15+. Total,Age 15+,Total
3356,,,,,,,,Mean years of schooling. Age 15-19. Female,Age 15-19,Female
3357,,,,,,,,Mean years of schooling. Age 15-19. Male,Age 15-19,Male
3358,,,,,,,,Mean years of schooling. Age 15-19. Total,Age 15-19,Total
