# Exploration of Cedefop Skills Forecast 2030 data set
Felix Zaussinger | 04.05.2021

## Core Analysis Goal(s)
1. Understand structure of dataset

## Key Insight(s)
1.
2.
3.

In [1]:
import os
import sys
import logging
from pathlib import Path

import numpy as np
import scipy as sp
import statsmodels.api as sm
from statsmodels.formula.api import ols

%load_ext autoreload
%autoreload 2

import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

import seaborn as sns
sns.set_context("poster")
sns.set(rc={'figure.figsize': (16, 9.)})
sns.set_style("ticks")

import pandas as pd
pd.set_option("display.max_rows", 120)
pd.set_option("display.max_columns", 120)

logging.basicConfig(level=logging.INFO, stream=sys.stdout)

Define directory structure

In [119]:
# project directory
abspath = os.path.abspath('')
project_dir = str(Path(abspath).parents[0])

# sub-directories
data_raw = os.path.join(project_dir, "data", "raw")
data_interim = os.path.join(project_dir, "data", "interim")
data_processed = os.path.join(project_dir, "data", "processed")
figure_dir = os.path.join(project_dir, "plots")

Cedefop Skills Forecast data set components

In [132]:
dtypes_supply = {
    "popLF": 'category',
    "year": 'category',
    "country": 'category',
    "gender": 'category',
    "ageband": 'category',
    "qualification": 'category',
    "adjustment": 'category',
    "weight": "float"
}

dtypes_demand = {
    "year": 'category',
    "country": 'category',
    "industry": 'category',
    "occupation": 'category',
    "qualification": 'category',
    "adjustment": 'category',
    "weight": "float"
}

demand = pd.read_csv(os.path.join(data_interim, "cedefop_skills_forecast", "Demand.csv"))
supply = pd.read_csv(os.path.join(data_interim, "cedefop_skills_forecast", "Supply.csv"))
lookups = pd.read_csv(os.path.join(data_interim, "cedefop_skills_forecast", "lookups.csv"))

In [133]:
supply
demand

Unnamed: 0,year,country,industry,occupation,qualification,adjustment,weight
0,2000,1,1,1,1,1,0.000000
1,2000,1,1,1,1,2,0.000000
2,2000,1,1,1,2,1,0.000000
3,2000,1,1,1,2,2,0.000000
4,2000,1,1,1,3,1,0.000000
...,...,...,...,...,...,...,...
16609423,2030,33,66,41,1,2,1285.680494
16609424,2030,33,66,41,2,1,51.731323
16609425,2030,33,66,41,2,2,51.731323
16609426,2030,33,66,41,3,1,12.448825


Decode supply and demand data using lookup table

In [128]:
supply_decoded = supply.copy()
for col in supply_decoded.columns:
    print(col)
    renamer = dict(zip(
        lookups[lookups.variable == col].drop(columns="variable").value.values,
        lookups[lookups.variable == col].drop(columns="variable").name.values
    ))

    supply_decoded[col] = supply_decoded[col].replace(to_replace=renamer)

supply_decoded = supply_decoded.astype(dtypes_supply)
supply_decoded.to_csv(os.path.join(data_processed, "cedefop_skills_forecast", "Supply_decoded.csv"))
supply_decoded.info()

popLF
year
country
gender
ageband
qualification
adjustment
weight
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 294624 entries, 0 to 294623
Data columns (total 8 columns):
 #   Column         Non-Null Count   Dtype   
---  ------         --------------   -----   
 0   popLF          294624 non-null  category
 1   year           294624 non-null  category
 2   country        294624 non-null  category
 3   gender         294624 non-null  category
 4   ageband        294624 non-null  category
 5   qualification  294624 non-null  category
 6   adjustment     294624 non-null  category
 7   weight         294624 non-null  float64 
dtypes: category(7), float64(1)
memory usage: 4.2 MB


In [134]:
demand_decoded = demand.copy()
for col in demand_decoded.columns:
    print(col)
    renamer = dict(zip(
        lookups[lookups.variable == col].drop(columns="variable").value.values,
        lookups[lookups.variable == col].drop(columns="variable").name.values
    ))

    demand_decoded[col] = demand_decoded[col].replace(to_replace=renamer)

demand_decoded = demand_decoded.astype(dtypes_demand)
demand_decoded.to_csv(os.path.join(data_processed, "cedefop_skills_forecast", "Demand_decoded.csv"))
demand_decoded.info()

year
country
industry
occupation
qualification
adjustment
weight
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16609428 entries, 0 to 16609427
Data columns (total 7 columns):
 #   Column         Dtype   
---  ------         -----   
 0   year           category
 1   country        category
 2   industry       category
 3   occupation     category
 4   qualification  category
 5   adjustment     category
 6   weight         float64 
dtypes: category(6), float64(1)
memory usage: 221.8 MB


In [135]:
# test = pd.read_csv(os.path.join(data_processed, "cedefop_skills_forecast", "Supply_decoded.csv"), dtype=dtypes_supply, index_col=0)
# test.info()