# Skills Space
Felix Zaussinger | 21.07.2021

## Core Analysis Goal(s)
1. visualise adjacency matrix of skills space

## Key Insight(s)
1.

In [1]:
import os
import sys
import logging
from pathlib import Path

import numpy as np
import scipy as sp
import statsmodels.api as sm
from statsmodels.formula.api import ols

%load_ext autoreload
%autoreload 2


import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

import seaborn as sns
sns.set_context("poster")
sns.set(rc={'figure.figsize': (16, 9.)})
sns.set_style("ticks")

import pandas as pd
pd.set_option("display.max_rows", 120)
pd.set_option("display.max_columns", 120)

import networkx as nx
logging.basicConfig(level=logging.INFO, stream=sys.stdout)

Define directory structure

In [2]:
# project directory
abspath = os.path.abspath('')
project_dir = str(Path(abspath).parents[0])

# sub-directories
data_raw = os.path.join(project_dir, "data", "raw")
data_interim = os.path.join(project_dir, "data", "interim")
data_processed = os.path.join(project_dir, "data", "processed")
data_external = os.path.join(project_dir, "data", "external")
figure_dir = os.path.join(project_dir, "reports", "figures")

Read data

In [3]:
skills = pd.read_csv(os.path.join(data_raw, "esco", "v1.0.3", "skills_en.csv"))

In [4]:
#adj_matrix = pd.read_pickle(
#        os.path.join(project_dir, "data", "processed", "adjacency_matrix.pkl")
#)

In [5]:
# A = adj_matrix.values
# At = A.transpose()

# np.matmul(At, A)

Read skills adjacency matrix from Skilllab

In [6]:
# nesta report
sim_skills = np.load(
    os.path.join(data_external, "Neighborhood_Model_skill_to_skill.npy")
)

Remove links below a certain threshold (weakly connected skills)

In [7]:
#w_thresh = 0.001
#sim_skills[sim_skills < w_thresh] = np.nan

Load subset of full graph

In [None]:
subset = sim_skills.shape[0]
Gsub = nx.from_numpy_array(sim_skills[:subset, :subset])

Apply threshold to remove irrelevant edges

In [None]:
#w_thresh = 0.01
#edge_weights = nx.get_edge_attributes(Gsub,'weight')
#Gsub.remove_edges_from((e for e, w in edge_weights.items() if w < w_thresh))

In [None]:
closeness_centrality = np.array(list(nx.algorithms.centrality.closeness_centrality(Gsub).values()))
degree_centrality = np.array(list(nx.algorithms.centrality.degree_centrality(Gsub).values()))
betweenness_centrality = np.array(list(nx.algorithms.centrality.betweenness_centrality(Gsub).values()))
eigenvector_centrality = np.array(list(nx.algorithms.centrality.eigenvector_centrality(Gsub).values()))
clustering_coefficient = np.array(list(nx.algorithms.cluster.clustering(Gsub).values()))

attr_dict = {}

for i in np.arange(subset):
    attr_dict[i] = {
        "label": skills.preferredLabel.values[i],
        "closeness_centrality": closeness_centrality[i],
        "degree_centrality": degree_centrality[i],
        "betweenness_centrality": betweenness_centrality[i],
        "eigenvector_centrality": eigenvector_centrality[i],
        "clustering_coefficient": clustering_coefficient[i]
    }

# set attributes
nx.set_node_attributes(Gsub, attr_dict)

In [None]:
weights = nx.get_edge_attributes(Gsub,'weight')
weights_array = list(weights.values())

In [None]:
node_labels = dict(zip(np.arange(subset), skills.preferredLabel.values[:subset]))

# remove labels for non-central skills
centrality_thresh = 0.01
for i, c in enumerate(betweenness_centrality):
    if c <= centrality_thresh:
        node_labels[i] = ""

# plot
pos = nx.spring_layout(Gsub)
#pos = nx.kamada_kawai_layout(Gsub)

nx.draw_networkx(
    Gsub,
    labels=node_labels,
    font_size=6,
    horizontalalignment="left",
    verticalalignment="top",
    node_size=betweenness_centrality * 1000,
    edge_cmap=plt.cm.Blues,
    edge_color=list(weights.values()),
    edge_vmin=0,
    edge_vmax=max(weights_array),
    font_color="lightgrey"
)

plt.box(False)
plt.tight_layout()
plt.savefig(
    os.path.join(figure_dir, "skills_centrality_all_edges.png"),
    dpi=300,
    bbox_inches="tight"
)

In [None]:
skills_centrality = {
    "label": skills.preferredLabel.values[:subset],
    "closeness_centrality": closeness_centrality,
    "degree_centrality": degree_centrality,
    "betweenness_centrality": betweenness_centrality,
    "eigenvector_centrality": eigenvector_centrality,
    "clustering_coefficient": clustering_coefficient
}

df_skills_centrality = pd.DataFrame.from_dict(skills_centrality)

# from kanders 2020
def coreness(b, e, c):
    """Measure of node coreness proposed by Kanders et al. 2020"""
    return 0.5 * (b/max(b) + e/max(e)) * (1-c)

df_skills_centrality["coreness"] = coreness(
    b=df_skills_centrality["betweenness_centrality"],
    e=df_skills_centrality["eigenvector_centrality"],
    c=df_skills_centrality["clustering_coefficient"]
)

df_skills_centrality.to_csv(
    os.path.join(data_processed, "skills_coreness_all_edges.csv")
)

In [None]:
# export graph files
nx.write_gexf(Gsub, os.path.join(data_raw, "networks", "skills_network_all_edges.gexf"))
nx.write_graphml(Gsub, os.path.join(data_raw, "networks", "skills_network_all_edges.graphml"))

In [None]:
df_skills_centrality.sort_values("coreness", ascending=False)