In [1]:
import json

from collections import Counter

import pandas as pd

# Data exploration

In [2]:
DATA_FILE_PATH: str = "../data/data.json"

In [3]:
with open(DATA_FILE_PATH, "r") as file:
    data = json.load(file)

In [4]:
type(data)

list

In [5]:
example = data[0]
example

{'talent': {'languages': [{'rating': 'C2', 'title': 'German'},
   {'rating': 'C2', 'title': 'English'},
   {'rating': 'B2', 'title': 'French'},
   {'rating': 'A2', 'title': 'Turkish'}],
  'job_roles': ['frontend-developer',
   'backend-developer',
   'full-stack-developer',
   'java-developer',
   'mobile-developer'],
  'seniority': 'junior',
  'salary_expectation': 48000,
  'degree': 'bachelor'},
 'job': {'languages': [{'title': 'German', 'rating': 'C1', 'must_have': True},
   {'title': 'English', 'rating': 'B2', 'must_have': True}],
  'job_roles': ['frontend-developer'],
  'seniorities': ['junior', 'midlevel'],
  'max_salary': 70000,
  'min_degree': 'none'},
 'label': True}

In [6]:
for key, value in example.items():
    print(f"- {key}: {value}")

- talent: {'languages': [{'rating': 'C2', 'title': 'German'}, {'rating': 'C2', 'title': 'English'}, {'rating': 'B2', 'title': 'French'}, {'rating': 'A2', 'title': 'Turkish'}], 'job_roles': ['frontend-developer', 'backend-developer', 'full-stack-developer', 'java-developer', 'mobile-developer'], 'seniority': 'junior', 'salary_expectation': 48000, 'degree': 'bachelor'}
- job: {'languages': [{'title': 'German', 'rating': 'C1', 'must_have': True}, {'title': 'English', 'rating': 'B2', 'must_have': True}], 'job_roles': ['frontend-developer'], 'seniorities': ['junior', 'midlevel'], 'max_salary': 70000, 'min_degree': 'none'}
- label: True


## Job data

### Languages overview

In [7]:
title_languages = []
rating_languages = []
must_have_languages = []

for batch in data:
    job_languages = batch["job"]["languages"]
    for language_dict in job_languages:
        title_languages.append(language_dict.get("title", "no title"))
        rating_languages.append(language_dict.get("rating", "no rating"))
        must_have_languages.append(language_dict.get("must_have", "no must_have"))

In [8]:
Counter(title_languages)

Counter({'German': 2000, 'English': 1318})

In [9]:
Counter(rating_languages)

Counter({'C1': 1765, 'B2': 888, 'C2': 424, 'B1': 241})

In [10]:
Counter(must_have_languages)

Counter({True: 2806, False: 512})

Not every language is required in a job ad, we will need to differentiate that in the features we will create

### Minimum degree overview

In [11]:
min_degrees = []

for batch in data:
    min_degrees.append(batch["job"]["min_degree"])

Counter(min_degrees)

Counter({'none': 899,
         'bachelor': 467,
         'master': 245,
         'apprenticeship': 195,
         'doctorate': 194})

### Job roles

In [12]:
job_roles_number = []

for batch in data:
    job_roles_number.append(len(batch["job"]["job_roles"]))

Counter(job_roles_number)

Counter({1: 1303, 2: 538, 3: 137, 5: 22})

In [13]:
for batch in data:
    if len(batch["job"]["job_roles"]) >= 5:
        print(batch["job"]["job_roles"])

['frontend-developer', 'backend-developer', 'full-stack-developer', 'c-net-developer', 'java-developer']
['frontend-developer', 'backend-developer', 'full-stack-developer', 'c-net-developer', 'java-developer']
['frontend-developer', 'backend-developer', 'full-stack-developer', 'c-net-developer', 'java-developer']
['frontend-developer', 'backend-developer', 'full-stack-developer', 'c-net-developer', 'java-developer']
['frontend-developer', 'backend-developer', 'full-stack-developer', 'c-net-developer', 'java-developer']
['frontend-developer', 'backend-developer', 'full-stack-developer', 'c-net-developer', 'java-developer']
['frontend-developer', 'backend-developer', 'full-stack-developer', 'c-net-developer', 'java-developer']
['frontend-developer', 'backend-developer', 'full-stack-developer', 'c-net-developer', 'java-developer']
['frontend-developer', 'backend-developer', 'full-stack-developer', 'c-net-developer', 'java-developer']
['frontend-developer', 'backend-developer', 'full-stack

Sometimes there are more than 1 role that can match in a job offer so we must take it into account in the **Feature Engineering** phase

## Talent data

### Languages overview

In [14]:
title_languages = []
rating_languages = []
must_have_languages = []

for batch in data:
    job_languages = batch["talent"]["languages"]
    for language_dict in job_languages:
        title_languages.append(language_dict.get("title", "no title"))
        rating_languages.append(language_dict.get("rating", "no rating"))

In [15]:
Counter(title_languages)

Counter({'English': 1997,
         'German': 1996,
         'French': 416,
         'Spanish': 313,
         'Russian': 132,
         'Turkish': 106,
         'Arabic': 87,
         'Italian': 67,
         'Polish': 49,
         'Portuguese': 31,
         'Dutch': 31,
         'Chinese': 28,
         'Persian': 27,
         'Japanese': 26,
         'Croatian': 21,
         'Swedish': 21,
         'Hindi': 15,
         'Greek': 10,
         'Serbian': 10,
         'Romanian': 8,
         'Hungarian': 8,
         'Albanian': 7,
         'Korean': 6,
         'Danish': 5,
         'Czech': 5,
         'Bulgarian': 4,
         'Hebrew': 4,
         'Tamil': 3,
         'Norwegian': 2,
         'Slovak': 1,
         'Bengalese': 1,
         'Macedonian': 1,
         'Finnish': 1,
         'Armenian': 1,
         'Latvian': 1})

In [16]:
Counter(rating_languages)

Counter({'C2': 2230, 'C1': 1011, 'B2': 698, 'A2': 640, 'A1': 464, 'B1': 398})

There are more values for talents than jobs but this is not going to be an issue. We can compute an 'overlap feature' but also optional languages spoken by the talent, and see if it helps in the prediction

### Degree overview

In [17]:
degrees = []

for batch in data:
    degrees.append(batch["talent"]["degree"])

Counter(degrees)

Counter({'bachelor': 644,
         'none': 453,
         'apprenticeship': 442,
         'master': 441,
         'doctorate': 20})

We will need to encode these degree values to be able to introduce a notion of hierarchy

### Seniority overview

In [18]:
seniorities = []

for batch in data:
    seniorities.append(batch["talent"]["seniority"])

Counter(seniorities)

Counter({'midlevel': 625, 'junior': 588, 'senior': 448, 'none': 339})

### Salary

In [19]:
pd.Series([batch["talent"]["salary_expectation"] for batch in data]).describe()

count      2000.000000
mean      71710.889500
std       21867.684002
min       25000.000000
25%       55000.000000
50%       70000.000000
75%       86800.000000
max      137080.000000
dtype: float64

There doesn't seem to be any missing value in the data.