#### Aim
- perform an overall EDA on the data
- extract required features from the dataset
- train a sample model and check its performance
- modularise components

In [8]:
import pandas as pd
# import numpy as np
import json
import joblib

# from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report


# Set display option to show all columns
pd.set_option('display.max_columns', None)

pd.set_option('display.max_rows', None)

In [13]:
df = pd.read_json('./data/data.json')

print(df.shape)

df.head()

(2000, 3)


Unnamed: 0,talent,job,label
0,"{'languages': [{'rating': 'C2', 'title': 'Germ...","{'languages': [{'title': 'German', 'rating': '...",True
1,"{'languages': [{'rating': 'C2', 'title': 'Germ...","{'languages': [{'title': 'German', 'rating': '...",True
2,"{'languages': [{'rating': 'C2', 'title': 'Engl...","{'languages': [{'title': 'German', 'rating': '...",True
3,"{'languages': [{'rating': 'C2', 'title': 'Germ...","{'languages': [{'title': 'German', 'rating': '...",True
4,"{'languages': [{'rating': 'C2', 'title': 'Germ...","{'languages': [{'title': 'German', 'rating': '...",True


In [9]:
# get basic stats over label col

df['label'].describe()

count     2000
unique       2
top       True
freq      1000
Name: label, dtype: object

In [10]:
true_count = df['label'].sum()  # This counts the number of True values
false_count = (~df['label']).sum()  # This counts the number of False values

true_proportion = df['label'].mean()  # This calculates the proportion of True values
false_proportion = 1 - true_proportion  # This calculates the proportion of False values

print(f"Number of True values: {true_count}")
print(f"Number of False values: {false_count}")
print(f"Proportion of True values: {true_proportion:.2f}")
print(f"Proportion of False values: {false_proportion:.2f}")


Number of True values: 1000
Number of False values: 1000
Proportion of True values: 0.50
Proportion of False values: 0.50


In [None]:
# summary
# - overall we have a balanced dataset with equal proportions of positive and negative samples

#### Section 2 : Data cleaning
-  data cleaning, transformation & feature extraction

In [16]:
# Load the data
with open('./data/data.json', 'r') as file:
    data = json.load(file)

# Convert the data to a pandas DataFrame for easier inspection and manipulation
df = pd.json_normalize(data)

# Display the first few rows of the DataFrame to understand its structure
df.head()

Unnamed: 0,label,talent.languages,talent.job_roles,talent.seniority,talent.salary_expectation,talent.degree,job.languages,job.job_roles,job.seniorities,job.max_salary,job.min_degree
0,True,"[{'rating': 'C2', 'title': 'German'}, {'rating...","[frontend-developer, backend-developer, full-s...",junior,48000,bachelor,"[{'title': 'German', 'rating': 'C1', 'must_hav...",[frontend-developer],"[junior, midlevel]",70000,none
1,True,"[{'rating': 'C2', 'title': 'German'}, {'rating...","[frontend-developer, full-stack-developer, c-c...",junior,44000,master,"[{'title': 'German', 'rating': 'C1', 'must_hav...",[frontend-developer],"[junior, midlevel]",70000,none
2,True,"[{'rating': 'C2', 'title': 'English'}, {'ratin...","[frontend-developer, backend-developer, php-de...",senior,40000,none,"[{'title': 'German', 'rating': 'C1', 'must_hav...",[php-developer],"[midlevel, senior]",65000,none
3,True,"[{'rating': 'C2', 'title': 'German'}, {'rating...","[frontend-developer, backend-developer, full-s...",junior,46000,apprenticeship,"[{'title': 'German', 'rating': 'C1', 'must_hav...",[frontend-developer],"[junior, midlevel]",70000,none
4,True,"[{'rating': 'C2', 'title': 'German'}, {'rating...","[devops-engineer, frontend-developer, backend-...",midlevel,75000,master,"[{'title': 'German', 'rating': 'C1', 'must_hav...","[backend-developer, full-stack-developer]",[midlevel],80000,bachelor


In [12]:
# print(df.head())

In [4]:
#  get unique degree values
print(df['talent.degree'].unique())

print(df['job.min_degree'].unique())

#  get unique seniority values
print(df['talent.seniority'].unique())

# print(df['job.min_degree'].unique())

['bachelor' 'master' 'none' 'apprenticeship' 'doctorate']
['none' 'bachelor' 'master' 'apprenticeship' 'doctorate']
['junior' 'senior' 'midlevel' 'none']


In [7]:
# # One-hot encode categorical variables
# categorical_columns = ['talent.seniority', 'talent.degree', 'job.min_degree']
# df = pd.get_dummies(df, columns=categorical_columns)

# # Create features for job roles
# job_roles = set([role for sublist in df['talent.job_roles'].tolist() + df['job.job_roles'].tolist() for role in sublist])
# for role in job_roles:
#     df[f'talent.role_{role}'] = df['talent.job_roles'].apply(lambda x: int(role in x))
#     df[f'job.role_{role}'] = df['job.job_roles'].apply(lambda x: int(role in x))

# # Drop the original job roles columns
# df.drop(columns=['talent.job_roles', 'job.job_roles'], inplace=True)

# # Display the first few rows to verify the transformation
# df.head()


#### Section 1.2 : Feature extraction
- define features from various available attributes
- extract their values in numerical format

In [17]:
# Define degree and proficiency hierarchies
degree_hierarchy = {'none': 0, 'apprenticeship': 1, 'bachelor': 2, 'master': 3, 'doctorate': 4 }
proficiency_hierarchy = {'A1': 0, 'A2': 1, 'B1': 2, 'B2': 3, 'C1': 4, 'C2': 5}

# Function to calculate seniority match
def seniority_match(talent_seniority, job_seniorities):
    return 1 if talent_seniority in job_seniorities else 0

# Function to calculate degree match
def degree_match(talent_degree, job_min_degree):
    return 1 if degree_hierarchy[talent_degree.lower()] >= degree_hierarchy[job_min_degree.lower()] else 0

# Function to calculate salary match
def salary_match(job_max_salary, talent_salary_expectation):
    return job_max_salary / talent_salary_expectation

# Function to calculate language match
def language_match(talent_languages, job_languages):
    for job_lang in job_languages:
        job_title = job_lang['title'].lower()
        job_rating = proficiency_hierarchy[job_lang['rating']]
        for talent_lang in talent_languages:
            talent_title = talent_lang['title'].lower()
            talent_rating = proficiency_hierarchy[talent_lang['rating']]
            if job_title == talent_title and talent_rating >= job_rating:
                return 1
    return 0

# Function to calculate job role match
def job_role_match(talent_job_roles, job_job_roles):
    return 1 if set(talent_job_roles).intersection(set(job_job_roles)) else 0

# Add the new columns to the DataFrame
df['seniority_match'] = df.apply(lambda row: seniority_match(row['talent.seniority'], row['job.seniorities']), axis=1)
df['degrees_match'] = df.apply(lambda row: degree_match(row['talent.degree'], row['job.min_degree']), axis=1)
df['salary_match'] = df.apply(lambda row: salary_match(row['job.max_salary'], row['talent.salary_expectation']), axis=1)
df['language_match'] = df.apply(lambda row: language_match(row['talent.languages'], row['job.languages']), axis=1)
df['job_role_match'] = df.apply(lambda row: job_role_match(row['talent.job_roles'], row['job.job_roles']), axis=1)

# Display the updated DataFrame
df.head()


Unnamed: 0,label,talent.languages,talent.job_roles,talent.seniority,talent.salary_expectation,talent.degree,job.languages,job.job_roles,job.seniorities,job.max_salary,job.min_degree,seniority_match,degrees_match,salary_match,language_match,job_role_match
0,True,"[{'rating': 'C2', 'title': 'German'}, {'rating...","[frontend-developer, backend-developer, full-s...",junior,48000,bachelor,"[{'title': 'German', 'rating': 'C1', 'must_hav...",[frontend-developer],"[junior, midlevel]",70000,none,1,1,1.458333,1,1
1,True,"[{'rating': 'C2', 'title': 'German'}, {'rating...","[frontend-developer, full-stack-developer, c-c...",junior,44000,master,"[{'title': 'German', 'rating': 'C1', 'must_hav...",[frontend-developer],"[junior, midlevel]",70000,none,1,1,1.590909,1,1
2,True,"[{'rating': 'C2', 'title': 'English'}, {'ratin...","[frontend-developer, backend-developer, php-de...",senior,40000,none,"[{'title': 'German', 'rating': 'C1', 'must_hav...",[php-developer],"[midlevel, senior]",65000,none,1,1,1.625,1,1
3,True,"[{'rating': 'C2', 'title': 'German'}, {'rating...","[frontend-developer, backend-developer, full-s...",junior,46000,apprenticeship,"[{'title': 'German', 'rating': 'C1', 'must_hav...",[frontend-developer],"[junior, midlevel]",70000,none,1,1,1.521739,1,1
4,True,"[{'rating': 'C2', 'title': 'German'}, {'rating...","[devops-engineer, frontend-developer, backend-...",midlevel,75000,master,"[{'title': 'German', 'rating': 'C1', 'must_hav...","[backend-developer, full-stack-developer]",[midlevel],80000,bachelor,1,1,1.066667,1,1


#### Section 1.3 : Train a model
- train a simple logistic regression model
    - use the above calculated features
    - predict target variable `label` 

In [18]:
# Define the feature matrix (X) and the target variable (y)
features = ['seniority_match', 'degrees_match', 'salary_match', 'language_match', 'job_role_match']
X = df[features]
y = df['label']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the logistic regression model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print(f"Classification Report:\n{report}")


Accuracy: 0.9775
Classification Report:
              precision    recall  f1-score   support

       False       1.00      0.96      0.98       201
        True       0.96      1.00      0.98       199

    accuracy                           0.98       400
   macro avg       0.98      0.98      0.98       400
weighted avg       0.98      0.98      0.98       400



In [9]:
# Save the model to a file
joblib.dump(model, 'logistic_regression_model.pkl')

['logistic_regression_model.pkl']

In [19]:
# degree_hierarchy = {'none': 0, 'apprenticeship': 1, 'bachelor': 2, 'master': 3, 'doctorate': 4 }
# proficiency_hierarchy = {'A1': 0, 'A2': 1, 'B1': 2, 'B2': 3, 'C1': 4, 'C2': 5}

def extract_features(talent, job):
    # Compute seniority_match
    seniority_match = 1 if talent['seniority'] in job['seniorities'] else 0
    
    # Compute degrees_match
    degrees_match = 1 if degree_hierarchy[talent['degree'].lower()] >= degree_hierarchy[job['min_degree'].lower()] else 0
    
    # Compute salary_match
    salary_match = job['max_salary'] / talent['salary_expectation']
    
    # Compute language_match
    language_match = 0
    for job_lang in job['languages']:
        job_title = job_lang['title'].lower()
        job_rating = proficiency_hierarchy[job_lang['rating']]
        for talent_lang in talent['languages']:
            talent_title = talent_lang['title'].lower()
            talent_rating = proficiency_hierarchy[talent_lang['rating']]
            if job_title == talent_title and talent_rating >= job_rating:
                language_match = 1
                break
        if language_match == 1:
            break
    
    # Compute job_role_match
    job_role_match = 1 if set(talent['job_roles']).intersection(set(job['job_roles'])) else 0
    
    # Return the feature vector
    return [seniority_match, degrees_match, salary_match, language_match, job_role_match]

def predict_match(talent, job, model):
    features = extract_features(talent, job)
    label = model.predict([features])[0]
    score = model.predict_proba([features])[0][1]  # Probability of the positive class
    return label, score

# Example usage
talent = {
    "languages": [
        {"rating": "C2", "title": "German"},
        {"rating": "C2", "title": "English"},
        {"rating": "B2", "title": "French"},
        {"rating": "A2", "title": "Turkish"}
    ],
    "job_roles": ["frontend-developer", "backend-developer", "full-stack-developer", "java-developer", "mobile-developer"],
    "seniority": "junior",
    "salary_expectation": 48000,
    "degree": "bachelor"
}

job = {
    "languages": [
        {"title": "German", "rating": "C1", "must_have": True},
        {"title": "English", "rating": "B2", "must_have": True}
    ],
    "job_roles": ["frontend-developer"],
    "seniorities": ["junior", "midlevel"],
    "max_salary": 70000,
    "min_degree": "none"
}

label, score = predict_match(talent, job, model)
print(f"Label: {label}, Score: {score}")


Label: True, Score: 0.962364665401537


-----
EOF