In [1]:
DF_PATH       = "../data/processed/2_cleaned_data.pkl"

ROLE_COLS      = ['DevType']
TECH_COLS      = ['LanguageHaveWorkedWith',
                  'DatabaseHaveWorkedWith',
                  'PlatformHaveWorkedWith',
                  'WebframeHaveWorkedWith',
                  'MiscTechHaveWorkedWith',
                  'ToolsTechHaveWorkedWith',
                  'NEWCollabToolsHaveWorkedWith']

MLFLOW_TRACKING_URI = '../models/mlruns'
MLFLOW_EXPERIMENT_NAME = "skills_jobs_stackoverflow"

LOG_PATH = "../models/temp/"
LOG_DATA_PKL    =  "data.pkl"
LOG_MODEL_PKL   =  "model.pkl"
LOG_METRICS_PKL =  "metrics.pkl"

In [2]:
# Load packages
import pandas as pd 
import numpy as np
import logging
import pickle
import random
import plotly 
import os
from pathlib import Path

import mlflow
from mlflow.tracking import MlflowClient

from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler
from sklearn.pipeline import make_pipeline, FeatureUnion
from sklearn.feature_selection import VarianceThreshold
from sklearn.multioutput import MultiOutputClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

from sklearn import metrics
from sklearn.metrics import auc, accuracy_score, confusion_matrix, f1_score, precision_score, recall_score

from sklearn.decomposition import PCA, KernelPCA

from sklearn import tree
from sklearn.linear_model import LogisticRegression

from matplotlib import pyplot as plt

--------------------------------------------------------------------------------------------------------------------------------------------------

Read Data

In [3]:
# Read Data 
df = pd.read_pickle(DF_PATH)

Functions

In [4]:
def calculate_quality(ground_truth, predictions, metric_function, sort_values=False):
    quality_scores = {}
    for col in predictions.columns:
        role_pred = predictions[col].copy()
        role_truth = ground_truth[col].copy()
        quality_scores[col] = round(metric_function(role_truth,role_pred) * 100, 2)
        
    quality_scores = pd.Series(quality_scores.values(), index=quality_scores.keys())
    if sort_values:
        quality_scores = quality_scores.sort_values()
    
    return quality_scores

------------------------------------------------------------------------------------------------------------------------------------------

Initialize

Create directories

In [5]:
Path(MLFLOW_TRACKING_URI).mkdir(parents=True, exist_ok=True)
Path(LOG_PATH).mkdir(parents=True, exist_ok=True)

In [6]:
df

Unnamed: 0_level_0,DevType,DevType,DevType,DevType,DevType,DevType,DevType,DevType,DevType,DevType,...,skills_clusters,skills_clusters,skills_clusters,skills_clusters,skills_clusters,skills_clusters,skills_clusters,skills_clusters,skills_clusters,skills_clusters
Unnamed: 0_level_1,Academic researcher,Data or business analyst,Data scientist or machine learning specialist,Database administrator,DevOps specialist,"Developer, QA or test","Developer, back-end","Developer, desktop or enterprise applications","Developer, embedded applications or devices","Developer, front-end",...,skill_group_15,skill_group_16,skill_group_2,skill_group_3,skill_group_4,skill_group_5,skill_group_6,skill_group_7,skill_group_8,skill_group_9
0,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1
1,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,1,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,1
7,0,0,0,0,0,0,1,1,0,0,...,0,0,2,0,0,2,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
64416,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2
64422,0,0,0,0,0,0,1,0,0,0,...,0,0,2,0,0,3,0,0,0,0
64428,0,0,0,0,0,0,0,0,0,0,...,0,0,4,0,0,1,0,0,0,0
64447,0,0,0,0,0,0,1,0,0,0,...,0,1,2,0,0,0,0,0,0,2


Balance classes

In [7]:
# check the total samples of roles
roles_df = df['DevType'].copy()
role_sum = df['DevType'].sum(axis=0)
role_sum

Academic researcher                              1039
Data or business analyst                         1059
Data scientist or machine learning specialist    1275
Database administrator                            745
DevOps specialist                                1212
Developer, QA or test                             791
Developer, back-end                              9144
Developer, desktop or enterprise applications    2985
Developer, embedded applications or devices      1193
Developer, front-end                             5177
Developer, full-stack                            8718
Developer, game or graphics                       608
Developer, mobile                                2573
Engineer, data                                    916
Scientist                                         590
System administrator                              880
dtype: int64

In [8]:
roles_df[roles_df['Academic researcher'] == 1]

Unnamed: 0,Academic researcher,Data or business analyst,Data scientist or machine learning specialist,Database administrator,DevOps specialist,"Developer, QA or test","Developer, back-end","Developer, desktop or enterprise applications","Developer, embedded applications or devices","Developer, front-end","Developer, full-stack","Developer, game or graphics","Developer, mobile","Engineer, data",Scientist,System administrator
30,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
52,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
76,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0
206,1,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0
270,1,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
64142,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
64231,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
64238,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
64243,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0


In [9]:
# Resample roles
samples_per_class = 1200
resampled_roles = []

for role_col in roles_df.columns :
    sub_df = roles_df.loc[roles_df[role_col] == 1].copy()
    
    if len(sub_df) < samples_per_class:
        #upsample
        sub_df = sub_df.sample(samples_per_class , replace=True, random_state=0)
    else:
        # Downsample
        sub_df = sub_df.sample(samples_per_class, random_state=0) 
    
    resampled_roles.append(sub_df)

In [10]:
resampled_roles[0].sum(axis=0)

Academic researcher                              1200
Data or business analyst                           72
Data scientist or machine learning specialist     307
Database administrator                             13
DevOps specialist                                  23
Developer, QA or test                              20
Developer, back-end                               213
Developer, desktop or enterprise applications      81
Developer, embedded applications or devices        70
Developer, front-end                               76
Developer, full-stack                             159
Developer, game or graphics                        27
Developer, mobile                                  60
Engineer, data                                     53
Scientist                                         338
System administrator                               27
dtype: int64

In [11]:
# Construct dfs
roles_df = pd.concat(resampled_roles)
df = df.loc[roles_df.index].copy()

In [12]:
roles_df.sum(axis=0)

Academic researcher                              2511
Data or business analyst                         2176
Data scientist or machine learning specialist    2715
Database administrator                           1900
DevOps specialist                                2056
Developer, QA or test                            1665
Developer, back-end                              6666
Developer, desktop or enterprise applications    3130
Developer, embedded applications or devices      1914
Developer, front-end                             3059
Developer, full-stack                            4977
Developer, game or graphics                      1553
Developer, mobile                                2381
Engineer, data                                   2048
Scientist                                        1964
System administrator                             1999
dtype: int64

In [13]:
df

Unnamed: 0_level_0,DevType,DevType,DevType,DevType,DevType,DevType,DevType,DevType,DevType,DevType,...,skills_clusters,skills_clusters,skills_clusters,skills_clusters,skills_clusters,skills_clusters,skills_clusters,skills_clusters,skills_clusters,skills_clusters
Unnamed: 0_level_1,Academic researcher,Data or business analyst,Data scientist or machine learning specialist,Database administrator,DevOps specialist,"Developer, QA or test","Developer, back-end","Developer, desktop or enterprise applications","Developer, embedded applications or devices","Developer, front-end",...,skill_group_15,skill_group_16,skill_group_2,skill_group_3,skill_group_4,skill_group_5,skill_group_6,skill_group_7,skill_group_8,skill_group_9
43321,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,3,0,2,0,0
32110,1,0,1,0,0,0,0,0,0,0,...,0,0,2,0,0,2,0,1,2,1
51455,1,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,1,1,0,0,0
48097,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,1,0,2
63955,1,0,0,0,0,0,0,0,0,0,...,0,0,3,0,2,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19530,0,0,0,1,0,0,0,0,0,0,...,0,0,1,0,2,1,0,0,0,1
40193,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,3,0,1,0,0
31156,0,0,0,0,0,0,0,0,0,1,...,0,0,2,0,1,0,0,0,2,2
16594,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1


Split

In [14]:
X_train, X_test, Y_train, Y_test = train_test_split(df.drop("DevType", axis=1), 
                                                    df["DevType"], 
                                                    random_state=0)

  obj = obj._drop_axis(labels, axis, level=level, errors=errors)


----------------------------------------------------------------------------------------------------------------------------------------------

Train models

Initialize MLflow

In [15]:
# Initialize client and experiment
mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)
client = MlflowClient()
mlflow.set_experiment(MLFLOW_EXPERIMENT_NAME)
mlflow.get_experiment_by_name(MLFLOW_EXPERIMENT_NAME)
exp = client.get_experiment_by_name(MLFLOW_EXPERIMENT_NAME)

INFO: 'skills_jobs_stackoverflow' does not exist. Creating a new experiment


1. Logistic regression

In [16]:
clf = make_pipeline(StandardScaler(),
                    MultiOutputClassifier(LogisticRegression()))

clf.fit(X_train, Y_train)
predictions =  pd.DataFrame(clf.predict(X_train),
                            columns=Y_train.columns)

In [17]:
predictions

Unnamed: 0,Academic researcher,Data or business analyst,Data scientist or machine learning specialist,Database administrator,DevOps specialist,"Developer, QA or test","Developer, back-end","Developer, desktop or enterprise applications","Developer, embedded applications or devices","Developer, front-end","Developer, full-stack","Developer, game or graphics","Developer, mobile","Engineer, data",Scientist,System administrator
0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14395,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
14396,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
14397,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
14398,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [18]:
# Evaluate on training set
predictions =  pd.DataFrame(clf.predict(X_train),
                            columns=Y_train.columns)
train_scores = {score.__name__: calculate_quality(Y_train, predictions, score) 
                for score in [accuracy_score, precision_score, recall_score, f1_score]}
train_scores = pd.concat(train_scores,axis=1)

In [19]:
train_scores

Unnamed: 0,accuracy_score,precision_score,recall_score,f1_score
Academic researcher,87.84,62.37,19.16,29.31
Data or business analyst,90.56,66.25,33.05,44.1
Data scientist or machine learning specialist,90.94,75.34,52.75,62.05
Database administrator,90.16,50.38,4.65,8.52
DevOps specialist,91.7,74.72,34.69,47.38
"Developer, QA or test",91.4,100.0,0.08,0.16
"Developer, back-end",72.97,66.0,43.7,52.59
"Developer, desktop or enterprise applications",83.69,52.02,10.89,18.01
"Developer, embedded applications or devices",91.07,61.18,30.78,40.96
"Developer, front-end",87.74,68.01,41.14,51.27


In [20]:
# Evaluate on test set
predictions =  pd.DataFrame(clf.predict(X_test), columns=Y_test.columns)
test_scores = {score.__name__: calculate_quality(Y_test, predictions, score) 
                for score in [accuracy_score, precision_score, recall_score, f1_score]}
test_scores = pd.concat(test_scores,axis=1)
mean_test_scores = test_scores.mean()

  _warn_prf(average, modifier, msg_start, len(result))


In [21]:
test_scores

Unnamed: 0,accuracy_score,precision_score,recall_score,f1_score
Academic researcher,88.21,61.68,21.43,31.81
Data or business analyst,89.65,61.35,27.8,38.26
Data scientist or machine learning specialist,90.94,76.06,54.47,63.48
Database administrator,90.12,58.7,5.6,10.23
DevOps specialist,91.46,70.93,31.88,43.99
"Developer, QA or test",91.12,0.0,0.0,0.0
"Developer, back-end",71.62,66.79,41.95,51.53
"Developer, desktop or enterprise applications",83.94,47.02,10.38,17.01
"Developer, embedded applications or devices",91.02,56.8,30.54,39.72
"Developer, front-end",87.54,73.33,39.83,51.62


In [22]:
mean_test_scores

accuracy_score     88.209375
precision_score    60.970625
recall_score       27.938125
f1_score           36.593125
dtype: float64

Log run

1. Prepare

In [23]:
# Data details
data_details = {"data_path": DF_PATH,
                "training_indices": X_train.index.tolist(),
                "test_indices":     X_test.index.tolist(), 
                "features_names":   X_train.columns.droplevel(0).tolist(),
                "targets_names":    Y_train.columns.tolist()}

with open(os.path.join(LOG_PATH, LOG_DATA_PKL), "wb") as output_file:
    pickle.dump(data_details, output_file)

In [24]:
str(clf)

"Pipeline(steps=[('standardscaler', StandardScaler()),\n                ('multioutputclassifier',\n                 MultiOutputClassifier(estimator=LogisticRegression()))])"

In [25]:
# Model
model = {"model_description": "Baseline model: Logistic Regression ",
         "model_details": str(clf),
         "model_object": clf} 

with open(os.path.join(LOG_PATH, LOG_MODEL_PKL), "wb") as output_file:
    pickle.dump(model, output_file)

In [26]:
# Performance details
classes_metrics = {"train_scores": train_scores, 
                   "test_scores":  test_scores}

with open(os.path.join(LOG_PATH, LOG_METRICS_PKL), "wb") as output_file:
    pickle.dump(classes_metrics, output_file)

2. Log

In [27]:
exp

<Experiment: artifact_location='../models/mlruns/0', experiment_id='0', lifecycle_stage='active', name='skills_jobs_stackoverflow', tags={}>

In [28]:
# Start a new run and track 
with mlflow.start_run(experiment_id=exp.experiment_id, run_name=model["model_description"]):
    # Log pickles 
    mlflow.log_artifacts(LOG_PATH)
    
    # Track metrics 
    for metric, score in mean_test_scores.items():
        mlflow.log_metric(metric, score) 
    