In [1]:
DF_PATH       = "../data/processed/2_cleaned_data.pkl"

ROLE_COLS      = ['DevType']
TECH_COLS      = ['LanguageHaveWorkedWith',
                  'DatabaseHaveWorkedWith',
                  'PlatformHaveWorkedWith',
                  'WebframeHaveWorkedWith',
                  'MiscTechHaveWorkedWith',
                  'ToolsTechHaveWorkedWith',
                  'NEWCollabToolsHaveWorkedWith']

MLFLOW_TRACKING_URI = '../models/mlruns'
MLFLOW_EXPERIMENT_NAME = "skills_jobs_stackoverflow"

LOG_PATH = "../models/temp/"
LOG_DATA_PKL    =  "data.pkl"
LOG_MODEL_PKL   =  "model.pkl"
LOG_METRICS_PKL =  "metrics.pkl"


In [2]:
# Load packages
import pandas as pd 
import numpy as np
import logging
import pickle
import random
import plotly 
import os
from pathlib import Path

import mlflow
from mlflow.tracking import MlflowClient

from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler
from sklearn.pipeline import make_pipeline, FeatureUnion
from sklearn.feature_selection import VarianceThreshold
from sklearn.multioutput import MultiOutputClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

from sklearn import metrics
from sklearn.metrics import auc, accuracy_score, confusion_matrix, f1_score, precision_score, recall_score

from sklearn.decomposition import PCA, KernelPCA

from sklearn import tree
from sklearn.linear_model import LogisticRegression

from matplotlib import pyplot as plt


## Initialize

Create directories

In [3]:
Path(MLFLOW_TRACKING_URI).mkdir(parents=True, exist_ok=True)
Path(LOG_PATH).mkdir(parents=True, exist_ok=True)

In [4]:
df=pd.read_pickle(DF_PATH)

In [5]:
df.head()

Unnamed: 0_level_0,skills_clusters,skills_clusters,skills_clusters,skills_clusters,skills_clusters,skills_clusters,skills_clusters,skills_clusters,skills_clusters,skills_clusters,...,MiscTechWorkedWith,MiscTechWorkedWith,MiscTechWorkedWith,MiscTechWorkedWith,MiscTechWorkedWith,MiscTechWorkedWith,MiscTechWorkedWith,MiscTechWorkedWith,MiscTechWorkedWith,MiscTechWorkedWith
Unnamed: 0_level_1,skill_group_0,skill_group_1,skill_group_10,skill_group_11,skill_group_12,skill_group_2,skill_group_3,skill_group_4,skill_group_5,skill_group_6,...,Node.js,Pandas,Puppet,React Native,TensorFlow,Teraform,Torch/PyTorch,Unity 3D,Unreal Engine,Xamarin
0,1,0,2,0,1,0,0,0,5,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,2,0,0,0,...,0,0,0,1,0,0,0,0,0,0
5,0,0,0,0,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,3,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
11,0,0,0,0,2,0,0,0,2,0,...,0,0,0,0,0,0,0,0,0,0


In [6]:
df['DevType'].sum(axis=0)

Academic researcher                               581
Data or business analyst                          669
Data scientist or machine learning specialist     799
Database administrator                            296
DevOps specialist                                 677
Developer, QA or test                             493
Developer, back-end                              5503
Developer, desktop or enterprise applications    1671
Developer, embedded applications or devices       795
Developer, front-end                             2890
Developer, full-stack                            5578
Developer, game or graphics                       342
Developer, mobile                                1859
Engineer, data                                    483
Scientist                                         292
System administrator                              440
dtype: int64

## Resampling

In [7]:
roles = df['DevType'].columns.tolist()
sample_size = 500
final_sample = []

for role in roles:
    df_role = df[df[('DevType',role)] == 1]
    
    if len(df_role) < sample_size:
        df_role = df_role.sample(sample_size, replace = True,random_state=42)
    
        
    else:
        df_role = df_role.sample(sample_size, replace = True)
        
    final_sample.append(df_role)



In [8]:
final_sample = pd.concat(final_sample)

In [9]:
final_sample['DevType'].sum(axis=0)

Academic researcher                               806
Data or business analyst                          776
Data scientist or machine learning specialist     870
Database administrator                            592
DevOps specialist                                 695
Developer, QA or test                             604
Developer, back-end                              1840
Developer, desktop or enterprise applications     875
Developer, embedded applications or devices       653
Developer, front-end                              870
Developer, full-stack                            1430
Developer, game or graphics                       571
Developer, mobile                                 807
Engineer, data                                    618
Scientist                                         654
System administrator                              627
dtype: int64

## Spilt

In [10]:
X_train, X_test, y_train, y_test = train_test_split(final_sample.drop(ROLE_COLS, axis=1),final_sample[ROLE_COLS], test_size=0.2, random_state=42)

  X_train, X_test, y_train, y_test = train_test_split(final_sample.drop(ROLE_COLS, axis=1),final_sample[ROLE_COLS], test_size=0.2, random_state=42)


## Train models

#### Initialize MLflow

In [11]:
mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)
client = MlflowClient()
mlflow.set_experiment(MLFLOW_EXPERIMENT_NAME)
exp = client.get_experiment_by_name(MLFLOW_EXPERIMENT_NAME)

2023/08/05 16:09:53 INFO mlflow.tracking.fluent: Experiment with name 'skills_jobs_stackoverflow' does not exist. Creating a new experiment.


## 1. Logistic regression

In [12]:
clf = make_pipeline(StandardScaler(),MultiOutputClassifier(LogisticRegression(random_state=42, max_iter=1000, n_jobs=-1)))

clf.fit(X_train, y_train)
pred = pd.DataFrame(clf.predict(X_test), columns = y_test.columns)

In [13]:
table = {}
for role in roles:
    
    mat = metrics.accuracy_score(y_test[('DevType',role)], pred[('DevType',role)])
    per = metrics.precision_score(y_test[('DevType',role)], pred[('DevType',role)])
    recal = metrics.recall_score(y_test[('DevType',role)], pred[('DevType',role)])
    f1= metrics.f1_score(y_test[('DevType',role)], pred[('DevType',role)])
    table[role] = [mat, per, recal, f1]
    
table = pd.DataFrame(table, index = ['Accuracy', 'Precision', 'Recall', 'F1']).sort_values(by='Precision', axis=1, ascending=False)
table.T

Unnamed: 0,Accuracy,Precision,Recall,F1
"Developer, QA or test",0.925,1.0,0.02439,0.047619
"Developer, mobile",0.946875,0.809917,0.6125,0.697509
"Developer, game or graphics",0.956875,0.794872,0.53913,0.642487
"Developer, front-end",0.925,0.788991,0.469945,0.589041
System administrator,0.9225,0.785714,0.157143,0.261905
Data scientist or machine learning specialist,0.92,0.75,0.456522,0.567568
"Engineer, data",0.925,0.72973,0.19708,0.310345
DevOps specialist,0.92875,0.64557,0.372263,0.472222
Academic researcher,0.906875,0.638889,0.144654,0.235897
Data or business analyst,0.91,0.62963,0.309091,0.414634


In [14]:
table.T['Precision'].mean()

0.6652159235698092

## Log run

In [15]:
meta_data = {
    'data_path':DF_PATH,
    'traing_index':X_train.index.to_list(),
    'test_index':X_test.index.to_list(),
    'feature_name':X_train.columns.droplevel(0).to_list(),
    'tareget_name':y_train.columns.droplevel(0).to_list(),
}

with open(os.path.join(LOG_PATH,LOG_DATA_PKL),'wb') as f:
    pickle.dump(meta_data,f)


In [16]:
model ={
    'model_description':'Baseline model with Logistic Regression',
    'model_details':str(clf),
    'model':clf,
}

with open(os.path.join(LOG_PATH,LOG_MODEL_PKL),'wb') as f:
    pickle.dump(model,f)

In [17]:
performance = {
    'accuracy':table.T['Accuracy'].mean(),
    'percision':table.T['Precision'].mean(),
    'recall':table.T['Recall'].mean(),
    'f1':table.T['F1'].mean(),
}

with open(os.path.join(LOG_PATH,LOG_METRICS_PKL),'wb') as f:
    pickle.dump(performance,f)

In [18]:
with mlflow.start_run(experiment_id=exp.experiment_id, run_name=model['model_description']):
    
    mlflow.log_param("model_description", model['model_description'])
    mlflow.log_param("model_details", model['model_details'])
    
    mlflow.log_metric("accuracy", performance['accuracy'])
    mlflow.log_metric("percision", performance['percision'])
    mlflow.log_metric("recall", performance['recall'])
    mlflow.log_metric("f1", performance['f1'])
    
    mlflow.log_artifact(os.path.join(LOG_PATH,LOG_DATA_PKL))
    mlflow.log_artifact(os.path.join(LOG_PATH,LOG_MODEL_PKL))
    mlflow.log_artifact(os.path.join(LOG_PATH,LOG_METRICS_PKL))
    

In [19]:
runs = mlflow.search_runs([exp.experiment_id])
runs

Unnamed: 0,run_id,experiment_id,status,artifact_uri,start_time,end_time,metrics.f1,metrics.recall,metrics.accuracy,metrics.percision,params.model_description,params.model_details,tags.mlflow.source.name,tags.mlflow.runName,tags.mlflow.source.type,tags.mlflow.user
0,77264420ba6343909c955b6a0686d10b,241658575313021916,FINISHED,file:///g:/workspace/e2e_ml_project/notebooks/...,2023-08-05 13:10:18.401000+00:00,2023-08-05 13:10:18.787000+00:00,0.375621,0.289507,0.912031,0.665216,Baseline model with Logistic Regression,"Pipeline(steps=[('standardscaler', StandardSca...",c:\Users\Sameh\anaconda3\envs\e2e_env\lib\site...,Baseline model with Logistic Regression,LOCAL,Sameh
