In [1]:
DF_PATH       = "../data/processed/2_cleaned_data.pkl"

ROLE_COLS      = ['DevType']
TECH_COLS      = ['LanguageHaveWorkedWith',
                  'DatabaseHaveWorkedWith',
                  'PlatformHaveWorkedWith',
                  'WebframeHaveWorkedWith',
                  'MiscTechHaveWorkedWith',
                  'ToolsTechHaveWorkedWith',
                  'NEWCollabToolsHaveWorkedWith']

MLFLOW_TRACKING_URI = '../models/mlruns'
MLFLOW_EXPERIMENT_NAME = "skills_jobs_stackoverflow"

LOG_PATH = "../models/temp/"
LOG_DATA_PKL    =  "data_rf.pkl"
LOG_MODEL_PKL   =  "model_rf.pkl"
LOG_METRICS_PKL =  "metrics_rf.pkl"


In [2]:
# Load packages
import pandas as pd 
import numpy as np
import logging
import pickle
import random
import plotly 
import os
from pathlib import Path

import mlflow
from mlflow.tracking import MlflowClient

from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler
from sklearn.pipeline import make_pipeline, FeatureUnion
from sklearn.feature_selection import VarianceThreshold
from sklearn.multioutput import MultiOutputClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

from sklearn import metrics
from sklearn.metrics import auc, accuracy_score, confusion_matrix, f1_score, precision_score, recall_score

from sklearn.decomposition import PCA, KernelPCA

from sklearn import tree
from sklearn.linear_model import LogisticRegression

from matplotlib import pyplot as plt


In [3]:
df = pd.read_pickle(DF_PATH)

In [4]:
df['DevType'].sum(axis=0)

Academic researcher                               581
Data or business analyst                          669
Data scientist or machine learning specialist     799
Database administrator                            296
DevOps specialist                                 677
Developer, QA or test                             493
Developer, back-end                              5503
Developer, desktop or enterprise applications    1671
Developer, embedded applications or devices       795
Developer, front-end                             2890
Developer, full-stack                            5578
Developer, game or graphics                       342
Developer, mobile                                1859
Engineer, data                                    483
Scientist                                         292
System administrator                              440
dtype: int64

In [5]:
roles = df['DevType'].columns.tolist()
sample_size = 500
final_sample = []

for role in roles:
    df_role = df[df[('DevType',role)] == 1]
    
    if len(df_role) < sample_size:
        df_role = df_role.sample(sample_size, replace = True,random_state=42)
    
        
    else:
        df_role = df_role.sample(sample_size, replace = True)
        
    final_sample.append(df_role)



In [6]:
final_sample = pd.concat(final_sample)
final_sample['DevType'].sum()

Academic researcher                               810
Data or business analyst                          772
Data scientist or machine learning specialist     863
Database administrator                            587
DevOps specialist                                 712
Developer, QA or test                             596
Developer, back-end                              1771
Developer, desktop or enterprise applications     887
Developer, embedded applications or devices       660
Developer, front-end                              855
Developer, full-stack                            1413
Developer, game or graphics                       566
Developer, mobile                                 798
Engineer, data                                    608
Scientist                                         657
System administrator                              653
dtype: int64

In [7]:
X_train, X_test, y_train, y_test = train_test_split(final_sample.drop(ROLE_COLS, axis=1),final_sample[ROLE_COLS], test_size=0.2, random_state=42)

  X_train, X_test, y_train, y_test = train_test_split(final_sample.drop(ROLE_COLS, axis=1),final_sample[ROLE_COLS], test_size=0.2, random_state=42)


In [8]:
mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)
client = MlflowClient()
mlflow.set_experiment(MLFLOW_EXPERIMENT_NAME)
exp = client.get_experiment_by_name(MLFLOW_EXPERIMENT_NAME)

## Random Forest

In [9]:
clf = RandomForestClassifier(random_state=42)
clf.fit(X_train, y_train)
pred = pd.DataFrame(clf.predict(X_test), columns=y_test.columns)

In [10]:
table = {}
for role in roles:
    
    mat = metrics.accuracy_score(y_test[('DevType',role)], pred[('DevType',role)])
    per = metrics.precision_score(y_test[('DevType',role)], pred[('DevType',role)])
    recal = metrics.recall_score(y_test[('DevType',role)], pred[('DevType',role)])
    f1= metrics.f1_score(y_test[('DevType',role)], pred[('DevType',role)])
    table[role] = [mat, per, recal, f1]
    
table = pd.DataFrame(table, index = ['Accuracy', 'Precision', 'Recall', 'F1']).sort_values(by='Precision', axis=1, ascending=False)
table.T

Unnamed: 0,Accuracy,Precision,Recall,F1
System administrator,0.97,0.961165,0.692308,0.804878
Data scientist or machine learning specialist,0.965,0.954545,0.715909,0.818182
"Engineer, data",0.97125,0.944444,0.674603,0.787037
"Developer, mobile",0.97125,0.941667,0.743421,0.830882
Scientist,0.980625,0.926606,0.814516,0.866953
DevOps specialist,0.9725,0.922414,0.753521,0.829457
Data or business analyst,0.9675,0.92,0.732484,0.815603
"Developer, game or graphics",0.978125,0.884211,0.777778,0.827586
Academic researcher,0.958125,0.883333,0.666667,0.759857
Database administrator,0.970625,0.875,0.728,0.79476


In [11]:
table.T['Precision'].mean()

0.8882542714743562

In [12]:
meta_data = {
    'data_path':DF_PATH,
    'traing_index':X_train.index.to_list(),
    'test_index':X_test.index.to_list(),
    'feature_name':X_train.columns.to_list(),
    'tareget_name':y_train.columns.to_list(),
}

with open(os.path.join(LOG_PATH,LOG_DATA_PKL),'wb') as f:
    pickle.dump(meta_data,f)


In [13]:
model ={
    'model_description':clf.__class__.__name__,
    'model_details':str(clf),
    'model':clf,
}

with open(os.path.join(LOG_PATH,LOG_MODEL_PKL),'wb') as f:
    pickle.dump(model,f)

In [14]:
performance = {
    'accuracy':table.T['Accuracy'].mean(),
    'percision':table.T['Precision'].mean(),
    'recall':table.T['Recall'].mean(),
    'f1':table.T['F1'].mean(),
}

with open(os.path.join(LOG_PATH,LOG_METRICS_PKL),'wb') as f:
    pickle.dump(performance,f)

In [15]:
with mlflow.start_run(experiment_id=exp.experiment_id, run_name=model['model_description']):
    
    mlflow.log_param("model_description", model['model_description'])
    mlflow.log_param("model_details", model['model_details'])
    
    mlflow.log_metric("accuracy", performance['accuracy'])
    mlflow.log_metric("percision", performance['percision'])
    mlflow.log_metric("recall", performance['recall'])
    mlflow.log_metric("f1", performance['f1'])
    
    mlflow.log_artifact(os.path.join(LOG_PATH,LOG_DATA_PKL))
    mlflow.log_artifact(os.path.join(LOG_PATH,LOG_MODEL_PKL))
    mlflow.log_artifact(os.path.join(LOG_PATH,LOG_METRICS_PKL))
    

## Random Frorest + PCA + RBF

In [16]:
clf = make_pipeline(StandardScaler(),PCA(n_components=0.95) ,RandomForestClassifier(random_state=42))
clf.fit(X_train, y_train)
pred = pd.DataFrame(clf.predict(X_test), columns=y_test.columns)

In [17]:
table = {}
for role in roles:
    
    mat = metrics.accuracy_score(y_test[('DevType',role)], pred[('DevType',role)])
    per = metrics.precision_score(y_test[('DevType',role)], pred[('DevType',role)])
    recal = metrics.recall_score(y_test[('DevType',role)], pred[('DevType',role)])
    f1= metrics.f1_score(y_test[('DevType',role)], pred[('DevType',role)])
    table[role] = [mat, per, recal, f1]
    
table = pd.DataFrame(table, index = ['Accuracy', 'Precision', 'Recall', 'F1']).sort_values(by='Precision', axis=1, ascending=False)
table.T

Unnamed: 0,Accuracy,Precision,Recall,F1
System administrator,0.968125,0.979167,0.657343,0.786611
DevOps specialist,0.970625,0.970297,0.690141,0.806584
Data scientist or machine learning specialist,0.959375,0.966387,0.653409,0.779661
"Developer, game or graphics",0.980625,0.963855,0.740741,0.837696
"Developer, mobile",0.965625,0.961905,0.664474,0.785992
"Engineer, data",0.970625,0.94382,0.666667,0.781395
Data or business analyst,0.969375,0.942623,0.732484,0.824373
Scientist,0.98125,0.935185,0.814516,0.87069
"Developer, QA or test",0.96875,0.927711,0.636364,0.754902
Database administrator,0.97375,0.919192,0.728,0.8125


In [18]:
table.T['Precision'].mean()

0.9229932692399849

In [19]:
meta_data = {
    'data_path':DF_PATH,
    'traing_index':X_train.index.to_list(),
    'test_index':X_test.index.to_list(),
    'feature_name':X_train.columns.droplevel(0).to_list(),
    'tareget_name':y_train.columns.droplevel(0).to_list(),
}

with open(os.path.join(LOG_PATH,LOG_DATA_PKL),'wb') as f:
    pickle.dump(meta_data,f)


In [20]:
model ={
    'model_description':clf.__class__.__name__,
    'model_details':str(clf),
    'model':clf,
}

with open(os.path.join(LOG_PATH,LOG_MODEL_PKL),'wb') as f:
    pickle.dump(model,f)

In [21]:
performance = {
    'accuracy':table.T['Accuracy'].mean(),
    'percision':table.T['Precision'].mean(),
    'recall':table.T['Recall'].mean(),
    'f1':table.T['F1'].mean(),
}

with open(os.path.join(LOG_PATH,LOG_METRICS_PKL),'wb') as f:
    pickle.dump(performance,f)

In [22]:
with mlflow.start_run(experiment_id=exp.experiment_id, run_name=model['model_description']):
    
    mlflow.log_param("model_description", model['model_description'])
    mlflow.log_param("model_details", model['model_details'])
    
    mlflow.log_metric("accuracy", performance['accuracy'])
    mlflow.log_metric("percision", performance['percision'])
    mlflow.log_metric("recall", performance['recall'])
    mlflow.log_metric("f1", performance['f1'])
    
    mlflow.log_artifact(os.path.join(LOG_PATH,LOG_DATA_PKL))
    mlflow.log_artifact(os.path.join(LOG_PATH,LOG_MODEL_PKL))
    mlflow.log_artifact(os.path.join(LOG_PATH,LOG_METRICS_PKL))
    

## Hyper parameter tuning

In [23]:
clf = make_pipeline(StandardScaler(),PCA() ,RandomForestClassifier(random_state=42))

In [24]:
params ={
    'pca__n_components':[0.95,0.99,0.999],
    'randomforestclassifier__n_estimators':[100,200,300],
    'randomforestclassifier__max_depth':[5,10,15],
}

In [25]:
hpt_clf = GridSearchCV(clf, params, cv=5, scoring='precision_macro', n_jobs=-1)
hpt_clf.fit(X_train, y_train)

In [26]:
clf =hpt_clf.best_estimator_

In [27]:
clf.fit(X_train, y_train)
pred = pd.DataFrame(clf.predict(X_test), columns = y_test.columns)

In [28]:
table = {}
for role in roles:
    
    mat = metrics.accuracy_score(y_test[('DevType',role)], pred[('DevType',role)])
    per = metrics.precision_score(y_test[('DevType',role)], pred[('DevType',role)])
    recal = metrics.recall_score(y_test[('DevType',role)], pred[('DevType',role)])
    f1= metrics.f1_score(y_test[('DevType',role)], pred[('DevType',role)])
    table[role] = [mat, per, recal, f1]
    
table = pd.DataFrame(table, index = ['Accuracy', 'Precision', 'Recall', 'F1']).sort_values(by='Precision', axis=1, ascending=False)
table.T

Unnamed: 0,Accuracy,Precision,Recall,F1
System administrator,0.958125,1.0,0.531469,0.694064
"Developer, QA or test",0.955625,0.980769,0.421488,0.589595
DevOps specialist,0.965,0.977778,0.619718,0.758621
Data scientist or machine learning specialist,0.954375,0.972477,0.602273,0.74386
"Developer, mobile",0.96375,0.97,0.638158,0.769841
"Developer, game or graphics",0.97875,0.9625,0.712963,0.819149
Data or business analyst,0.9625,0.953271,0.649682,0.772727
"Engineer, data",0.9625,0.945946,0.555556,0.7
"Developer, desktop or enterprise applications",0.93125,0.93617,0.291391,0.444444
Database administrator,0.969375,0.931818,0.656,0.769953


In [29]:
table.T['Precision'].mean()

0.938507816632391

In [30]:
meta_data = {
    'data_path':DF_PATH,
    'traing_index':X_train.index.to_list(),
    'test_index':X_test.index.to_list(),
    'feature_name':X_train.columns.droplevel(0).to_list(),
    'tareget_name':y_train.columns.droplevel(0).to_list(),
}

with open(os.path.join(LOG_PATH,LOG_DATA_PKL),'wb') as f:
    pickle.dump(meta_data,f)


In [31]:
model ={
    'model_description':'RandomForestClassifier with PCA',
    'model_details':str(clf),
    'model':clf,
}

with open(os.path.join(LOG_PATH,LOG_MODEL_PKL),'wb') as f:
    pickle.dump(model,f)

In [32]:
performance = {
    'accuracy':table.T['Accuracy'].mean(),
    'percision':table.T['Precision'].mean(),
    'recall':table.T['Recall'].mean(),
    'f1':table.T['F1'].mean(),
}

with open(os.path.join(LOG_PATH,LOG_METRICS_PKL),'wb') as f:
    pickle.dump(performance,f)

In [33]:
with mlflow.start_run(experiment_id=exp.experiment_id, run_name=model['model_description']):
    
    mlflow.log_param("model_description", model['model_description'])
    mlflow.log_param("model_details", model['model_details'])
    
    mlflow.log_metric("accuracy", performance['accuracy'])
    mlflow.log_metric("percision", performance['percision'])
    mlflow.log_metric("recall", performance['recall'])
    mlflow.log_metric("f1", performance['f1'])
    
    mlflow.log_artifact(os.path.join(LOG_PATH,LOG_DATA_PKL))
    mlflow.log_artifact(os.path.join(LOG_PATH,LOG_MODEL_PKL))
    mlflow.log_artifact(os.path.join(LOG_PATH,LOG_METRICS_PKL))
    

In [34]:
runs = mlflow.search_runs([exp.experiment_id])
runs

Unnamed: 0,run_id,experiment_id,status,artifact_uri,start_time,end_time,metrics.accuracy,metrics.recall,metrics.percision,metrics.f1,params.model_description,params.model_details,tags.mlflow.source.name,tags.mlflow.runName,tags.mlflow.user,tags.mlflow.source.type
0,a6a1acf9579f4b858116fa024bff1619,241658575313021916,FINISHED,file:///g:/workspace/e2e_ml_project/notebooks/...,2023-08-05 13:37:06.684000+00:00,2023-08-05 13:37:10.182000+00:00,0.94793,0.549573,0.938508,0.685822,RandomForestClassifier with PCA,"Pipeline(steps=[('standardscaler', StandardSca...",c:\Users\Sameh\anaconda3\envs\e2e_env\lib\site...,RandomForestClassifier with PCA,Sameh,LOCAL
1,ce9c69fc63d541059a1edbdd9130ba77,241658575313021916,FINISHED,file:///g:/workspace/e2e_ml_project/notebooks/...,2023-08-05 13:11:28.482000+00:00,2023-08-05 13:11:30.350000+00:00,0.954375,0.634733,0.922993,0.747998,Pipeline,"Pipeline(steps=[('standardscaler', StandardSca...",c:\Users\Sameh\anaconda3\envs\e2e_env\lib\site...,Pipeline,Sameh,LOCAL
2,4f106e0e9c8845369bb40c9f2c618d71,241658575313021916,FINISHED,file:///g:/workspace/e2e_ml_project/notebooks/...,2023-08-05 13:11:04.796000+00:00,2023-08-05 13:11:10.010000+00:00,0.954648,0.663479,0.888254,0.756416,RandomForestClassifier,RandomForestClassifier(random_state=42),c:\Users\Sameh\anaconda3\envs\e2e_env\lib\site...,RandomForestClassifier,Sameh,LOCAL
3,77264420ba6343909c955b6a0686d10b,241658575313021916,FINISHED,file:///g:/workspace/e2e_ml_project/notebooks/...,2023-08-05 13:10:18.401000+00:00,2023-08-05 13:10:18.787000+00:00,0.912031,0.289507,0.665216,0.375621,Baseline model with Logistic Regression,"Pipeline(steps=[('standardscaler', StandardSca...",c:\Users\Sameh\anaconda3\envs\e2e_env\lib\site...,Baseline model with Logistic Regression,Sameh,LOCAL
