In [1]:
RUN_ID ='a6a1acf9579f4b858116fa024bff1619'
MLFLOW_TRACKING_URI = '../models/mlruns'

LOG_DATA_PKL    =  "data_rf.pkl"
LOG_MODEL_PKL   =  "model_rf.pkl"
LOG_METRICS_PKL =  "metrics_rf.pkl"

CLUSTERS_PATH = "../data/processed/skills_group_clusters.pkl"


In [2]:
import os 
import sklearn
import pickle
import yaml

import pandas as pd

import mlflow
from mlflow.tracking import MlflowClient


### Initialize MLflow

In [3]:
mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)
client = MlflowClient()

run = client.get_run(RUN_ID)
artifact_path = run.info.artifact_uri
artifact_path.strip("file:///")

'g:/workspace/e2e_ml_project/notebooks/../models/mlruns/241658575313021916/a6a1acf9579f4b858116fa024bff1619/artifacts'

In [4]:
with open(os.path.join(artifact_path, LOG_MODEL_PKL).replace("file:///", ""), "rb") as f:
    model = pickle.load(f)

In [5]:
model_rf= model['model']

In [6]:
with open(os.path.join(artifact_path, LOG_DATA_PKL).replace("file:///", ""), "rb") as f:
    data = pickle.load(f)

In [7]:
features=data['feature_name']

In [8]:
with open(CLUSTERS_PATH , 'rb') as f:
    cluser_data = pickle.load(f)

In [9]:
cluser_data

skill_group_0     [Cassandra, DynamoDB, Elasticsearch, MongoDB, ...
skill_group_1                             [Unity 3D, Unreal Engine]
skill_group_10    [SQL, VBA, IBM DB2, Microsoft SQL Server, Oracle]
skill_group_11                                   [Assembly, C, C++]
skill_group_12              [HTML/CSS, PHP, MySQL, Laravel, jQuery]
skill_group_2         [Couchbase, MariaDB, Drupal, Gatsby, Symfony]
skill_group_3     [Dart, Java, Kotlin, Objective-C, Swift, Fireb...
skill_group_4     [Python, Scala, Django, Flask, Apache Spark, H...
skill_group_5          [C#, ASP.NET, ASP.NET Core, .NET, .NET Core]
skill_group_6     [Bash/Shell/PowerShell, Go, Rust, Ansible, Ter...
skill_group_7     [JavaScript, TypeScript, Angular, Angular.js, ...
skill_group_8     [Haskell, Julia, R, Keras, Pandas, TensorFlow,...
skill_group_9             [Perl, Ruby, Ruby on Rails, Chef, Puppet]
dtype: object

In [10]:
sample = ['Python','Keras','Pandas','TensorFlow','Torch/PyTorch']


In [11]:
features = pd.Series(features)
features_skill = features[~features.isin(cluser_data.index)]
features_skill

13                 Assembly
14    Bash/Shell/PowerShell
15                        C
16                       C#
17                      C++
              ...          
82                 Teraform
83            Torch/PyTorch
84                 Unity 3D
85            Unreal Engine
86                  Xamarin
Length: 74, dtype: object

## One Hot Encoding

In [12]:
ohe_skils = pd.Series(features_skill.isin(sample).astype(int).to_list(), index=features_skill.values)

## Clustring Skills

In [13]:
cluster_skills = pd.Series(cluser_data).explode().isin(sample).astype(int).groupby(level=0).sum()

## combine features

In [14]:
combine_df = pd.concat([ohe_skils, cluster_skills], axis=0)

In [15]:
combine_df

Assembly                 0
Bash/Shell/PowerShell    0
C                        0
C#                       0
C++                      0
                        ..
skill_group_5            0
skill_group_6            0
skill_group_7            0
skill_group_8            4
skill_group_9            0
Length: 87, dtype: int64

In [16]:
final_features = combine_df.loc[features]

final_features.shape

(87,)

In [17]:
pred = model_rf.predict_proba(final_features.values.reshape(1,-1))

In [18]:
positive_probs = [prob[0][1] for prob in pred]
pd.Series(positive_probs, 
          index=data['tareget_name']).sort_values(ascending=False)


Data scientist or machine learning specialist    0.827832
Scientist                                        0.243659
Academic researcher                              0.200554
Data or business analyst                         0.106116
Engineer, data                                   0.067568
Developer, back-end                              0.052526
Developer, desktop or enterprise applications    0.014497
Developer, embedded applications or devices      0.008411
Developer, full-stack                            0.007098
Developer, mobile                                0.005451
System administrator                             0.003124
DevOps specialist                                0.002675
Developer, QA or test                            0.001623
Developer, front-end                             0.001605
Database administrator                           0.001048
Developer, game or graphics                      0.000920
dtype: float64