In [18]:
from sklearn.linear_model import LogisticRegression
from sklearn.inspection import permutation_importance
from tensorflow.keras.preprocessing import text

import numpy as np
import pandas as pd
import pickle

In [3]:
data = pd.read_csv("../data/data_transformed.csv")
data.head(n=20)

Unnamed: 0,index,job_title,skill_name,job_title_trans,job_title_color
0,0,Data Engineer,"5S,Amazon Web Services (AWS),CGI,Extract-Trans...",engineer,2
1,1,Data Scientist,"5S,Amazon Web Services (AWS),Extract-Transform...",scientist,0
2,2,Senior Data Engineer - Panorama Financial Inst...,"API,Amazon Web Services (AWS),Extract-Transfor...",engineer,2
3,3,Senior Production Support (DevOps) – Data Anal...,"5S,Artificial Intelligence,Amazon Web Services...",ops,3
4,4,MTB Process Data Analyst Engineer,"3D Modeling,5G,Artificial Intelligence,Dynamic...",analyst,1
5,5,Lead Data Scientist,"Artificial Intelligence,Human Computer Interac...",scientist,0
6,6,Senior Data Scientist,"Artificial Intelligence,API,Computer Aided Tra...",scientist,0
7,7,"Financial Crimes Compliance, Data Scientist/Da...","Activity-Based Costing (ABC),Artificial Intell...",engineer,2
8,8,Senior Principal Software Engineer - (IT Data ...,"Software Development Life Cycle (SDL),SQL,MS T...",analyst,1
9,9,Analytics Solution Architect & Data Engineer,"API,Internet of Things (IoT),Machine Learning,...",architect,3


### Tokenization

In [5]:
tokenizer = text.Tokenizer(num_words=1000, split=",", filters="")
tokenizer.fit_on_texts(data['skill_name'].values)
bag_of_words = tokenizer.texts_to_matrix(data['skill_name'].values)

### Training Set

In [6]:
from sklearn.model_selection import train_test_split
from numpy import hstack,vstack

analysts = bag_of_words[data.job_title_trans == "analyst"]
engineers = bag_of_words[data.job_title_trans == "engineer"]

X_train_a, X_test_a, y_train_a, y_test_a = train_test_split(analysts, np.repeat("analyst", len(analysts)), train_size=5000, random_state=0)
X_train_e, X_test_e, y_train_e, y_test_e = train_test_split(engineers, np.repeat("engineer", len(engineers)), train_size=5000, random_state=0)

X_train = vstack((X_train_a, X_train_e))
y_train = hstack((y_train_a, y_train_e))

# Naive Bayes Classifier (Bernoulli)

In [7]:
from sklearn.naive_bayes import BernoulliNB

bnb = BernoulliNB().fit(X_train, y_train)

In [8]:
X_test, y_test = X_test_a, y_test_a # analysts

y_pred = bnb.predict(X_test)
print("Number of mislabeled analysts out of a total of %d : %d (Score: %f)" % (X_test.shape[0], (y_test != y_pred).sum(), bnb.score(X_test, y_test)))

Number of mislabeled analysts out of a total of 2220 : 492 (Score: 0.778378)


In [9]:
X_test, y_test = X_test_e, y_test_e # engineers

y_pred = bnb.predict(X_test)
print("Number of mislabeled engineers out of a total of %d : %d (Score: %f)" % (X_test.shape[0], (y_test != y_pred).sum(), bnb.score(X_test, y_test)))

Number of mislabeled engineers out of a total of 10935 : 1404 (Score: 0.871605)


#### Feature Importances

In [16]:
X_test = vstack((X_test_a, X_test_e))
y_test = hstack((y_test_a, y_test_e))

#importances = permutation_importance(bnb, X_test, y_test)
#pickle.dump(importances, open("../data/importances.pkl", "wb"))
importances = pickle.load(open("../data/importances.pkl", 'rb'))

importances_mean_sorted = np.argsort(importances.importances_mean)[::-1]

for f in range(10):
    print("%d. %s (%f)" % (f + 1, tokenizer.index_word[importances_mean_sorted[f]],
                           importances.importances_mean[importances_mean_sorted[f]]))

1. overall equipment effectiveness (oee) (0.006507)
2. computerized maintenance management system (cmms) (0.001460)
3. sap plaint maintenace (pm) (0.001201)
4. ms excel (0.001201)
5. t-sql (transact-sql) (0.001155)
6. online marketing (0.000821)
7. azure devops (0.000775)
8. extract-transform-load (etl) technique (0.000699)
9. ms sql server integration services (0.000639)
10. striving for achievement (0.000623)


# Logistic Regression

In [19]:
clf = LogisticRegression(random_state=0, max_iter=5000).fit(X_train, y_train)

In [20]:
X_test, y_test = X_test_a, y_test_a # analysts

y_pred_clf = clf.predict(X_test)
print("Number of mislabeled analysts out of a total of %d : %d (Score: %f)"
      % (X_test.shape[0], (y_test != y_pred_clf).sum(), clf.score(X_test, y_test)))

Number of mislabeled analysts out of a total of 2220 : 280 (Score: 0.873874)


In [21]:
X_test, y_test = X_test_e, y_test_e # engineers

y_pred_clf = clf.predict(X_test)
print("Number of mislabeled engineers out of a total of %d : %d (Score: %f)"
      % (X_test.shape[0], (y_test != y_pred_clf).sum(), clf.score(X_test, y_test)))

Number of mislabeled engineers out of a total of 10935 : 1289 (Score: 0.882122)


In [22]:
clf.classes_

array(['analyst', 'engineer'], dtype='<U8')

In [23]:
weights_sorted = [x.item() for x in np.argsort(clf.coef_.T, axis=0)[::-1]]
for f in range(0,10):
    print("%d. %s (%f)" % (f + 1, tokenizer.index_word[weights_sorted[f]],
                           clf.coef_.T[weights_sorted[f]]))

1. overall equipment effectiveness (oee) (2.278676)
2. pair programming (2.011320)
3. component design (1.970347)
4. payment process (1.753054)
5. hidden markov model (hmm) (1.656961)
6. mobile device (1.625826)
7. ms skype / lync (1.610856)
8. sap plaint maintenace (pm) (1.504822)
9. siebel crm (1.432077)
10. automation engineering (1.397523)


  print("%d. %s (%f)" % (f + 1, tokenizer.index_word[weights_sorted[f]],


In [24]:
for f in range(len(weights_sorted) - 10, len(weights_sorted)):
    print("%d. %s (%f)" % (f + 1, tokenizer.index_word[weights_sorted[f]],
                           clf.coef_.T[weights_sorted[f]]))

991. secure shell (ssh) (-1.545504)
992. adobe analytics (-1.608604)
993. central processing unit (cpu) (-1.623046)
994. ibm - aix (-1.642375)
995. cluster validation (-1.649666)
996. reinsurance (-1.713266)
997. autodesk mechanical (-1.787711)
998. psychology (-1.858232)
999. apache mesos (-1.905243)
1000. objectivity (-1.956007)


  print("%d. %s (%f)" % (f + 1, tokenizer.index_word[weights_sorted[f]],


In [27]:
data["find_skill"] = data.skill_name.apply(lambda x : True if "bottleneck" in x.lower() else False)
data[data.find_skill].skill_name.iloc[0]

'Amazon Web Services (AWS),Behaviour Driven Development (BDD),Extract-Transform-Load (ETL) Technique,General Data Protection Regulation (GDPR),Apache Hadoop Distributed File System (HDFS),Internet of Things (IoT),Java - JUnit,Machine Learning,MongoDB,NoSQL Database,Object-Relational Mapping,RabbitMQ,XML SOAP,SQL,svn,Test-Driven Development (TDD),Agile Software Development,Ambition,Apache Oozie,Architecture & Construction Design,MS Azure,Bash Script (Unix),Big Data,Bottleneck,Business Requirement,Cloud Services,Configuration Management,Container,Data Management,Process and Data Modeling,Data Storage,Data Visualization,Data Warehouse,Database,Debugging,Software Design Pattern,Distributed Computing,Docker,Elasticsearch,Apache Flume,Apache Hadoop,Hands-On Mentality,Apache HBase,Apache Hive,Java,Kubernetes,Apache MapReduce,Apache Maven,Networking,Pentaho,Problem Solving,Programming Languages,Python,Redis,Resource Management,Scala,Software Development,Apache Solr,Spark,Stream Processing,Tabl