In [6]:
from tensorflow.keras.preprocessing import text
from mlxtend.frequent_patterns import apriori, association_rules
import pandas as pd

In [2]:
data = pd.read_csv("../data/data_transformed.csv")
data.head(n=20)

Unnamed: 0,index,job_title,skill_name,job_title_trans,job_title_trans_12,job_title_color,job_title_color_12
0,0,Data Engineer,"5S,Amazon Web Services (AWS),CGI,Extract-Trans...",engineer,engineer,tab:blue,2
1,1,Data Scientist,"5S,Amazon Web Services (AWS),Extract-Transform...",misc,scientist,tab:orange,0
2,2,Senior Data Engineer - Panorama Financial Inst...,"API,Amazon Web Services (AWS),Extract-Transfor...",engineer,engineer,tab:blue,2
3,3,Senior Production Support (DevOps) – Data Anal...,"5S,Artificial Intelligence,Amazon Web Services...",analyst,ops,tab:orange,3
4,4,MTB Process Data Analyst Engineer,"3D Modeling,5G,Artificial Intelligence,Dynamic...",analyst,analyst,tab:orange,1
5,5,Lead Data Scientist,"Artificial Intelligence,Human Computer Interac...",misc,scientist,tab:orange,0
6,6,Senior Data Scientist,"Artificial Intelligence,API,Computer Aided Tra...",misc,scientist,tab:orange,0
7,7,"Financial Crimes Compliance, Data Scientist/Da...","Activity-Based Costing (ABC),Artificial Intell...",engineer,engineer,tab:blue,2
8,8,Senior Principal Software Engineer - (IT Data ...,"Software Development Life Cycle (SDL),SQL,MS T...",analyst,analyst,tab:orange,1
9,9,Analytics Solution Architect & Data Engineer,"API,Internet of Things (IoT),Machine Learning,...",analyst,architect,tab:orange,3


# Association Rule Mining

In [4]:
temp_df = data
tokenizer = text.Tokenizer(num_words=1000, split=",", filters="")
tokenizer.fit_on_texts(temp_df['skill_name'].values)
bag_of_words = tokenizer.texts_to_matrix(temp_df['skill_name'].values)
columns = ["TOKEN"]
columns.extend(list(tokenizer.index_word.values())[:1000-1])
items = pd.DataFrame(bag_of_words, columns=columns)
len(items)

38999

### 1 - Find all frequent item sets with Support > 10 %

In [9]:
#freq_items = apriori(items,min_support=0.1, use_colnames=True, verbose=1)
#pd.to_pickle(freq_items, "../data/frequent_item_sets.pkl")
freq_items = pd.read_pickle("../data/frequent_item_sets.pkl")
freq_items.sort_values('support', ascending=False)

Unnamed: 0,support,itemsets
0,0.825996,(python)
1,0.687659,(sql)
89,0.589784,"(python, sql)"
2,0.578297,(data science)
3,0.572194,(machine learning)
...,...,...
3423,0.100028,"(big data, extract-transform-load (etl) techni..."
1801,0.100028,"(conceptualization, big data, cloud services)"
296,0.100028,"(machine learning, marketing)"
2854,0.100003,"(scala, big data, sql, cloud services)"


### 2 - Drop all rules with lift <= 1

In [12]:
#freq_items = pd.read_pickle("../data/frequent_item_sets.pkl")
#rules = association_rules(freq_items, metric='lift', min_threshold=1, support_only=False)
#rules.to_pickle("../data/association_rules.pkl")
rules = pd.read_pickle("../data/association_rules.pkl")
rules.sort_values('lift', ascending=False)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,representativity,leverage,conviction,zhangs_metric,jaccard,certainty,kulczynski
5564,"(tensorflow, machine learning)",(pytorch),0.165184,0.114157,0.103849,0.628687,5.507222,1.0,0.084992,2.385703,0.980360,0.591759,0.580836,0.769195
5569,(pytorch),"(tensorflow, machine learning)",0.114157,0.165184,0.103849,0.909704,5.507222,1.0,0.084992,9.245279,0.923888,0.591759,0.891837,0.769195
2980,"(tensorflow, python)",(pytorch),0.161132,0.114157,0.100618,0.624443,5.470048,1.0,0.082224,2.358745,0.974154,0.576042,0.576046,0.752922
2985,(pytorch),"(tensorflow, python)",0.114157,0.161132,0.100618,0.881402,5.470048,1.0,0.082224,7.073180,0.922495,0.576042,0.858621,0.752922
5566,"(machine learning, pytorch)",(tensorflow),0.112593,0.168774,0.103849,0.922341,5.464962,1.0,0.084846,10.703564,0.920678,0.585007,0.906573,0.768828
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13041,"(python, machine learning)","(apache hadoop, apache hive)",0.515782,0.226826,0.117003,0.226846,1.000085,1.0,0.000010,1.000025,0.000176,0.187024,0.000025,0.371336
2670,"(architecture & construction design, conceptua...",(python),0.126901,0.825996,0.104823,0.826025,1.000036,1.0,0.000004,1.000172,0.000041,0.123602,0.000172,0.476465
2675,(python),"(architecture & construction design, conceptua...",0.825996,0.126901,0.104823,0.126905,1.000036,1.0,0.000004,1.000005,0.000208,0.123602,0.000005,0.476465
2665,(architecture & construction design),"(problem solving, python)",0.379420,0.287212,0.108977,0.287220,1.000027,1.0,0.000003,1.000011,0.000044,0.195420,0.000011,0.333325


### 3 - Select all rules (A -> B) where A and B belong to different skill classes

In [13]:
def filter_rules(df):
    pd_skill_taxonomy = pd.read_csv("../data/skill_taxonomy.csv")
    pd_skill_taxonomy["skill_name"] = pd_skill_taxonomy.skill_name.apply(lambda x : x.lower())
    skill_taxonomy = pd_skill_taxonomy.set_index('skill_name')["family_name"].to_dict()

    for index, row in df.iterrows():
        antecedents = {skill_taxonomy[skill] for skill in row["antecedents"]}
        consequents = {skill_taxonomy[skill] for skill in row["consequents"]}

        if len(antecedents.intersection(consequents)) != 0:
            df.drop(index, inplace=True)
    return df

#rules = pd.read_pickle("../data/association_rules.pkl")
#filtered_rules = filter_rules(rules.copy()).sort_values('lift', ascending=False)
#filtered_rules.to_pickle("../data/filtered_association_rules.pkl")
filtered_rules = pd.read_pickle("../data/filtered_association_rules.pkl")
filtered_rules.head(n=20)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,representativity,leverage,conviction,zhangs_metric,jaccard,certainty,kulczynski
4159,"(sql, extract-transform-load (etl) technique)",(warehouse management),0.292341,0.166671,0.10249,0.350583,2.103446,1.0,0.053765,1.283196,0.741303,0.287471,0.220696,0.482753
4162,(warehouse management),"(sql, extract-transform-load (etl) technique)",0.166671,0.292341,0.10249,0.614923,2.103446,1.0,0.053765,1.837709,0.629511,0.287471,0.455844,0.482753
20954,(statistics),"(forecast & modeling, machine learning, r)",0.371881,0.131721,0.100746,0.270909,2.056686,1.0,0.051761,1.190907,0.817968,0.25008,0.160304,0.517876
20947,"(forecast & modeling, machine learning, r)",(statistics),0.131721,0.371881,0.100746,0.764843,2.056686,1.0,0.051761,2.671064,0.591724,0.25008,0.625617,0.517876
20637,"(data science, forecast & modeling, r)",(statistics),0.132388,0.371881,0.100823,0.761573,2.047892,1.0,0.05159,2.634425,0.589772,0.249905,0.62041,0.516345
20644,(statistics),"(data science, forecast & modeling, r)",0.371881,0.132388,0.100823,0.271116,2.047892,1.0,0.05159,1.19033,0.814644,0.249905,0.159897,0.516345
31772,(statistics),"(machine learning, r, data science, python, al...",0.371881,0.140388,0.106003,0.285044,2.030402,1.0,0.053795,1.202329,0.807947,0.260919,0.168281,0.520056
31721,"(machine learning, r, data science, python, al...",(statistics),0.140388,0.371881,0.106003,0.755068,2.030402,1.0,0.053795,2.564467,0.590367,0.260919,0.610055,0.520056
30973,(statistics),"(data science, algorithm, machine learning, r)",0.371881,0.144568,0.108746,0.292422,2.022734,1.0,0.054984,1.208959,0.804975,0.26673,0.172842,0.52232
30952,"(data science, algorithm, machine learning, r)",(statistics),0.144568,0.371881,0.108746,0.752217,2.022734,1.0,0.054984,2.534956,0.591069,0.26673,0.605516,0.52232


In [6]:
filtered_rules.describe()

Unnamed: 0,antecedent support,consequent support,support,confidence,lift,leverage,conviction
count,2988.0,2988.0,2988.0,2988.0,2988.0,2988.0,2988.0
mean,0.349123,0.349123,0.131992,0.431502,1.255572,0.023446,1.197562
std,0.137693,0.137693,0.037709,0.167729,0.216468,0.016949,0.259439
min,0.110823,0.110823,0.100003,0.122218,1.000027,3e-06,1.000005
25%,0.22116,0.22116,0.10731,0.284099,1.072259,0.008584,1.041277
50%,0.361009,0.361009,0.119567,0.40987,1.215658,0.022001,1.097138
75%,0.480705,0.480705,0.141542,0.567617,1.376643,0.034441,1.236917
max,0.825996,0.825996,0.443934,0.934426,2.103446,0.088776,2.671064
