In [1]:
#load packages
import os.path
import numpy as np
import re
import pandas as pd
import matplotlib.pyplot as plt
import altair as alt

import gensim 
from gensim.models import LdaModel
from gensim.models.wrappers import LdaMallet

import gensim.corpora as corpora
from gensim.corpora import Dictionary

from gensim import matutils, models

import pyLDAvis.gensim
import string
from nltk.tokenize import word_tokenize
pd.set_option('display.max_colwidth', 100)
%matplotlib inline

import spacy
# Load English model for SpaCy
nlp = spacy.load("en_core_web_sm")

  regargs, varargs, varkwargs, defaults, formatvalue=lambda value: ""
  from collections import Sequence, defaultdict
  from collections import Counter, Iterable


In [2]:
def get_most_prob_topic(unseen_document, model):
    """
    Given an unseen_document, and a trained LDA model, this function
    finds the most likely topic (topic with the highest probability) from the 
    topic distribution of the unseen document and returns the best topic with 
    its probability. . 
    
    Parameters
    ------------
    unseen_document : (str) 
        the document to be labeled with a topic
    model : (gensim ldamodel) 
        the trained LDA model
    
    Returns: 
    -------------
        (str) a string of the form 
        `most likely topic label:probability of that label` 
    
    Examples:
    ----------
    >> get_most_prob_topic("The research uses an HMM for discovering gene sequence.", 
                            model = lda)
    Science and Technology:0.435
    """    
    #preprocess 
    preprocessed = preprocess(unseen_document)
    
    #process into a doc_term_matrix
    corpus = preprocessed
    bow_vector = dictionary.doc2bow(corpus)
    
    
    #find simliar articals
    simliar_artical = model[bow_vector]
    
    
    #change the topic name and sort tuple
    items = []

    for idx, prob in simliar_artical:
        items.append((topic_labels[idx], prob))

    items = sorted(items, key=lambda x: x[1], reverse=True)
    out = items[0][0]
    return out

In [3]:
def preprocess(txt, 
               min_token_len = 3, 
               irrelevant_pos = ['ADV','PRON','CCONJ','PUNCT','PART','DET','ADP','SPACE']): 
    """
    Given text, min_token_len, and irrelevant_pos carry out preprocessing of the text 
    and return a preprocessed string. 
    
    Parameters
    -------------
    text : (str) 
        the text to be preprocessed
    min_token_len : (int) 
        min_token_length required
    irrelevant_pos : (list) 
        a list of irrelevant pos tags
    
    Returns
    -------------
    (str) the preprocessed text
    """
    #change to string
    txt = str(txt)

    #weird character
    txt = re.sub('[^a-zA-Z0-9]+',' ', txt)

    #multiple spaces
    txt = re.sub(' +',' ', txt)
    
    #to lower
    txt = txt.lower()
    
    #remove numbers
    txt = re.sub('\d+', '', txt)
    

    #spacy remove stopwords
    text_tokens = nlp(txt)
    tokens = [word for word in text_tokens if not word in stopwords]
    
    #lenght
    tokens = [word for word in tokens if len(word)>min_token_len]
    
    #remove irrelevant_pos
    tokens = [word for word in tokens if not word.pos_ in irrelevant_pos]
    
    #lemmentatiation
    tokens = [word.lemma_ for word in tokens]
    
    return tokens
    

  txt = re.sub('\d+', '', txt)


In [4]:
def remove_stopwords(tokens):
    tokens = [word for word in tokens if not word in stopwords]
    return tokens

In [5]:
def join_text(tokens):
    return " ".join(tokens)

In [6]:
# bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100)
# bigram_mod = gensim.models.phrases.Phraser(bigram)

# def make_bigrams(texts):
#     return [bigram_mod[doc] for doc in texts]

In [7]:
topic_labels = {0:'01',
                1:'02',
                2:'03',
                3:'04',
                4:'05',
                5:'06',
                6:'07',
                7:'08',
                8:'09',
                9:'10',
                10:'11',
                11:'12',
                12:'13',
                13:'14',
                14:'15'}

## Load and merge data frames

In [26]:
global_path= "../../../Glentel Inc/HR Analytics - Documents/Capstone Data/ubc_mds_team_share/make_processed/"

df_train_long = pd.read_csv(global_path+"train_dataset.csv")
df_resume_long = pd.read_csv(global_path+"english_clean_resumes.csv")


In [27]:
df_train_long.head(2)

Unnamed: 0.1,Unnamed: 0,employee_code,original_hire_date,max_hire_date,gender,job_title,position_status,worker_category,birth_year,report_date_week_ending_,...,language,resume_found,months_high_perf,months_with_perf,hp_perc,hp_class,perf_found,rehired_,referral_flag,exclusion_code
0,2825,MZE,"Saturday, May 18, 2019",2019-05-18,Female,SALES ASSOCIATE,Active,Permanent Full Time,1997,2019-05-24,...,English,1.0,0.0,3.0,0.0,0.0,1.0,,1.0,06-Pass
1,2639,MRU,"Friday, February 22, 2019",2019-02-22,Male,SALES ASSOCIATE,Active,Permanent Full Time,1991,2019-02-22,...,English,1.0,0.0,3.0,0.0,0.0,1.0,,,06-Pass


In [28]:
df_resume_long.head(2)

Unnamed: 0.1,Unnamed: 0,employee_name,employee_code,store,raw_resume,resume_text,resume_bline,language,file_type,clean_text
0,0,"Abbasi, Samiee Z",N42,WW 374,\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,"Samiee Zahid Abbasi 61 Muscat Crescent Ajax, ON L1Z 0B8 Phone: 647-285-3809 E-mail: sabbasii@ic...","['', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '',...",English,docx,Zahid Muscat Crescent look build great work fastpaced environment skill Abilities read write spe...
1,1,"Abdi, Melika",N3O,TB 146,\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nSkills\n\nExperien...,"Skills Experience MELIKA ABDI Carmelo Avenue Coquitlam, BC V3B7M9 Ph: 6047049487 melikaabdi94@g...","['', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '',...",English,pdf,Skills Experience Carmelo Avenue Ph ability work pressure effective leadership managemet Teamwor...


In [29]:
df_resume_long.shape

(496, 10)

In [30]:
df_train = df_train_long[['employee_code', 'hp_class']]
df_resume = df_resume_long[['employee_code', 'clean_text']]

In [31]:
df_topic_model = pd.merge(df_train, df_resume)

## Topic modeling

### Full training set

In [32]:
sp = spacy.load('en_core_web_sm')
sp.Defaults.stop_words |= {'customer', 'skill', 'experience', 'complete', 'june', 'program', 'sale', 'july', 'october', 'september', 'high', 'able', 'meet', 'year', 'need', 'joan', 'mean', 'management', 'client', 'service', 'team', 'product', 'store', 'ensure', 'manager', 'manager', 'communication', 'company', 'environment', 'work', 'provide', 'knowledge', 'business', 'target', 'associate', 'good', 'maintain', 'april', 'nee', 'employee', 'include', 'november'}
stopwords = sp.Defaults.stop_words

In [33]:
df_topic_model['token'] = df_topic_model['clean_text'].apply(preprocess)

In [34]:
df_topic_model['token_2'] = df_topic_model['token'].apply(remove_stopwords)

In [35]:
corpus = df_topic_model["token"]
dictionary = corpora.Dictionary(corpus)
doc_term_matrix = [dictionary.doc2bow(doc) for doc in corpus]

In [36]:
lda = models.LdaModel(corpus=doc_term_matrix, 
                      id2word=dictionary, 
                      num_topics=15,
                      passes=20,
                      random_state=42)

In [37]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda, doc_term_matrix, dictionary, sort_topics=False)
vis

## Cluster Analysis

In [38]:
df_topic_model['token_join'] = df_topic_model['token_2'].apply(join_text)
df_topic_model['cluster'] = df_topic_model['token_join'].apply(get_most_prob_topic, model=lda)
df_topic_model_group = df_topic_model.drop(['employee_code', 'clean_text', 'token', 'token_2'], axis=1)
df_topic_model_group_hp = df_topic_model_group.groupby('hp_class')
df_topic_model_group_cluster = df_topic_model_group.groupby('cluster')
df_topic_model_group_all = df_topic_model_group.groupby(['hp_class', 'cluster'])
df_topic_model_group['cluster'] = df_topic_model_group["cluster"].astype('category')
df_topic_model_group['hp_class'] = df_topic_model_group["hp_class"].astype('category')

In [39]:
alt.Chart(df_topic_model_group).mark_bar().encode(
    x=alt.X('cluster:O'),
    y=alt.Y('count()'),
    color=alt.Color('hp_class:N')
)

In [40]:
alt.Chart(df_topic_model_group).mark_bar().encode(
    x=alt.X('cluster:O'),
    y=alt.Y('count()'),
    color=alt.Color('hp_class:N'),
    column=alt.Column('hp_class:N')
)

In [41]:
(df_topic_model_group_cluster['hp_class'].count()/df_topic_model_group_cluster.count()['hp_class'].sum())

cluster
01    0.364583
02    0.006944
03    0.024306
04    0.003472
05    0.031250
06    0.010417
07    0.020833
08    0.083333
09    0.086806
10    0.006944
11    0.034722
12    0.166667
13    0.052083
14    0.069444
15    0.038194
Name: hp_class, dtype: float64

In [42]:
df_topic_model_group_all.size()

hp_class  cluster
0.0       01         70
          02          1
          03          7
          04          1
          05          8
          06          3
          07          5
          08         17
          09         22
          10          2
          11          8
          12         36
          13         13
          14         17
          15          8
1.0       01         35
          02          1
          05          1
          07          1
          08          7
          09          3
          11          2
          12         12
          13          2
          14          3
          15          3
dtype: int64

### Only high performers 

In [44]:
df_topic_model_high = df_topic_model.query("hp_class == 1.0")

In [45]:
corpus = df_topic_model_high["token_2"]
dictionary = corpora.Dictionary(corpus)
doc_term_matrix = [dictionary.doc2bow(doc) for doc in corpus]

In [46]:
lda_high = models.LdaModel(corpus=doc_term_matrix, 
                           id2word=dictionary, 
                           num_topics=15,
                           passes=30)

In [47]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_high, doc_term_matrix, dictionary, sort_topics=False)
vis

### Only low performers

In [48]:
df_topic_model_low = df_topic_model.query("hp_class == 0.0")

In [49]:
corpus = df_topic_model_low["token_2"]
dictionary = corpora.Dictionary(corpus)
doc_term_matrix = [dictionary.doc2bow(doc) for doc in corpus]

In [50]:
lda_low = models.LdaModel(corpus=doc_term_matrix, 
                           id2word=dictionary, 
                           num_topics=15, 
                           passes=30)

In [51]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_low, doc_term_matrix, dictionary, sort_topics=False)
vis

In [52]:
# def get_most_prob_topic_w_prob(unseen_document, model = lda):
#     """
#     Given an unseen_document, and a trained LDA model, this function
#     finds the most likely topic (topic with the highest probability) from the 
#     topic distribution of the unseen document and returns the best topic with 
#     its probability. . 
    
#     Parameters
#     ------------
#     unseen_document : (str) 
#         the document to be labeled with a topic
#     model : (gensim ldamodel) 
#         the trained LDA model
    
#     Returns: 
#     -------------
#         (str) a string of the form 
#         `most likely topic label:probability of that label` 
    
#     Examples:
#     ----------
#     >> get_most_prob_topic("The research uses an HMM for discovering gene sequence.", 
#                             model = lda)
#     Science and Technology:0.435
#     """    
#     #preprocess 
#     preprocessed = preprocess(unseen_document)
    
#     #process into a doc_term_matrix
#     corpus = preprocessed
#     bow_vector = dictionary.doc2bow(corpus)
    
    
#     #find simliar articals
#     simliar_artical = model[bow_vector]
    
    
#     #change the topic name and sort tuple
#     items = []

#     for idx, prob in simliar_artical:
#         items.append((topic_labels[idx], prob))

#     items = sorted(items, key=lambda x: x[1], reverse=True)
#     out = str(items[0][0])+":"+str(items[0][1])
#     return out