# The following file contains: 

- LDA
- Topic Modeling/Coherence
- TF-IDF/Bag of Words with Polynomial Model
- Similarity Scores

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
import nltk
import re
import spacy
from spacy.lang.en import English
from nltk.corpus import wordnet as wn
from nltk.stem.wordnet import WordNetLemmatizer
from gensim import corpora
import pickle
import gensim
import pyLDAvis.gensim
from gensim.models import LsiModel
from gensim.models.coherencemodel import CoherenceModel

import glob
from os import path
from PIL import Image
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import RegexpTokenizer

from sklearn.naive_bayes import MultinomialNB
#Import scikit-learn metrics module for accuracy calculation
from sklearn import metrics

from nltk.probability import FreqDist

import json


In [2]:
df = pd.read_csv('../data_processed/concatenated_data_cleaned_labeled_preprocessed_alt.csv') 

### LDA With Gensim
As seen on: https://towardsdatascience.com/topic-modelling-in-python-with-nltk-and-gensim-4ef03213cd21

list of tokens -> bag-of-words corpus -> dictionary -> bag-of-words corpus -> LDA model



from LDA model -> extract top topics

Documentation of Gensim

https://radimrehurek.com/gensim/models/ldamodel.html

### Preprocessing for LDA

In [148]:
'''Fixing desc_tokens column'''
#df_test = df.iloc[0:10, :]


desc_tokens = list(df['new_description'])

new_col = []
for i, item in enumerate(desc_tokens):
    listed = list(item.split())
    new_col.append(listed)

df['desc_tokens'] = new_col

desc_tokens = list(df['desc_tokens'])
desc_tokens[0:2]

[['noom',
  'scientifically',
  'proven',
  'method',
  'help',
  'user',
  'create',
  'healthier',
  'lifestyle',
  'manage',
  'important',
  'condition',
  'like',
  'ii',
  'diabetes',
  'obesity',
  'hypertension',
  'engineering',
  'team',
  'forefront',
  'challenge',
  'solving',
  'complex',
  'technical',
  'problem',
  'center',
  'around',
  'habit',
  'behavior',
  'lifestyle',
  'looking',
  'data',
  'engineer',
  'join',
  'data',
  'team',
  'help',
  'u',
  'improve',
  'maintain',
  'data',
  'warehouse',
  'like',
  'billion',
  'row',
  'data',
  'center',
  'data',
  'driven',
  'decision',
  'love',
  'like',
  'u',
  'problem',
  'affect',
  'life',
  'real',
  'people',
  'user',
  'depend',
  'u',
  'make',
  'positive',
  'change',
  'health',
  'life',
  'base',
  'scientifically',
  'proven',
  'peer',
  'reviewed',
  'methodology',
  'designed',
  'medical',
  'professionalswe',
  'respectful',
  'diverse',
  'dynamic',
  'environment',
  'engineering',


### Creating a dictionary from aggregated data descriptions. 

In [149]:
# Create a dictionary from the data
dictionary = corpora.Dictionary(desc_tokens)

In [153]:
# Create a bag-of-words corpus 
# also called corpus! 

doc_term_matrix = [dictionary.doc2bow(text) for text in desc_tokens]
print(len(doc_term_matrix))
print(doc_term_matrix[0:2])

8880
[[(0, 1), (1, 2), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1), (12, 1), (13, 2), (14, 1), (15, 1), (16, 1), (17, 2), (18, 1), (19, 1), (20, 1), (21, 1), (22, 1), (23, 9), (24, 1), (25, 1), (26, 1), (27, 1), (28, 1), (29, 1), (30, 1), (31, 1), (32, 1), (33, 1), (34, 1), (35, 1), (36, 1), (37, 1), (38, 1), (39, 3), (40, 1), (41, 1), (42, 1), (43, 1), (44, 1), (45, 1), (46, 2), (47, 1), (48, 1), (49, 1), (50, 1), (51, 1), (52, 1), (53, 1), (54, 1), (55, 1), (56, 3), (57, 1), (58, 1), (59, 1), (60, 1), (61, 1), (62, 1), (63, 1), (64, 1), (65, 3), (66, 2), (67, 5), (68, 1), (69, 1), (70, 1), (71, 1), (72, 1), (73, 1), (74, 1), (75, 1), (76, 1), (77, 1), (78, 1), (79, 1), (80, 1), (81, 1), (82, 1), (83, 1), (84, 1), (85, 1), (86, 1), (87, 1), (88, 2), (89, 1), (90, 1), (91, 1), (92, 1), (93, 1), (94, 1), (95, 1), (96, 5), (97, 1), (98, 1), (99, 2), (100, 1), (101, 2), (102, 1), (103, 2), (104, 1), (105, 1), (106, 1), (107, 1), (108, 1), (109, 1), (11

In [151]:
pickle.dump(doc_term_matrix, open('doc_term_matrix.pkl', 'wb'))
#dictionary.save('dictionary.gensim')

### Preprocessing of Descriptions Grouped by Bin Roles

In [170]:
by_role = df.groupby('bin_role')

'''Creates a dictionary called by_role_tokens where an aggregated list of tokens is saved for each bin role
NOTE: that desc_tokens should be a list of lists - one list of words per document'''

by_role_tokens = {}

for key, item in by_role:
    aggregated_tokens = []
    print(key)
    #print(item['desc_tokens'])
    desc_tokens = list(item['desc_tokens'])

    #for item2 in item['desc_tokens']:
        #aggregated_tokens = aggregated_tokens + item2
    by_role_tokens[key] = desc_tokens

Business Analyst
Data Analyst
Data Engineer
Data Scientist
ML/AI Researcher
Machine Learning Engineer
Software Engineer
Statistical Modeler/Researcher
Unclassified


## Topic modeling for each bin_role group

### Question --> Can we use the same dictionary for each subject/bin role subject analysis? Or should we create new models for each role? 

In [175]:
# Top 3 subjects/topics for each bin_role grouping

NUM_TOPICS = 3

for key, values in by_role_tokens.items():
    
    doc_term_matrix = [dictionary.doc2bow(text) for text in values]
    
    ldamodel = gensim.models.ldamodel.LdaModel(doc_term_matrix, num_topics = NUM_TOPICS, id2word=dictionary, passes=15)

    topics = ldamodel.print_topics(num_words=4)
    print(key)
    for topic in topics:
        print(topic)
    print()

Business Analyst
(0, '0.026*"business" + 0.012*"project" + 0.011*"process" + 0.010*"system"')
(1, '0.028*"business" + 0.023*"data" + 0.011*"team" + 0.007*"reporting"')
(2, '0.018*"data" + 0.011*"business" + 0.009*"management" + 0.009*"system"')

Data Analyst
(0, '0.037*"data" + 0.012*"business" + 0.008*"team" + 0.008*"analysis"')
(1, '0.022*"data" + 0.006*"analysis" + 0.006*"position" + 0.006*"preferred"')
(2, '0.031*"data" + 0.006*"analytics" + 0.006*"team" + 0.006*"process"')

Data Engineer
(0, '0.036*"u" + 0.016*"data" + 0.010*"learning" + 0.010*"team"')
(1, '0.033*"data" + 0.008*"cloud" + 0.008*"technology" + 0.008*"team"')
(2, '0.051*"data" + 0.009*"business" + 0.008*"team" + 0.006*"design"')

Data Scientist
(0, '0.025*"data" + 0.007*"team" + 0.007*"business" + 0.006*"model"')
(1, '0.038*"data" + 0.010*"science" + 0.009*"learning" + 0.009*"business"')
(2, '0.029*"data" + 0.014*"business" + 0.009*"science" + 0.008*"learning"')

ML/AI Researcher
(0, '0.028*"learning" + 0.024*"machin

### Same as above but with new dictionaries for each bin role

In [177]:
# Top 3 subjects/topics for each bin_role grouping

NUM_TOPICS = 3

for key, values in by_role_tokens.items():
    
    # Create a dictionary from the data
    dictionary = corpora.Dictionary(values)
    
    doc_term_matrix = [dictionary.doc2bow(text) for text in values]
    
    ldamodel = gensim.models.ldamodel.LdaModel(doc_term_matrix, num_topics = NUM_TOPICS, id2word=dictionary, passes=15)

    topics = ldamodel.print_topics(num_words=4)
    print(key)
    for topic in topics:
        print(topic)
    print()

Business Analyst
(0, '0.025*"data" + 0.019*"business" + 0.010*"team" + 0.006*"system"')
(1, '0.033*"business" + 0.016*"data" + 0.011*"process" + 0.011*"project"')
(2, '0.019*"business" + 0.008*"analysis" + 0.008*"project" + 0.007*"system"')

Data Analyst
(0, '0.034*"data" + 0.013*"business" + 0.009*"analysis" + 0.008*"team"')
(1, '0.043*"data" + 0.013*"business" + 0.009*"team" + 0.007*"analytics"')
(2, '0.031*"data" + 0.007*"information" + 0.006*"management" + 0.006*"support"')

Data Engineer
(0, '0.044*"u" + 0.021*"data" + 0.014*"learning" + 0.011*"team"')
(1, '0.047*"data" + 0.008*"team" + 0.008*"business" + 0.008*"system"')
(2, '0.043*"data" + 0.009*"technology" + 0.009*"team" + 0.009*"cloud"')

Data Scientist
(0, '0.037*"data" + 0.011*"learning" + 0.010*"science" + 0.009*"team"')
(1, '0.037*"data" + 0.012*"business" + 0.009*"science" + 0.008*"team"')
(2, '0.027*"data" + 0.009*"business" + 0.008*"team" + 0.007*"learning"')

ML/AI Researcher
(0, '0.034*"learning" + 0.027*"machine" + 

# pyLDAvis

Establishes most salient terms and creates distance plot maps of words

In [None]:
#dictionary = gensim.corpora.Dictionary.load('dictionary.gensim')
corpus = pickle.load(open('corpus.pkl', 'rb'))
#lda = gensim.models.ldamodel.LdaModel.load('model5.gensim')
lda_display = pyLDAvis.gensim.prepare(ldamodel, doc_term_matrix, dictionary, sort_topics=False)
pyLDAvis.display(lda_display)

# Text Classification Model
### Using Multinomial Naive Bayes Classification from scikit-learn

As seen on: 
https://www.datacamp.com/community/tutorials/text-analytics-beginners-nltk

The following model uses the one-hot-encoded description column in the data frame as the set of X features and evaluates the searched role as the response y variable. 

## 1. Using TF-IDF


In [184]:
def group_descriptions(by_role):
    '''Creates concatenated/joined text for job descriptions for each bin role'''
    roles = []
    merged_desc = []
    for key, items in by_role.indices.items():
        # concatenate strings in descriptions column (by search_role grouping) and append to list of merged descriptions
        string=(" ").join(description for description in df.loc[items,'new_description'])
        merged_desc.append(string)
        roles.append(key)

    return merged_desc, roles

In [185]:
'''descriptions variable is list of concatenated descriptions separated by bin role'''
descriptions, roles = group_descriptions(by_role)

In [191]:
vectorizer = TfidfVectorizer()

# generate matrix of word vectors 
tfidf_matrix = vectorizer.fit_transform(descriptions)

print(tfidf_matrix)

  (0, 6511)	0.001014438350062388
  (0, 19322)	0.020021007121598784
  (0, 749)	0.0033523750826131636
  (0, 1)	0.025074853579478088
  (0, 807)	0.008677986892557077
  (0, 24218)	0.03479378907539983
  (0, 7354)	0.0023493786852747823
  (0, 21170)	0.00558719704041346
  (0, 16946)	0.001783147991621317
  (0, 4020)	0.03372470617084844
  (0, 1968)	0.0008321357294232813
  (0, 13028)	0.005468320507638706
  (0, 16754)	0.005824950105962969
  (0, 3666)	0.0008227794116481934
  (0, 13496)	0.02449387481723123
  (0, 18733)	0.03178091907166409
  (0, 13504)	0.018155547561895077
  (0, 5492)	0.01418964582404574
  (0, 9245)	0.01380088840420887
  (0, 15423)	0.0013606509694290435
  (0, 8584)	0.0005274174507629513
  (0, 3024)	0.002936723356593478
  (0, 21900)	0.08970577462735765
  (0, 20617)	0.004160678647116406
  (0, 4711)	0.07551612880331192
  :	:
  (8, 9754)	0.0004560426821465608
  (8, 16299)	7.600711369109347e-05
  (8, 14881)	7.600711369109347e-05
  (8, 13816)	0.00015201422738218693
  (8, 24323)	7.6007113691

In [193]:
tf=TfidfVectorizer()
text_tf= tf.fit_transform(df['description'])

# Build the training and sets using the one-hot-encoded TF-IDF vectorizer
X_train, X_test, y_train, y_test = train_test_split(
    text_tf, df['bin_role'], test_size=0.3, random_state=1)

clf = MultinomialNB().fit(X_train, y_train)
predicted= clf.predict(X_test)
print("MultinomialNB Accuracy:",metrics.accuracy_score(y_test, predicted))

MultinomialNB Accuracy: 0.4804804804804805


## 2. Using Bag of Words


In [None]:
# You can generate document term matrix by using scikit-learn's CountVectorizer.
#tokenizer to remove unwanted elements from out data like symbols and numbers
token = RegexpTokenizer(r'[a-zA-Z0-9]+')

cv = CountVectorizer(lowercase=True,stop_words='english',ngram_range = (1,1),tokenizer = token.tokenize)
text_counts= cv.fit_transform(df['description'])


X_train, X_test, y_train, y_test = train_test_split(
    text_counts, df['search_role'], test_size=0.3, random_state=1)

clf = MultinomialNB().fit(X_train, y_train)
predicted= clf.predict(X_test)
print("MultinomialNB Accuracy:",metrics.accuracy_score(y_test, predicted))

In [None]:
# for trigram model

cv = CountVectorizer(lowercase=True,stop_words='english',ngram_range = (3,3),tokenizer = token.tokenize)
text_counts= cv.fit_transform(df['description'])

X_train, X_test, y_train, y_test = train_test_split(
    text_counts, df['search_role'], test_size=0.3, random_state=1)

clf = MultinomialNB().fit(X_train, y_train)
predicted= clf.predict(X_test)
print("MultinomialNB Accuracy:",metrics.accuracy_score(y_test, predicted))

# Word Similarity Between Aggregated Posts Grouped by Search Role
- Uses Cosine Similarity score

As seen on:
https://www.datacamp.com/courses/feature-engineering-for-nlp-in-python

In [None]:
# load model and create Doc object

nlp = spacy.load('en_core_web_sm')


for i, merged_text in enumerate(descriptions):
    doc = nlp(merged_text)


    for token1 in doc:
        for token2 in doc:
            print(token1.text, token2.text, token1.similarity(token2))
            print()
        #print(token.vector)