# The following file contains: 

- LDA
- Topic Modeling/Coherence
- TF-IDF/Bag of Words with Polynomial Model
- Similarity Scores

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
import nltk
import re
import spacy
from spacy.lang.en import English
from nltk.corpus import wordnet as wn
from nltk.stem.wordnet import WordNetLemmatizer
from gensim import corpora
import pickle
import gensim
import pyLDAvis.gensim
from gensim.models import LsiModel
from gensim.models.coherencemodel import CoherenceModel

import glob
from os import path
from PIL import Image
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import RegexpTokenizer

from sklearn.naive_bayes import MultinomialNB
#Import scikit-learn metrics module for accuracy calculation
from sklearn import metrics

from nltk.probability import FreqDist

import json


In [2]:
df = pd.read_csv('../data_processed/concatenated_data_cleaned_labeled_preprocessed.csv') 

### LDA With Gensim

list of tokens -> bag-of-words corpus -> dictionary -> bag-of-words corpus -> LDA model



from LDA model -> extract top topics

In [7]:
desc_tokens = list(df['desc_tokens'])

In [10]:
df['desc_tokens']

0       ['scientifically', 'prove', 'method', 'user', ...
1       ['overview', 'numbers', 'finding', 'story', 'n...
2       ['world', 'premiere', 'travel', 'technology', ...
3       ['medical', 'scientist', 'least', 'years', 'cl...
4       ['class="jobsectionheader">company', 'overview...
5       ['responsible', 'assist', 'development', 'stat...
6       ['class="jobsectionheader">overview', 'respons...
7       ['description', 'looking', 'highly', 'motivate...
8       ['ready', 'challenge', 'celonis', 'leader', 'b...
9       ['overview', 'supply', 'chain', 'group', 'asso...
10      ['customer', 'operations', 'scientist', 'formu...
11      ['report', 'science', 'understand', 'business'...
12      ['getty', 'image', 'looking', 'individual', 'e...
13      ['class="jobsectionheader">description', 'repo...
14      ['jotitle', 'analyst', 'scientist', 'jodescrip...
15      ['position', 'works', 'scientist', 'report', '...
16      ['scientist', 'intern', 'industry', 'leading',...
17      ['summ

In [8]:
# Create a dictionary from the data
dictionary = corpora.Dictionary(desc_tokens)

# Create a bag-of-words corpus 
# also called corpus! 
doc_term_matrix = [dictionary.doc2bow(text) for text in desc_tokens]

TypeError: doc2bow expects an array of unicode tokens on input, not a single string

In [None]:
pickle.dump(corpus, open('corpus.pkl', 'wb'))
#dictionary.save('dictionary.gensim')

Documentation

https://radimrehurek.com/gensim/models/ldamodel.html

## Topic modeling over all descriptions

In [None]:
NUM_TOPICS = 10
ldamodel = gensim.models.ldamodel.LdaModel(doc_term_matrix, num_topics = NUM_TOPICS, id2word=dictionary, passes=15)
#ldamodel.save('model5.gensim')

topics = ldamodel.print_topics(num_words=4)
for topic in topics:
    print(topic)
    
    

## Determining Optimum Number of Topics to Use
### And coherence measurements

As seen on
https://www.datacamp.com/community/tutorials/discovering-hidden-topics-python

In [None]:
def compute_coherence_values(dictionary, doc_term_matrix, desc_tokens, stop, start=2, step=3):
    """
    From: https://www.datacamp.com/community/tutorials/discovering-hidden-topics-python
    Input   : dictionary : Gensim dictionary
              corpus : Gensim corpus
              texts : List of input texts
              stop : Max num of topics
    purpose : Compute c_v coherence for various number of topics
    Output  : model_list : List of LSA topic models
              coherence_values : Coherence values corresponding to the LDA model with respective number of topics
    """
    coherence_values = []
    model_list = []
    for num_topics in range(start, stop, step):
        # generate LSi model on range of num_topics 
        model = LsiModel(doc_term_matrix, num_topics=num_topics, id2word = dictionary)  # train model
        model_list.append(model)
        coherencemodel = CoherenceModel(model=model, texts=desc_tokens, dictionary=dictionary, coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())
    return model_list, coherence_values

In [None]:
def plot_graph(desc_tokens,start, stop, step, dictionary, doc_term_matrix):
    model_list, coherence_values = compute_coherence_values(dictionary, doc_term_matrix,desc_tokens,
                                                            stop, start, step)
    # Show graph
    x = range(start, stop, step)
    plt.figure(figsize=(10,7))
    plt.plot(x, coherence_values)
    plt.title('Coherence Score to Assess Number of Topics')
    plt.xlabel("Number of Topics")
    plt.ylabel("Coherence score")
    plt.show()

In [None]:

start,stop,step=2,12,1
plot_graph(desc_tokens,start,stop,step, dictionary, doc_term_matrix)

#### Results: 
Optimal number of topics to evaluate is 3

In [None]:
NUM_TOPICS = 3
ldamodel = gensim.models.ldamodel.LdaModel(doc_term_matrix, num_topics = NUM_TOPICS, id2word=dictionary, passes=15)
#ldamodel.save('model5.gensim')

topics = ldamodel.print_topics(num_words=4)
for topic in topics:
    print(topic)
# Create new model using different data set

# pyLDAvis

In [None]:
#dictionary = gensim.corpora.Dictionary.load('dictionary.gensim')
corpus = pickle.load(open('corpus.pkl', 'rb'))
#lda = gensim.models.ldamodel.LdaModel.load('model5.gensim')
lda_display = pyLDAvis.gensim.prepare(ldamodel, doc_term_matrix, dictionary, sort_topics=False)
pyLDAvis.display(lda_display)

# Text Classification Model
### Using Multinomial Naive Bayes Classification from scikit-learn

As seen on: 
https://www.datacamp.com/community/tutorials/text-analytics-beginners-nltk

The following model uses the one-hot-encoded description column in the data frame as the set of X features and evaluates the searched role as the response y variable. 

## 1. Using TF-IDF


In [None]:

vectorizer = TfidfVectorizer()

# generate matrix of word vectors 
tfidf_matrix = vectorizer.fit_transform(descriptions)

print(tfidf_matrix)

In [None]:
tf=TfidfVectorizer()
text_tf= tf.fit_transform(df['description'])

# Build the training and sets using the one-hot-encoded TF-IDF vectorizer
X_train, X_test, y_train, y_test = train_test_split(
    text_tf, df['search_role'], test_size=0.3, random_state=1)

clf = MultinomialNB().fit(X_train, y_train)
predicted= clf.predict(X_test)
print("MultinomialNB Accuracy:",metrics.accuracy_score(y_test, predicted))

## 2. Using Bag of Words


In [None]:
# You can generate document term matrix by using scikit-learn's CountVectorizer.
#tokenizer to remove unwanted elements from out data like symbols and numbers
token = RegexpTokenizer(r'[a-zA-Z0-9]+')

cv = CountVectorizer(lowercase=True,stop_words='english',ngram_range = (1,1),tokenizer = token.tokenize)
text_counts= cv.fit_transform(df['description'])


X_train, X_test, y_train, y_test = train_test_split(
    text_counts, df['search_role'], test_size=0.3, random_state=1)

clf = MultinomialNB().fit(X_train, y_train)
predicted= clf.predict(X_test)
print("MultinomialNB Accuracy:",metrics.accuracy_score(y_test, predicted))

In [None]:
# for trigram model

cv = CountVectorizer(lowercase=True,stop_words='english',ngram_range = (3,3),tokenizer = token.tokenize)
text_counts= cv.fit_transform(df['description'])

X_train, X_test, y_train, y_test = train_test_split(
    text_counts, df['search_role'], test_size=0.3, random_state=1)

clf = MultinomialNB().fit(X_train, y_train)
predicted= clf.predict(X_test)
print("MultinomialNB Accuracy:",metrics.accuracy_score(y_test, predicted))

# Word Similarity Between Aggregated Posts Grouped by Search Role
- Uses Cosine Similarity score

As seen on:
https://www.datacamp.com/courses/feature-engineering-for-nlp-in-python

In [None]:
# load model and create Doc object

nlp = spacy.load('en_core_web_sm')


for i, merged_text in enumerate(descriptions):
    doc = nlp(merged_text)


    for token1 in doc:
        for token2 in doc:
            print(token1.text, token2.text, token1.similarity(token2))
            print()
        #print(token.vector)