# Latent Dirichlet Allocation

In [None]:
import pandas as pd
import numpy as np
data = pd.read_csv('NUS_training_data.csv', encoding= 'unicode_escape', index_col = False)

labels = data['E_OCC']
data['rand'] = pd.Series(np.random.uniform(0,1,len(labels.index)))
data = data.sort_values(by=['rand'])

In [2]:
# Function to remove Punctuation

import string

def remove_punc(text):
    text_nopunc = "".join([char for char in text if char not in string.punctuation])  # discard all punctuation
    return text_nopunc

data['desc_clean'] = data['E_OCC_Desc'].apply(lambda x: remove_punc(x))

# data.head()

In [3]:
# Function to Tokenize words

import re

def tokenise(text):
    tokens = re.split('\W+', text)
    return tokens

data['desc_tokenised'] = data['desc_clean'].apply(lambda x: tokenise(x.lower()))

# data.head()

In [4]:
# Function to remove stopwords

import nltk
nltk.download('stopwords')

stopword = nltk.corpus.stopwords.words('english')

def remove_stopwords(lst):
    text = [word for word in lst if word not in stopword]  # Remove all stopwords
    return text

data['desc_nostop'] = data['desc_tokenised'].apply(lambda x: remove_stopwords(x))

# data.head()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Eugene\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
# Lemmatizing

import nltk
nltk.download('wordnet')

wn = nltk.WordNetLemmatizer()

def lemmatizing(token):
    text = [wn.lemmatize(word) for word in token]
    return text

data['desc_lemmatized'] = data['desc_nostop'].apply(lambda x: lemmatizing(x))

# data

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Eugene\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [6]:
def clean_text(text):
    text = "".join([word.lower() for word in text if word not in string.punctuation])
    tokens = re.split('\W+', text)
    text = [wn.lemmatize(word) for word in tokens if word not in stopword]
    return text

In [None]:
#data['desc_lemmatized']

In [9]:
# Create a list of the topic numbers we want to try

topic_nums = list(np.arange(1, 68 + 1, 2))

# Bag of Words LDA

In [8]:
from gensim.corpora import Dictionary

texts = data['desc_lemmatized']
dictionary = Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]

In [14]:
from gensim.models.coherencemodel import CoherenceModel
from gensim.models.ldamulticore import LdaMulticore
from gensim.models.ldamodel import LdaModel

# Run the nmf model and calculate the coherence score
# for each number of topics
coherence_scores = []

for num in topic_nums:
    lda_model = LdaMulticore(corpus=corpus,
                             id2word=dictionary,
                             num_topics=num,
                             random_state=1,
                             per_word_topics=True)
    
    # Run the coherence model to get the score
    cm = CoherenceModel(
        model=lda_model,
        texts=texts,
        dictionary=dictionary,
        coherence='c_v'
    )
    
    coherence_scores.append(round(cm.get_coherence(), 5))

In [15]:
from operator import itemgetter

# Get the number of topics with the highest coherence score
scores = list(zip(topic_nums, coherence_scores))
best_num_topics = sorted(scores, key=itemgetter(1), reverse=True)[0][0]

print(best_num_topics)

27


In [18]:
# Best LDA Model

lda_model = LdaMulticore(corpus=corpus,
                         id2word=dictionary,
                         num_topics=best_num_topics,
                         random_state=2,
                         per_word_topics=True)

# TF-IDF LDA

In [20]:
from gensim import corpora, models

tfidf = models.TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]

In [21]:
# Run the nmf model and calculate the coherence score
# for each number of topics
coherence_scores = []

for num in topic_nums:
    lda_model_tfidf = LdaMulticore(corpus=corpus_tfidf,
                                   id2word=dictionary,
                                   num_topics=num,
                                   random_state=3,
                                   per_word_topics=True)
    
    # Run the coherence model to get the score
    cm = CoherenceModel(
        model=lda_model_tfidf,
        texts=texts,
        dictionary=dictionary,
        coherence='c_v'
    )
    
    coherence_scores.append(round(cm.get_coherence(), 5))

In [22]:
from operator import itemgetter

# Get the number of topics with the highest coherence score
scores = list(zip(topic_nums, coherence_scores))
best_num_topics = sorted(scores, key=itemgetter(1), reverse=True)[0][0]

print(best_num_topics)

55


In [25]:
# Best LDA Model

lda_model_tfidf = LdaMulticore(corpus=corpus,
                               id2word=dictionary,
                               num_topics=best_num_topics,
                               random_state=4,
                               per_word_topics=True)

In [None]:
#for idx, topic in lda_model_tfidf.print_topics(-1):
    #print('Topic: {} Word: {}'.format(idx, topic))