This notebook is to explore and implement Latent Dirichlet Allocation. <hr>

> Things to consider doing to improve results: plot distribution of words/lemmas in corpus and remove head and tail of distribution.

In [1]:
import pandas as pd
import gensim.corpora as corpora
from gensim.models.ldamodel import LdaModel
from pprint import pprint
import spacy
import matplotlib.pyplot as plt
from gensim.models import CoherenceModel
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis

In [2]:
def get_corpus(data, min_len=3):
    # Create a Dictionary: a mapping between words and their integer IDs
    id2word = corpora.Dictionary(data)
    
    # Remove tokens of 1 or 2 letters
    del_ids = [k for k,v in id2word.items() if len(v)<min_len]
    id2word.filter_tokens(bad_ids=del_ids)
    
    # Create a corpus: a list of documents represented as a BoW
    corpus = [id2word.doc2bow(text) for text in data]
    
    return id2word, corpus

In [13]:
def get_best_model(corpus, id2word, title, language, plot=False, save_plot=False):
    coh_scores = []
    # for num_topics in range(1, 11):
    num_topics = 3
    for passes in [10, 20, 30, 40, 50]:
        lda_model = LdaModel(
            corpus=corpus, 
            id2word=id2word, 
            num_topics=num_topics, 
            # distributed=True,
            passes=passes, 
            update_every=1,
            alpha='auto', 
            eta=None, 
            decay=0.9,
            eval_every=5,
            iterations=100, 
            per_word_topics=True)
        
        coherence_model_lda = CoherenceModel(
            model=lda_model, 
            texts=data, 
            dictionary=id2word, 
            coherence='c_v')
        
        coherence_lda = coherence_model_lda.get_coherence()
        print("passes: {}, score: {}".format(passes, coherence_lda))
        coh_scores.append(coherence_lda)
        if coherence_lda == max(coh_scores):
            best_lda = lda_model
        
    if plot:
        fig, ax = plt.subplots(1, 1)
        ax.plot(range(1, 11), coh_scores, marker='o', linestyle='--')
        ax.title.set_text(title)
        ax.set_ylabel("Coherence score")
        ax.set_xlabel('Number of topics')
        ax.grid(True)
    if save_plot:
        ax.get_figure().savefig("figures/LDA_coh_"+language, bbox_inches="tight")
        
    return best_lda 

<hr>

**German tasks**

In [14]:
df = pd.read_csv("data/all_preprocessed_tasks_DE.csv")
df.dropna(subset=["description"], inplace=True)
data = df["description"].str.split().to_list() 
title = "Coherence score by number of topics in German tasks" 

id2word, corpus = get_corpus(data)

lda_model = get_best_model(corpus, id2word, title=title, language="DE")

passes: 10, score: 0.5499038060390347
passes: 20, score: 0.4591994068542385
passes: 30, score: 0.5044541822968583
passes: 40, score: 0.5614911877047016
passes: 50, score: 0.5420106761776485


In [5]:
lda_model.show_topics()

AttributeError: 'int' object has no attribute 'show_topics'

In [None]:
# Visualization 
pyLDAvis.enable_notebook()
vis = gensimvis.prepare(lda_model, corpus, id2word) 
vis

<hr>

**English tasks**

In [None]:
df = pd.read_csv("data/all_preprocessed_tasks_EN.csv")
df.dropna(subset=["description"], inplace=True)
data = df["description"].str.split().to_list() 
title = "Coherence score by number of topics in Augmented English tasks" 

id2word, corpus = get_corpus(data)

lda_model = get_best_model(corpus, id2word, title=title, language="EN", save_plot=False)

In [5]:
lda_model.print_topics() 

[(0,
  '0.031*"like" + 0.016*"name" + 0.015*"english" + 0.015*"rise" + 0.014*"translat" + 0.007*"societi" + 0.007*"bear" + 0.007*"woman" + 0.007*"mean" + 0.006*"may"'),
 (1,
  '0.021*"sie" + 0.019*"den" + 0.018*"satz" + 0.017*"ubersetzen" + 0.013*"say" + 0.008*"dora" + 0.008*"luke" + 0.007*"get" + 0.007*"chri" + 0.007*"see"'),
 (2,
  '0.012*"englisch" + 0.012*"ship" + 0.010*"auf" + 0.009*"schreib" + 0.009*"horst" + 0.009*"make" + 0.008*"oil" + 0.007*"peopl" + 0.006*"use" + 0.006*"translat"')]

In [6]:
# Visualization 
pyLDAvis.enable_notebook()
vis = gensimvis.prepare(lda_model, corpus, id2word) 
vis

<hr>

Functions that come with gensim LDAModel: <br>
- get_document_topics(bow[, ...]) 	Return topic distribution for the given document bow, as a list of (topic_id, topic_probability) 2-tuples.<br>
- get_term_topics(word_id[, minimum_probability]) 	Returns most likely topics for a particular word in vocab.<br>
- get_topic_terms(topicid[, topn]) 	Return a list of (word_id, probability) 2-tuples for the most probable words in topic topicid. <br>
- show_topic(topicid[, topn]) Return a list of (word, probability) 2-tuples for the most probable words in topic topicid.<br>
- top_topics(corpus[, num_words]) 	Calculate the Umass topic coherence for each topic.<br>
  https://tedboy.github.io/nlps/generated/generated/gensim.models.LdaModel.html                                             