This notebook is to explore and implement Latent Dirichlet Allocation. <hr>

> Things to consider doing to improve results: plot distribution of words/lemmas in corpus and remove head and tail of distribution.

In [1]:
import pandas as pd
import gensim.corpora as corpora
from gensim.models.ldamodel import LdaModel
from pprint import pprint
import matplotlib.pyplot as plt
from gensim.models import CoherenceModel
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis

In [2]:
def get_corpus(data, min_len=3):
    # Create a Dictionary: a mapping between words and their integer IDs
    id2word = corpora.Dictionary(data)
    
    # Remove tokens of 1 or 2 letters
    del_ids = [k for k,v in id2word.items() if len(v)<min_len]
    id2word.filter_tokens(bad_ids=del_ids)
    
    # Create a corpus: a list of documents represented as a BoW
    corpus = [id2word.doc2bow(text) for text in data]
    
    return id2word, corpus

In [3]:
def get_model(corpus, id2word, title, num_topics=3, passes=10, decay=0.5, iterations=50):
    coh_scores = []
    lda_model = LdaModel(
        corpus=corpus, 
        id2word=id2word, 
        num_topics=num_topics, 
        distributed=False,
        passes=passes, 
        update_every=1,
        alpha='auto', 
        eta=None, 
        decay=decay,
        eval_every=5,
        iterations=iterations, 
        per_word_topics=True)
    
    coherence_model_lda = CoherenceModel(
        model=lda_model, 
        texts=data, 
        dictionary=id2word, 
        coherence='c_v')
        
    coherence_lda = coherence_model_lda.get_coherence()
    print(f"Coherence score: {coherence_lda}")

    return lda_model, coherence_lda

def plot_coh_score(coh_scores, title, language, save=True): 
    fig, ax = plt.subplots(1, 1)
    ax.plot(range(2, 11), coh_scores, marker='o', linestyle='--')
    ax.title.set_text(title)
    ax.set_ylabel("Coherence score")
    ax.set_xlabel('Number of topics')
    ax.grid(True)
    if save:
        ax.get_figure().savefig("figures/LDA_coh_"+language, bbox_inches="tight")


In [4]:
def get_best_model(corpus, id2word, title, language, plot=False, save_plot=False):
    coh_scores = []
    for num_topics in range(2, 11):
    # for passes in range(10, 100, 10): 
    # for iterations in range(50, 100, 10): 
    # for decay in [0.6, 0.7, 0.8, 0.9, 1]:
        lda_model, coherence_lda = get_model(corpus, 
                                             id2word, 
                                             title, 
                                             num_topics=num_topics, 
                                             passes=passes, 
                                             decay=decay, 
                                             iterations=iterations)
    coh_scores.append(coherence_lda)
    if coherence_lda == max(coh_scores):
        best_model = lda_model

    if plot:
        plot_coh_score(coh_scores, title, language, save_plot)

    return best_model 

<hr>

**German tasks**

In [95]:
df = pd.read_csv("data/all_preprocessed_tasks_DE.csv") 
df.head()

Unnamed: 0,taskId,language,description,topic_id,word_count
0,9Aa4h4yosMb9oAglIYVbMr,DE,translat into german він народився у швеицаріі,,7
1,2T3rZTRBlAv94jV8YNcbgq,DE,brief zeitung schreiben reaktion artikel zeitu...,,56
2,2z7BrRib1zJ6KE4gVWmz6U,DE,bericht radiosendung thema heiraen deutschland...,,34
3,a0s1XNwpeGQ9pLGnqdkhni,DE,tranlat i love my dog,,5
4,5XfzyS7dJ487uL4NZfz8fZ,DE,erganz mindester zwei weiter anspruchsgruppen,,5


In [6]:
df = pd.read_csv("data/all_preprocessed_tasks_DE.csv")
df.dropna(subset=["description"], inplace=True)
data = df["description"].str.split().to_list() 
title = "Coherence score by number of topics in german tasks" 

id2word, corpus = get_corpus(data)

num_topics = 3 
passes = 80 
decay = 0.9
iterations = 100
lda_model, coherence_lda = get_model(corpus=corpus,
                                     id2word=id2word,
                                     title=title,
                                     num_topics=num_topics,
                                     passes=passes,
                                     decay=decay,
                                     iterations=iterations)

Coherence score: 0.3730639746406244


In [7]:
lda_model.show_topics()

[(0,
  '0.045*"satz" + 0.042*"schreib" + 0.014*"passiv" + 0.012*"prasen" + 0.009*"frage" + 0.008*"infinitiv" + 0.007*"fur" + 0.007*"beispiel" + 0.007*"elektrisch" + 0.007*"zeit"'),
 (1,
  '0.011*"englisch" + 0.011*"energi" + 0.010*"fur" + 0.010*"schreib" + 0.009*"horst" + 0.007*"temperatur" + 0.006*"bewegung" + 0.005*"erd" + 0.005*"warm" + 0.005*"antwort"'),
 (2,
  '0.011*"fur" + 0.010*"magnet" + 0.008*"bild" + 0.006*"groß" + 0.006*"geschwindigkeit" + 0.006*"beschleunigung" + 0.005*"kraft" + 0.005*"roll" + 0.005*"zwei" + 0.005*"geben"')]

In [8]:
# Visualization 
pyLDAvis.enable_notebook()
vis = gensimvis.prepare(lda_model, corpus, id2word) 
vis

**German concept aspects**

In [9]:
df = pd.read_csv("data/preprocessed_concept_aspects_DE.csv")
df.dropna(subset=["description"], inplace=True)
data = df["description"].str.split().to_list() 
title = "Coherence score by number of topics in english tasks" 

id2word, corpus = get_corpus(data)

num_topics = 3 
passes = 20 
decay = 0.9
iterations = 100
lda_model_aspects, coherence_lda_aspects = get_model(corpus=corpus,
                                                     id2word=id2word,
                                                     title=title,
                                                     num_topics=num_topics,
                                                     passes=passes,
                                                     decay=decay,
                                                     iterations=iterations)

Coherence score: 0.5386744115953176


In [12]:
lda_model_aspects.show_topics()

[(0,
  '0.063*"lernend" + 0.039*"the" + 0.023*"person" + 0.023*"verstehen" + 0.023*"correctli" + 0.019*"korrekt" + 0.017*"abil" + 0.017*"learner" + 0.017*"lage" + 0.015*"text"'),
 (1,
  '0.073*"antwort" + 0.060*"enthalt" + 0.058*"lernend" + 0.036*"verb" + 0.021*"bilden" + 0.019*"wurd" + 0.015*"rechtschreibfehl" + 0.014*"verben" + 0.013*"geben" + 0.013*"artikel"'),
 (2,
  '0.138*"lernend" + 0.085*"antwort" + 0.033*"setzen" + 0.031*"geben" + 0.025*"verwenden" + 0.017*"worter" + 0.017*"prasen" + 0.014*"wurd" + 0.013*"hilfsverb" + 0.011*"verstehen"')]

In [13]:
# Visualization 
pyLDAvis.enable_notebook()
vis = gensimvis.prepare(lda_model_aspects, corpus, id2word) 
vis

<hr>

**English tasks**

In [79]:
df = pd.read_csv("data/all_preprocessed_tasks_EN.csv")
df.dropna(subset=["description"], inplace=True)
df.reset_index(drop=True, inplace=True)
data = df["description"].str.split().to_list() 

In [80]:
title = "Coherence score by number of topics in Augmented English tasks" 

id2word, corpus = get_corpus(data)

num_topics = 3 
passes = 80 
decay = 0.9
iterations = 100
lda_model_en, coherence_lda_en = get_model(corpus=corpus,
                                         id2word=id2word,
                                         title=title,
                                         num_topics=num_topics,
                                         passes=passes,
                                         decay=decay,
                                         iterations=iterations)

Coherence score: 0.2896502134148638


In [81]:
lda_model_en.print_topics() 

[(0,
  '0.018*"dora" + 0.014*"gwen" + 0.010*"look" + 0.008*"want" + 0.008*"phone" + 0.008*"day" + 0.008*"chef" + 0.007*"help" + 0.007*"woman" + 0.007*"play"'),
 (1,
  '0.013*"name" + 0.011*"rise" + 0.009*"oil" + 0.007*"man" + 0.007*"ship" + 0.006*"still" + 0.006*"well" + 0.006*"mean" + 0.006*"water" + 0.006*"ice"'),
 (2,
  '0.008*"one" + 0.008*"ship" + 0.007*"find" + 0.007*"lifeboat" + 0.007*"peopl" + 0.007*"sentenc" + 0.007*"man" + 0.007*"write" + 0.006*"come" + 0.006*"think"')]

In [82]:
documents = df["description"].to_list()

# Infer topic distributions for each document
topic_distributions = lda_model_en.get_document_topics(corpus)

doc_to_topic = {}
for (i, d) in enumerate(topic_distributions): 
    doc_to_topic[i] = {u:v for (u,v) in d} 
    
df1 = pd.DataFrame.from_dict(doc_to_topic, orient='index').sort_index()
# Replace values that are less than 1/3 by NaN 
df1 = df1.mask(df1 < 1/3)

In [88]:
df_tasks_topics = pd.concat([df[["taskId"]], df1], axis=1) 
df_tasks_topics.head()

Unnamed: 0,taskId,0,1,2
0,7TVZOkAoQvS71zub3YI9Uy,,,0.96269
1,8lxRyLzStOK9eUhNRal38O,0.977905,,
2,aazKG44PsKc5UnpTiDECut,,,0.593247
3,8L1QdQwEG5XaRfdYKcLPed,0.665136,,
4,14ambh1obhw7TYMQE8lcC1,,,0.864768


In [93]:
len(df_tasks_topics.taskId.unique())

2549

In [89]:
df_aspects = pd.read_csv("data/taskAspects_EN.csv")
df_aspects.head()

Unnamed: 0,taskId,aspectId
17,14ambh1obhw7TYMQE8lcC1,9639
28,25RGLvb2p0G5zulfX9xQOj,9639
57,18Ccvc8NMJT5xqLv9nAgTH,9937
103,3Jr6T26XL13aKRh31JX0xi,9633
133,3gbjpjewKN1aa5y4aN20Yw,11401


In [91]:
pd.merge(df_tasks_topics, df_aspects, on="taskId", how="inner")

(11222, 951)

**English Aspects**

In [20]:
d = pd.read_csv("data/taskAspects_EN.csv")
len(d.aspectId.unique())

269

In [17]:
i = pd.read_csv("data/preprocessed_concept_aspects_EN.csv")
i

Unnamed: 0,aspectId,description,type,groupId,categoryId,word_count
0,9639,word right order,CONCEPT,,,3
1,9937,good answer orang utan strong social bond,CONCEPT,,,7
2,9633,subject verb congruent,CONCEPT,,,3
3,11401,answer mention keyword look correctstat,CONCEPT,55122.0,,5
4,9984,verb conjug expect ten expectedten,CONCEPT,,,5
...,...,...,...,...,...,...
264,381128,learner compos two dimension three dimension s...,CONCEPT,342103.0,4.0,12
265,379944,answer within scope task,CONCEPT,340377.0,,4
266,381286,climat chang refer signific chang global tempe...,CONCEPT,342514.0,4.0,32
267,381288,impact climat chang widespread affect weather ...,CONCEPT,342515.0,4.0,28


In [39]:
df = pd.read_csv("data/preprocessed_concept_aspects_EN.csv")
df.dropna(subset=["description"], inplace=True)
data = df["description"].str.split().to_list() 
title = "Coherence score by number of topics in english tasks" 

id2word, corpus = get_corpus(data)

num_topics = 3 
passes = 20 
decay = 0.9
iterations = 100
lda_model_aspects, coherence_lda_aspects = get_model(corpus=corpus,
                                                     id2word=id2word,
                                                     title=title,
                                                     num_topics=num_topics,
                                                     passes=passes,
                                                     decay=decay,
                                                     iterations=iterations)

Coherence score: 0.5102065045997334


In [40]:
lda_model_aspects.print_topics() 

[(0,
  '0.086*"correctli" + 0.082*"write" + 0.052*"spell" + 0.047*"verb" + 0.047*"student" + 0.041*"abil" + 0.040*"subject" + 0.038*"text" + 0.031*"match" + 0.026*"learner"'),
 (1,
  '0.098*"abil" + 0.061*"languag" + 0.042*"right" + 0.040*"answer" + 0.037*"text" + 0.037*"one" + 0.035*"anoth" + 0.035*"convert" + 0.033*"use" + 0.030*"correct"'),
 (2,
  '0.195*"student" + 0.179*"word" + 0.172*"know" + 0.026*"answer" + 0.010*"sentenc" + 0.008*"understand" + 0.007*"die" + 0.007*"english" + 0.005*"consist" + 0.005*"look"')]

In [41]:
# Visualization 
pyLDAvis.enable_notebook()
vis = gensimvis.prepare(lda_model_aspects, corpus, id2word) 
vis

<hr>

Functions that come with gensim LDAModel: <br>
- get_document_topics(bow[, ...]) 	Return topic distribution for the given document bow, as a list of (topic_id, topic_probability) 2-tuples.<br>
- get_term_topics(word_id[, minimum_probability]) 	Returns most likely topics for a particular word in vocab.<br>
- get_topic_terms(topicid[, topn]) 	Return a list of (word_id, probability) 2-tuples for the most probable words in topic topicid. <br>
- show_topic(topicid[, topn]) Return a list of (word, probability) 2-tuples for the most probable words in topic topicid.<br>
- top_topics(corpus[, num_words]) 	Calculate the Umass topic coherence for each topic.<br>
  https://tedboy.github.io/nlps/generated/generated/gensim.models.LdaModel.html                                             