This notebook is to explore and implement Latent Dirichlet Allocation. <hr>

> Things to consider doing to improve results: plot distribution of words/lemmas in corpus and remove head and tail of distribution.

In [2]:
import pandas as pd
import gensim.corpora as corpora
from gensim.models.ldamodel import LdaModel
from pprint import pprint
import matplotlib.pyplot as plt
from gensim.models import CoherenceModel
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis

In [3]:
def get_corpus(data, min_len=3):
    # Create a Dictionary: a mapping between words and their integer IDs
    id2word = corpora.Dictionary(data)
    
    # Remove tokens of 1 or 2 letters
    del_ids = [k for k,v in id2word.items() if len(v)<min_len]
    id2word.filter_tokens(bad_ids=del_ids)
    
    # Create a corpus: a list of documents represented as a BoW
    corpus = [id2word.doc2bow(text) for text in data]
    
    return id2word, corpus

In [4]:
def get_model(corpus, id2word, title, num_topics=3, passes=10, decay=0.5, iterations=50):
    coh_scores = []
    lda_model = LdaModel(
        corpus=corpus, 
        id2word=id2word, 
        num_topics=num_topics, 
        distributed=False,
        passes=passes, 
        update_every=1,
        alpha='auto', 
        eta=None, 
        decay=decay,
        eval_every=5,
        iterations=iterations, 
        per_word_topics=True)
    
    coherence_model_lda = CoherenceModel(
        model=lda_model, 
        texts=data, 
        dictionary=id2word, 
        coherence='c_v')
        
    coherence_lda = coherence_model_lda.get_coherence()
    print(f"Coherence score: {coherence_lda}")

    return lda_model, coherence_lda

def plot_coh_score(coh_scores, title, language, save=True): 
    fig, ax = plt.subplots(1, 1)
    ax.plot(range(2, 11), coh_scores, marker='o', linestyle='--')
    ax.title.set_text(title)
    ax.set_ylabel("Coherence score")
    ax.set_xlabel('Number of topics')
    ax.grid(True)
    if save:
        ax.get_figure().savefig("figures/LDA_coh_"+language, bbox_inches="tight")


In [5]:
def get_best_model(corpus, id2word, title, language, plot=False, save_plot=False):
    coh_scores = []
    for num_topics in range(2, 11):
    # for passes in range(10, 100, 10): 
    # for iterations in range(50, 100, 10): 
    # for decay in [0.6, 0.7, 0.8, 0.9, 1]:
        lda_model, coherence_lda = get_model(corpus, 
                                             id2word, 
                                             title, 
                                             num_topics=num_topics, 
                                             passes=passes, 
                                             decay=decay, 
                                             iterations=iterations)
    coh_scores.append(coherence_lda)
    if coherence_lda == max(coh_scores):
        best_model = lda_model

    if plot:
        plot_coh_score(coh_scores, title, language, save_plot)

    return best_model 

<hr>

**German tasks**

<hr>

**English tasks**

In [10]:
df = pd.read_csv("data/all_preprocessed_tasks_EN.csv")
df_aspects = pd.read_csv("data/all_taskAspects_EN.csv")
df = pd.merge(df, df_aspects, on="taskId", how="inner") 

In [14]:
len(df.taskId.unique()), len(df_aspects.aspectId.unique())

(1247, 1171)

In [13]:
df.tail()

Unnamed: 0,taskId,language,description,topic_id,word_count,aspectId
20187,aif5faqXBMr5BWFu35fcRC_SR,eng,blue part question call doubt tag swiss german...,,11,68184
20188,aif5faqXBMr5BWFu35fcRC_SR,eng,blue part question call doubt tag swiss german...,,11,8513
20189,aif5faqXBMr5BWFu35fcRC_SR,eng,blue part question call doubt tag swiss german...,,11,68181
20190,aif5faqXBMr5BWFu35fcRC_SR,eng,blue part question call doubt tag swiss german...,,11,68175
20191,aif5faqXBMr5BWFu35fcRC_SR,eng,blue part question call doubt tag swiss german...,,11,9159


In [7]:
df_aspects.head()

Unnamed: 0,taskId,aspectId
0,14ambh1obhw7TYMQE8lcC1,9639
1,25RGLvb2p0G5zulfX9xQOj,9639
2,18Ccvc8NMJT5xqLv9nAgTH,9937
3,3Jr6T26XL13aKRh31JX0xi,9633
4,3gbjpjewKN1aa5y4aN20Yw,11401


In [17]:
df.dropna(subset=["description"], inplace=True)
df.reset_index(drop=True, inplace=True)
data = df["description"].str.split().to_list() 

In [16]:
df[df["taskId"] == "8gooLcJt0bz7yVHSaWd0MM_SR"]

Unnamed: 0,taskId,language,description,topic_id,word_count,aspectId


In [6]:
title = "Coherence score by number of topics in Augmented English tasks" 

id2word, corpus = get_corpus(data)

num_topics = 3 
passes = 80 
decay = 0.9
iterations = 100
lda_model_en, coherence_lda_en = get_model(corpus=corpus,
                                         id2word=id2word,
                                         title=title,
                                         num_topics=num_topics,
                                         passes=passes,
                                         decay=decay,
                                         iterations=iterations)

Coherence score: 0.3283477288665198


In [7]:
lda_model_en.print_topics() 

[(0,
  '0.012*"ship" + 0.011*"write" + 0.011*"lifeboat" + 0.010*"one" + 0.009*"sentenc" + 0.009*"mine" + 0.008*"peopl" + 0.008*"open" + 0.008*"day" + 0.007*"find"'),
 (1,
  '0.009*"oil" + 0.008*"ship" + 0.007*"time" + 0.007*"come" + 0.007*"man" + 0.007*"think" + 0.007*"food" + 0.006*"find" + 0.006*"ice" + 0.006*"water"'),
 (2,
  '0.016*"name" + 0.013*"rise" + 0.010*"dora" + 0.009*"woman" + 0.008*"gwen" + 0.008*"want" + 0.008*"look" + 0.006*"societi" + 0.006*"mean" + 0.006*"well"')]

In [8]:
documents = df["description"].to_list()

# Infer topic distributions for each document
topic_distributions = lda_model_en.get_document_topics(corpus)

doc_to_topic = {}
for (i, d) in enumerate(topic_distributions): 
    doc_to_topic[i] = {u:v for (u,v) in d} 
    
df1 = pd.DataFrame.from_dict(doc_to_topic, orient='index').sort_index()
# Replace values that are less than 1/3 by NaN 
df1 = df1.mask(df1 < 1/3)

In [9]:
df_tasks_topics = pd.concat([df[["taskId"]], df1], axis=1) 
df_tasks_topics.head()

Unnamed: 0,taskId,0,1,2
0,7TVZOkAoQvS71zub3YI9Uy,0.96944,,
1,8lxRyLzStOK9eUhNRal38O,0.984251,,
2,aazKG44PsKc5UnpTiDECut,0.993978,,
3,8L1QdQwEG5XaRfdYKcLPed,0.989354,,
4,14ambh1obhw7TYMQE8lcC1,0.990433,,


In [10]:
len(df_tasks_topics.taskId.unique())

2549

Unnamed: 0,taskId,aspectId
0,14ambh1obhw7TYMQE8lcC1,9639
1,25RGLvb2p0G5zulfX9xQOj,9639
2,18Ccvc8NMJT5xqLv9nAgTH,9937
3,3Jr6T26XL13aKRh31JX0xi,9633
4,3gbjpjewKN1aa5y4aN20Yw,11401


In [25]:
df_tasks_topics.tail(10)

Unnamed: 0,taskId,0,1,2
2539,atOoDKljkeQ5MI1SEnEJSN_SR,0.986434,,
2540,3mkboeyrKOC61fFrrPVHFp_SR,,,0.999405
2541,2ezJ24ksPkl5xccVpuBAag_SR,0.999268,,
2542,9FKeQbZKUpkasDUbCYYm3a_SR,0.999284,,
2543,zhUY1CRUTt5J9eLf8anEr_SR,0.996971,,
2544,88uTpL8NbsA8ItamLmWQOl_SR,0.997062,,
2545,4wepjjR1qol8ge0hyeFhmy_SR,,,0.994626
2546,MeWfnN21TD5MlRoHkOAH4_SR,0.999647,,
2547,54yk3CYjufq5DW3Jm89h4k_SR,0.999684,,
2548,aif5faqXBMr5BWFu35fcRC_SR,0.991284,,


In [16]:
"8gooLcJt0bz7yVHSaWd0MM" in df_tasks_topics["taskId"].unique()

True

In [26]:
df_aspects.sample(10)

Unnamed: 0,taskId,aspectId
15152,62HICEhR1sK6bT8gDgzzkI_BT,17486
112164,7tdnrdrMDFm6cTs9xegrNH_SR_RI_RS_RD,34029
220731,2SvLuio6AlL5V2q7XMFOod_SR,19487
57307,6f6eWYJq6Ly5RgdxmcB6z8_BT,8550
79078,6J0dM4IDurz8VMOHbXtZwl_SR_RI_RS_RD,43345
206843,26u1vZhdpuH9kDcA6xx78A_SR_RI_RS,15465
97506,89LDEDnuUOr6ulX3abc8aH_SR_RI_RS_RD,65267
67973,V2lGMV9xar6MerL0KV6H1_SR_RI_RS_RD,24639
256048,19ZVjTFUecy6HVUsYdOorw_SR,55532
142399,NTd8vnUk0VavHuZVM07XP_SR_RI,29098


In [12]:
len(df_aspects.taskId.unique())

5866

In [91]:
pd.merge(df_tasks_topics, df_aspects, on="taskId", how="inner")

(11222, 951)

**English Aspects**

In [20]:
d = pd.read_csv("data/taskAspects_EN.csv")
len(d.aspectId.unique())

269

In [17]:
i = pd.read_csv("data/preprocessed_concept_aspects_EN.csv")
i

Unnamed: 0,aspectId,description,type,groupId,categoryId,word_count
0,9639,word right order,CONCEPT,,,3
1,9937,good answer orang utan strong social bond,CONCEPT,,,7
2,9633,subject verb congruent,CONCEPT,,,3
3,11401,answer mention keyword look correctstat,CONCEPT,55122.0,,5
4,9984,verb conjug expect ten expectedten,CONCEPT,,,5
...,...,...,...,...,...,...
264,381128,learner compos two dimension three dimension s...,CONCEPT,342103.0,4.0,12
265,379944,answer within scope task,CONCEPT,340377.0,,4
266,381286,climat chang refer signific chang global tempe...,CONCEPT,342514.0,4.0,32
267,381288,impact climat chang widespread affect weather ...,CONCEPT,342515.0,4.0,28


In [39]:
df = pd.read_csv("data/preprocessed_concept_aspects_EN.csv")
df.dropna(subset=["description"], inplace=True)
data = df["description"].str.split().to_list() 
title = "Coherence score by number of topics in english tasks" 

id2word, corpus = get_corpus(data)

num_topics = 3 
passes = 20 
decay = 0.9
iterations = 100
lda_model_aspects, coherence_lda_aspects = get_model(corpus=corpus,
                                                     id2word=id2word,
                                                     title=title,
                                                     num_topics=num_topics,
                                                     passes=passes,
                                                     decay=decay,
                                                     iterations=iterations)

Coherence score: 0.5102065045997334


In [40]:
lda_model_aspects.print_topics() 

[(0,
  '0.086*"correctli" + 0.082*"write" + 0.052*"spell" + 0.047*"verb" + 0.047*"student" + 0.041*"abil" + 0.040*"subject" + 0.038*"text" + 0.031*"match" + 0.026*"learner"'),
 (1,
  '0.098*"abil" + 0.061*"languag" + 0.042*"right" + 0.040*"answer" + 0.037*"text" + 0.037*"one" + 0.035*"anoth" + 0.035*"convert" + 0.033*"use" + 0.030*"correct"'),
 (2,
  '0.195*"student" + 0.179*"word" + 0.172*"know" + 0.026*"answer" + 0.010*"sentenc" + 0.008*"understand" + 0.007*"die" + 0.007*"english" + 0.005*"consist" + 0.005*"look"')]

In [41]:
# Visualization 
pyLDAvis.enable_notebook()
vis = gensimvis.prepare(lda_model_aspects, corpus, id2word) 
vis

<hr>

Functions that come with gensim LDAModel: <br>
- get_document_topics(bow[, ...]) 	Return topic distribution for the given document bow, as a list of (topic_id, topic_probability) 2-tuples.<br>
- get_term_topics(word_id[, minimum_probability]) 	Returns most likely topics for a particular word in vocab.<br>
- get_topic_terms(topicid[, topn]) 	Return a list of (word_id, probability) 2-tuples for the most probable words in topic topicid. <br>
- show_topic(topicid[, topn]) Return a list of (word, probability) 2-tuples for the most probable words in topic topicid.<br>
- top_topics(corpus[, num_words]) 	Calculate the Umass topic coherence for each topic.<br>
  https://tedboy.github.io/nlps/generated/generated/gensim.models.LdaModel.html                                             