This notebook is to explore and implement Latent Dirichlet Allocation. <hr>

> Things to consider doing to improve results: plot distribution of words/lemmas in corpus and remove head and tail of distribution.

In [2]:
import pandas as pd
import gensim.corpora as corpora
from gensim.models.ldamodel import LdaModel
from pprint import pprint
import matplotlib.pyplot as plt
from gensim.models import CoherenceModel
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis

In [3]:
def get_corpus(data, min_len=3):
    # Create a Dictionary: a mapping between words and their integer IDs
    id2word = corpora.Dictionary(data)
    
    # Remove tokens of 1 or 2 letters
    del_ids = [k for k,v in id2word.items() if len(v)<min_len]
    id2word.filter_tokens(bad_ids=del_ids)
    
    # Create a corpus: a list of documents represented as a BoW
    corpus = [id2word.doc2bow(text) for text in data]
    
    return id2word, corpus

In [45]:
def get_model(corpus, id2word, num_topics=3, passes=10, decay=0.5, iterations=50):
    coh_scores = []
    lda_model = LdaModel(
        corpus=corpus, 
        id2word=id2word, 
        num_topics=num_topics, 
        distributed=False,
        passes=passes, 
        update_every=1,
        alpha='auto', 
        eta=None, 
        decay=decay,
        eval_every=5,
        iterations=iterations, 
        per_word_topics=True)
    
    coherence_model_lda = CoherenceModel(
        model=lda_model, 
        texts=data, 
        dictionary=id2word, 
        coherence='c_v')
        
    coherence_lda = coherence_model_lda.get_coherence()
    print(f"Coherence score: {coherence_lda}")

    return lda_model, coherence_lda

def plot_coh_score(coh_scores, title, language, save=True): 
    fig, ax = plt.subplots(1, 1)
    ax.plot(range(2, 11), coh_scores, marker='o', linestyle='--')
    ax.title.set_text(title)
    ax.set_ylabel("Coherence score")
    ax.set_xlabel('Number of topics')
    ax.grid(True)
    if save:
        ax.get_figure().savefig("figures/LDA_coh_"+language, bbox_inches="tight")


In [5]:
def get_best_model(corpus, id2word, title, language, plot=False, save_plot=False):
    coh_scores = []
    for num_topics in range(2, 11):
    # for passes in range(10, 100, 10): 
    # for iterations in range(50, 100, 10): 
    # for decay in [0.6, 0.7, 0.8, 0.9, 1]:
        lda_model, coherence_lda = get_model(corpus, 
                                             id2word, 
                                             num_topics=num_topics, 
                                             passes=passes, 
                                             decay=decay, 
                                             iterations=iterations)
    coh_scores.append(coherence_lda)
    if coherence_lda == max(coh_scores):
        best_model = lda_model

    if plot:
        plot_coh_score(coh_scores, title, language, save_plot)

    return best_model 

<hr>

**German tasks**

<hr>

**English tasks**

In [6]:
folder = "gen_files/EN/"
df = pd.read_csv(f"{folder}preprocessed/all_preprocessed_open_tasks_EN.csv")
df_taskaspects = pd.read_csv(f"{folder}all_taskAspects_EN.csv")

# Keeping only the tasks that have one or more aspects of type CONCEPT
df = pd.merge(df, df_taskaspects, on="taskId", how="inner") 
df.reset_index(drop=True, inplace=True)

In [7]:
len(df.taskId.unique()), len(df_taskaspects.taskId.unique())

(5706, 5706)

In [8]:
_df = df[["taskId", "description", "topic_id"]].drop_duplicates("taskId")
_df = _df.dropna(subset=["description"]).reset_index()
data = _df["description"].str.split().to_list() 

In [9]:
len(data)

5702

In [46]:
id2word, corpus = get_corpus(data)

num_topics = 3 
passes = 20 
decay = 0.9
iterations = 100
lda_model_en, coherence_lda_en = get_model(corpus=corpus,
                                         id2word=id2word,
                                         num_topics=num_topics,
                                         passes=passes,
                                         decay=decay,
                                         iterations=iterations)

Coherence score: 0.37872511552041693


In [11]:
lda_model_en.print_topics() 

[(0,
  '0.013*"dora" + 0.010*"gwen" + 0.008*"think" + 0.007*"look" + 0.007*"come" + 0.007*"dad" + 0.007*"ask" + 0.006*"friend" + 0.006*"take" + 0.006*"want"'),
 (1,
  '0.022*"auf" + 0.021*"englisch" + 0.019*"schreib" + 0.019*"horst" + 0.013*"sie" + 0.011*"die" + 0.011*"ship" + 0.011*"english" + 0.011*"satz" + 0.010*"lifeboat"'),
 (2,
  '0.023*"translat" + 0.021*"sie" + 0.019*"satz" + 0.019*"ubersetzen" + 0.017*"english" + 0.015*"den" + 0.011*"oil" + 0.010*"ich" + 0.010*"sentenc" + 0.009*"ship"')]

In [12]:
documents = _df["description"].to_list()

# Infer topic distributions for each document
topic_distributions = lda_model_en.get_document_topics(corpus)

doc_to_topic = {}
for (i, d) in enumerate(topic_distributions): 
    doc_to_topic[i] = {u:v for (u,v) in d} 
    
df1 = pd.DataFrame.from_dict(doc_to_topic, orient='index').sort_index()
# Replace values that are less than 1/3 by NaN 
df1 = df1.mask(df1 < 1/3).reset_index()

In [13]:
df_tasks_topics = pd.concat([_df[["taskId"]], df1], axis=1) 
print(len(_df), len(df1))

5702 5702


In [14]:
df_tasks_topics

Unnamed: 0,taskId,index,1,2,0
0,14ambh1obhw7TYMQE8lcC1,0,0.981932,,
1,25RGLvb2p0G5zulfX9xQOj,1,0.613683,,0.379606
2,18Ccvc8NMJT5xqLv9nAgTH,2,,0.694426,
3,3Jr6T26XL13aKRh31JX0xi,3,0.991638,,
4,3gbjpjewKN1aa5y4aN20Yw,4,,0.786927,
...,...,...,...,...,...
5697,1vmTwnxXvLHaBpLOllxi8E_SR,5697,0.474202,,0.507759
5698,aif5faqXBMr5BWFu35fcRC_SR,5698,0.984828,,
5699,2lqfJ6gZiLr7w2cuDlFWvF_SR,5699,,,0.968495
5700,16MreBTRQqA9TQI3zQU33G_SR,5700,0.919188,,


In [15]:
df_taskaspects.head()

Unnamed: 0,taskId,aspectId
0,14ambh1obhw7TYMQE8lcC1_BT,9639
1,25RGLvb2p0G5zulfX9xQOj_BT,9639
2,18Ccvc8NMJT5xqLv9nAgTH_BT,9937
3,3Jr6T26XL13aKRh31JX0xi_BT,9633
4,3gbjpjewKN1aa5y4aN20Yw_BT,11401


In [16]:
df_task_to_aspects = df_taskaspects.groupby(by="taskId")["aspectId"].apply(list).reset_index()
df_task_to_aspects.head()

Unnamed: 0,taskId,aspectId
0,12QfXkR96hp8Hyu8mSN1gm,"[183, 184, 185, 186, 187, 17042, 18153, 19487]"
1,12QfXkR96hp8Hyu8mSN1gm_BT,"[183, 184, 185, 186, 187, 17042, 18153, 19487]"
2,12QfXkR96hp8Hyu8mSN1gm_RD,"[183, 184, 185, 186, 187, 17042, 18153, 19487]"
3,12QfXkR96hp8Hyu8mSN1gm_RI,"[183, 184, 185, 186, 187, 17042, 18153, 19487]"
4,12QfXkR96hp8Hyu8mSN1gm_RS,"[183, 184, 185, 186, 187, 17042, 18153, 19487]"


In [17]:
len(df_tasks_topics), len(df_tasks_topics.taskId.unique()), len(df_task_to_aspects)

(5702, 5702, 5706)

In [18]:
df_task_topic_aspect = pd.merge(df_tasks_topics, 
                                df_task_to_aspects, 
                                on="taskId", 
                                how="inner").drop(columns=["index"])
df_task_topic_aspect.sample(5)

Unnamed: 0,taskId,1,2,0,aspectId
4457,4XZ8sEvsgUl5DMsGoE5ufp_RS,,0.979604,,"[6095, 6096, 6097, 6098, 6099, 6100, 6101, 624..."
4987,2KO9ICLsLJT6Eqglg3Y4zi_SR,0.956628,,,"[1375, 1376, 1377, 124, 1380, 15321, 27937]"
5538,8nblrizP2BF7H4WWocLGff_SR,,0.983991,,"[1374, 71121, 71160]"
4642,7rYCHqM6wT35W1eFnKv0Rh_RS,0.964774,,,"[68053, 8783, 68218, 92129]"
3361,5dykdJ0qry1aM9Jzj6vJf1_RI,0.956821,,,"[1415, 1631, 1417, 1416, 1418, 1419, 1632, 160..."


In [19]:
df_task_topic_aspect[0]

0            NaN
1       0.379606
2            NaN
3            NaN
4            NaN
          ...   
5697    0.507759
5698         NaN
5699    0.968495
5700         NaN
5701    0.950674
Name: 0, Length: 5702, dtype: float64

In [20]:
df_topic_0 = df_task_topic_aspect[[0, "aspectId"]].dropna(subset=[0]).reset_index(drop=True)
df_topic_1 = df_task_topic_aspect[[1, "aspectId"]].dropna(subset=[1]).reset_index(drop=True)
df_topic_2 = df_task_topic_aspect[[2, "aspectId"]].dropna(subset=[2]).reset_index(drop=True)

In [21]:
from collections import Counter
import itertools
aspects_0 = df_topic_0["aspectId"].to_list()
aspects_0 = list(itertools.chain.from_iterable(aspects_0))
occurrences_0 = Counter(aspects_0)

aspects_1 = df_topic_1["aspectId"].to_list()
aspects_1 = list(itertools.chain.from_iterable(aspects_1))
occurrences_1 = Counter(aspects_1)

aspects_2 = df_topic_2["aspectId"].to_list()
aspects_2 = list(itertools.chain.from_iterable(aspects_2))
occurrences_2 = Counter(aspects_2)

In [41]:
# define threshold t as a function of the values of the occurrences.
# defining t as a fixed value risks of having an empty dict result. 
from statistics import median 
t = median(occurrences_0.values())
retained_occ = dict(filter(lambda x: x[1] > t, occurrences_0.items()))
retained_aspects_0 = list(retained_occ.keys())
print(len(retained_aspects_0))

t = median(occurrences_1.values())
retained_occ = dict(filter(lambda x: x[1] > t, occurrences_1.items()))
retained_aspects_1 = list(retained_occ.keys())
print(len(retained_aspects_1))

t = median(occurrences_2.values())
retained_occ = dict(filter(lambda x: x[1] > t, occurrences_2.items()))
retained_aspects_2 = list(retained_occ.keys())
print(len(retained_aspects_2))

68
117
116


In [44]:
# Verify the number of common elements between the aspects of each topic 
# to assess their disparity. 
print(len(set(retained_aspects_0).intersection(set(retained_aspects_1))))
print(len(set(retained_aspects_2).intersection(set(retained_aspects_1))))
print(len(set(retained_aspects_0).intersection(set(retained_aspects_2))))

49
99
43


subtopics

In [55]:
tasks_topic0 = df_tasks_topics[["taskId", 0]].dropna(subset=[0]).reset_index(drop=True)
tasks_topic0 = pd.merge(tasks_topic0, _df, on="taskId", how="inner").drop(columns=[0,"index"])

data = tasks_topic0["description"].str.split().to_list() 
id2word, corpus = get_corpus(data)

num_topics = 3 
passes = 20 
decay = 0.9
iterations = 100
lda_model_en, coherence_lda_en = get_model(corpus=corpus,
                                         id2word=id2word,
                                         num_topics=num_topics,
                                         passes=passes,
                                         decay=decay,
                                         iterations=iterations)

Coherence score: 0.3734310209940581


In [56]:
pyLDAvis.enable_notebook()
vis = gensimvis.prepare(lda_model_en, corpus, id2word) 
vis

**English Aspects**

In [None]:
d = pd.read_csv("data/taskAspects_EN.csv")
len(d.aspectId.unique())

In [None]:
i = pd.read_csv("data/preprocessed_concept_aspects_EN.csv")
i

In [None]:
df = pd.read_csv("data/preprocessed_concept_aspects_EN.csv")
df.dropna(subset=["description"], inplace=True)
data = df["description"].str.split().to_list() 
title = "Coherence score by number of topics in english tasks" 

id2word, corpus = get_corpus(data)

num_topics = 3 
passes = 20 
decay = 0.9
iterations = 100
lda_model_aspects, coherence_lda_aspects = get_model(corpus=corpus,
                                                     id2word=id2word,
                                                     title=title,
                                                     num_topics=num_topics,
                                                     passes=passes,
                                                     decay=decay,
                                                     iterations=iterations)

In [None]:
lda_model_aspects.print_topics() 

In [None]:
# Visualization 
pyLDAvis.enable_notebook()
vis = gensimvis.prepare(lda_model_aspects, corpus, id2word) 
vis

<hr>

Functions that come with gensim LDAModel: <br>
- get_document_topics(bow[, ...]) 	Return topic distribution for the given document bow, as a list of (topic_id, topic_probability) 2-tuples.<br>
- get_term_topics(word_id[, minimum_probability]) 	Returns most likely topics for a particular word in vocab.<br>
- get_topic_terms(topicid[, topn]) 	Return a list of (word_id, probability) 2-tuples for the most probable words in topic topicid. <br>
- show_topic(topicid[, topn]) Return a list of (word, probability) 2-tuples for the most probable words in topic topicid.<br>
- top_topics(corpus[, num_words]) 	Calculate the Umass topic coherence for each topic.<br>
  https://tedboy.github.io/nlps/generated/generated/gensim.models.LdaModel.html                                             