This notebook explores and implements Latent Semantic Analysis. <hr>

In [76]:
import warnings
warnings.filterwarnings("ignore")

In [77]:
import pandas as pd
from gensim.utils import simple_preprocess
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import Normalizer
import numpy as np
import spacy
from nltk.corpus import stopwords
from sklearn.metrics.pairwise import cosine_similarity
from gensim.models import CoherenceModel
from gensim.corpora import Dictionary


In [78]:
def lsa(texts, num_topics=5):
    # Step 1: Convert texts to TF-IDF matrix
    tfidf_vectorizer = TfidfVectorizer(max_df=0.5, min_df=1, stop_words=stop_words)
    tfidf_matrix = tfidf_vectorizer.fit_transform(texts)
    
    # Step 2: Apply Singular Value Decomposition (SVD)
    svd = TruncatedSVD(n_components=num_topics)
    latent_semantic_analysis = svd.fit_transform(tfidf_matrix)
    
    # Step 3: Normalize the output of SVD
    normalizer = Normalizer(copy=False)
    latent_semantic_analysis = normalizer.fit_transform(latent_semantic_analysis)
    
    # Step 4: Print the topics and their most relevant terms
    terms = tfidf_vectorizer.get_feature_names_out()
    topics = []
    for i, topic in enumerate(svd.components_):
        top_terms_idx = topic.argsort()[:-21:-1] # Top 50 terms
        top_terms = [terms[idx] for idx in top_terms_idx]
        topics.append(top_terms)
    return tfidf_vectorizer, svd, normalizer, latent_semantic_analysis, topics

In [79]:
def preprocess_text(texts):
    tfidf_vectorizer = TfidfVectorizer(max_df=0.5, min_df=1, stop_words=stop_words)
    tfidf_matrix = tfidf_vectorizer.fit_transform(texts)
    return tfidf_vectorizer, tfidf_matrix

def transform_text(text, tfidf_vectorizer, svd, normalizer):
    text_tfidf = tfidf_vectorizer.transform([text])
    text_lsa = svd.transform(text_tfidf)
    text_lsa_normalized = normalizer.transform(text_lsa)
    return text_lsa_normalized

def assign_topic_to_text(text, topics_lsa_normalized):
    # Transform test text to LSA space
    text_lsa_normalized = transform_text(text, tfidf_vectorizer, svd, normalizer)

    similarities = cosine_similarity(text_lsa_normalized, topics_lsa_normalized)
    # print(similarities)
    most_similar_topic_index = np.argmax(similarities)
    return most_similar_topic_index

<hr>

In [80]:
# Load data 
df = pd.read_csv("gen_files/EN/preprocessed/trimmed_open_tasks.csv")
stop_words = stopwords.words('english')
df = df.dropna(subset=["description"]).reset_index(drop=True)
data = df["description"].to_list() 

In [81]:
# Tokenize documents for Gensim
tokenized_docs = [doc.split() for doc in data]
# Create a Gensim dictionary and corpus
dictionary = Dictionary(tokenized_docs)
corpus = [dictionary.doc2bow(doc) for doc in tokenized_docs]

In [82]:
coh_scores = []
for num_topics in range(2,11):
    tfidf_vectorizer, svd, normalizer, lsa_output, topics = lsa(data, num_topics=num_topics)
    # Convert top words per topic into the format required by CoherenceModel
    cm_topics = [[dictionary.token2id[word] for word in topic] for topic in topics]
    # Compute Coherence Score using the 'u_mass' coherence measure
    coherence_model = CoherenceModel(topics=cm_topics, texts=tokenized_docs, dictionary=dictionary, coherence='u_mass')
    coherence_score = coherence_model.get_coherence() # mean of coherence scores per topic
    coh_scores.append(coherence_score)
    if coherence_score == max(coh_scores):
        best_n = num_topics
        best_model = (tfidf_vectorizer, svd, normalizer, lsa_output, topics)

In [83]:
best_n

4

Assign topics to tasks

In [84]:
(tfidf_vectorizer, svd, normalizer, lsa_output, topics) = best_model

In [85]:
# Transform topics to LSA space
list_topics_normalized = []
for topic in topics:
    list_topics_normalized.append(transform_text(" ".join(topic), tfidf_vectorizer, svd, normalizer)[0].tolist())
    topics_lsa_normalized = np.array(list_topics_normalized)

In [86]:
# Assign a topic to each task of df 

df_task_topic = df[["taskId","description"]] 
df_task_topic["task_topic"] = df_task_topic["description"].apply(lambda text: assign_topic_to_text(text, topics_lsa_normalized))
# df_task_topic.head()

Evaluation

LSA on aspects 

In [87]:
df = pd.read_csv("gen_files/EN/preprocessed/concept_aspects.csv")
stop_words = stopwords.words('english')
df = df.dropna(subset=["description"]).reset_index(drop=True)
data = df["description"].to_list() 

In [88]:
# Tokenize documents for Gensim
tokenized_docs = [doc.split() for doc in data]
# Create a Gensim dictionary and corpus
dictionary = Dictionary(tokenized_docs)
corpus = [dictionary.doc2bow(doc) for doc in tokenized_docs]

In [89]:
coh_scores = []
for num_topics in range(2,11):
    tfidf_vectorizer, svd, normalizer, lsa_output, topics = lsa(data, num_topics=num_topics)
    # Convert top words per topic into the format required by CoherenceModel
    cm_topics = [[dictionary.token2id[word] for word in topic] for topic in topics]
    # Compute Coherence Score using the 'u_mass' coherence measure
    coherence_model = CoherenceModel(topics=cm_topics, texts=tokenized_docs, dictionary=dictionary, coherence='u_mass')
    coherence_score = coherence_model.get_coherence() # mean of coherence scores per topic
    coh_scores.append(coherence_score)
    if coherence_score == max(coh_scores):
        best_n = num_topics
        best_model = (tfidf_vectorizer, svd, normalizer, lsa_output, topics)

In [90]:
(tfidf_vectorizer, svd, normalizer, lsa_output, topics) = best_model

In [91]:
len(topics), best_n

(4, 4)

In [92]:
# Transform topics to LSA space
list_topics_normalized = []
for topic in topics:
    list_topics_normalized.append(transform_text(" ".join(topic), tfidf_vectorizer, svd, normalizer)[0].tolist())
    topics_lsa_normalized = np.array(list_topics_normalized)

In [93]:
# Assign a topic to each aspect of df 

df_aspect_topic = df[["aspectId","description"]] 
df_aspect_topic["aspect_topic"] = df_aspect_topic["description"].apply(lambda text: assign_topic_to_text(text, topics_lsa_normalized))
df_aspect_topic.head()

Unnamed: 0,aspectId,description,aspect_topic
0,9639,word right order,0
1,9937,good answer orang utan strong social bond,2
2,9633,subject verb congruent,3
3,11401,answer mention keyword look correctstat,2
4,9984,verb conjug expect ten expectedten,3


Mapping aspects

In [94]:
df_task_aspects = pd.read_csv("gen_files/EN/taskAspects.csv")
# df_task_aspects.head()

In [95]:
df_task_aspects = pd.merge(df_task_aspects, df_task_topic, on="taskId", how="inner")
df_task_aspects

Unnamed: 0,taskId,aspectId,description,task_topic
0,14ambh1obhw7TYMQE8lcC1,9639,write short text sentenc past simpl word usual...,0
1,25RGLvb2p0G5zulfX9xQOj,9639,man woman write sentenc posit neg happen pictu...,1
2,18Ccvc8NMJT5xqLv9nAgTH,9937,anim write sentenc,0
3,3Jr6T26XL13aKRh31JX0xi,9633,peopl pictur visit london write day london poi...,0
4,3gbjpjewKN1aa5y4aN20Yw,11401,time mull wine gluhwein german vin chaud frenc...,2
...,...,...,...,...
11217,16MreBTRQqA9TQI3zQU33G,124,battl waterloo,3
11218,16MreBTRQqA9TQI3zQU33G,71160,battl waterloo,3
11219,8gooLcJt0bz7yVHSaWd0MM,71121,read follow articl georg lunch,1
11220,8gooLcJt0bz7yVHSaWd0MM,124,read follow articl georg lunch,1


In [96]:
df_task_aspects = pd.merge(df_task_aspects, df_aspect_topic, on="aspectId", how="inner")
# df_task_aspects

In [97]:
df_map = df_task_aspects[["task_topic", "aspect_topic"]]
df_map

Unnamed: 0,task_topic,aspect_topic
0,0,0
1,1,0
2,0,2
3,0,2
4,0,3
...,...,...
11121,1,0
11122,1,0
11123,2,1
11124,2,1


In [98]:
df_map.drop_duplicates().groupby("task_topic")["aspect_topic"].apply(list).reset_index()

Unnamed: 0,task_topic,aspect_topic
0,0,"[0, 2, 3, 1]"
1,1,"[0, 3, 2, 1]"
2,2,"[3, 2, 1, 0]"
3,3,"[2, 1, 3, 0]"


Mapping 2.0 

In [99]:
df_task_aspects

Unnamed: 0,taskId,aspectId,description_x,task_topic,description_y,aspect_topic
0,14ambh1obhw7TYMQE8lcC1,9639,write short text sentenc past simpl word usual...,0,word right order,0
1,25RGLvb2p0G5zulfX9xQOj,9639,man woman write sentenc posit neg happen pictu...,1,word right order,0
2,18Ccvc8NMJT5xqLv9nAgTH,9937,anim write sentenc,0,good answer orang utan strong social bond,2
3,4i811yJMsQja4C3GII7BHS,9937,anim write sentenc,0,good answer orang utan strong social bond,2
4,3Jr6T26XL13aKRh31JX0xi,9633,peopl pictur visit london write day london poi...,0,subject verb congruent,3
...,...,...,...,...,...,...
11121,3Djh3SW7P7p7vZe1JVgsyw,381288,climat chang paragraph,1,impact climat chang widespread affect weather ...,0
11122,68RE5Cw3jm48eggO9ZPAJf,381288,climat chang paragraph,1,impact climat chang widespread affect weather ...,0
11123,2FRQuUvYy7J6fiChXiS0LC,379209,solv type answer,2,learner appli commut associ properti addit,1
11124,2Om9QzEpf5raBfTy7yw79q,379209,type miss fact fact famili,2,learner appli commut associ properti addit,1


In [None]:
# For each task, assign the aspect_topics (from the task's aspects), and compute the probabilities of how often would topic i map to aspect_topic j 

In [105]:
d = df_task_aspects.groupby(["task_topic", "aspect_topic"]).count()[["taskId"]].reset_index()

In [108]:
d1 = df_map.groupby("task_topic").count().reset_index()

In [110]:
r = pd.merge(d, d1, on="task_topic", how="left")

In [112]:
r.head()

Unnamed: 0,task_topic,aspect_topic_x,taskId,aspect_topic_y
0,0,0,1143,5058
1,0,1,1853,5058
2,0,2,1252,5058
3,0,3,810,5058
4,1,0,1351,5101


In [113]:
r["probability"] = r["taskId"] / r["aspect_topic_y"]

In [114]:
r

Unnamed: 0,task_topic,aspect_topic_x,taskId,aspect_topic_y,probability
0,0,0,1143,5058,0.225979
1,0,1,1853,5058,0.36635
2,0,2,1252,5058,0.247529
3,0,3,810,5058,0.160142
4,1,0,1351,5101,0.26485
5,1,1,1711,5101,0.335424
6,1,2,1363,5101,0.267203
7,1,3,676,5101,0.132523
8,2,0,149,840,0.177381
9,2,1,312,840,0.371429


<hr>

In [None]:
df = pd.read_csv("data/final_tasks_DE.csv")
stop_words = stopwords.words('german')
df.dropna(subset=["description"], inplace=True)
data = df["description"].to_list() 
num_topics = 10
tfidf_vectorizer, svd, normalizer, lsa_output = lsa(data, num_topics=num_topics)

In [None]:
topics = [
    "Topic 1: englisch | horst | schreib | horen | schreiben", 
    "Topic 2: satz | schreib | prateritum | hideaway | passiv", 
    "Topic 3: horen | schreiben | englisch | satz | passiv", 
    "Topic 4: massachusett | institut | technolog | of | infinitiv", 
    "Topic 5: infinitiv | komma | denk | prateritum | zubird", 
    "Topic 6: prateritum | hideaway | hideout | hauptsatz | relativsatz", 
    "Topic 7: frage | indirekt | direkt | zwei | hauptsatz", 
    "Topic 8: hauptsatz | zwei | satzen | relativsatz | schreib", 
    "Topic 9: lair | prateritum | frage | satz | schreib", 
    "Topic 10: frage | indirekt | direkt | schreib | passiv", 
]

In [None]:
test_text = df.loc[1101, "description"]
test_text

In [None]:
test_text = "It's a question of going to Edinburgh in the time of earthquake."

# Transform test text to LSA space
test_text_lsa_normalized = transform_text(test_text, tfidf_vectorizer, svd, normalizer)

# Transform topics to LSA space
list_topics_normalized = []
for topic in topics:
    list_topics_normalized.append(transform_text(topic, tfidf_vectorizer, svd, normalizer)[0].tolist())
    topics_lsa_normalized = np.array(list_topics_normalized)
    
# Assign topic to test text
most_similar_topic_index = assign_topic_to_text(test_text_lsa_normalized, topics_lsa_normalized)

print(f"{topics[most_similar_topic_index]}")

<hr>

> the LSA output is a matrix where each row represents a document and each column represents a topic. Each value indicates the strength of the association between the document and the topic.