This notebook explores and implements Latent Semantic Analysis. <hr>

In [1]:
import pandas as pd
from gensim.utils import simple_preprocess
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import Normalizer
import numpy as np
import spacy
from nltk.corpus import stopwords
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
def latent_semantic_analysis(texts, num_topics=5):
    # Step 1: Convert texts to TF-IDF matrix
    tfidf_vectorizer = TfidfVectorizer(max_df=0.5, min_df=1, stop_words=stop_words)
    tfidf_matrix = tfidf_vectorizer.fit_transform(texts)
    
    # Step 2: Apply Singular Value Decomposition (SVD)
    svd = TruncatedSVD(n_components=num_topics)
    latent_semantic_analysis = svd.fit_transform(tfidf_matrix)
    
    # Step 3: Normalize the output of SVD
    normalizer = Normalizer(copy=False)
    latent_semantic_analysis = normalizer.fit_transform(latent_semantic_analysis)
    
    # Step 4: Print the topics and their most relevant terms
    terms = tfidf_vectorizer.get_feature_names_out()
    for i, topic in enumerate(svd.components_):
        top_terms_idx = topic.argsort()[:-6:-1] # Top 5 terms
        top_terms = [terms[idx] for idx in top_terms_idx]
        print(f"Topic {i+1}: {' | '.join(top_terms)}")
    
    return tfidf_vectorizer, svd, normalizer, latent_semantic_analysis

In [3]:
def preprocess_text(texts):
    tfidf_vectorizer = TfidfVectorizer(max_df=0.5, min_df=1, stop_words=stop_words)
    tfidf_matrix = tfidf_vectorizer.fit_transform(texts)
    return tfidf_vectorizer, tfidf_matrix

def transform_text(text, tfidf_vectorizer, svd, normalizer):
    text_tfidf = tfidf_vectorizer.transform([text])
    text_lsa = svd.transform(text_tfidf)
    text_lsa_normalized = normalizer.transform(text_lsa)
    return text_lsa_normalized

def assign_topic_to_text(text_lsa_normalized, topics_lsa_normalized):
    similarities = cosine_similarity(text_lsa_normalized, topics_lsa_normalized)
    most_similar_topic_index = np.argmax(similarities)
    return most_similar_topic_index

<hr>

In [4]:
df = pd.read_csv("data/final_tasks_EN.csv")
stop_words = stopwords.words('english')
df.dropna(subset=["description"], inplace=True)
data = df["description"].to_list() 
num_topics = 8
tfidf_vectorizer, svd, normalizer, lsa_output = latent_semantic_analysis(data, num_topics=num_topics)

Topic 1: like | name | rise | societi | mean
Topic 2: go | sentenc | chri | say | luke
Topic 3: sentenc | write | carolin | edinburgh | question
Topic 4: luke | sherlock | abrihim | cafe | mr
Topic 5: italian | french | textbox | languag | sport
Topic 6: chri | gun | say | gordi | dylan
Topic 7: edinburgh | carolin | move | studi | citi
Topic 8: societi | differ | uniti | develop | lose


In [5]:
topics = [
    "Topic 1: like | name | rise | societi | mean",
    "Topic 2: go | sentenc | chri | say | luke", 
    "Topic 3: sentenc | write | carolin | edinburgh | question", 
    "Topic 4: luke | sherlock | abrihim | cafe | mr", 
    "Topic 5: italian | french | textbox | languag | sport", 
    "Topic 6: chri | gun | say | gordi | dylan", 
    "Topic 7: edinburgh | carolin | move | studi | citi", 
    "Topic 8: societi | differ | uniti | develop | lose"
]

In [26]:
test_text = df.loc[1101, "description"]
test_text

'describ esperanza relev name turn find two three question partner answer three quot societi minut becom integr whole look eye see whole differ world togeth natur azizah al hibri american philosoph legal scholar peac uniti similar uniti diver comparison concili differ mikhail gorbachev former russian presid unlik drop water lose ident join ocean man lose societi live man life independ bear develop unlik societi alon develop self b r ambedkar former indian minist law justic hous mango street name english name mean hope spanish mean mani letter mean sad mean wait like number nine muddi color mexican record father play sunday morn shave song like sob great grandmoth name mine hor woman bear like chine year hor suppos bad circumst bear femal think chine lie chine like mexican like woman strong great grandmoth would like know wild hor woman wild marri great grandfath throw sack head carri like fanci chandeli way stori snuff never forgav look window whole life way mani woman sit sad elbow wo

In [27]:
# Transform test text to LSA space
test_text_lsa_normalized = transform_text(test_text, tfidf_vectorizer, svd, normalizer)

# Transform topics to LSA space
list_topics_normalized = []
for topic in topics:
    list_topics_normalized.append(transform_text(topic, tfidf_vectorizer, svd, normalizer)[0].tolist())
    topics_lsa_normalized = np.array(list_topics_normalized)

# Assign topic to test text
most_similar_topic_index = assign_topic_to_text(test_text_lsa_normalized, topics_lsa_normalized)

print(f"{topics[most_similar_topic_index]}")

Topic 1: like | name | rise | societi | mean


<hr>

In [28]:
df = pd.read_csv("data/final_tasks_DE.csv")
stop_words = stopwords.words('german')
df.dropna(subset=["description"], inplace=True)
data = df["description"].to_list() 
num_topics = 10
tfidf_vectorizer, svd, normalizer, lsa_output = latent_semantic_analysis(data, num_topics=num_topics)

Topic 1: englisch | horst | schreib | horen | schreiben
Topic 2: satz | schreib | prateritum | hideaway | passiv
Topic 3: horen | schreiben | englisch | satz | passiv
Topic 4: massachusett | institut | technolog | of | infinitiv
Topic 5: infinitiv | komma | denk | prateritum | zubird
Topic 6: prateritum | hideaway | hideout | hauptsatz | relativsatz
Topic 7: frage | indirekt | direkt | zwei | hauptsatz
Topic 8: hauptsatz | zwei | satzen | relativsatz | schreib
Topic 9: lair | prateritum | frage | schreib | satz
Topic 10: frage | indirekt | direkt | schreib | passiv


In [30]:
topics = [
    "Topic 1: englisch | horst | schreib | horen | schreiben", 
    "Topic 2: satz | schreib | prateritum | hideaway | passiv", 
    "Topic 3: horen | schreiben | englisch | satz | passiv", 
    "Topic 4: massachusett | institut | technolog | of | infinitiv", 
    "Topic 5: infinitiv | komma | denk | prateritum | zubird", 
    "Topic 6: prateritum | hideaway | hideout | hauptsatz | relativsatz", 
    "Topic 7: frage | indirekt | direkt | zwei | hauptsatz", 
    "Topic 8: hauptsatz | zwei | satzen | relativsatz | schreib", 
    "Topic 9: lair | prateritum | frage | satz | schreib", 
    "Topic 10: frage | indirekt | direkt | schreib | passiv", 
]

In [31]:
test_text = df.loc[1101, "description"]
test_text

'schreib satz konjunktiv geld leih konne'

In [33]:
test_text = "It's a question of going to Edinburgh in the time of earthquake."

# Transform test text to LSA space
test_text_lsa_normalized = transform_text(test_text, tfidf_vectorizer, svd, normalizer)

# Transform topics to LSA space
list_topics_normalized = []
for topic in topics:
    list_topics_normalized.append(transform_text(topic, tfidf_vectorizer, svd, normalizer)[0].tolist())
    topics_lsa_normalized = np.array(list_topics_normalized)
    
# Assign topic to test text
most_similar_topic_index = assign_topic_to_text(test_text_lsa_normalized, topics_lsa_normalized)

print(f"{topics[most_similar_topic_index]}")

Topic 4: massachusett | institut | technolog | of | infinitiv


<hr>

> the LSA output is a matrix where each row represents a document and each column represents a topic. Each value indicates the strength of the association between the document and the topic.