In [2]:
import pandas as pd
from gensim.utils import simple_preprocess
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import Normalizer
import numpy as np
import spacy
from nltk.corpus import stopwords

In [17]:
def preprocess_data(text): 
 # lowercase, tokenize, and remove stopwords
    doc = nlp(text)
    verbs = [token.text.lower() for token in doc if token.pos_ == "VERB"]
    preprocessed_text = simple_preprocess(text)
    # words_list = [word for word in preprocessed_text if word not in verbs and word not in stop_words]
    words_list = [word for word in preprocessed_text if word not in stop_words]
    return " ".join(words_list)

In [13]:
df = pd.read_csv("data/preprocessed_open_tasks_EN.csv")
stop_words = stopwords.words('english')
data = df["preprocess_desc"].to_list()

In [14]:
# df = pd.read_csv("data/preprocessed_open_tasks_DE.csv")
# stop_words = stopwords.words('german')
# data = df["preprocess_desc"].to_list()

In [15]:
def latent_semantic_analysis(texts, num_topics=5):
    # Step 1: Convert texts to TF-IDF matrix
    tfidf_vectorizer = TfidfVectorizer(max_df=0.5, min_df=1, stop_words=stop_words)
    tfidf_matrix = tfidf_vectorizer.fit_transform(texts)
    
    # Step 2: Apply Singular Value Decomposition (SVD)
    svd = TruncatedSVD(n_components=num_topics)
    latent_semantic_analysis = svd.fit_transform(tfidf_matrix)
    
    # Step 3: Normalize the output of SVD
    normalizer = Normalizer(copy=False)
    latent_semantic_analysis = normalizer.fit_transform(latent_semantic_analysis)
    
    # Step 4: Print the topics and their most relevant terms
    terms = tfidf_vectorizer.get_feature_names_out()
    for i, topic in enumerate(svd.components_):
        top_terms_idx = topic.argsort()[:-6:-1] # Top 5 terms
        top_terms = [terms[idx] for idx in top_terms_idx]
        print(f"Topic {i+1}: {' | '.join(top_terms)}")
    
    return tfidf_vectorizer, svd, normalizer, latent_semantic_analysis

In [16]:
num_topics = 6
tfidf_vectorizer, svd, normalizer, lsa_output = latent_semantic_analysis(data, num_topics=num_topics)

Topic 1: sentenc | write | anim | say | question
Topic 2: italian | textbox | french | sport | languag
Topic 3: describ | interact | girl | pictur | dog
Topic 4: anim | say | write | interact | dog
Topic 5: edinburgh | carolin | move | question | full
Topic 6: one | earthquak | build | behav | write


> the LSA output is a matrix where each row represents a document and each column represents a topic. Each value indicates the strength of the association between the document and the topic.

In [8]:
# topics = [
#     "Topic 1: grammatik präsens passiv satz ergänze schreibe",
#     "Topic 2: physik geschwindigkeit nenne elektrischen km auto",
# ]

In [18]:
topics = [
    "Topic 1: sentenc | write | anim | say | question", 
    "Topic 3: describ | interact | girl | pictur | dog", 
    "Topic 4: anim | say | write | interact | dog", 
    "Topic 5: edinburgh | carolin | move | question | full", 
    "Topic 6: one | earthquak | build | behav | write"]

In [10]:
# print("LSA Output:")
# print(lsa_output)

Assign a given text to a topic

In [20]:
from sklearn.metrics.pairwise import cosine_similarity
def preprocess_text(texts):
    tfidf_vectorizer = TfidfVectorizer(max_df=0.5, min_df=1, stop_words=stop_words)
    tfidf_matrix = tfidf_vectorizer.fit_transform(texts)
    return tfidf_vectorizer, tfidf_matrix

def transform_text(text, tfidf_vectorizer, svd, normalizer):
    text_tfidf = tfidf_vectorizer.transform([text])
    text_lsa = svd.transform(text_tfidf)
    text_lsa_normalized = normalizer.transform(text_lsa)
    return text_lsa_normalized

def assign_topic_to_text(text_lsa_normalized, topics_lsa_normalized):
    similarities = cosine_similarity(text_lsa_normalized, topics_lsa_normalized)
    most_similar_topic_index = np.argmax(similarities)
    return most_similar_topic_index

# Example usage:
texts = data

# Preprocess texts and topics
tfidf_vectorizer, tfidf_matrix = preprocess_text(texts)

# Perform LSA
svd = TruncatedSVD(n_components=len(topics))
lsa = svd.fit_transform(tfidf_matrix)
normalizer = Normalizer(copy=False)
lsa_normalized = normalizer.fit_transform(lsa)

# Transform topics to LSA space
list_topics_normalized = []
for topic in topics:
    list_topics_normalized.append(transform_text(topic, tfidf_vectorizer, svd, normalizer)[0].tolist())
    topics_lsa_normalized = np.array(list_topics_normalized)

In [21]:
# Test text
# test_text = "berechnen Sie die Strecke, die das Auto in einer Stunde bei einer Geschwindigkeit von 100 km pro Stunde zurücklegt."
test_text = "It's a question of going to Edinburgh in the time of earthquake."

# Transform test text to LSA space
test_text_lsa_normalized = transform_text(test_text, tfidf_vectorizer, svd, normalizer)

# Assign topic to test text
most_similar_topic_index = assign_topic_to_text(test_text_lsa_normalized, topics_lsa_normalized)

print(f"{topics[most_similar_topic_index]}")

Topic 5: edinburgh | carolin | move | question | full


In [23]:
# Test text
# test_text = "einen Satz im Präsens mit dem Verb essen schreiben"
test_text = "Describe how earthquakes behave"# in Edinburgh"

# Transform test text to LSA space
test_text_lsa_normalized = transform_text(test_text, tfidf_vectorizer, svd, normalizer)

# Assign topic to test text
most_similar_topic_index = assign_topic_to_text(test_text_lsa_normalized, topics_lsa_normalized)

print(f"{topics[most_similar_topic_index]}")

Topic 1: sentenc | write | anim | say | question


In [27]:
# Test text
# test_text = "Write one sentence about how people behave when building after an earthquake" #Topic 6
test_text = "Write a sentence about how people behave when building after an earthquake" #Topic 1

# Transform test text to LSA space
test_text_lsa_normalized = transform_text(test_text, tfidf_vectorizer, svd, normalizer)

# Assign topic to test text
most_similar_topic_index = assign_topic_to_text(test_text_lsa_normalized, topics_lsa_normalized)

print(f"{topics[most_similar_topic_index]}")

Topic 1: sentenc | write | anim | say | question
