In [1]:
import pandas as pd
from gensim.utils import simple_preprocess
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import Normalizer
import numpy as np

In [2]:
df = pd.read_csv("data/open_tasks_Corn_EKV_DE.csv")
data = df["description"]

In [3]:
import spacy
from nltk.corpus import stopwords
stop_words = stopwords.words('german')

# Load the German language model
nlp = spacy.load("de_core_news_sm")

def preprocess_data(text): 
 # lowercase, tokenize, and remove stopwords
    doc = nlp(text)
    verbs = [token.text.lower() for token in doc if token.pos_ == "VERB"]
    preprocessed_text = simple_preprocess(text)
    words_list = [word for word in preprocessed_text if word not in verbs and word not in stop_words]
    return " ".join(words_list)

In [4]:
preprocessed_data = data.apply(preprocess_data).to_list()

In [5]:
def latent_semantic_analysis(texts, num_topics=5):
    # Step 1: Convert texts to TF-IDF matrix
    tfidf_vectorizer = TfidfVectorizer(max_df=0.5, min_df=1, stop_words=None)
    tfidf_matrix = tfidf_vectorizer.fit_transform(texts)
    
    # Step 2: Apply Singular Value Decomposition (SVD)
    svd = TruncatedSVD(n_components=num_topics)
    latent_semantic_analysis = svd.fit_transform(tfidf_matrix)
    
    # Step 3: Normalize the output of SVD
    normalizer = Normalizer(copy=False)
    latent_semantic_analysis = normalizer.fit_transform(latent_semantic_analysis)
    
    # Step 4: Print the topics and their most relevant terms
    terms = tfidf_vectorizer.get_feature_names_out()
    for i, topic in enumerate(svd.components_):
        top_terms_idx = topic.argsort()[:-6:-1] # Top 5 terms
        top_terms = [terms[idx] for idx in top_terms_idx]
        print(f"Topic {i+1}: {' | '.join(top_terms)}")
    
    return tfidf_vectorizer, svd, normalizer, latent_semantic_analysis

In [6]:
tfidf_vectorizer, svd, normalizer, lsa_output = latent_semantic_analysis(preprocessed_data, num_topics=2)

Topic 1: passiv | präsens | satz | ergänze | schreibe
Topic 2: geschwindigkeit | nenne | elektrischen | km | auto


> the LSA output is a matrix where each row represents a document and each column represents a topic. Each value indicates the strength of the association between the document and the topic.

In [7]:
# print("LSA Output:")
# print(lsa_output)

Assign a given text to a topic

In [8]:
from sklearn.metrics.pairwise import cosine_similarity
def preprocess_text(texts):
    tfidf_vectorizer = TfidfVectorizer(max_df=0.5, min_df=1, stop_words=None)
    tfidf_matrix = tfidf_vectorizer.fit_transform(texts)
    return tfidf_vectorizer, tfidf_matrix

def transform_text(text, tfidf_vectorizer, svd, normalizer):
    text_tfidf = tfidf_vectorizer.transform([text])
    text_lsa = svd.transform(text_tfidf)
    text_lsa_normalized = normalizer.transform(text_lsa)
    return text_lsa_normalized

def assign_topic_to_text(text_lsa_normalized, topics_lsa_normalized):
    similarities = cosine_similarity(text_lsa_normalized, topics_lsa_normalized)
    most_similar_topic_index = np.argmax(similarities)
    return most_similar_topic_index

# Example usage:
texts = preprocessed_data

topics = [
    "Topic 1: grammatik präsens passiv satz ergänze schreibe",
    "Topic 2: physik geschwindigkeit nenne elektrischen km auto",
]

# Preprocess texts and topics
tfidf_vectorizer, tfidf_matrix = preprocess_text(texts)

# Perform LSA
svd = TruncatedSVD(n_components=len(topics))
lsa = svd.fit_transform(tfidf_matrix)
normalizer = Normalizer(copy=False)
lsa_normalized = normalizer.fit_transform(lsa)

# Transform topics to LSA space
list_topics_normalized = []
for topic in topics:
    list_topics_normalized.append(transform_text(topic, tfidf_vectorizer, svd, normalizer)[0].tolist())
    topics_lsa_normalized = np.array(list_topics_normalized)

In [9]:
# Test text
test_text = "berechnen Sie die Strecke, die das Auto in einer Stunde bei einer Geschwindigkeit von 100 km pro Stunde zurücklegt."

# Transform test text to LSA space
test_text_lsa_normalized = transform_text(test_text, tfidf_vectorizer, svd, normalizer)

# Assign topic to test text
most_similar_topic_index = assign_topic_to_text(test_text_lsa_normalized, topics_lsa_normalized)

print(f"{topics[most_similar_topic_index]}")

Topic 2: physik geschwindigkeit nenne elektrischen km auto


In [10]:
# Test text
test_text = "einen Satz im Präsens mit dem Verb essen schreiben"

# Transform test text to LSA space
test_text_lsa_normalized = transform_text(test_text, tfidf_vectorizer, svd, normalizer)

# Assign topic to test text
most_similar_topic_index = assign_topic_to_text(test_text_lsa_normalized, topics_lsa_normalized)

print(f"{topics[most_similar_topic_index]}")

Topic 1: grammatik präsens passiv satz ergänze schreibe
