# Preprocessing

In [31]:
import pandas as pd
import spacy
import re
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer

## Fix encoding problems

In [32]:
df_data = pd.read_csv("data/abgeordnetenwatch_data_long.csv", sep=";")

In [33]:
df_data = df_data.dropna(subset="answer")
df_data = df_data.drop_duplicates(subset=["answer", "question_text"])
len(df_data)

46018

In [34]:
df_sample = pd.read_csv("data/stratified_sample.csv", sep=";")
df_sample = df_sample.drop_duplicates(subset=["answer", "question_text"])
df_sample = df_sample.dropna(subset=["answer", "answer_encoded"])
len(df_sample)

1867

In [None]:
df_merged = pd.merge(df_data, df_sample, on="question_id", how="left")
df_merged = df_merged[["party_x", "last_name_x", "gender_x", "answer_x", "topic_x", "question_text_x", "question_teaser_x", "answer_encoded"]]
df_merged = df_merged.dropna(subset="answer_encoded")
df_merged.rename(columns={
    "party_x" : "party",
    "last_name_x" : "last_name",
    "gender_x" : "gender",
    "answer_x" : "answer",
    "topic_x" : "topic",
    "question_text_x" : "question_text",
    "question_teaser_x" : "question_teaser"
}, inplace=True)

## Data cleaning

In [37]:
nlp = spacy.load("de_core_news_sm")

def text_lemmatization(text):
    doc = nlp(text)
    lemmas = [token.lemma_ for token in doc if not token.is_punct]
    return " ".join(lemmas)

In [38]:
def text_preprocessing(text):
    # remove links, punctuation, special letters
    text = re.sub(r"[^a-zA-öZÖäÄüÜß]|\bhttps?://\S*|&\w+;|[\.,]", " ", text)
    
    # replace single characters
    text = re.sub(r" [a-zA-Z] ", " ", text)
    
    # remove additional whitespaces
    text = re.sub(r"\s+", " ", text)
    
    # lemmatize texts
    text = text_lemmatization(text)

    # lower text
    text = text.lower()
    
    # tokenization of words
    text = text.split()
    
    # remove stopwords
    german_stopwords = set(stopwords.words("german"))
    text = [w for w in text if w not in german_stopwords]
    
    # return joined text
    return " ".join(text)

In [39]:
df_merged["clean_answer"] = df_merged["answer"].apply(text_preprocessing)

In [None]:
# show most frequent words
vectorizer = CountVectorizer(
)
dtm = vectorizer.fit_transform(df_merged["clean_answer"])

frequencies = dtm.sum(axis=0).tolist()[0]

df_freq = pd.DataFrame(
    dict(frequencies=frequencies,
         index=vectorizer.get_feature_names_out()
    )
)

df_freq.sort_values("frequencies", ascending=False).head(n=20)

Unnamed: 0,frequencies,index
0,1,_origin
80266,1,medienriese
80265,1,medienreferent
80264,1,medienrechtlich
80263,1,medienrecht
80262,1,medienquelle
80260,1,medienpädagoginne
80259,1,medienpräzenz
80256,1,medienpolitiker
80254,1,medienpluralität


In [41]:
# remove stopwords with refined list and after lemmatization to catch 
def remove_individual_stopwords(text):
    individual_stopwords = [
        "geehrt",
        "frau",
        "vieler",
        "dank",
        "herr",
        "danke",
        "anfrage",
        "frage",
        "nachricht",
        "freundlich",
        "sollen",
        "müssen",
        "mehr",
        "grüße",
        "daher",
        "immer",
        "dafür"
    ]
    text = text.split()
    text = [w for w in text if w not in individual_stopwords]
    
    return " ".join(text)

df_merged["clean_answer"] = df_merged["clean_answer"].apply(remove_individual_stopwords)

In [42]:
df_merged["clean_answer"]

0        brauchen familienrecht höhe zeit zählen insbes...
1        neufassung abs postg heben durchschnittlich br...
2        klar wer freiwillig arbeiten netto brutto wer ...
3        klar unabhängigkeit justiz wesentlich baustein...
4        frei demokrat treten verantwortungsvoll haltun...
                               ...                        
46013    gehen davon wirtschaft transformieren konzept ...
46014    israel anweisung urteil igh befolg kommen durc...
46015    mittlerweile nahezu parteiübergreifend einigke...
46016    möglichkeit beantworten gerne zuschrifto hessi...
46017    möglichkeit beantworten gerne zuschrifto hessi...
Name: clean_answer, Length: 44332, dtype: object

In [43]:
# transform data for fasttext
df_merged["answer_encoded"] = df_merged["answer_encoded"].apply(lambda x: x.replace(" ", "_"))
df_merged["answer_encoded"] = df_merged["answer_encoded"].apply(lambda x: "__label__" + x)
df_merged["answer_encoding_combined"] = df_merged["answer_encoded"]+ " " + df_merged["clean_answer"]

AttributeError: 'float' object has no attribute 'replace'

In [43]:
# export data
df_merged.to_csv("data/stratified_sample_cleaned.csv", index=False, sep=";", )
df_merged["answer_encoding_combined"].to_csv("data/fasttext_data.csv", sep=";", index=False, header=False)