# Preprocessing

In [None]:
import pandas as pd
import spacy
import re
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer

## Fix encoding problems

In [3]:
df_data = pd.read_csv("data/abgeordnetenwatch_data_long.csv", sep=";")

In [4]:
df_data = df_data.dropna(subset="answer")
df_data = df_data.drop_duplicates(subset=["answer", "question_text"])
len(df_data)

46018

In [6]:
df_sample = pd.read_csv("data/stratified_sample.csv", sep=";")
df_sample = df_sample.drop_duplicates(subset=["answer", "question_text"])
df_sample = df_sample.dropna(subset=["answer", "answer_encoded"])
len(df_sample)

1867

In [7]:
df_merged = pd.merge(df_data, df_sample, on="question_id", how="left")
df_merged = df_merged[["party_x", "last_name_x", "gender_x", "answer_x", "topic_x", "question_text_x", "question_teaser_x", "answer_encoded"]]
df_merged = df_merged.dropna(subset="answer_encoded")
df_merged.rename(columns={
    "party_x" : "party",
    "last_name_x" : "last_name",
    "gender_x" : "gender",
    "answer_x" : "answer",
    "topic_x" : "topic",
    "question_text_x" : "question_text",
    "question_teaser_x" : "question_teaser"
}, inplace=True)

## Data cleaning

In [8]:
nlp = spacy.load("de_core_news_sm")

def text_lemmatization(text):
    doc = nlp(text)
    lemmas = [token.lemma_ for token in doc if not token.is_punct]
    return " ".join(lemmas)

In [9]:
def text_preprocessing(text):
    # remove links, punctuation, special letters
    text = re.sub(r"[^a-zA-öZÖäÄüÜß]|\bhttps?://\S*|&\w+;|[\.,]", " ", text)
    
    # replace single characters
    text = re.sub(r" [a-zA-Z] ", " ", text)
    
    # remove additional whitespaces
    text = re.sub(r"\s+", " ", text)
    
    # lemmatize texts
    text = text_lemmatization(text)

    # lower text
    text = text.lower()
    
    # tokenization of words
    text = text.split()
    
    # remove stopwords
    german_stopwords = set(stopwords.words("german"))
    text = [w for w in text if w not in german_stopwords]
    
    # return joined text
    return " ".join(text)

In [10]:
df_merged["clean_answer"] = df_merged["answer"].apply(text_preprocessing)

In [40]:
# show most frequent words
vectorizer = CountVectorizer(
)
dtm = vectorizer.fit_transform(df_merged["clean_answer"])

frequencies = dtm.sum(axis=0).tolist()[0]

df_freq = pd.DataFrame(
    dict(frequencies=frequencies,
         index=vectorizer.get_feature_names_out()
    )
)

df_freq.sort_values("frequencies", ascending=True).head(n=20)

Unnamed: 0,frequencies,index
11569,1,konstituiert
9138,1,gymnasium
16895,1,schulbereich
9140,1,gähnend
16894,1,schulbedarf
16893,1,schulausschuss
16892,1,schulaufgab
16891,1,schulalltag
9146,1,gärtner
9147,1,gäubahn


In [30]:
# remove stopwords with refined list and after lemmatization to catch 
def remove_individual_stopwords(text):
    individual_stopwords = [
        "geehrt",
        "frau",
        "vieler",
        "dank",
        "herr",
        "danke",
        "anfrage",
        "frage",
        "nachricht",
        "freundlich",
        "sollen",
        "müssen",
        "mehr",
        "grüße",
        "daher",
        "immer",
        "dafür"
    ]
    text = text.split()
    text = [w for w in text if w not in individual_stopwords]
    
    return " ".join(text)

df_merged["clean_answer"] = df_merged["clean_answer"].apply(remove_individual_stopwords)

In [31]:
df_merged["clean_answer"]

43       name person groß bedeutung eigen identität ind...
68       bürokratiekostenindex wichtig maß erfassung bü...
106      namensrecht bundestag änderung ehenamen geburt...
112      eckpunktepapier reform abstammungsrecht veröff...
143      seit anfang unionsbürgerinn unionsbürger sowie...
                               ...                        
45894    verhandlungsbereitschaft erkennbar gegenteil h...
45924    natürlich michael bloss gesetz wiederherstellu...
45945    genau beschreiben kontrolle bedeuten direkt ko...
45974    frei demokrat stehen freiheit einzeln mensch v...
45988    nature restoration law letzter woche europäisc...
Name: clean_answer, Length: 1686, dtype: object