# Preprocessing

In [None]:
import pandas as pd
import spacy
import re
from nltk.corpus import stopwords

## Fix encoding problems

In [None]:
df_data = pd.read_csv("data/abgeordnetenwatch_data_long.csv", sep=";")

106012

In [None]:
df_data = df_data.dropna(subset="answer")
df_data = df_data.drop_duplicates(subset="answer")

45642

In [None]:
df_sample = pd.read_csv("data/stratified_sample.csv", sep=";")
df_sample = df_sample.drop_duplicates(subset="answer")
df_sample = df_sample.dropna(subset=["answer", "answer_encoded"])

1867

In [55]:
df_merged = pd.merge(df_data, df_sample, on="question_id", how="left")
df_merged = df_merged[["party_x", "last_name_x", "gender_x", "answer_x", "topic_x", "question_text_x", "question_teaser_x", "answer_encoded"]]
df_merged = df_merged.dropna(subset="answer_encoded")
df_merged.rename(columns={
    "party_x" : "party",
    "last_name_x" : "last_name",
    "gender_x" : "gender",
    "answer_x" : "answer",
    "topic_x" : "topic",
    "question_text_x" : "question_text",
    "question_teaser_x" : "question_teaser"
}, inplace=True)

## Data cleaning

In [49]:
nlp = spacy.load("de_core_news_sm")

def text_lemmatization(text):
    doc = nlp(text)
    lemmas = [token.lemma_ for token in doc if not token.is_punct]
    return " ".join(lemmas)

In [50]:
def text_preprocessing(text):
    # remove links, punctuation, special letters
    text = re.sub(r"[^a-zA-öZÖäÄüÜß]|\bhttps?://\S*|&\w+;|[\.,]", " ", text)
    
    # replace single characters
    text = re.sub(r" [a-zA-Z] ", " ", text)
    
    # remove additional whitespaces
    text = re.sub(r"\s+", " ", text)
    
    # lemmatize texts
    text = text_lemmatization(text)

    # lower text
    text = text.lower()
    
    # tokenization of words
    text = text.split()
    
    # remove stopwords
    german_stopwords = set(stopwords.words("german"))
    text = [w for w in text if w not in german_stopwords]
    
    # return joined text
    return " ".join(text)

In [56]:
df_merged["clean_answer"] = df_merged["answer"].apply(text_preprocessing)

In [54]:
df_merged["clean_answer"]

43       geehrt frau vieler dank anfrage name person gr...
68       geehrt herr vieler dank anfrage bürokratiekost...
105      geehrt frau vieler dank anfrage namensrecht bu...
111      geehrt frau vieler dank anfrage eckpunktepapie...
142      geehrt herr vieler dank anfrage seit anfang un...
                               ...                        
45518    verhandlungsbereitschaft erkennbar gegenteil i...
45548    danke frage natürlich michael bloss gesetz wie...
45569    genau beschreiben kontrolle bedeuten direkt ko...
45598    geehrt herr vieler dank frage frei demokrat st...
45612    geehrt frau vieler dank nachricht frage nature...
Name: clean_answer, Length: 1669, dtype: object