# Preprocessing

In [None]:
import pandas as pd
import spacy
import re
from nltk.corpus import stopwords

## Fix encoding problems

In [None]:
df_data = pd.read_csv("data/abgeordnetenwatch_data_long.csv", sep=";")

106012

In [None]:
df_data = df_data.dropna(subset="answer")
df_data = df_data.drop_duplicates(subset="answer")

45642

In [None]:
df_sample = pd.read_csv("data/stratified_sample.csv", sep=";")
df_sample = df_sample.drop_duplicates(subset="answer")
df_sample = df_sample.dropna(subset=["answer", "answer_encoded"])

1867

In [None]:
df_merged = pd.merge(df_data, df_sample, on="question_id", how="left")
df_merged = df_merged[["party_x", "last_name_x", "gender_x", "answer_x", "topic_x", "question_text_x", "question_teaser_x", "answer_encoded"]]
df_merged = df_merged.dropna(subset="answer_encoded")

## Data cleaning

In [39]:
nlp = spacy.load("de_core_news_sm")

def text_lemmatization(text):
    doc = nlp(text)
    lemmas = [token.lemma_ for token in doc if not token.is_punct]
    return " ".join(lemmas)

In [45]:
def text_preprocessing(text):
    # remove links, punctuation, special letters
    text = re.sub(r"[^a-zA-öZÖäÄüÜß]|\bhttps?://\S*|&\w+;|[\.,]", " ", text)
    
    # replace single characters
    text = re.sub(r" [a-zA-Z] ", " ", text)
    
    # remove additional whitespaces
    text = re.sub(r"\s+", " ", text)
    
    # lemmatize texts
    text = text_lemmatization(text)

    # lower text
    text = text.lower()
    
    # tokenization of words
    text = text.split()
    
    # remove stopwords
    german_stopwords = set(stopwords.words("german"))
    text = [w for w in text if w not in german_stopwords]
    
    # return joined text
    return " ".join(text)

In [47]:
df_merged["clean_answers"] = df_merged["answer"].apply(text_preprocessing)

KeyError: 'answer'

In [None]:
df_merged["clean_answers"]