## Data Preparation

In [None]:
import pandas as pd
import seaborn as sns
import re
import json

from studienarbeit.utils.cleaning import Cleaning
from studienarbeit.utils.split_text import SplitText
from studienarbeit.utils.load import EDataTypes
from pathlib import Path
from dotenv import load_dotenv
from tqdm import tqdm

In [None]:
load_dotenv()
sns.set(style="white", palette="muted", rc={"figure.figsize": (20, 8)})
tqdm.pandas()

In [None]:
data_dir = Path("../../data/speeches")
data_type = EDataTypes.SPEECHES

In [None]:
with open("../../data/party_colors.json", "r", encoding="utf-8") as f:
  party_palette = json.load(f)
cleaning = Cleaning()
split_text = SplitText()

In [None]:
df_prep = pd.read_parquet(data_dir / "speeches_before_cleaning.parquet")

In [None]:
df_prep = df_prep[df_prep["politicianId"] != -1]
df_prep

In [None]:
df_prep = df_prep[df_prep["word_count"] >= 200].reset_index(drop=True)
df_prep

In [None]:
df_prep = df_prep.drop(columns=["char_count", "word_count", "sentence_count"])
df_prep

In [None]:
df_prep.duplicated(subset=["speechContent"]).sum()

In [None]:
df_prep = df_prep.drop_duplicates(subset=["speechContent"]).reset_index(drop=True)
df_prep

In [None]:
df_prep.duplicated(subset=["speechContent"]).sum()

In [None]:
df_prep = split_text.split_dataframe_texts(df_prep, "speechContent", 512)
df_prep

In [None]:
def initial_cleaning(text):
    text = re.sub("[\u2022\u2023\u25E6\u2043\u2219\uf0b7\u25fc]\s", " ", text)
    text = re.sub("({\d*})", "", text)
    text = re.sub("\(\w*\)", "", text)
    text = text.replace(". –", ". ")
    
    text = re.sub("\n", " ", text)
    text = re.sub("\t", " ", text)
    text = re.sub("\s+", " ", text)
    text = text.strip()
    
    return text

In [None]:
df_prep["clean_text"] = df_prep["speechContent"].progress_apply(lambda x: cleaning.clean_text(initial_cleaning(x), keep_punctuation=True, keep_upper=True)).astype("string[pyarrow]")
df_prep = df_prep.drop(columns=["speechContent", "politicianId"])

df_prep["tokenized_text"] = df_prep["clean_text"].progress_apply(lambda x: cleaning.filter_text(cleaning.lemma_text(x))).astype("string[pyarrow]")

df_prep

In [None]:
with open("../../data/party_encoding.json", "r", encoding="utf-8") as f:
  party_encoding = json.load(f)

df_prep["party"] = df_prep["party"].map(party_encoding)
df_prep

In [None]:
df_prep.to_parquet(data_dir / "speeches.parquet", index=False)