In [None]:
import pandas as pd
import json
import seaborn as sns

from dotenv import load_dotenv
from pathlib import Path
from studienarbeit.utils.cleaning import Cleaning
from studienarbeit.utils.load import EDataTypes
from tqdm import tqdm

In [None]:
load_dotenv()
sns.set(style="white", palette="muted", rc={"figure.figsize": (20, 8)})
tqdm.pandas()

In [None]:
data_dir = Path("../../data/party_programs")
data_type = EDataTypes.PARTY_PROGRAMS

In [None]:
with open("../../data/party_colors.json", "r", encoding="utf-8") as f:
  party_palette = json.load(f)
cleaning = Cleaning()

In [None]:
df_prep = pd.read_parquet(data_dir / "party_programs_before_cleaning.parquet")

In [None]:
df_prep["clean_text"] = df_prep["text_orig"].progress_apply(lambda x: cleaning.clean_text(x, keep_punctuation=True, keep_upper=True)).astype("string[pyarrow]")
df_prep["tokenized_text"] = df_prep["clean_text"].progress_apply(lambda x: cleaning.filter_text(cleaning.lemma_text(x))).astype("string[pyarrow]")

df_prep

In [None]:
df_prep.duplicated(subset=["clean_text"]).sum()

In [None]:
df_prep = df_prep.drop_duplicates(subset=["clean_text"]).reset_index(drop=True)

In [None]:
df_prep.duplicated(subset=["clean_text"]).sum()

In [None]:
df_final = df_prep.copy().reset_index(drop=True).drop(columns=["text_orig", "election_type", "election"])

In [None]:
with open("../../data/party_encoding.json", "r", encoding="utf-8") as f:
  party_encoding = json.load(f)
  df_final["party"] = df_final["party"].map(party_encoding)
df_final

In [None]:
df_final.to_parquet(data_dir / "party_programs.parquet", index=False)