In [None]:
import pandas as pd
import seaborn as sns
import json

from studienarbeit.utils.plots import Plots
from studienarbeit.utils.load import EDataTypes
from pathlib import Path
from dotenv import load_dotenv
from tqdm import tqdm
from nltk.tokenize import sent_tokenize

In [None]:
load_dotenv()
sns.set(style="white", palette="muted", rc={"figure.figsize": (20, 8)})
tqdm.pandas()

In [None]:
data_dir = Path("../../data/speeches")
data_type = EDataTypes.SPEECHES

In [None]:
with open("../../data/party_colors.json", "r", encoding="utf-8") as f:
    party_palette = json.load(f)
plots = Plots(data_type, data_dir, party_palette)

In [None]:
df_speeches = pd.read_parquet(data_dir / "speeches.parquet")
df_speeches

In [None]:
with open("../../data/party_encoding.json", "r", encoding="utf-8") as f:
    party_encoding = json.load(f)
    party_encoding = {v: k for k, v in party_encoding.items()}
    df_speeches["party"] = df_speeches["party"].map(party_encoding)

In [None]:
plots.party_count(df_speeches)

In [None]:
df_speeches["char_count"] = df_speeches["clean_text"].progress_apply(len).astype("int16")
df_speeches["word_count"] = df_speeches["clean_text"].progress_apply(lambda x: len(x.split())).astype("int16")
df_speeches["sentence_count"] = (
    df_speeches["clean_text"].progress_apply(lambda x: len(sent_tokenize(x))).astype("int16")
)

In [None]:
plots.word_count(df_speeches, "word_count", "Anzahl an Wörtern nach Partei", 500, "Anzahl an Wörtern")

In [None]:
plots.wordclouds(df_speeches)