# Plotting the Tweets

This notebook is used to plot the tweets after they have been cleaned and preprocessed. This process is done in `./preparation.ipynb`.

## Config and Imports

In [None]:
import json
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np
import plotly.express as px
import seaborn as sns
from ydata_profiling import ProfileReport

from studienarbeit.utils.load import EDataTypes, Load
from studienarbeit.utils.plots import Plots

In [None]:
file_name = "prep_tweets_sent_full.parquet"
data_type = EDataTypes.TWEETS
data_dir = Path("../../data/") 

with open(data_dir / "party_colors.json", "r", encoding="utf-8") as f:
    party_palette = json.load(f)

load = Load(data_type=data_type)
plot = Plots(data_type=data_type, party_palette=party_palette)

In [None]:
df = load.load_dataframe(file_name)

In [None]:
df.shape

In [None]:
df.head(10)

## Plots

In [None]:
profile = ProfileReport(df, title="Profiling Report")
profile.to_file("03_plotting.html")

In [None]:
fig = px.area(df.groupby("created_at")["party"].value_counts().unstack().resample("M").sum(), facet_col="party", title="Anzahl pro Monat nach Partei", facet_col_wrap=2, height=1000, labels={"value": "Anzahl", "created_at": "Datum"})
fig.write_image("../../data/images/tweets/anzahl_pro_monat_nach_partei.png", width=1500, scale=2)
fig.show()

In [None]:
plot.party_count(df)

Print the sentiment distribution of the tweets using latex for the paper.

In [None]:
sentiment_distribution = df.groupby("party")["sentiment"].value_counts().groupby("party", group_keys=False).apply(lambda x: x / x.sum()).unstack()

print(f"\n{sentiment_distribution.round(2).style.to_latex()}")

In [None]:
if "sentiment" in df.columns:
    plot.sentiment(df)

In [None]:
plot.word_count(df, title="Wortanzahl pro Partei vor Bereinigung", column="init_word_count", x_lim=60)

In [None]:
plot.word_count(df, title="Wortanzahl pro Partei nach Bereinigung", x_lim=40)

In [None]:
plot.gender(df)

In [None]:
plot.user_count(df)

In [None]:
plot.wordclouds(df, title="Wortwolke pro Partei vor Bereinigung", column="lemma_text")

In [None]:
corr = df.select_dtypes(exclude=["object", "category", "datetime64[ns]", "bool"]).corr(numeric_only=True)
mask = np.triu(np.ones_like(corr, dtype=bool))
mask[np.diag_indices_from(mask)] = False

fig, ax = plt.subplots(figsize=(10, 10))
corr_plot = sns.heatmap(
    corr,
    mask=mask,
    cmap="coolwarm",
    center=0,
    square=True,
    cbar_kws={"shrink": 0.5},
    annot=True,
    annot_kws={"fontsize": 10},
    fmt=".2f",
    ax=ax,
)