# Tweets (Sältzer)


In [None]:
from pathlib import Path

import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from dotenv import load_dotenv
from loguru import logger

from studienarbeit.config import gender_encoding, party_encoding
from studienarbeit.utils.load import EDataTypes, Load
from studienarbeit.utils.plots import Plots

load_dotenv()


In [None]:
# If true, it will try to load the cached dataframe otherwise it will process the data without sentiment analysis
FAST_MODE = True
# Set to a number of tweets to limit amount of data needed to process, set to None to process all data
SAMPLE_SIZE = 10000

file_name = "prep_tweets_fast_full.parquet"
data_type = EDataTypes.TWEETS
data_dir = Path("../../data/") / data_type.value

load = Load(data_type=data_type)
plot = Plots(document_type="Tweets")

## Data Understanding

---


### Import


In [None]:
df_base = load.load_dataframe(
    "tweets.parquet", columns=["screen_name", "created_at", "is_retweet", "text", "party", "birthyear", "gender"]
)

In [None]:
df_base.shape

In [None]:
for col in df_base.columns:
    df_base[col] = df_base[col].apply(
        lambda x: None if x == "" or x == "NA" or x == "NA, NA" or x == "NA, NA, NA, NA, NA, NA, NA, NA" else x
    )

Check for missing values


In [None]:
df_base.isna().sum()

In the cell above we can see that there are about 11k missing values in the `text` column. Regarding the `is_retweet` column, about 3k entries have missing values.

Following we will delete the rows.


In [None]:
df_base = df_base.dropna(subset=["text", "is_retweet"])

Following, we can check which columns represent categorical data.


In [None]:
df_base.nunique()

Clean duplicated rows (some tweets seem to be scraped twice at different days)


In [None]:
df_base = df_base.drop_duplicates(
    subset=["screen_name", "is_retweet", "text", "party", "birthyear", "gender"], keep="last"
)

In [None]:
df_base["party"].value_counts()

In [None]:
df_base.groupby("gender")["screen_name"].nunique()

In [None]:
convert_dict = {
    "screen_name": "category",
    "created_at": "datetime64[ns]",
    "is_retweet": "category",
    "text": "string[pyarrow]",
    "party": "category",
    "birthyear": "datetime64[ns]",
    "gender": "category",
}

In [None]:
df_base = df_base.astype(convert_dict)

In [None]:
df_base.info(verbose=True, memory_usage="deep")

In [None]:
df_base.describe(include="all", datetime_is_numeric=True)

In [None]:
df_base.head()

## Data Preparation

---


In [None]:
import itertools
from collections import Counter

from nltk import ngrams
from tqdm import tqdm

from studienarbeit.utils.cleaning import Cleaning
from studienarbeit.utils.sentiment import Sentiment

tqdm.pandas()


In [None]:
clean = Cleaning()
sentiment = Sentiment()

In [None]:
clean.pipeline(
    "Ehemalige @AfD-Vorsitzende #Petry muss wegen Meineid vor Gericht. Kein Einzelfall: gegen circa 10% aller AfD-Abgeordneten bundesweit laufen oder liefen Strafverfahren. Kriminelle Asylbewerber? Fehlanzeige. Kriminelle AfD-Hetzer trifft den Nagel eher auf den Kopf <U+0001F602> #AfD"
)

### Cleaning


In [None]:
def prep_pipeline(df: pd.DataFrame, min_word_count: int = 5):
    if FAST_MODE:
        logger.info("Fast mode is enabled, skipping sentiment analysis...")
    
    # Group CDU and CSU as Union
    df["party"] = df["party"].replace("CSU", "UNION")
    df["party"] = df["party"].replace("CDU", "UNION")
    df["party"] = df["party"].cat.remove_unused_categories()

    # Fix labels for retweets
    df["is_retweet"] = df["is_retweet"].replace("FALSE", False)
    df["is_retweet"] = df["is_retweet"].replace("TRUE", True)
    df["is_retweet"] = df["is_retweet"].astype("bool")

    # Remove tweets from parties that are not in the Bundestag and/or retweets
    print(
        f"The dataset contains {len(df.loc[(df['is_retweet'] == True) | (df['text'].str.startswith('RT'))])} retweets..."
    )
    df = df.loc[(df["party"] != "Parteilos") & (df["is_retweet"] == False) & (~df["text"].str.startswith("RT"))]

    # Encode party and gender
    df["party"] = df["party"].map(party_encoding).astype("int8")
    df["gender"] = df["gender"].map(gender_encoding).astype("int8")

    # Apply cleaning pipeline
    df["clean_text"] = df["text"].progress_apply(clean.clean_text).astype("string[pyarrow]")
    df["lemma_text"] = df["clean_text"].progress_apply(clean.lemma_text).astype("string[pyarrow]")
    df["filter_text"] = df["lemma_text"].progress_apply(clean.filter_text).astype("string[pyarrow]")

    # Count the number of words and tokens in the tweet
    df["clean_word_count"] = df["clean_text"].progress_apply(lambda x: len(x.split())).astype("int16")
    df["clean_symbol_count"] = df["clean_text"].progress_apply(lambda x: len(x)).astype("int16")
    df["filter_word_count"] = df["filter_text"].progress_apply(lambda x: len(x.split())).astype("int16")
    df["filter_symbol_count"] = df["filter_text"].progress_apply(lambda x: len(x)).astype("int16")

    # Filter out tweets that are too short
    print(
        f"Found {len(df.loc[df['filter_word_count'] < min_word_count])} tweets with less than {min_word_count} words..."
    )
    df = df.loc[df["filter_word_count"] >= min_word_count]

    # Calculate the sentiment of the tweets
    if not FAST_MODE:
        df["sentiment"] = df["clean_text"].progress_apply(sentiment.predict_sentiment).astype("category")

    return df

Either load the cached data or process the raw tweets


In [None]:
suffix = []

if FAST_MODE:
    suffix.append("fast")

if SAMPLE_SIZE is None:
    suffix.append("full")
elif SAMPLE_SIZE < 25000:
    suffix.append("sm")
elif SAMPLE_SIZE < 50000:
    suffix.append("md")
elif SAMPLE_SIZE < 100000:
    suffix.append("lg")

file_path = f"prep_tweets_{'_'.join(suffix)}.parquet"

if FAST_MODE and load.check_file_exists(file_path):
    df_prep = load.load_dataframe(file_path)
else:
    df_prep = prep_pipeline(df_base.sample(SAMPLE_SIZE, random_state=42).copy() if SAMPLE_SIZE else df_base.copy())
    load.save_dataframe(df_prep, file_path)


Check for n-grams


In [None]:
Counter(
    list(itertools.chain.from_iterable(df_prep["filter_text"].str.split().apply(lambda x: list(ngrams(x, 2)))))
).most_common(50)

In [None]:
df_prep.info(verbose=True, memory_usage="deep")

In [None]:
df_prep.describe(include="all", datetime_is_numeric=True)

In [None]:
df_prep.head(10)

### Plotting


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np


In [None]:
df_plot = df_prep.copy()
df_plot["party"] = df_plot["party"].map({value: key for key, value in party_encoding.items()})


In [None]:
plot.party_count(df_plot)

In [None]:
if "sentiment" in df_plot.columns:
    plot.sentiment(df_plot)

In [None]:
plot.word_count(df_plot)

In [None]:
plot.gender(df_plot)

In [None]:
plot.user_count(df_plot)

In [None]:
corr = df_prep.select_dtypes(exclude=["object", "category", "datetime64[ns]", "bool"]).corr(numeric_only=True)
mask = np.triu(np.ones_like(corr, dtype=bool))
mask[np.diag_indices_from(mask)] = False

fig, ax = plt.subplots(figsize=(10, 10))
corr_plot = sns.heatmap(
    corr,
    mask=mask,
    cmap="coolwarm",
    center=0,
    square=True,
    cbar_kws={"shrink": 0.5},
    annot=True,
    annot_kws={"fontsize": 10},
    fmt=".2f",
    ax=ax,
)