# Preprocessing the Tweets

In this notebook we will preprocess the tweets. We will remove the stopwords, the punctuation and the links. We will also lemmatize the words.

If you are looking for the initial understanding please have a look at [01_understanding.ipynb](01_understanding.ipynb). For the visual exploration after the cleaning please have a look at [03_plotting.ipynb](03_plotting.ipynb).


## Config and Imports

In [None]:
import itertools
import json
from collections import Counter
from pathlib import Path

import pandas as pd
from dotenv import load_dotenv
from loguru import logger
from nltk import ngrams
from tqdm import tqdm

from studienarbeit.utils.cleaning import Cleaning
from studienarbeit.utils.load import EDataTypes, Load
from studienarbeit.utils.sentiment import Sentiment


load_dotenv()
tqdm.pandas()

In [None]:
# If true, it will try to load the cached dataframe otherwise it will process the data
FAST_MODE = True
# If true, it will run the sentiment analysis
SENTIMENT_ANALYSIS = True
# Set to a number of tweets to limit amount of data needed to process, set to None to process all data
SAMPLE_SIZE = None

file_name = "tweets_understanding.parquet"
data_type = EDataTypes.TWEETS
data_dir = Path("../../data")

with open(data_dir / "gender_encoding.json", "r", encoding="utf-8") as f:
    gender_encoding = json.load(f)
with open(data_dir / "party_encoding.json", "r", encoding="utf-8") as f:
    party_encoding = json.load(f)

load = Load(data_type=data_type)
clean = Cleaning()
sentiment = Sentiment()

In [None]:
df = load.load_dataframe(file_name)

## Clean the Data


In [None]:
def prep_pipeline(df: pd.DataFrame, min_word_count: int = 5):
    if FAST_MODE:
        logger.info("Fast mode is enabled, skipping sentiment analysis...")

    # Group CDU and CSU as Union
    df["party"] = df["party"].replace("CSU", "UNION")
    df["party"] = df["party"].replace("CDU", "UNION")
    df["party"] = df["party"].cat.remove_unused_categories()

    # Fix labels for retweets
    df["is_retweet"] = df["is_retweet"].replace("FALSE", False)
    df["is_retweet"] = df["is_retweet"].replace("TRUE", True)
    df["is_retweet"] = df["is_retweet"].astype("bool")

    # Remove tweets from parties that are not in the Bundestag and/or retweets
    print(
        f"The dataset contains {len(df.loc[(df['is_retweet'] == True) | (df['text'].str.startswith('RT'))])} retweets..."
    )
    df = df.loc[(df["party"] != "Parteilos") & (df["is_retweet"] == False) & (~df["text"].str.startswith("RT"))]

    # Encode party and gender
    df["party"] = df["party"].map(party_encoding).astype("int8")
    df["gender"] = df["gender"].map(gender_encoding).astype("int8")

    # Apply cleaning pipeline
    df["clean_text"] = df["text"].progress_apply(clean.clean_text).astype("string[pyarrow]")
    df["lemma_text"] = df["clean_text"].progress_apply(clean.lemma_text).astype("string[pyarrow]")
    df["filter_text"] = df["lemma_text"].progress_apply(clean.filter_text).astype("string[pyarrow]")

    # Count the number of words and tokens in the tweet
    df["init_word_count"] = df["text"].progress_apply(lambda x: len(x.split())).astype("int16")
    df["init_symbol_count"] = df["text"].progress_apply(lambda x: len(x)).astype("int16")
    df["clean_word_count"] = df["clean_text"].progress_apply(lambda x: len(x.split())).astype("int16")
    df["clean_symbol_count"] = df["clean_text"].progress_apply(lambda x: len(x)).astype("int16")
    df["filter_word_count"] = df["filter_text"].progress_apply(lambda x: len(x.split())).astype("int16")
    df["filter_symbol_count"] = df["filter_text"].progress_apply(lambda x: len(x)).astype("int16")

    # Filter out tweets that are too short
    print(
        f"Found {len(df.loc[df['filter_word_count'] < min_word_count])} tweets with less than {min_word_count} words..."
    )
    df = df.loc[df["filter_word_count"] >= min_word_count]

    # Calculate the sentiment of the tweets
    if SENTIMENT_ANALYSIS:
        df["sentiment"] = df["clean_text"].progress_apply(sentiment.predict_sentiment).astype("category")

    return df

Either load the cached data or process the raw tweets


In [None]:
suffix = []

if SENTIMENT_ANALYSIS:
    suffix.append("sent")

if SAMPLE_SIZE is None:
    suffix.append("full")
elif SAMPLE_SIZE <= 25000:
    suffix.append("sm")
elif SAMPLE_SIZE <= 50000:
    suffix.append("md")
elif SAMPLE_SIZE <= 100000:
    suffix.append("lg")

file_path = f"prep_tweets_{'_'.join(suffix)}.parquet"

if FAST_MODE and load.check_file_exists(file_path):
    df_prep = load.load_dataframe(file_path)
else:
    df_prep = prep_pipeline(df.sample(SAMPLE_SIZE, random_state=42).copy() if SAMPLE_SIZE else df.copy())
    load.save_dataframe(df_prep, file_path)

In [None]:
# For test purposes
# TODO: Remove this after rerunning the dataframes
df_prep["init_word_count"] = df_prep["text"].progress_apply(lambda x: len(x.split())).astype("int16")
df_prep["init_symbol_count"] = df_prep["text"].progress_apply(lambda x: len(x)).astype("int16")
df_prep["screen_name"] = df_prep["screen_name"].astype("string[pyarrow]")
df_prep["party"] = df_prep["party"].map(party_encoding)

In [None]:
df_prep["party"] = (
    df_prep["party"].replace("DIE GRÜNEN", "Grüne").replace("DIE LINKE", "Linke").replace("UNION", "Union")
)

## Analyse the Data

Following you find our tweet example for visualizing the data cleaning pipeline.

In [None]:
clean.pipeline(
    "Ehemalige @AfD-Vorsitzende #Petry muss wegen Meineid vor Gericht. Kein Einzelfall: gegen circa 10% aller AfD-Abgeordneten bundesweit laufen oder liefen Strafverfahren. Kriminelle Asylbewerber? Fehlanzeige. Kriminelle AfD-Hetzer trifft den Nagel eher auf den Kopf <U+0001F602> #AfD"
)

In [None]:
# Print the shape of the dataframe
df_prep.shape

In [None]:
# Count the number of unique users
df_prep["screen_name"].nunique()

In [None]:
# Count the number of politicians per party
df_prep.groupby("party")["screen_name"].nunique()

In [None]:
# Count the number of tweets by a individual politician grouped by party
df_prep.groupby("party")["screen_name"].value_counts().groupby("party").describe()

In [None]:
df_prep["party"].value_counts()

In [None]:
# Check for n-grams
Counter(
    list(itertools.chain.from_iterable(df_prep["filter_text"].str.split().apply(lambda x: list(ngrams(x, 3)))))
).most_common(50)

In [None]:
df_prep.info(verbose=True, memory_usage="deep")

In [None]:
df_prep.describe(include="all", datetime_is_numeric=True)