# Tweets (Sältzer)


In [None]:
import pandas as pd
from pathlib import Path
from dotenv import load_dotenv
from studienarbeit.utils.plots import Plots

load_dotenv()

In [None]:
FAST_MODE = True

data_dir = Path("../../data/tweets")
plot = Plots()

## Business Understanding

---

Lorem


## Data Understanding

---


### Import


In [None]:
df = pd.read_parquet(
    data_dir / "tweets.parquet",
    columns=["screen_name", "created_at", "is_retweet", "text", "party", "birthyear", "gender"],
    use_nullable_dtypes=True,
)

In [None]:
for col in df.columns:
    df[col] = df[col].apply(
        lambda x: None if x == "" or x == "NA" or x == "NA, NA" or x == "NA, NA, NA, NA, NA, NA, NA, NA" else x
    )

Check for missing values


In [None]:
df.isna().sum()

In the cell above we can see that there are about 11k missing values in the `text` column. Regarding the `is_retweet` column, about 3k entries have missing values.

Following we will delete the rows.


In [None]:
df = df.dropna(subset=["text", "is_retweet"])

Following, we can check which columns represent categorical data.


In [None]:
df.nunique()

Clean duplicated rows (some tweets seem to be scraped twice at different days)

In [None]:
df = df.drop_duplicates(subset=["screen_name", "is_retweet", "text", "party", "birthyear", "gender"], keep="last")

In [None]:
df["party"].value_counts()

In [None]:
convert_dict = {
    "screen_name": "category",
    "created_at": "datetime64[ns]",
    "is_retweet": "category",
    "text": "string[pyarrow]",
    "party": "category",
    "birthyear": "datetime64[ns]",
    "gender": "category",
}

In [None]:
df = df.astype(convert_dict)

In [None]:
df.info(verbose=True, memory_usage="deep")

In [None]:
df.describe(include="all", datetime_is_numeric=True)

In [None]:
df.head()

## Data Preparation

---


In [None]:
from tqdm import tqdm
from pandarallel import pandarallel
from collections import Counter
import itertools
from nltk import ngrams
from studienarbeit.utils.cleaning import Cleaning
from studienarbeit.utils.sentiment import Sentiment

tqdm.pandas()
pandarallel.initialize(progress_bar=True, verbose=1)

In [None]:
clean = Cleaning()
sentiment = Sentiment()

cache_file = data_dir / "cache/tweets_prep.parquet"

party_encoding = {
    "AfD": 0,
    "FDP": 1,
    "DIE GRÜNEN": 2,
    "DIE LINKE": 3,
    "SPD": 4,
    "UNION": 5,
}
gender_encoding = {
    "male": 0,
    "female": 1,
}

### Cleaning


In [None]:
def prep_pipeline(df: pd.DataFrame, min_word_count: int = 5):
    # Group CDU and CSU as Union
    df["party"] = df["party"].replace("CSU", "UNION")
    df["party"] = df["party"].replace("CDU", "UNION")
    df["party"] = df["party"].cat.remove_unused_categories()

    # Fix labels for retweets
    df["is_retweet"] = df["is_retweet"].replace("FALSE", False)
    df["is_retweet"] = df["is_retweet"].replace("TRUE", True)
    df["is_retweet"] = df["is_retweet"].astype("bool")

    # Remove tweets from parties that are not in the Bundestag and/or retweets
    print(f"The dataset contains {len(df.loc[df['is_retweet'] == True])} retweets.")
    df = df.loc[(df["party"] != "Parteilos") & (df["is_retweet"] == False)]

    # Encode party and gender
    df["party"] = df["party"].map(party_encoding).astype("int8")
    df["gender"] = df["gender"].map(gender_encoding).astype("int8")

    # Apply cleaning pipeline
    df["clean_text"] = df["text"].parallel_apply(lambda x: clean.clean_text(x)).astype("string[pyarrow]")
    df["tokenized_text"] = (
        df["clean_text"].parallel_apply(lambda x: clean.remove_stopwords(clean.stemm_text(x))).astype("string[pyarrow]")
    )

    # Count the number of words and tokens in the tweet
    df["word_count"] = df["clean_text"].parallel_apply(lambda x: len(x.split())).astype("int16")
    df["token_count"] = df["tokenized_text"].parallel_apply(lambda x: len(x)).astype("int16")

    # Filter out tweets that are too short
    print(
        f"Found {len(df.loc[df['word_count'] < min_word_count])} tweets with less than {min_word_count} words..."
    )  # df['token_count'] < 5
    df = df.loc[df["word_count"] >= min_word_count]  # df['token_count'] >= 5

    # Calculate the sentiment of the tweets
    df["sentiment"] = df["clean_text"].progress_apply(sentiment.predict_sentiment).astype("category")

    return df

Either load the cached data or process the raw tweets


In [None]:
if FAST_MODE and cache_file.exists():
    df_prep = pd.read_parquet(cache_file)
else:
    df_prep = prep_pipeline(df.sample(10000, random_state=42).copy()).reset_index(drop=True)

    if (data_dir / "cache").exists() == False:
        (data_dir / "cache").mkdir()
    df_prep.to_parquet(cache_file)

Check for n-grams


In [None]:
Counter(
    list(itertools.chain.from_iterable(df_prep["text"].str.split().progress_apply(lambda x: ngrams(x, 3))))
).most_common(20)

In [None]:
df_prep.info(verbose=True, memory_usage="deep")

In [None]:
df_prep.describe(include="all", datetime_is_numeric=True)

In [None]:
df_prep.head(10)

### Plotting


In [None]:
plot.party_count(df_prep)

In [None]:
plot.sentiment(df_prep)

In [None]:
plot.word_count(df_prep)

In [None]:
plot.gender(df_prep)

In [None]:
plot.user_count(df_prep)

## Modeling

---


In [None]:
df_modeling = df_prep[["clean_text", "tokenized_text", "party"]]

In [None]:
df_modeling.to_parquet(data_dir / "cache/tweets_modeling.parquet")