# Tweets (Sältzer)


In [None]:
import pandas as pd
from pathlib import Path
from dotenv import load_dotenv
from studienarbeit.utils.plots import Plot

load_dotenv()

In [None]:
FAST_MODE = False

data_dir = Path("../../data/tweets")
plot = Plot()

## Business Understanding

---

Lorem


## Data Understanding

---


### Import


In [None]:
df = pd.read_parquet(
    data_dir / "tweets.parquet",
    columns=["screen_name", "created_at", "is_retweet", "text", "party", "birthyear", "gender"],
    use_nullable_dtypes=True,
)


In [None]:
for col in df.columns:
    df[col] = df[col].apply(
        lambda x: None if x == "" or x == "NA" or x == "NA, NA" or x == "NA, NA, NA, NA, NA, NA, NA, NA" else x
    )


Check for missing values


In [None]:
df.isna().sum()

In the cell above we can see that there are about 11k missing values in the `text` column. Regarding the `is_retweet` column, about 3k entries have missing values.

Following we will delete the rows.


In [None]:
df = df.dropna(subset=["text", "is_retweet"])

Following, we can check which columns represent categorical data.


In [None]:
df.nunique()

In [None]:
convert_dict = {
    "screen_name": "category",
    "created_at": "datetime64[ns]",
    "is_retweet": "category",
    "text": "string",
    "party": "category",
    "birthyear": "datetime64[ns]",
    "gender": "category",
}


In [None]:
df = df.astype(convert_dict)

In [None]:
df.info(verbose=True, memory_usage="deep")

In [None]:
df.describe(include="all", datetime_is_numeric=True)

In [None]:
df.head()

### Plotting


In [None]:
plot.plot_party_count(df)

## Data Preparation

---


In [None]:
from pandarallel import pandarallel
from studienarbeit.utils.cleaning import CleanText
from textblob_de.sentiments import PatternAnalyzer

pandarallel.initialize(progress_bar=True, verbose=1)

### Cleaning


In [None]:
clean = CleanText()
sentiment = PatternAnalyzer()

In [None]:
def prep_pipeline(df: pd.DataFrame):
    # Remove tweets from parties that are not in the Bundestag
    df = df[df["party"] != "Parteilos"]

    # Group CDU and CSU as Union
    df["party"] = df["party"].replace("CSU", "UNION")
    df["party"] = df["party"].replace("CDU", "UNION")
    df["party"] = df["party"].cat.remove_unused_categories()

    # Fix labels for retweets
    df["is_retweet"] = df["is_retweet"].replace("FALSE", False)
    df["is_retweet"] = df["is_retweet"].replace("TRUE", True)
    df["is_retweet"] = df["is_retweet"].astype("bool")

    # Apply cleaning pipeline
    df["clean_text"] = df["text"].parallel_apply(lambda x: clean.clean_text(x)).astype("string")
    df["tokenized_text"] = (
        df["clean_text"].parallel_apply(lambda x: clean.remove_stopwords(clean.stemm_text(x))).astype("string")
    )

    # Count the number of words and tokens in the tweet
    df["word_count"] = df["clean_text"].parallel_apply(lambda x: len(x.split())).astype("int16")
    df["token_count"] = df["tokenized_text"].parallel_apply(lambda x: len(x)).astype("int16")

    # Calculate the sentiment of the tweets
    # df["sentiment"] = df["clean_text"].parallel_apply(lambda x: sentiment.analyze(x).polarity).astype("float32")

    return df


Either load the cached data or process the raw tweets


In [None]:
if FAST_MODE and (data_dir / "cache/tweets_prep.parquet").exists():
    df_prep = pd.read_parquet(data_dir / "cache/tweets_prep.parquet")
else:
    df_prep = prep_pipeline(df.copy())

    if (data_dir / "cache").exists() == False:
        (data_dir / "cache").mkdir()
    df_prep.to_parquet(data_dir / "cache/tweets_prep.parquet")


In [None]:
if (data_dir / "cache").exists() == False:
    (data_dir / "cache").mkdir()
df_prep.to_parquet(data_dir / "cache/tweets_prep_w_sent.parquet")

In [None]:
df_prep = pd.read_parquet(data_dir / "cache/tweets_prep_w_sent.parquet")

In [None]:
df_prep["sentiment_2"] = df_prep["clean_text"].parallel_apply(lambda x: sentiment.analyze(x).polarity).astype("float32")

In [None]:
from germansentiment import SentimentModel
from tqdm import tqdm
tqdm.pandas()
model = SentimentModel()

In [None]:
df_prep["sentiment_3"] = df_prep["clean_text"].progress_apply(lambda x: model.predict_sentiment([x])[0]).astype("category")

In [None]:
import spacy
import numpy as np
from spacy_sentiws import spaCySentiWS

In [None]:
spacy_nlp_ger = spacy.load("de_core_news_md", exclude=["tagger", "parser", "senter", "ner"]) 
spacy_nlp_ger.add_pipe("sentiws", config={'sentiws_path': '../../data/sentiws/'})

In [None]:
df_prep["sentiment"] = df_prep["clean_text"].parallel_apply(lambda x: np.mean([token._.sentiws for token in spacy_nlp_ger(x) if token._.sentiws != None])).astype("float32")

In [None]:
df_prep.describe(include="all", datetime_is_numeric=True)

### Filtering


Count the number of tweets with less than 5 words


In [None]:
df_prep["word_count"] = df_prep["clean_text"].parallel_apply(lambda x: len(x.split())).astype("int16")
df_prep["token_count"] = df_prep["tokenized_text"].parallel_apply(lambda x: len(x)).astype("int16")

In [None]:
print(f"Found {len(df_prep.loc[(df_prep['token_count'] < 5) | (df_prep['word_count'] < 5)])} tweets with less than 5 words...")

Count the number of tweets labeled as retweets

In [None]:
print(f"The dataset contains {len(df_prep.loc[df_prep['is_retweet'] == True])} retweets.")

In [None]:
def filtering_pipeline(df):
  df = df[(df_prep['token_count'] >= 5) & (df_prep['word_count'] >= 5)]
  df = df[df["is_retweet"] == False]
  
  return df

In [None]:
df_prep = filtering_pipeline(df_prep.copy())

### Encoding


In [None]:
party_map = {
    "AfD": 0,
    "FDP": 1,
    "DIE GRÜNEN": 2,
    "DIE LINKE": 3,
    "SPD": 4,
    "UNION": 5,
}

df_prep["party"] = df_prep["party"].map(party_map).astype("int8")

In [None]:
gender_map = {
    "male": 0,
    "female": 1,
}

df_prep["gender"] = df_prep["gender"].map(gender_map).astype("int8")

In [None]:
df_prep.info(verbose=True, memory_usage="deep")

### Plotting


In [None]:
plot.plot_party_count(df_prep)

In [None]:
plot.plot_sentiment(df_prep, "sentiment_2")

In [None]:
plot.plot_word_count(df_prep)

In [None]:
plot.plot_gender(df_prep)

### Evaluation


In [None]:
df_prep.head()

In [None]:
df_prep.describe(include="all", datetime_is_numeric=True)

## Modeling

---


In [None]:
df_modeling = df_prep[["clean_text", "tokenized_text", "party"]]

In [None]:
df_modeling.to_parquet(data_dir / "cache/tweets_modeling.parquet")