# Tweets (Sältzer)

In [None]:
import pandas as pd
from pathlib import Path
import seaborn as sns
import json
from studienarbeit.utils.cleaning import CleanText
from dotenv import load_dotenv
from pandarallel import pandarallel

load_dotenv()
sns.set(style="white", palette="muted", rc={"figure.figsize": (20, 8)})
pandarallel.initialize()

In [None]:
FAST_MODE = True

data_dir = Path("../../data/tweets")
line_kws={"color":"r","alpha":0.7,"lw":5}

## Business Understanding

---

Lorem

## Data Understanding

---

In [None]:
df = pd.read_parquet(data_dir / "tweets.parquet", columns=["screen_name", "user_id", "created_at", "text", "is_retweet", "fullname", "faction", "name", "social", "economic", "hashtags", "party", "birthyear", "followers_count", "list", "gender"], use_nullable_dtypes=True)

In [None]:
df.head()

In [None]:
for col in df.columns:
    df[col] = df[col].apply(lambda x: None if x == "NA" or x == ["NA"] or x == "NA, NA" or x == "NA, NA, NA, NA, NA, NA, NA, NA" else x)

In [None]:
df["hashtags"] = df["hashtags"].map(lambda x: x.split(', '), na_action="ignore")

Check for missing values

In [None]:
df.isna().sum()

Drop columns with missing `text` values and tweets which have not a indicator for `is_retweet`

In [None]:
df.dropna(subset=["text", "is_retweet"], inplace=True)

Following, we can check which columns represent categorical data.

In [None]:
category_col = {
    col: df[col].value_counts().to_dict() 
    for col in df.columns.drop("hashtags") if df[col].nunique() <= 15
}

# with open("out/category_col_values.json", "w") as f:
#     json.dump(category_col, f)
    
print(category_col)

In [None]:
convert_dict = {
  "screen_name": "string",
  "user_id": "string",
  "created_at": "string",
  "text": "string",
  "is_retweet": "bool",
  "fullname": "string",
  "faction": "float32",
  "name": "string",
  "social": "float32",
  "economic": "float32",
  "hashtags": "string",
  "followers_count": "float32",
  "party": "category",
  "birthyear": "float32",
  "list": "string",
  "gender": "category",
}

In [None]:
df = df.astype(convert_dict)

In [None]:
df.info(verbose=True, memory_usage="deep")

## Data Preparation

---

In [None]:
df_prep = df.copy()

Either load the cached data or process the raw tweets

In [None]:
if FAST_MODE and (data_dir / "cache/tweets_prep.parquet").exists():
    df_prep = pd.read_parquet(data_dir / "cache/tweets_prep.parquet")
else:
    clean = CleanText()
  
    df_prep["clean_text"] = df_prep["text"].parallel_apply(lambda x: clean.clean_text(x, True))
    df_prep["tokenized_text"] = df_prep["clean_text"].parallel_apply(lambda x: clean.remove_stopwords(clean.stemm_text(x)))
    
    if (data_dir / "cache").exists() == False:
        (data_dir / "cache").mkdir()
    df_prep.to_parquet(data_dir / "cache/tweets_prep.parquet")

In [None]:
df_prep.head()

In [None]:
len(df_prep["screen_name"].unique())

In [None]:
sns.countplot(x="party", data=df_prep)