# Tweets (Sältzer)

In [None]:
import pandas as pd
from pathlib import Path
import seaborn as sns
import json

data_dir = Path('../../data/tweets')

sns.set(style="white", palette="muted", rc={"figure.figsize": (20, 8)})
line_kws={"color":"r","alpha":0.7,"lw":5}

## Business Understanding

---

Lorem

## Data Understanding

---

In [None]:
# Load the initial data

df = pd.read_feather("tweets/tweets.feather")

### Split Dataframe

In [None]:
# Save head of data frame to file for inspection

with open("tweets/tweets_head.txt", "w") as f, pd.option_context('display.max_rows', None, 'display.max_columns', None):
    f.write(str(df.head()))

In [None]:
# Save head of data frame to file for inspection

with open("tweets/tweets_core_head.txt", "w") as f, pd.option_context('display.max_rows', None, 'display.max_columns', None):
    f.write(str(df.head()))
    
with open("tweets/tweets_core_types.txt", "w") as f, pd.option_context('display.max_rows', None, 'display.max_columns', None):
    f.write(str(df.dtypes))

In [None]:
# Declare the meta data columns to split the data frame

meta_data = ["screen_name", "user_id", "status_id"]

In [None]:
# Load the columns related to Abstimmungen

with open("tweets/tweets_filter_abstimmungen.txt", "r") as f:
    lines_abstimmung = f.read().splitlines()

In [None]:
# Save the data frame consisting of meta data and columns related to Abstimmungen

tweets_abstimmungen = df[meta_data + lines_abstimmung]

tweets_abstimmungen.to_pickle(Path("tweets/tweets_abstimmungen.pkl"))

In [None]:
# Load the columns related to Ausschuss

with open("tweets/tweets_filter_ausschuss.txt", "r") as f:
    lines_ausschuss = f.read().splitlines()

In [None]:
# Save the data frame consisting of meta data and columns related to Ausschuss

tweets_ausschuss = df[meta_data + lines_ausschuss]

tweets_ausschuss.to_pickle(Path("tweets/tweets_ausschuss.pkl"))

In [None]:
# Load the columns related to Ausschuss

with open("tweets/tweets_filter_meta_data.txt", "r") as f:
    lines_meta_data = f.read().splitlines()

In [None]:
# Save the data frame consisting of meta data and columns related to Meta Data

tweets_meta_data = df[meta_data + lines_meta_data]

tweets_meta_data.to_pickle(Path("tweets/tweets_meta_data.pkl"))

In [None]:
# Load the columns with only empty values ("NA")

with open("tweets/empty_columns.txt", "r") as f:
    empty_columns = f.read().splitlines()

In [None]:
# Filter the core data frame and save it

tweets_core = df.drop(columns=lines_abstimmung + lines_ausschuss + empty_columns + lines_meta_data, axis=1)

tweets_core.to_pickle(Path("tweets/tweets_core.pkl"))

In [None]:
# Check for columns with only empty values

unique_values = {col: list(tweets_core[col].unique()) if len(tweets_core[col].unique()) <= 20 else None for col in tweets_core}

with open("tweets/tweets_unique_values.json", "w") as f:
    json.dump(unique_values, f)

### Replace and Convert Values

In [None]:
# Iterate over each column and check if it contains NA values and replace them with None

for col in tweets_core:
    tweets_core[col] = tweets_core[col].apply(lambda x: None if x == "NA" or x == "NA, NA" or x == "NA, NA, NA, NA, NA, NA, NA, NA" else x)

In [None]:
convert_dict = {
  "screen_name": str,
  "user_id": str,
  "status_id": str,
  "created_at": str, # TODO: Convert to datetime
  "text": str,
  "is_retweet": bool,
  "lang": str,
  "fullname": str,
  "faction": pd.Int8Dtype(),
  "name": str,
  "social": pd.Int8Dtype(),
  "economic": pd.Int8Dtype(),
  # "source": str,
  "reply_to_status_id": pd.Int64Dtype(),
  "reply_to_user_id": pd.Int64Dtype(),
  "reply_to_screen_name": str,
  # "is_quote": bool,
  "favorite_count": pd.Int16Dtype(),
  "retweet_count": pd.Int16Dtype(),
  "hashtags": str, # TODO: Convert to list
  # "urls_url": str,
  # "urls_t.co": str,
  # "urls_expanded_url": str,
  # "media_url": str,
  # "media_t.co": str,
  # "media_expanded_url": str,
  # "media_type": str,
  # "ext_media_url": str,
  # "ext_media_t.co": str,
  # "ext_media_expanded_url": str,
  "mentions_user_id": str, # TODO: Convert to list
  "retweet_status_id": pd.Int64Dtype(),
  "retweet_text": str,
  "retweet_created_at": str, # TODO: Convert to datetime
  # "retweet_source": str,
  "retweet_favorite_count": pd.Int32Dtype(),
  "retweet_retweet_count": pd.Int16Dtype(),
  # "place_url": str,
  "place_name": str,
  "place_full_name": str,
  "place_type": str,
  "country": str,
  "country_code": str,
  # "geo_coords": list[str], # TODO: Convert to list
  # "coords_coords": list[str], # TODO: Convert to list
  # "bbox_coords": list[str], # TODO: Convert to list
  # "status_url": str,
  "name.1": str,
  "location": str,
  "description": str,
  "description": str,
  # "protected": bool,
  "followers_count": pd.Int32Dtype(),
  "friends_count": pd.Int16Dtype(),
  "listed_count": pd.Int16Dtype(),
  "statuses_count": pd.Int32Dtype(),
  "favourites_count": pd.Int32Dtype(),
  "account_created_at": str, # TODO: Convert to datetime
  # "verified": bool,
  # "profile_url": str,
  # "profile_expanded_url": str,
  # "profile_banner_url": str,
  # "profile_background_url": str,
  # "profile_image_url": str,
  # "dups": bool,
  # "edited": str, # TODO: Convert to datetime
  "parliament": str,
  "party": str,
  "id": str,
  "lastname": str,
  "firstname": str,
  "birthyear": pd.Int16Dtype(),
  "list": str,
  "position": pd.Int8Dtype(),
  "won": bool,
  "district": str,
  "won.1": bool,
  "number": pd.Int16Dtype(),
  "result": pd.Float64Dtype(),
  "profession": str,
  "gender": str,
  "education": str,
  # "min": bool,
  # "partyoffice": bool,
}

In [None]:
tweets_core_typed = tweets_core.copy()

In [None]:
tweets_core_typed = tweets_core_typed.astype(convert_dict)

In [None]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
  print(tweets_core_typed.dtypes)

In [None]:
# Check for columns with only empty values

unique_values = {columnName: columnData.astype(str).unique() for columnName, columnData in tweets_core_typed.items()}

print(unique_values)

In [None]:
tweets_core_typed.to_pickle(Path("tweets/tweets_core_typed.pkl"))