# Hourly Data Processing

In [1]:
%load_ext lab_black
%load_ext autoreload
%autoreload 2

In [2]:
import os
from datetime import datetime
from glob import glob
from typing import Dict, List

import numpy as np
import pandas as pd

## About

Walkthrough of processing hourly data using in-memory `DataFrame`s for a single hour of streamed tweets.

## User Inputs

In [3]:
processed_data_dir = "data/processed"
use_cols = [
    "id",
    "source_text",
    "created_at",
    "user_name",
    "user_screen_name",
    "user_joined",
    "text",
]

case_sensitive_tweet_search_terms = ["NASA", "ESA", "CSA", "Kepler"]
tweet_search_terms = [
    "aeronautics",
    "webb",
    "goddard",
    "propulsion",
    # "telescope",  # picks up topics not related to space
    "exoplanet",
    # 'launch',
    # 'astronomy',  # picks up random astronomers
    # 'astrophysics',  # picks up random astrophysicists
    "laboratory",
    "jwst",
    "exploration",
    " mission",
    "spacecraft",
]
joined_tweet_search_terms = [
    "james webb",
    "webb space",
    "webb telescope",
    "jet propulsionlab",
    "canadian space agency",
    "european space agency",
    "national aeronautics",
    "shuttle launch",
    "space shuttle",
    "goddard space flightcenter",
    "johnson space center",
    "ames research center",
    "marshall space flightcenter",
    "glenn research center",
    "stephen hawking",
    "dark matter",
    "dark energy",
    "hubble space",
    "hubble telescope",
]
crypto_terms = [
    "crypto",
    # "token",
    "koistarter",
    "daostarter",
    "decentralized",
    # "services",
    # "pancakeswap",
    # "eraxnft",
    # "browsing",
    # "kommunitas",
    # "hosting",
    # "internet",
    # "exipofficial",
    # "servers",
    # "wallet",
    # "liquidity",
    # "rewards",
    # "floki",
    # "10000000000000linkstelegram",
    "dogecoin",
    "czbinance",
    # "watch",
    "binance",
    "dogelonmars",
    "cryptocurrency",
    # "money",
    # "danheld",
    # "cybersecurity",
    "ethereum",
]
video_games_terms = [
    # "gamejoin",
    "arcade",
    "dreamcast",
    "sega",
    "xbox",
    "wii",
    "ps4",
]
non_english_terms = [
    "webuye",
    "bungoma",
    "ethereum",
    "pay someone",
    "seungkwan",
    "woozi",
    "hoshi",
    "kasama",
    "nung",
    "lahat",
    "jinsoul",
    "brunisoul",
    "loona",
    "taas",
    "nung",
]
misc_unwanted_terms = [
    "nft",
    "volcano detected",
    "block-2",
    "tanzanite",
    "vvsorigin",
    "gemstonecarat",
    "popescu",
    "breeding",
    "nairobi",
    "pay someone",
    "bitcrush",
    "homeworkpay",
    "homework",
    "photocards",
    "essay",
    # "hbomax",
]
religious_terms = [
    "scriptures",
    "methusealah",
    "testament",
    "yahweh",
    "god",
    "mullah",
    "allah",
    "clergy",
    "mercy",
    "morality",
    "muslims,",
    "hindus",
    "buddhist",
    "catholics",
    "christians",
    "atheist",
]
inappropriate_terms = [
    "prostitution",
    "musembe",
    "mo-greene",
    "running scared2012",
    "running scared 2012",
    "massacres",
    "eric ephriam chavez",
    "drugs",
    "bin laden",
    "saddam",
    "perished",
    "whore",
    "nasty",
    "nazist",
    "antifa",
    "proud boys",
]
min_num_words_tweet = 10

# columns for extracting insights after making predictions
vcols = [
    "is_quote_status",
    "quote_count",
    "reply_count",
    # "retweet_count",
    "favorite_count",
    "favorited",
    "retweeted",
    "source_text",
    "user_followers",
    "user_friends",
    "user_favourites",
    "user_verified",
    "retweeted_tweet",
]

In [4]:
files_list = glob(f"{processed_data_dir}/*.parquet.gzip")

joined_tweet_search_terms_no_spaces = [
    t.replace(" ", "") for t in joined_tweet_search_terms
]
unwanted_partial_strings_list = (
    crypto_terms
    + religious_terms
    + inappropriate_terms
    + video_games_terms
    + misc_unwanted_terms
    + non_english_terms
)

In [5]:
def drop_blank_tweets(df: pd.DataFrame, subset: List[str] = ["text"]) -> pd.DataFrame:
    """Drop tweets with no text."""
    df_no_nans = df.dropna(subset=subset)
    num_rows_dropped = len(df) - len(df_no_nans)
    print(f"Dropped {num_rows_dropped:,} tweets from raw data")
    return df_no_nans


def get_raw_masks(
    df,
    tweet_search_terms_list: List[str],
    case_sensitive_tweet_search_terms_list: List[str],
    joined_tweet_search_terms_no_spaces_list: List[str],
) -> List[pd.Series]:
    """Get masks for tweets with types of wanted terms in text."""
    lowercase_mask = (
        df["text"].str.lower().str.contains("|".join(tweet_search_terms_list))
    )
    case_mask = df["text"].str.contains(
        "|".join(case_sensitive_tweet_search_terms_list)
    )
    joined_case_mask = (
        df["text"]
        .str.lower()
        .str.replace(" ", "")
        .str.contains("|".join(joined_tweet_search_terms_no_spaces_list))
    )
    print("Created masks to filter raw data based on wanted text in tweets")
    return [lowercase_mask, case_mask, joined_case_mask]


def add_search_term_boolean_columns(
    df: pd.DataFrame,
    lowercase_mask: pd.Series,
    case_mask: pd.Series,
    joined_case_mask: pd.Series,
    crypto_terms_list: List[str],
    religious_terms_list: List[str],
    inappropriate_terms_list: List[str],
    video_games_terms_list: List[str],
    misc_unwanted_terms_list: List[str],
    non_english_terms_list: List[str],
) -> pd.DataFrame:
    """Add boolean columns based on presence of wanted and unwanted terms in tweet text."""
    df = (
        df.assign(contains_wanted_text=lowercase_mask)
        .assign(contains_wanted_text_case_sensitive=case_mask)
        .assign(contains_multi_word_wanted_text=joined_case_mask)
        .assign(
            contains_crypto_terms=df["text"].str.contains("|".join(crypto_terms_list))
        )
        .assign(
            contains_religious_terms=df["text"].str.contains(
                "|".join(religious_terms_list)
            )
        )
        .assign(
            contains_inappropriate_terms=df["text"].str.contains(
                "|".join(inappropriate_terms_list)
            )
        )
        .assign(
            contains_video_games_terms=df["text"].str.contains(
                "|".join(video_games_terms_list)
            )
        )
        .assign(
            contains_misc_unwanted_terms=df["text"].str.contains(
                "|".join(misc_unwanted_terms_list)
            )
        )
        .assign(
            contains_non_english_terms=df["text"].str.contains(
                "|".join(non_english_terms_list)
            )
        )
    )
    print("Created boolean columns to indicate presence of unwanted terms in tweets")
    terms_str = []
    pcts_total = []
    for c in df.columns[df.columns.str.endswith("_terms")]:
        pct_of_total = (df[c].sum() / len(df)) * 100
        term_type = c.replace("contains_", "").replace("_terms", "")
        term_str = f"{term_type}={pct_of_total:.3f}"
        terms_str.append(term_str)
        pcts_total.append(pct_of_total)
    term_str_full = " | ".join(terms_str) + f" | total unwanted={sum(pcts_total):.3f}"
    print(term_str_full)
    return df


def apply_masks(
    df: pd.DataFrame,
    case_mask: pd.Series,
    lowercase_mask: pd.Series,
    joined_case_mask: pd.Series,
    unwanted_partial_strings_list: List[str],
) -> pd.DataFrame:
    """Apply masks for only keeping tweets based on wanted terms in text."""
    df = df.loc[case_mask | lowercase_mask | joined_case_mask]
    unwanted_mask = df["text"].str.contains("|".join(unwanted_partial_strings_list))
    df = df.loc[~unwanted_mask]
    print(f"Kept {len(df):,} tweets after filtering raw data with masks")
    return df


def filter_by_num_words_in_tweet(
    df: pd.DataFrame, min_num_tweet_words_wanted: int
) -> pd.DataFrame:
    """Filter tweets based on number of words in text."""
    min_num_words_mask = (
        df["text"].str.split(" ").str.len() >= min_num_tweet_words_wanted
    )
    print(
        f"Kept {len(df.loc[min_num_words_mask]):,} tweets with more than "
        f"approximately {min_num_tweet_words_wanted:,} words per tweet"
    )
    df = df.loc[min_num_words_mask]
    return df


def filter_tweets_based_on_content(
    df_raw: pd.DataFrame,
    tweet_search_terms: List[str],
    case_sensitive_tweet_search_terms: List[str],
    joined_tweet_search_terms_no_spaces: List[str],
    crypto_terms: List[str],
    religious_terms: List[str],
    inappropriate_terms: List[str],
    video_games_terms: List[str],
    misc_unwanted_terms: List[str],
    non_english_terms: List[str],
    min_num_words_tweet: int,
) -> pd.DataFrame:
    """Filter tweets based on terms in text and approximate number in words."""
    start = datetime.now()
    print(
        "Filtering Tweets - Starting time = "
        f"{start.strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]}..."
    )
    df_raw = df_raw.pipe(drop_blank_tweets, subset=["text"])
    lowercase_mask, case_mask, joined_case_mask = get_raw_masks(
        df_raw,
        tweet_search_terms,
        case_sensitive_tweet_search_terms,
        joined_tweet_search_terms_no_spaces,
    )
    df = (
        df_raw.pipe(
            add_search_term_boolean_columns,
            lowercase_mask=lowercase_mask,
            case_mask=case_mask,
            joined_case_mask=joined_case_mask,
            crypto_terms_list=crypto_terms,
            religious_terms_list=religious_terms,
            inappropriate_terms_list=inappropriate_terms,
            video_games_terms_list=video_games_terms,
            misc_unwanted_terms_list=misc_unwanted_terms,
            non_english_terms_list=non_english_terms,
        )
        .pipe(
            apply_masks,
            case_mask=case_mask,
            lowercase_mask=lowercase_mask,
            joined_case_mask=joined_case_mask,
            unwanted_partial_strings_list=unwanted_partial_strings_list,
        )
        .pipe(
            filter_by_num_words_in_tweet, min_num_tweet_words_wanted=min_num_words_tweet
        )
    )
    end = datetime.now()
    duration = (end - start).total_seconds()
    print(
        "Done filtering at "
        f"{end.strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} ({duration:.3f} seconds)."
    )
    return df

## Get Raw Hourly Data

In [6]:
%%time
df_raw = pd.read_parquet(files_list[0], columns=use_cols)
num_rows, num_cols = df_raw.shape
print(f"Raw Data contains {num_rows:,} rows and {num_cols:,} columns")

Raw Data contains 2,055 rows and 7 columns
CPU times: user 18.4 ms, sys: 10 ms, total: 28.4 ms
Wall time: 19.2 ms


## Process Data

Filter tweets based on terms found in the tweet text

In [7]:
df = df_raw.pipe(
    filter_tweets_based_on_content,
    tweet_search_terms=tweet_search_terms,
    case_sensitive_tweet_search_terms=case_sensitive_tweet_search_terms,
    joined_tweet_search_terms_no_spaces=joined_tweet_search_terms_no_spaces,
    crypto_terms=crypto_terms,
    religious_terms=religious_terms,
    inappropriate_terms=inappropriate_terms,
    video_games_terms=video_games_terms,
    misc_unwanted_terms=misc_unwanted_terms,
    non_english_terms=non_english_terms,
    min_num_words_tweet=min_num_words_tweet,
)

Filtering Tweets - Starting time = 2022-08-17 13:40:29.434...
Dropped 0 tweets from raw data
Created masks to filter raw data based on wanted text in tweets
Created boolean columns to indicate presence of unwanted terms in tweets
crypto=0.438 | religious=0.146 | inappropriate=0.292 | video_games=0.097 | misc_unwanted=0.097 | non_english=0.146 | total unwanted=1.217
Kept 525 tweets after filtering raw data with masks
Kept 489 tweets with more than approximately 10 words per tweet
Done filtering at 2022-08-17 13:40:29.515 (0.081 seconds).


In [8]:
%%time
display(
    df[use_cols]
    .isna()
    .sum()
    .to_frame()
    .T.add_suffix("__nans")
    .assign(num_rows=len(df))
)
display(
    df.isna()
    .sum()
    .rename("nans")
    .to_frame()
    .merge(
        df.dtypes.rename("dtype").to_frame(),
        left_index=True,
        right_index=True,
        how="left",
    )
)
display(df[use_cols])

Unnamed: 0,id__nans,source_text__nans,created_at__nans,user_name__nans,user_screen_name__nans,user_joined__nans,text__nans,num_rows
0,0,0,0,0,0,0,0,489


Unnamed: 0,nans,dtype
id,0,object
source_text,0,object
created_at,0,"datetime64[ns, UTC]"
user_name,0,object
user_screen_name,0,object
user_joined,0,"datetime64[ns, UTC]"
text,0,object
contains_wanted_text,0,bool
contains_wanted_text_case_sensitive,0,bool
contains_multi_word_wanted_text,0,bool


Unnamed: 0,id,source_text,created_at,user_name,user_screen_name,user_joined,text
7,1476608009669201922,Twitter Web App,2021-12-30 17:35:58+00:00,Radio Justice 📻🎙⚖,justiceputnam,2009-07-14 05:10:36+00:00,"NASA: It wasn't a strike, it was just a work s..."
9,1476608015965040643,Buckshee Forum,2021-12-30 17:35:59+00:00,The Buckshee,BucksheeForum,2017-11-03 12:41:31+00:00,Webb telescope is captured soaring through sp...
13,1476608024521453573,Twitter for iPhone,2021-12-30 17:36:01+00:00,Fabricio F. Costa,ffalconi,2009-03-06 05:49:14+00:00,NASA just dropped an exciting update about the...
16,1476608030330724357,Twitter for Android,2021-12-30 17:36:03+00:00,Dr. James O'Donoghue,physicsJ,2010-12-22 22:01:32+00:00,Great footage from camera of NASA/ESA/CSA J...
18,1476608036873682952,Twitter for iPhone,2021-12-30 17:36:04+00:00,Beyond Blue Aerospace,beyondblueaero,2013-11-03 02:35:32+00:00,Photo of James Webb before he transformed into...
...,...,...,...,...,...,...,...
2026,1476611740083261445,Twitter for iPhone,2021-12-30 17:50:47+00:00,Lord,hizir06013769,2020-10-21 14:15:27+00:00,"We've got a plan for keeping our resolution, t..."
2034,1476611757061910533,Twitter Web App,2021-12-30 17:50:51+00:00,Turgut Guliyev,Turgut_Guliyev,2015-03-29 07:00:31+00:00,"We've got a plan for keeping our resolution, t..."
2035,1476611758529777675,Twitter for Android,2021-12-30 17:50:52+00:00,Anti-Wife Equation,Ren_Chandler4,2017-01-31 04:36:10+00:00,Today is the 49th anniversary of the Skylab st...
2050,1476611783007866883,Twitter Web App,2021-12-30 17:50:57+00:00,SSP | Snormanda,SnormandaSSB,2015-04-19 16:42:26+00:00,You just explore the different planets in the ...


CPU times: user 25.4 ms, sys: 1.03 ms, total: 26.4 ms
Wall time: 23.8 ms


In [9]:
for c in df.columns[df.columns.str.startswith("contains_")]:
    display(df[c].value_counts().to_frame())

Unnamed: 0,contains_wanted_text
True,283
False,206


Unnamed: 0,contains_wanted_text_case_sensitive
True,300
False,189


Unnamed: 0,contains_multi_word_wanted_text
False,358
True,131


Unnamed: 0,contains_crypto_terms
False,489


Unnamed: 0,contains_religious_terms
False,489


Unnamed: 0,contains_inappropriate_terms
False,489


Unnamed: 0,contains_video_games_terms
False,489


Unnamed: 0,contains_misc_unwanted_terms
False,489


Unnamed: 0,contains_non_english_terms
False,489
