# Data Filtering

In [1]:
%load_ext lab_black
%load_ext autoreload
%autoreload 2

In [2]:
import os
import sys
import zipfile
from datetime import datetime
from glob import glob
from typing import Dict, List, Union

import boto3
import pandas as pd

In [3]:
PROJ_ROOT = os.path.join(os.pardir)
src_dir = os.path.join(PROJ_ROOT, "src")
sys.path.append(src_dir)

In [4]:
%aimport file_utils
from file_utils import create_zip_file

%aimport s3_utils
from s3_utils import (
    download_file_from_s3,
    extract_zip_file,
    upload_file_to_s3,
)

%aimport text_filter_utils
from text_filter_utils import filter_files_per_hour

## About

As part of this hourly data processing, we will now
- filter out tweets with unwanted text in the tweet, where these texts include terms related to the following topics
  - cryptocurrency (see the `crypto_terms` list below)
  - video games (`video_game_terms`)
  - religion (`religious_terms`)
  - non-English words (`non_english_terms`)
  - miscellaneous unwanted words (`misc_unwanted_terms`)
  - inappropriate terms (profanity, etc. in `inappropriate_terms`)
- only keep tweets that contain terms specific to the subject of this project and these terms are defined below in the following Python lists
  - `tweet_search_terms`
  - `case_sensitive_tweet_search_terms`
  - `joined_tweet_search_terms`

  which were built up using the sampled dataset
  - created using `3-combine-data.ipynb` and then manually labeled
  - filtered in `4_0_create_filter_data_list.ipynb` to pick up as many tweets as possible

**Pre-Requisites**
1. In order to access data stored on Amazon S3, this data preparation notebook must be provided with the following environment variables
   - `AWS_S3_BUCKET_NAME`

## User Inputs

In [5]:
# raw data on S3
path_to_folder = "/datasets/twitter/kinesis-demo/"

# processed data
processed_data_dir = "../data/processed"  # (S3) "../data/processed" or (locally) "datasets/twitter/kinesis-demo/processed"
proc_zip_fname = "combined_data.zip"
filtered_zip_fname = "processed_data.zip"

# keep tweets with text containing terms
case_sensitive_tweet_search_terms = [
    # "NASA",  # picks up topics not related to the mission
    # "ESA",  # picks up topics not related to the mission
    # "CSA",  # picks up topics not related to the mission
    # "Kepler",  # picks up topics not related to the mission
]
tweet_search_terms = [
    # "aeronautics",
    "webb",
    # "goddard",  # picks up topics not related to the mission
    # "propulsion",  # picks up topics not related to the mission
    # "telescope",  # picks up topics not related to the mission
    # "exoplanet",  # picks up topics not related to the mission
    # 'launch',  # picks up topics not related to space exploration
    # 'astronomy',  # picks up random astronomers unrelated to the mission
    # 'astrophysics',  # picks up random astrophysicists unrelated to the mission
    # "laboratory",  # picks up random astronomers unrelated to the mission
    "jwst",
    "livestream",
    "congratulations",
    "congrats",
    "unfolding",
    "tightening",
    "tensioning",
    "sunshield",
    "sunshade",
    "l2",
    # "exploration",  # picks up topics not related to space exploration
    # " mission",  # picks up topics not related to space exploration
    # "spacecraft",  # picks up topics not related to the mission
]
joined_tweet_search_terms = [
    "james webb",
    # "space telescope",
    "webb space",
    "webb telescope",
    "mirror deploy",
    "space telescope",
    "telescope launch",
    "new telescope",
    "live coverage",
    "live stream",
    "heat shield",
    "primary mirror",
    "secondary mirror",
    # "jet propulsion lab",
    # "canadian space agency",
    # "european space agency",
    # "national aeronautics",
    # "shuttle launch",
    # "space shuttle",
    # "goddard space flight center",
    # "johnson space center",
    # "ames research center",
    # "marshall space flight center",
    # "glenn research center",
    # "ball aerospace",
    # "harris corporation",
    # "space telescope science institute",
    # "billochs",
    # "johndurning",
    # "johnmather",
    # "jonathangardner",
    # "northrop grumman",
    # "lockheed martin",
    # "stephen hawking",  # picks up topics not related to the mission
    # "dark matter",  # picks up topics not related to the mission
    # "dark energy",  # picks up topics not related to the mission
    # "hubble space",  # picks up topics not related to the mission
    # "hubble telescope",  # picks up topics not related to the mission
]
# remove tweets with text containing terms
crypto_terms = [
    "crypto",
    # "token",
    "koistarter",
    "daostarter",
    "decentralized",
    # "services",
    # "pancakeswap",
    # "eraxnft",
    # "browsing",
    # "kommunitas",
    # "hosting",
    # "internet",
    # "exipofficial",
    # "servers",
    # "wallet",
    # "liquidity",
    # "rewards",
    # "floki",
    # "10000000000000linkstelegram",
    "dogecoin",
    "czbinance",
    # "watch",
    "binance",
    "dogelonmars",
    "cryptocurrency",
    # "money",
    # "danheld",
    # "cybersecurity",
    "ethereum",
    "bitcrush",
    "vvsorigin",
]
video_games_terms = [
    # "gamejoin",
    "arcade",
    "dreamcast",
    "sega",
    "xbox",
    "wii",
    "ps4",
]
non_english_terms = [
    "webuye",
    "bungoma",
    "ethereum",
    "pay someone",
    "seungkwan",
    "woozi",
    "hoshi",
    "kasama",
    "nung",
    "lahat",
    "jinsoul",
    "brunisoul",
    "loona",
    "taas",
    "nung",
]
misc_unwanted_terms = [
    "nft",
    "volcano detected",
    "block-2",
    "tanzanite",
    "gemstonecarat",
    "popescu",
    "breeding",
    "nairobi",
    "pay someone",
    "homeworkpay",
    "homework",
    "photocards",
    "essay",
    # "hbomax",
]
religious_terms = [
    "scriptures",
    "methusealah",
    "testament",
    "yahweh",
    "god",
    "mullah",
    "allah",
    "clergy",
    "mercy",
    "morality",
    "muslims,",
    "hindus",
    "buddhist",
    "catholics",
    "christians",
    "atheist",
]
inappropriate_terms = [
    "prostitution",
    "musembe",
    "mo-greene",
    "running scared2012",
    "running scared 2012",
    "massacres",
    "eric ephriam chavez",
    "drugs",
    "bin laden",
    "saddam",
    "perished",
    "whore",
    "nasty",
    "nazist",
    "antifa",
    "proud boys",
]
min_num_words_tweet = 3

upload_to_s3 = True
cleanup_local_files = True

# columns for EDA
vcols = [
    "is_quote_status",
    "quote_count",
    "reply_count",
    # "retweet_count",
    "favorite_count",
    "favorited",
    "retweeted",
    "source_text",
    "user_followers",
    "user_friends",
    "user_favourites",
    "user_verified",
    "retweeted_tweet",
]

In [6]:
joined_tweet_search_terms_no_spaces = [
    t.replace(" ", "") for t in joined_tweet_search_terms
]
unwanted_partial_strings_list = (
    crypto_terms
    + religious_terms
    + inappropriate_terms
    + video_games_terms
    + misc_unwanted_terms
    + non_english_terms
)
dtypes_dict = {
    "id": pd.StringDtype(),
    "geo": pd.StringDtype(),
    "coordinates": pd.StringDtype(),
    "place": pd.StringDtype(),
    "contributors": pd.StringDtype(),  # pd.BooleanDtype(),
    "is_quote_status": pd.StringDtype(),  # pd.BooleanDtype(),
    "quote_count": pd.Int32Dtype(),
    "reply_count": pd.Int32Dtype(),
    "retweet_count": pd.Int32Dtype(),
    "favorite_count": pd.Int32Dtype(),
    "favorited": pd.StringDtype(),  # pd.BooleanDtype(),
    "retweeted": pd.StringDtype(),  # pd.BooleanDtype(),
    "source": pd.StringDtype(),
    "in_reply_to_user_id": pd.StringDtype(),
    "in_reply_to_screen_name": pd.StringDtype(),
    "source_text": pd.StringDtype(),
    "place_id": pd.StringDtype(),
    "place_url": pd.StringDtype(),
    "place_place_type": pd.StringDtype(),
    "place_name": pd.StringDtype(),
    "place_full_name": pd.StringDtype(),
    "place_country_code": pd.StringDtype(),
    "place_country": pd.StringDtype(),
    "place_bounding_box_type": pd.StringDtype(),
    "place_bounding_box_coordinates": pd.StringDtype(),
    "place_attributes": pd.StringDtype(),
    "coords_type": pd.StringDtype(),
    "coords_lon": pd.StringDtype(),
    "coords_lat": pd.StringDtype(),
    "geo_type": pd.StringDtype(),
    "geo_lon": pd.StringDtype(),
    "geo_lat": pd.StringDtype(),
    "user_name": pd.StringDtype(),
    "user_screen_name": pd.StringDtype(),
    "user_followers": pd.Int32Dtype(),
    "user_friends": pd.Int32Dtype(),
    "user_listed": pd.Int32Dtype(),
    "user_favourites": pd.Int32Dtype(),
    "user_statuses": pd.Int32Dtype(),
    "user_protected": pd.StringDtype(),  # pd.BooleanDtype(),
    "user_verified": pd.StringDtype(),  # pd.BooleanDtype(),
    "user_contributors_enabled": pd.StringDtype(),
    "user_location": pd.StringDtype(),
    "retweeted_tweet": pd.StringDtype(),
    "tweet_text_urls": pd.StringDtype(),
    "tweet_text_hashtags": pd.StringDtype(),
    "tweet_text_usernames": pd.StringDtype(),
    "num_urls_in_tweet_text": pd.Int32Dtype(),
    "num_users_in_tweet_text": pd.Int32Dtype(),
    "num_hashtags_in_tweet_text": pd.Int32Dtype(),
    "text": pd.StringDtype(),
    "contains_wanted_text": pd.BooleanDtype(),
    "contains_wanted_text_case_sensitive": pd.BooleanDtype(),
    "contains_multi_word_wanted_text": pd.BooleanDtype(),
    "contains_crypto_terms": pd.BooleanDtype(),
    "contains_religious_terms": pd.BooleanDtype(),
    "contains_inappropriate_terms": pd.BooleanDtype(),
    "contains_video_games_terms": pd.BooleanDtype(),
    "contains_misc_unwanted_terms": pd.BooleanDtype(),
    "contains_non_english_terms": pd.BooleanDtype(),
}

In [7]:
s3_bucket_name = os.getenv("AWS_S3_BUCKET_NAME", "")

try:
    session = boto3.Session(profile_name="default")
    s3_client = session.client("s3")
    aws_region = session.region_name
    print("Retrieved AWS credentials from ~/.ssh/aws file")
except Exception as e:
    if str(e) == "The config profile (default) could not be found":
        aws_region = os.getenv("AWS_REGION")
        s3_client = boto3.client("s3", region_name=aws_region)
        print("Retrieved AWS credentials from .env file")

Retrieved AWS credentials from .env file


## Get Hourly Combined Data

In [8]:
%%time
if not os.path.exists(os.path.join(processed_data_dir, proc_zip_fname)):
    download_file_from_s3(
        s3_client,
        s3_bucket_name,
        processed_data_dir,
        proc_zip_fname,
        aws_region,
        f"{path_to_folder[1:]}processed/",
    )
    extract_zip_file(os.path.join(processed_data_dir, proc_zip_fname), processed_data_dir)
combined_files = glob(f"{processed_data_dir}/*.parquet.gzip")

CPU times: user 393 µs, sys: 426 µs, total: 819 µs
Wall time: 659 µs


## Filter Data Per Hour

Run an ETL workflow to process hourly file objects in the S3 bucket
- extract all hourly file objects into a single `pandas` `DataFrame`
- process the hourly data
  - exclude tweets that contain terms related to
    - cryptocurrency
    - video games
    - non-English
    - religious
    - miscellaneous unwanted
    - inappropriate

    subjects
  - include tweets that contain terms related to the subject of this project
  - set appropriate datatypes
- export the processed `DataFrame` to a `.parquet` file

In [9]:
%%time
df_summary = filter_files_per_hour(
    combined_files,
    tweet_search_terms,
    case_sensitive_tweet_search_terms,
    joined_tweet_search_terms_no_spaces,
    crypto_terms,
    religious_terms,
    inappropriate_terms,
    video_games_terms,
    misc_unwanted_terms,
    non_english_terms,
    unwanted_partial_strings_list,
    min_num_words_tweet,
    dtypes_dict,
    processed_data_dir,
)
display(df_summary)

Filtering Tweets from combined data file 1/225 (2022010603.parquet.gzip)
Starting time = 2022-10-25 23:55:29.498...
Read 4,172 rows of data from parquet file 2022010603.parquet.gzip
Created masks to filter raw data based on wanted text in tweets
Created boolean columns to indicate presence of unwanted terms in tweets
crypto=0.240 | religious=1.294 | inappropriate=0.048 | video_games=0.048 | misc_unwanted=0.144 | non_english=0.336 | total unwanted=2.109
Kept 108 tweets after filtering raw data with masks
Kept 108 tweets with more than approximately 3 words per tweet
Done filtering at 2022-10-25 23:55:29.745 (0.247 seconds).
Saving to parquet file filtered__2022010603.parquet.gzip...
Done.

Filtering Tweets from combined data file 2/225 (2022010921.parquet.gzip)
Starting time = 2022-10-25 23:55:29.752...
Read 5,702 rows of data from parquet file 2022010921.parquet.gzip
Created masks to filter raw data based on wanted text in tweets
Created boolean columns to indicate presence of unwanted

Unnamed: 0,filename,num_rows_combined,num_rows_filtered
0,../data/processed/2022010603.parquet.gzip,4172,108
1,../data/processed/2022010921.parquet.gzip,5702,339
2,../data/processed/2022010204.parquet.gzip,4021,165
3,../data/processed/2022010105.parquet.gzip,5763,203
4,../data/processed/2022010510.parquet.gzip,4129,141
...,...,...,...
220,../data/processed/2022010623.parquet.gzip,5705,305
221,../data/processed/2022010613.parquet.gzip,4431,168
222,../data/processed/2022010722.parquet.gzip,6534,337
223,../data/processed/2021123118.parquet.gzip,3555,113


CPU times: user 53 s, sys: 877 ms, total: 53.9 s
Wall time: 51.9 s


**Notes**
1. Like the previous notebook, this ETL workflow will also give a single `.parquet` file for every hour of every day on which tweets were streamed. Since the number of hourly tweets for this subject are small enough to fit into memory, it is possible to use `pandas` (in-memory) for filtering the hourly data.

## Compress to `.zip` and Upload to S3

All `.parquet` files are now
- combined into a single `.zip` file with a `filtered__` prefix in the filename, which is then
  - uploaded to the `datasets/twitter/kinesis-demo/processed` prefix in the S3 bucket
  - deleted locally
- deleted locally

In [10]:
%%time
if upload_to_s3:
    # zip all processed data files, upload to S3, delete local files
    # create zip of all .parquet.gzip processed data files
    curr_dir = os.getcwd()
    create_zip_file("filtered__*.parquet.gzip", processed_data_dir, filtered_zip_fname)
    # upload zip file to S3 bucket
    try:
        assert os.getcwd() == curr_dir
        upload_file_to_s3(
            aws_region,
            processed_data_dir,
            filtered_zip_fname,
            s3_bucket_name,
            f"{path_to_folder[1:-1]}/processed/{filtered_zip_fname}",
        )
        print("\nUploaded filtered zipped file to S3 bucket")
    except AssertionError as e:
        print(
            f"\n{str(e)}: Incorrect working directory. "
            "Did not upload filtered zipped file to S3 bucket."
        )

if cleanup_local_files:
    # 1. delete local combined parquet files and locally exported processed parquet files
    list(
        map(
            os.remove,
            glob(f"{processed_data_dir}/*.parquet.gzip"),
        )
    )
    print("Deleted local .parquet.gzip files with combined and filtered data.")

    # 3. delete local processed zip file
    os.remove(os.path.join(processed_data_dir, proc_zip_fname))
    print("Deleted local .zip file created from all combined data files.")

    # 4. delete local filtered zip file
    os.remove(os.path.join(processed_data_dir, filtered_zip_fname))
    print("Deleted local .zip file created from all filtered data files.")

Created zip file at ../data/processed/processed_data.zip

Uploaded filtered zipped file to S3 bucket
Deleted local .parquet.gzip files with combined and filtered data.
Deleted local .zip file created from all combined data files.
Deleted local .zip file created from all filtered data files.
CPU times: user 631 ms, sys: 60.5 ms, total: 691 ms
Wall time: 1.78 s


---

<span style="float:left;">
    <a href="./3-combine-data/notebooks/3_combine_data.ipynb"><< 3 - Combine Hourly Streamed Tweets</a>
</span>

<span style="float:right;">
    <a href="./5-process-data/notebooks/5_process_data.ipynb">5 - Data Processing >></a>
</span>