# Data Filtering

In [1]:
!pip3 freeze | grep -E 'boto3|s3fs|pandas=='

boto3==1.24.61
pandas==1.4.3
s3fs==0.4.2


In [2]:
%load_ext lab_black
%load_ext autoreload
%autoreload 2

In [3]:
import os
import zipfile
from datetime import datetime
from glob import glob
from typing import Dict, List, Union

import boto3
import pandas as pd

## About

As part of this hourly data processing, we will now
- filter out tweets with unwanted text in the tweet, where these texts include terms related to the following topics
  - cryptocurrency (see the `crypto_terms` list below)
  - video games (`video_game_terms`)
  - religion (`religious_terms`)
  - non-English words (`non_english_terms`)
  - miscellaneous unwanted words (`misc_unwanted_terms`)
  - inappropriate terms (profanity, etc. in `inappropriate_terms`)
- only keep tweets that contain terms specific to the subject of this project and these terms are defined below in the following Python lists
  - `tweet_search_terms`
  - `case_sensitive_tweet_search_terms`
  - `joined_tweet_search_terms`

**Pre-Requisites**
1. In order to access data stored on Amazon S3, this data preparation notebook must be provided with the following environment variables
   - `AWS_S3_BUCKET_NAME`

## User Inputs

In [4]:
# raw data on S3
path_to_folder = "/datasets/twitter/kinesis-demo/"

# processed data
processed_data_dir = "data/processed"  # (S3) "data/processed" or (locally) "datasets/twitter/kinesis-demo/processed"
proc_zip_fname = "combined_data.zip"
filtered_zip_fname = "processed_data.zip"

# keep tweets with text containing terms
case_sensitive_tweet_search_terms = ["NASA", "ESA", "CSA", "Kepler"]
tweet_search_terms = [
    "aeronautics",
    # "webb",  # picks up random astronomers unrelated to the mission
    "goddard",
    "propulsion",
    # "telescope",  # picks up topics not related to the mission
    "exoplanet",
    # 'launch',  # picks up topics not related to space exploration
    # 'astronomy',  # picks up random astronomers unrelated to the mission
    # 'astrophysics',  # picks up random astrophysicists unrelated to the mission
    # "laboratory",  # picks up random astronomers unrelated to the mission
    "jwst",
    # "exploration",  # picks up topics not related to space exploration
    # " mission",  # picks up topics not related to space exploration
    "spacecraft",
]
joined_tweet_search_terms = [
    "james webb",
    "space telescope",
    "webb space",
    "webb telescope",
    "jet propulsion lab",
    "canadian space agency",
    "european space agency",
    "national aeronautics",
    "shuttle launch",
    "space shuttle",
    "goddard space flight center",
    "johnson space center",
    "ames research center",
    "marshall space flight center",
    "glenn research center",
    "ball aerospace",
    "harris corporation",
    "space telescope science institute",
    "billochs",
    "johndurning",
    "johnmather",
    "jonathangardner",
    "northrop grumman",
    "lockheed martin",
    # "stephen hawking",  # picks up topics not related to the mission
    # "dark matter",  # picks up topics not related to the mission
    # "dark energy",  # picks up topics not related to the mission
    "hubble space",
    "hubble telescope",
]
# remove tweets with text containing terms
crypto_terms = [
    "crypto",
    # "token",
    "koistarter",
    "daostarter",
    "decentralized",
    # "services",
    # "pancakeswap",
    # "eraxnft",
    # "browsing",
    # "kommunitas",
    # "hosting",
    # "internet",
    # "exipofficial",
    # "servers",
    # "wallet",
    # "liquidity",
    # "rewards",
    # "floki",
    # "10000000000000linkstelegram",
    "dogecoin",
    "czbinance",
    # "watch",
    "binance",
    "dogelonmars",
    "cryptocurrency",
    # "money",
    # "danheld",
    # "cybersecurity",
    "ethereum",
    "bitcrush",
    "vvsorigin",
]
video_games_terms = [
    # "gamejoin",
    "arcade",
    "dreamcast",
    "sega",
    "xbox",
    "wii",
    "ps4",
]
non_english_terms = [
    "webuye",
    "bungoma",
    "ethereum",
    "pay someone",
    "seungkwan",
    "woozi",
    "hoshi",
    "kasama",
    "nung",
    "lahat",
    "jinsoul",
    "brunisoul",
    "loona",
    "taas",
    "nung",
]
misc_unwanted_terms = [
    "nft",
    "volcano detected",
    "block-2",
    "tanzanite",
    "gemstonecarat",
    "popescu",
    "breeding",
    "nairobi",
    "pay someone",
    "homeworkpay",
    "homework",
    "photocards",
    "essay",
    # "hbomax",
]
religious_terms = [
    "scriptures",
    "methusealah",
    "testament",
    "yahweh",
    "god",
    "mullah",
    "allah",
    "clergy",
    "mercy",
    "morality",
    "muslims,",
    "hindus",
    "buddhist",
    "catholics",
    "christians",
    "atheist",
]
inappropriate_terms = [
    "prostitution",
    "musembe",
    "mo-greene",
    "running scared2012",
    "running scared 2012",
    "massacres",
    "eric ephriam chavez",
    "drugs",
    "bin laden",
    "saddam",
    "perished",
    "whore",
    "nasty",
    "nazist",
    "antifa",
    "proud boys",
]
min_num_words_tweet = 10

upload_to_s3 = True
cleanup_local_files = True

# columns for EDA
vcols = [
    "is_quote_status",
    "quote_count",
    "reply_count",
    # "retweet_count",
    "favorite_count",
    "favorited",
    "retweeted",
    "source_text",
    "user_followers",
    "user_friends",
    "user_favourites",
    "user_verified",
    "retweeted_tweet",
]

In [5]:
s3_bucket_name = os.getenv("AWS_S3_BUCKET_NAME", "")

joined_tweet_search_terms_no_spaces = [
    t.replace(" ", "") for t in joined_tweet_search_terms
]
unwanted_partial_strings_list = (
    crypto_terms
    + religious_terms
    + inappropriate_terms
    + video_games_terms
    + misc_unwanted_terms
    + non_english_terms
)
dtypes_dict = {
    "id": pd.StringDtype(),
    "geo": pd.StringDtype(),
    "coordinates": pd.StringDtype(),
    "place": pd.StringDtype(),
    "contributors": pd.StringDtype(),  # pd.BooleanDtype(),
    "is_quote_status": pd.StringDtype(),  # pd.BooleanDtype(),
    "quote_count": pd.Int32Dtype(),
    "reply_count": pd.Int32Dtype(),
    "retweet_count": pd.Int32Dtype(),
    "favorite_count": pd.Int32Dtype(),
    "favorited": pd.StringDtype(),  # pd.BooleanDtype(),
    "retweeted": pd.StringDtype(),  # pd.BooleanDtype(),
    "source": pd.StringDtype(),
    "in_reply_to_user_id": pd.StringDtype(),
    "in_reply_to_screen_name": pd.StringDtype(),
    "source_text": pd.StringDtype(),
    "place_id": pd.StringDtype(),
    "place_url": pd.StringDtype(),
    "place_place_type": pd.StringDtype(),
    "place_name": pd.StringDtype(),
    "place_full_name": pd.StringDtype(),
    "place_country_code": pd.StringDtype(),
    "place_country": pd.StringDtype(),
    "place_bounding_box_type": pd.StringDtype(),
    "place_bounding_box_coordinates": pd.StringDtype(),
    "place_attributes": pd.StringDtype(),
    "coords_type": pd.StringDtype(),
    "coords_lon": pd.StringDtype(),
    "coords_lat": pd.StringDtype(),
    "geo_type": pd.StringDtype(),
    "geo_lon": pd.StringDtype(),
    "geo_lat": pd.StringDtype(),
    "user_name": pd.StringDtype(),
    "user_screen_name": pd.StringDtype(),
    "user_followers": pd.Int32Dtype(),
    "user_friends": pd.Int32Dtype(),
    "user_listed": pd.Int32Dtype(),
    "user_favourites": pd.Int32Dtype(),
    "user_statuses": pd.Int32Dtype(),
    "user_protected": pd.StringDtype(),  # pd.BooleanDtype(),
    "user_verified": pd.StringDtype(),  # pd.BooleanDtype(),
    "user_contributors_enabled": pd.StringDtype(),
    "user_location": pd.StringDtype(),
    "retweeted_tweet": pd.StringDtype(),
    "tweet_text_urls": pd.StringDtype(),
    "tweet_text_hashtags": pd.StringDtype(),
    "tweet_text_usernames": pd.StringDtype(),
    "num_urls_in_tweet_text": pd.Int32Dtype(),
    "num_users_in_tweet_text": pd.Int32Dtype(),
    "num_hashtags_in_tweet_text": pd.Int32Dtype(),
    "text": pd.StringDtype(),
    "contains_wanted_text": pd.BooleanDtype(),
    "contains_wanted_text_case_sensitive": pd.BooleanDtype(),
    "contains_multi_word_wanted_text": pd.BooleanDtype(),
    "contains_crypto_terms": pd.BooleanDtype(),
    "contains_religious_terms": pd.BooleanDtype(),
    "contains_inappropriate_terms": pd.BooleanDtype(),
    "contains_video_games_terms": pd.BooleanDtype(),
    "contains_misc_unwanted_terms": pd.BooleanDtype(),
    "contains_non_english_terms": pd.BooleanDtype(),
}

In [6]:
session = boto3.Session(profile_name="default")
s3_client = session.client("s3")

In [7]:
def save_to_parquet(
    df: pd.DataFrame, filepath: str, storage_options: Union[None, Dict[str, str]] = None
) -> None:
    """Save DataFrame to .parquet.gzip file."""
    print(f"Saving to parquet file {os.path.basename(filepath)}...")
    df.to_parquet(filepath, index=False, storage_options=storage_options)
    print("Done.")


def read_parquet(
    filepath: str,
    columns: Union[None, List[str]] = None,
    storage_options: Union[None, Dict[str, str]] = None,
    verbose: bool = False,
) -> pd.DataFrame:
    """Read DataFrame from .parquet.gzip file."""
    df = pd.read_parquet(filepath, columns=columns, storage_options=storage_options)
    if verbose:
        print(
            f"Read {len(df):,} rows of data from parquet "
            f"file {os.path.basename(filepath)}"
        )
    return df


def create_zip_file(
    file_search_pattern: str, processed_data_dir: str, proc_data_zip_fname: str
):
    """Create zipped file."""
    os.chdir(processed_data_dir)
    ZipFile = zipfile.ZipFile(proc_data_zip_fname, "w")
    for f in glob(file_search_pattern):
        ZipFile.write(f, compress_type=zipfile.ZIP_DEFLATED)
    ZipFile.close()
    os.chdir("../../")


def upload_file_to_s3(
    aws_region: str,
    processed_data_dir: str,
    fname: str,
    s3_bucket_name: str,
    s3_key: str,
) -> None:
    """Upload file to key in S3 bucket."""
    s3_resource = boto3.resource("s3", region_name=aws_region)
    s3_resource.meta.client.upload_file(
        f"{processed_data_dir}/{fname}",
        s3_bucket_name,
        s3_key,
    )


def download_file_from_s3(
    s3_bucket_name: str,
    path_to_folder: str,
    data_dir: str,
    fname: str,
    aws_region: str,
) -> None:
    """Download file from ."""
    dest_filepath = os.path.join(data_dir, fname)
    s3_filepath_key = s3_client.list_objects_v2(
        Bucket=s3_bucket_name,
        Delimiter="/",
        Prefix=f"{path_to_folder[1:]}processed/",
    )["Contents"][0]["Key"]
    start = datetime.now()
    print(
        f"Started downloading processed data zip file from {s3_filepath_key} to "
        f"{dest_filepath} at {start.strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]}..."
    )
    s3 = boto3.resource("s3", region_name=aws_region)
    s3.meta.client.download_file(
        s3_bucket_name,
        s3_filepath_key,
        dest_filepath,
    )
    duration = (datetime.now() - start).total_seconds()
    print(f"Done downloading in {duration:.3f} seconds.")


def extract_zip_file(dest_filepath: str, data_dir: str) -> None:
    """."""
    start = datetime.now()
    print(
        "Started extracting filtered data parquet files from "
        f"processed data zip file to {data_dir} at "
        f"{start.strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]}..."
    )
    zip_ref = zipfile.ZipFile(dest_filepath)
    zip_ref.extractall(data_dir)
    zip_ref.close()
    duration = (datetime.now() - start).total_seconds()
    print(f"Done extracting in {duration:.3f} seconds.")

In [8]:
def get_raw_masks(
    df,
    tweet_search_terms_list: List[str],
    case_sensitive_tweet_search_terms_list: List[str],
    joined_tweet_search_terms_no_spaces_list: List[str],
) -> List[pd.Series]:
    """Get masks for tweets with types of wanted terms in text."""
    lowercase_mask = (
        df["text"].str.lower().str.contains("|".join(tweet_search_terms_list))
    )
    case_mask = df["text"].str.contains(
        "|".join(case_sensitive_tweet_search_terms_list)
    )
    joined_case_mask = (
        df["text"]
        .str.lower()
        .str.replace(" ", "")
        .str.contains("|".join(joined_tweet_search_terms_no_spaces_list))
    )
    print("Created masks to filter raw data based on wanted text in tweets")
    return [lowercase_mask, case_mask, joined_case_mask]


def add_search_term_boolean_columns(
    df: pd.DataFrame,
    lowercase_mask: pd.Series,
    case_mask: pd.Series,
    joined_case_mask: pd.Series,
    crypto_terms_list: List[str],
    religious_terms_list: List[str],
    inappropriate_terms_list: List[str],
    video_games_terms_list: List[str],
    misc_unwanted_terms_list: List[str],
    non_english_terms_list: List[str],
) -> pd.DataFrame:
    """Add boolean columns based on presence of wanted and unwanted terms in tweet text."""
    df = (
        df.assign(contains_wanted_text=lowercase_mask)
        .assign(contains_wanted_text_case_sensitive=case_mask)
        .assign(contains_multi_word_wanted_text=joined_case_mask)
        .assign(
            contains_crypto_terms=df["text"].str.contains("|".join(crypto_terms_list))
        )
        .assign(
            contains_religious_terms=df["text"].str.contains(
                "|".join(religious_terms_list)
            )
        )
        .assign(
            contains_inappropriate_terms=df["text"].str.contains(
                "|".join(inappropriate_terms_list)
            )
        )
        .assign(
            contains_video_games_terms=df["text"].str.contains(
                "|".join(video_games_terms_list)
            )
        )
        .assign(
            contains_misc_unwanted_terms=df["text"].str.contains(
                "|".join(misc_unwanted_terms_list)
            )
        )
        .assign(
            contains_non_english_terms=df["text"].str.contains(
                "|".join(non_english_terms_list)
            )
        )
    )
    print("Created boolean columns to indicate presence of unwanted terms in tweets")
    terms_str = []
    pcts_total = []
    for c in df.columns[df.columns.str.endswith("_terms")]:
        pct_of_total = (df[c].sum() / len(df)) * 100
        term_type = c.replace("contains_", "").replace("_terms", "")
        term_str = f"{term_type}={pct_of_total:.3f}"
        terms_str.append(term_str)
        pcts_total.append(pct_of_total)
    term_str_full = " | ".join(terms_str) + f" | total unwanted={sum(pcts_total):.3f}"
    print(term_str_full)
    return df


def apply_masks(
    df: pd.DataFrame,
    case_mask: pd.Series,
    lowercase_mask: pd.Series,
    joined_case_mask: pd.Series,
    unwanted_partial_strings_list: List[str],
) -> pd.DataFrame:
    """Apply masks for only keeping tweets based on terms in text."""
    df = df.loc[case_mask | lowercase_mask | joined_case_mask]
    unwanted_mask = df["text"].str.contains("|".join(unwanted_partial_strings_list))
    df = df.loc[~unwanted_mask]
    print(f"Kept {len(df):,} tweets after filtering raw data with masks")
    return df


def filter_by_num_words_in_tweet(
    df: pd.DataFrame, min_num_tweet_words_wanted: int
) -> pd.DataFrame:
    """Filter tweets based on number of words in text."""
    min_num_words_mask = (
        df["text"].str.split(" ").str.len() >= min_num_tweet_words_wanted
    )
    print(
        f"Kept {len(df.loc[min_num_words_mask]):,} tweets with more than "
        f"approximately {min_num_tweet_words_wanted:,} words per tweet"
    )
    df = df.loc[min_num_words_mask]
    return df

In [9]:
def filter_files_per_hour(
    proc_files: List[str],
    tweet_search_terms: List[str],
    case_sensitive_tweet_search_terms: List[str],
    joined_tweet_search_terms_no_spaces: List[str],
    crypto_terms: List[str],
    religious_terms: List[str],
    inappropriate_terms: List[str],
    video_games_terms: List[str],
    misc_unwanted_terms: List[str],
    non_english_terms: List[str],
    unwanted_partial_strings_list: List[str],
    min_num_words_tweet: int,
    dtypes_dict: Dict,
    processed_data_dir: str,
) -> pd.DataFrame:
    """Filter files combined per hour."""
    records = []
    for k, f in enumerate(proc_files, 1):
        start = datetime.now()
        print(
            f"Filtering Tweets from combined data file {k}/{len(proc_files):,} "
            f"({os.path.basename(f)})\nStarting time = "
            f"{start.strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]}..."
        )
        # extract
        df_combined = read_parquet(f, None, None, True)
        # transform
        lowercase_mask, case_mask, joined_case_mask = get_raw_masks(
            df_combined,
            tweet_search_terms,
            case_sensitive_tweet_search_terms,
            joined_tweet_search_terms_no_spaces,
        )
        df_filtered = (
            df_combined.pipe(
                add_search_term_boolean_columns,
                lowercase_mask=lowercase_mask,
                case_mask=case_mask,
                joined_case_mask=joined_case_mask,
                crypto_terms_list=crypto_terms,
                religious_terms_list=religious_terms,
                inappropriate_terms_list=inappropriate_terms,
                video_games_terms_list=video_games_terms,
                misc_unwanted_terms_list=misc_unwanted_terms,
                non_english_terms_list=non_english_terms,
            )
            .pipe(
                apply_masks,
                case_mask=case_mask,
                lowercase_mask=lowercase_mask,
                joined_case_mask=joined_case_mask,
                unwanted_partial_strings_list=unwanted_partial_strings_list,
            )
            .pipe(
                filter_by_num_words_in_tweet,
                min_num_tweet_words_wanted=min_num_words_tweet,
            )
            .astype(dtypes_dict)
        )
        end = datetime.now()
        duration = (end - start).total_seconds()
        print(
            "Done filtering at "
            f"{end.strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} ({duration:.3f} seconds)."
        )
        # load
        file_name = (
            f"filtered__{os.path.basename(f).split('.')[0]}"
            # f"__{end.strftime('%Y%m%d_%H%M%S')}"
        )
        filepath = f"{processed_data_dir}/{file_name}.parquet.gzip"
        save_to_parquet(df_filtered, filepath, None)
        records.append(
            {
                "filename": f,
                "num_rows_combined": len(df_combined),
                "num_rows_filtered": len(df_filtered),
            }
        )
        if k < len(proc_files):
            print()
    # summarize
    df_summary = pd.DataFrame.from_records(records)
    num_combined_rows = df_summary["num_rows_combined"].sum()
    num_filtered_rows = df_summary["num_rows_filtered"].sum()
    print(
        f"\nCombined data contained {num_combined_rows:,} rows\n"
        f"Filtered data contains {num_filtered_rows:,} rows"
    )
    return df_summary

## Get Hourly Combined Data

In [10]:
%%time
if not os.path.exists(os.path.join(processed_data_dir, proc_zip_fname)):
    download_file_from_s3(
        s3_bucket_name, path_to_folder, processed_data_dir, proc_zip_fname, session.region_name
    )
    extract_zip_file(os.path.join(processed_data_dir, proc_zip_fname), processed_data_dir)
combined_files = glob(f"{processed_data_dir}/*.parquet.gzip")

CPU times: user 851 µs, sys: 174 µs, total: 1.03 ms
Wall time: 994 µs


## Filter Data Per Hour

Run an ETL workflow to process hourly file objects in the S3 bucket
- extract all hourly file objects into a single `pandas` `DataFrame`
- process the hourly data
  - exclude tweets that contain terms related to
    - cryptocurrency
    - video games
    - non-English
    - religious
    - miscellaneous unwanted
    - inappropriate

    subjects
  - include tweets that contain terms related to the subject of this project
  - set appropriate datatypes
- export the processed `DataFrame` to a `.parquet` file

In [11]:
%%time
df_summary = filter_files_per_hour(
    combined_files,
    tweet_search_terms,
    case_sensitive_tweet_search_terms,
    joined_tweet_search_terms_no_spaces,
    crypto_terms,
    religious_terms,
    inappropriate_terms,
    video_games_terms,
    misc_unwanted_terms,
    non_english_terms,
    unwanted_partial_strings_list,
    min_num_words_tweet,
    dtypes_dict,
    processed_data_dir,
)
display(df_summary)

Filtering Tweets from combined data file 1/225 (2022010308.parquet.gzip)
Starting time = 2022-08-28 18:14:01.804...
Read 3,537 rows of data from parquet file 2022010308.parquet.gzip
Created masks to filter raw data based on wanted text in tweets
Created boolean columns to indicate presence of unwanted terms in tweets
crypto=0.311 | religious=0.339 | inappropriate=0.057 | video_games=0.198 | misc_unwanted=0.028 | non_english=0.396 | total unwanted=1.329
Kept 494 tweets after filtering raw data with masks
Kept 455 tweets with more than approximately 10 words per tweet
Done filtering at 2022-08-28 18:14:02.313 (0.508 seconds).
Saving to parquet file filtered__2022010308.parquet.gzip...
Done.

Filtering Tweets from combined data file 2/225 (2022010310.parquet.gzip)
Starting time = 2022-08-28 18:14:02.329...
Read 4,015 rows of data from parquet file 2022010310.parquet.gzip
Created masks to filter raw data based on wanted text in tweets
Created boolean columns to indicate presence of unwante

Unnamed: 0,filename,num_rows_combined,num_rows_filtered
0,data/processed/2022010308.parquet.gzip,3537,455
1,data/processed/2022010310.parquet.gzip,4015,403
2,data/processed/2022010312.parquet.gzip,4314,463
3,data/processed/2022010314.parquet.gzip,4932,622
4,data/processed/2022010315.parquet.gzip,6292,1716
...,...,...,...
220,data/processed/2022010921.parquet.gzip,5702,878
221,data/processed/2022010922.parquet.gzip,5471,675
222,data/processed/2022010923.parquet.gzip,6158,583
223,data/processed/2022011000.parquet.gzip,5838,616


CPU times: user 1min 21s, sys: 1.5 s, total: 1min 23s
Wall time: 1min 23s


**Notes**
1. Like the previous notebook, this ETL workflow will also give a single `.parquet` file for every hour of every day on which tweets were streamed. Since the number of hourly tweets for this subject are small enough to fit into memory, it is possible to use `pandas` (in-memory) for filtering the hourly data.

## Compress to `.zip` and Upload to S3

All `.parquet` files are now
- combined into a single `.zip` file with a `filtered__` prefix in the filename, which is then
  - uploaded to the `datasets/twitter/kinesis-demo/processed` prefix in the S3 bucket
  - deleted locally
- deleted locally

In [None]:
%%time
if upload_to_s3:
    # zip all processed data files, upload to S3, delete local files
    # create zip of all .parquet.gzip processed data files
    curr_dir = os.getcwd()
    create_zip_file("filtered__*.parquet.gzip", processed_data_dir, filtered_zip_fname)
    # upload zip file to S3 bucket
    try:
        assert os.getcwd() == curr_dir
        upload_file_to_s3(
            session.region_name,
            processed_data_dir,
            filtered_zip_fname,
            s3_bucket_name,
            f"{path_to_folder[1:-1]}/processed/{filtered_zip_fname}",
        )
        print("\nUploaded filtered zipped file to S3 bucket")
    except AssertionError as e:
        print(
            f"\n{str(e)}: Incorrect working directory. "
            "Did not upload filtered zipped file to S3 bucket."
        )

if cleanup_local_files:
    # 1. delete local combined parquet files
    list(
        map(
            os.remove,
            glob(f"{processed_data_dir}/*.parquet.gzip"),
        )
    )
    print("Deleted local .parquet.gzip files with combined data.")

    # 2. delete locally exported processed parquet files
    list(
        map(
            os.remove,
            glob(f"{processed_data_dir}/filtered__*.parquet.gzip"),
        )
    )
    print("Deleted local .parquet.gzip files with filtered data.")

    # 3. delete local processed zip file
    os.remove(os.path.join(processed_data_dir, proc_zip_fname))
    print("Deleted local .zip file created from all combined data files.")

    # 4. delete local filtered zip file
    os.remove(os.path.join(processed_data_dir, filtered_zip_fname))
    print("Deleted local .zip file created from all filtered data files.")

---

<span style="float:left;">
    <a href="./3_combine_data.ipynb"><< 3 - Combine Hourly Data</a>
</span>

<span style="float:right;">
    <a href="./5_process_data.ipynb">5 - Big Data Processing >></a>
</span>