# Data Processing

In [1]:
%load_ext lab_black
%load_ext autoreload
%autoreload 2

In [2]:
import os
import sys
import shutil
from datetime import datetime
from functools import reduce
from glob import glob
import zipfile

import boto3
import dask.dataframe as dd
import pandas as pd
from pyspark import SparkConf, SparkFiles
import pyspark.pandas as ps
from pyspark.sql import Column, SparkSession, functions as F, types as T
from pyspark.sql.dataframe import DataFrame as pdf
from pyspark.sql.window import Window



In [3]:
PROJ_ROOT = os.path.join(os.pardir)
src_dir = os.path.join(PROJ_ROOT, "src")
sys.path.append(src_dir)

In [4]:
%aimport file_utils
from file_utils import create_zip_file_from_folder

%aimport s3_utils
from s3_utils import (
    download_file_from_s3,
    extract_zip_file,
    upload_file_to_s3,
)

## About

The streamed tweet data and metadata will  now be loaded with PySpark and processed for use in sentiment classification with a big-data ML framework
- load all the hourly `.parquet` data files saved to S3 objects in the `/processed` prefix, into a single PySpark DataFrame
- perform the following data processing on the `text` of the tweet using PySpark
  - download `.parquet` file from S3
  - extract all contained `.parquet` files (without subfolders)
  - read all `.parquet` files into single PySpark DataFrame
  - process data using PySpark
    - drop leading and trailing whitespaces
    - replace occurrences of multiple consecutive whitespaces by a single whitespace
    - drop tweets with missing (`NaN`s) or blank (`''`) `text`
    - change to lowercase
    - remove special characters
    - remove numbers
    - remove punctuation
  - filter the data to remove short tweets (number of words below a user-defined threshold)
    - tweets with three words or less will be removed

  and save the processed tweet `text` in a separate column (named `text_processed`) from the original `text`
- save the data after processing and filtering to a `.parquet` file on S3 in the `/processed` prefix

## User Inputs

In [5]:
path_to_folder = "/datasets/twitter/kinesis-demo/"

# data with no unwanted terms in tweets
processed_data_dir = "../data/processed"
proc_zip_fname = "processed_data.zip"

# processed data
processed_file_name = "processed_text"
min_num_words_tweet = 3

upload_to_s3 = True
cleanup_local_files = True

In [6]:
s3_bucket_name = os.getenv("AWS_S3_BUCKET_NAME", "")

try:
    session = boto3.Session(profile_name="default")
    s3_client = session.client("s3")
    aws_region = session.region_name
    print("Retrieved AWS credentials from ~/.ssh/aws file")
except Exception as e:
    if str(e) == "The config profile (default) could not be found":
        aws_region = os.getenv("AWS_REGION")
        s3_client = boto3.client("s3", region_name=aws_region)
        print("Retrieved AWS credentials from .env file")

Retrieved AWS credentials from .env file


In [7]:
dtypes_dict = {
    "id": pd.StringDtype(),
    "geo": pd.StringDtype(),
    "coordinates": pd.StringDtype(),
    "place": pd.StringDtype(),
    "contributors": pd.StringDtype(),  # pd.BooleanDtype(),
    "is_quote_status": pd.StringDtype(),  # pd.BooleanDtype(),
    "quote_count": pd.Int32Dtype(),
    "reply_count": pd.Int32Dtype(),
    "retweet_count": pd.Int32Dtype(),
    "favorite_count": pd.Int32Dtype(),
    "favorited": pd.StringDtype(),  # pd.BooleanDtype(),
    "retweeted": pd.StringDtype(),  # pd.BooleanDtype(),
    "source": pd.StringDtype(),
    "in_reply_to_user_id": pd.StringDtype(),
    "in_reply_to_screen_name": pd.StringDtype(),
    "source_text": pd.StringDtype(),
    "place_id": pd.StringDtype(),
    "place_url": pd.StringDtype(),
    "place_place_type": pd.StringDtype(),
    "place_name": pd.StringDtype(),
    "place_full_name": pd.StringDtype(),
    "place_country_code": pd.StringDtype(),
    "place_country": pd.StringDtype(),
    "place_bounding_box_type": pd.StringDtype(),
    "place_bounding_box_coordinates": pd.StringDtype(),
    "place_attributes": pd.StringDtype(),
    "coords_type": pd.StringDtype(),
    "coords_lon": pd.StringDtype(),
    "coords_lat": pd.StringDtype(),
    "geo_type": pd.StringDtype(),
    "geo_lon": pd.StringDtype(),
    "geo_lat": pd.StringDtype(),
    "user_name": pd.StringDtype(),
    "user_screen_name": pd.StringDtype(),
    "user_followers": pd.Int32Dtype(),
    "user_friends": pd.Int32Dtype(),
    "user_listed": pd.Int32Dtype(),
    "user_favourites": pd.Int32Dtype(),
    "user_statuses": pd.Int32Dtype(),
    "user_protected": pd.StringDtype(),  # pd.BooleanDtype(),
    "user_verified": pd.StringDtype(),  # pd.BooleanDtype(),
    "user_contributors_enabled": pd.StringDtype(),
    "user_location": pd.StringDtype(),
    "retweeted_tweet": pd.StringDtype(),
    "tweet_text_urls": pd.StringDtype(),
    "tweet_text_hashtags": pd.StringDtype(),
    "tweet_text_usernames": pd.StringDtype(),
    "num_urls_in_tweet_text": pd.Int32Dtype(),
    "num_users_in_tweet_text": pd.Int32Dtype(),
    "num_hashtags_in_tweet_text": pd.Int32Dtype(),
    "text": pd.StringDtype(),
    "contains_wanted_text": pd.BooleanDtype(),
    "contains_wanted_text_case_sensitive": pd.BooleanDtype(),
    "contains_multi_word_wanted_text": pd.BooleanDtype(),
    "contains_crypto_terms": pd.BooleanDtype(),
    "contains_religious_terms": pd.BooleanDtype(),
    "contains_inappropriate_terms": pd.BooleanDtype(),
    "contains_video_games_terms": pd.BooleanDtype(),
    "contains_misc_unwanted_terms": pd.BooleanDtype(),
    "contains_non_english_terms": pd.BooleanDtype(),
    "created_at": "datetime64[ns]",
    "user_joined": "datetime64[ns]",
}

proc_text_zip_fname = f"{processed_file_name}.zip"

In [8]:
def show_pyspark_df(df: pdf, nrows: int = 5) -> pd.DataFrame:
    """Display the first n rows of a PySpark DataFrame as a Pandas DataFrame."""
    return df.limit(nrows).toPandas()

## PySpark Setup

In [9]:
%%time
conf = (SparkConf())

CPU times: user 9 µs, sys: 0 ns, total: 9 µs
Wall time: 10.7 µs


Start a Spark session

In [10]:
%%time
spark = (
    SparkSession
    .builder
    .config(conf=conf)
    .appName("schema_test")
    .getOrCreate()
)

CPU times: user 8.3 ms, sys: 10.2 ms, total: 18.5 ms
Wall time: 3.48 s


## Get Data

We will start by downloaded the processed `.zip` file from S3 and extracting all the contained `.parquet` files into the same directory as the `.zip` file

In [11]:
%%time
if not os.path.exists(os.path.join(processed_data_dir, proc_zip_fname)):
    download_file_from_s3(
        s3_client,
        s3_bucket_name,
        # path_to_folder,
        processed_data_dir,
        proc_zip_fname,
        aws_region,
        f"{path_to_folder[1:]}processed/{proc_zip_fname}",
    )
    extract_zip_file(os.path.join(processed_data_dir, proc_zip_fname), processed_data_dir)
proc_files = glob(f"{processed_data_dir}/*.parquet.gzip")

Started downloading processed data zip file from datasets/twitter/kinesis-demo/processed/processed_data.zip to ../data/processed/processed_data.zip at 2022-10-26 00:00:53.332...
Done downloading in 0.618 seconds.
Started extracting filtered data parquet files from processed data zip file to ../data/processed at 2022-10-26 00:00:53.951...
Done extracting in 0.101 seconds.
CPU times: user 189 ms, sys: 94.5 ms, total: 283 ms
Wall time: 898 ms


We will now use the `pandas` API on Spark to load all the `.parquet` files into a single PySpark DataFrame. This could equivalently be done using native PySpark capabilities with
```python
def Zconcat(dfs):
    """Concatenate two PySpark DataFrames."""
    return reduce(lambda df1, df2: df1.union(df2.select(df1.columns)), dfs)


def union_all(dfs):
    """Concatenate multiple PySpark DataFrames."""
    columns = reduce(lambda x, y: set(x).union(set(y)), [i.columns for i in dfs])

    for i in range(len(dfs)):
        d = dfs[i]
        for c in columns:
            if c not in d.columns:
                d = d.withColumn(c, lit(None))
        dfs[i] = d

    return Zconcat(dfs)


df = union_all([spark.read.parquet(f) for f in proc_files])
```

but we will instead perform this using the `pandas` API on Spark, as shown below

In [12]:
%%time
df = (
    ps.read_parquet(proc_files, index_col=['id'])
    .reset_index()
    .astype(dtypes_dict)
)
with pd.option_context("display.max_columns", None):
    display(df.head())

  series = series.astype(t, copy=False)
  series = series.astype(t, copy=False)


Unnamed: 0,id,geo,coordinates,place,contributors,is_quote_status,quote_count,reply_count,retweet_count,favorite_count,favorited,retweeted,created_at,source,in_reply_to_user_id,in_reply_to_screen_name,source_text,place_id,place_url,place_place_type,place_name,place_full_name,place_country_code,place_country,place_bounding_box_type,place_bounding_box_coordinates,place_attributes,coords_type,coords_lon,coords_lat,geo_type,geo_lon,geo_lat,user_name,user_screen_name,user_followers,user_friends,user_listed,user_favourites,user_statuses,user_protected,user_verified,user_contributors_enabled,user_joined,user_location,retweeted_tweet,tweet_text_urls,tweet_text_hashtags,tweet_text_usernames,num_urls_in_tweet_text,num_users_in_tweet_text,num_hashtags_in_tweet_text,text,contains_wanted_text,contains_wanted_text_case_sensitive,contains_multi_word_wanted_text,contains_crypto_terms,contains_religious_terms,contains_inappropriate_terms,contains_video_games_terms,contains_misc_unwanted_terms,contains_non_english_terms
0,1479875654157885440,,,,,False,0,0,0,0,False,False,2022-01-08 18:00:25,"<a href=""http://twitter.com/download/android"" ...",,,Twitter for Android,,,,,,,,,[[]],{},,,,,,,Bibrata Kr Das,bibrata_das2,0,22,0,6,5,False,False,False,2022-01-07 06:43:02,,no,https://t.co/JmpGDw25JG|https://t.co/pt0lsJs4y6,UnfoldTheUniverse,NASAWebb,2,1,1,The honeycomb is almost complete!Tune in at ar...,True,True,True,False,False,False,False,False,False
1,1479875728195862529,,,,,False,0,0,0,0,False,False,2022-01-08 18:00:43,"<a href=""https://mobile.twitter.com"" rel=""nofo...",,,Twitter Web App,,,,,,,,,[[]],{},,,,,,,Jon Pennycook,jonpsp,170,898,28,8085,52406,False,False,False,2012-02-29 22:48:53,The South of England,no,,,,0,0,0,The James Webb Space Telescope is now fully de...,True,True,True,False,False,False,False,False,False
2,1479875735644897283,,,,,False,0,0,0,0,False,False,2022-01-08 18:00:44,"<a href=""http://twitter.com/download/iphone"" r...",,,Twitter for iPhone,,,,,,,,,[[]],{},,,,,,,David M. Comfort,DavidmComfort,570,3585,44,6917,7535,False,False,False,2010-03-22 03:53:53,"Los Angeles, California",no,,,,0,0,0,The James Webb Space Telescope is now fully de...,True,True,True,False,False,False,False,False,False
3,1479875741395431424,,,,,False,0,0,0,0,False,False,2022-01-08 18:00:46,"<a href=""http://twitter.com/download/android"" ...",,,Twitter for Android,,,,,,,,,[[]],{},,,,,,,Andy 💙,andrewtug,56,430,0,5860,4479,False,False,False,2009-10-31 06:30:41,,no,,,,0,0,0,The James Webb Space Telescope is now fully de...,True,True,True,False,False,False,False,False,False
4,1479875752443138051,,,,,False,0,0,0,0,False,False,2022-01-08 18:00:48,"<a href=""https://mobile.twitter.com"" rel=""nofo...",,,Twitter Web App,,,,,,,,,[[]],{},,,,,,,Sarah Gott 💙,SarahGott7,55,28,0,19689,6634,False,False,False,2019-02-07 18:46:37,"Grantham, England",no,,,,0,0,0,The James Webb Space Telescope is now fully de...,True,True,True,False,False,False,False,False,False


CPU times: user 341 ms, sys: 54.8 ms, total: 396 ms
Wall time: 6.03 s


The column datatypes are shown below

In [13]:
%%time
with pd.option_context("display.max_rows", None):
    display(df.dtypes.rename("dtype").to_frame())

Unnamed: 0,dtype
id,string
geo,string
coordinates,string
place,string
contributors,string
is_quote_status,string
quote_count,Int32
reply_count,Int32
retweet_count,Int32
favorite_count,Int32


CPU times: user 15.6 ms, sys: 3.58 ms, total: 19.2 ms
Wall time: 20.3 ms


Get the number of rows (retrieved tweets) in the data, number of pyspark `DataFrame` partitions and the number of workers on the host (single-node) cluster

In [14]:
%%time
print(
    f"Raw data contains {len(df):,} rows and {len(df.columns):,} columns "
    f"on a host with {len(os.sched_getaffinity(0))} CPUs"
)

Raw data contains 65,639 rows and 62 columns on a host with 12 CPUs
CPU times: user 767 µs, sys: 479 µs, total: 1.25 ms
Wall time: 398 ms


## Process Data

Process the tweet text using the `pandas` API on `PySpark`

In [15]:
%%time
df_processed = (
    df
    # drop tweets with NaN in the text column
    .dropna(subset=['text'])
    # remove leading, trailing spaces and multiple spaces with single space
    .assign(text_stripped=lambda df: df['text'].str.lstrip().str.rstrip().str.replace(r"\s+", " "))
    # remove blank tweets
    .query("text_stripped != ''")
    .assign(
        text_processed=lambda df: df['text_stripped']
        # lowercase
        .str.lower()
        # remove special characters
        .str.replace(r"[^a-zA-z]", " ")
        # remove numbers
        .str.replace(r"\d+", " ")
        # remove punctuation
        .str.replace('[^\w\s]', '')
    )
)
print(
    f"Processed data contains {len(df_processed):,} rows and "
    f"{len(df_processed.columns):,} columns."
)
with pd.option_context("display.max_columns", None, "display.max_colwidth", None):
    display(df_processed[['id', 'source_text', 'text', 'text_stripped', 'text_processed']].head(2))

Processed data contains 65,639 rows and 64 columns.


Unnamed: 0,id,source_text,text,text_stripped,text_processed
0,1479875654157885440,Twitter for Android,The honeycomb is almost complete!Tune in at around ~9am ET (14:00 UTC) as our team unfolds the final wing of Webb's massive primary mirror:,The honeycomb is almost complete!Tune in at around ~9am ET (14:00 UTC) as our team unfolds the final wing of Webb's massive primary mirror:,the honeycomb is almost complete tune in at around am et utc as our team unfolds the final wing of webb s massive primary mirror
1,1479875728195862529,Twitter Web App,"The James Webb Space Telescope is now fully deployed. This is a remarkable engineering achievement that 99 percent of the world will not appreciate. But those of us who know, know. And we are in awe.","The James Webb Space Telescope is now fully deployed. This is a remarkable engineering achievement that 99 percent of the world will not appreciate. But those of us who know, know. And we are in awe.",the james webb space telescope is now fully deployed this is a remarkable engineering achievement that percent of the world will not appreciate but those of us who know know and we are in awe


CPU times: user 97.9 ms, sys: 15.9 ms, total: 114 ms
Wall time: 10 s


The distribution of original tweets and re-tweets is shown below

In [16]:
%%time
display(
    df_processed['retweeted_tweet']
    .value_counts()
    .rename("num_tweets")
    .reset_index()
    .rename(columns={"index": "retweeted_tweet"}).merge(
        df_processed['retweeted_tweet']
        .value_counts(normalize=True)
        .rename("frac_tweets")
        .reset_index()
        .rename(columns={"index": "retweeted_tweet"}),
        on="retweeted_tweet",
        how="left",
    )
)

Unnamed: 0,retweeted_tweet,num_tweets,frac_tweets
0,no,65580,0.999101
1,yes,59,0.000899


CPU times: user 49 ms, sys: 12.3 ms, total: 61.3 ms
Wall time: 4.2 s


**Notes**
1. More than 99% of valid tweets for this use-case are original tweets. For the current use-case, we can keep the retweets, but retweets are not needed when extracting the sentiment (in the `7_*.ipynb` notebook). Based on the `retweeted_tweet` column, we'll need to drop these tweets. For now, we will leave the retweets in the processed data.

We'll now extract the approximate number of words in each *original* tweet (not the processed tweet). We will use this to optionally filter the dataset to remove short tweets (whose length is below some threshold we specify). To get the words, we'll split the text on whitespace. Since the `pandas` API on PySpark does not support splitting a string based on multiple occurrences of a separator, we'll convert this `DataFrame` to PySpark and use PySpark-native methods to perfrom this split.

Below, we convert the `pandas` on PySpark DataFrame to a PySpark DataFrame

In [17]:
%%time
dfpy = df_processed.set_index('id').to_spark(index_col='id')
with pd.option_context("display.max_columns", None):
    display(show_pyspark_df(dfpy, 2))

  series = series.astype(t, copy=False)
  series = series.astype(t, copy=False)


Unnamed: 0,id,geo,coordinates,place,contributors,is_quote_status,quote_count,reply_count,retweet_count,favorite_count,favorited,retweeted,created_at,source,in_reply_to_user_id,in_reply_to_screen_name,source_text,place_id,place_url,place_place_type,place_name,place_full_name,place_country_code,place_country,place_bounding_box_type,place_bounding_box_coordinates,place_attributes,coords_type,coords_lon,coords_lat,geo_type,geo_lon,geo_lat,user_name,user_screen_name,user_followers,user_friends,user_listed,user_favourites,user_statuses,user_protected,user_verified,user_contributors_enabled,user_joined,user_location,retweeted_tweet,tweet_text_urls,tweet_text_hashtags,tweet_text_usernames,num_urls_in_tweet_text,num_users_in_tweet_text,num_hashtags_in_tweet_text,text,contains_wanted_text,contains_wanted_text_case_sensitive,contains_multi_word_wanted_text,contains_crypto_terms,contains_religious_terms,contains_inappropriate_terms,contains_video_games_terms,contains_misc_unwanted_terms,contains_non_english_terms,text_stripped,text_processed
0,1479875654157885440,,,,,False,0,0,0,0,False,False,2022-01-08 18:00:25,"<a href=""http://twitter.com/download/android"" ...",,,Twitter for Android,,,,,,,,,[[]],{},,,,,,,Bibrata Kr Das,bibrata_das2,0,22,0,6,5,False,False,False,2022-01-07 06:43:02,,no,https://t.co/JmpGDw25JG|https://t.co/pt0lsJs4y6,UnfoldTheUniverse,NASAWebb,2,1,1,The honeycomb is almost complete!Tune in at ar...,True,True,True,False,False,False,False,False,False,The honeycomb is almost complete!Tune in at ar...,the honeycomb is almost complete tune in at ar...
1,1479875728195862529,,,,,False,0,0,0,0,False,False,2022-01-08 18:00:43,"<a href=""https://mobile.twitter.com"" rel=""nofo...",,,Twitter Web App,,,,,,,,,[[]],{},,,,,,,Jon Pennycook,jonpsp,170,898,28,8085,52406,False,False,False,2012-02-29 22:48:53,The South of England,no,,,,0,0,0,The James Webb Space Telescope is now fully de...,True,True,True,False,False,False,False,False,False,The James Webb Space Telescope is now fully de...,the james webb space telescope is now fully de...


CPU times: user 50.6 ms, sys: 6.54 ms, total: 57.1 ms
Wall time: 3 s


We now use a PySpark native function `.split()` to extract the words from the `text` of the tweet

In [18]:
%%time
dfpy = dfpy.withColumn(
    "text_trimmed", F.trim(F.col("text"))
).withColumn("words", F.split("text_trimmed", "\s+"))
print(f"Number of rows in processed data = {dfpy.count():,}")

Number of rows in processed data = 65,639
CPU times: user 6.2 ms, sys: 0 ns, total: 6.2 ms
Wall time: 1.29 s


In [19]:
%%time
print(
    f"Processed data contains {dfpy.count():,} rows and {len(dfpy.columns):,} columns "
    f"in {dfpy.rdd.getNumPartitions()} partitions, on a host with "
    f"{len(os.sched_getaffinity(0))} CPUs"
)
with pd.option_context("display.max_columns", None):
    display(show_pyspark_df(dfpy, 2))

Processed data contains 65,639 rows and 66 columns in 12 partitions, on a host with 12 CPUs


  series = series.astype(t, copy=False)
  series = series.astype(t, copy=False)


Unnamed: 0,id,geo,coordinates,place,contributors,is_quote_status,quote_count,reply_count,retweet_count,favorite_count,favorited,retweeted,created_at,source,in_reply_to_user_id,in_reply_to_screen_name,source_text,place_id,place_url,place_place_type,place_name,place_full_name,place_country_code,place_country,place_bounding_box_type,place_bounding_box_coordinates,place_attributes,coords_type,coords_lon,coords_lat,geo_type,geo_lon,geo_lat,user_name,user_screen_name,user_followers,user_friends,user_listed,user_favourites,user_statuses,user_protected,user_verified,user_contributors_enabled,user_joined,user_location,retweeted_tweet,tweet_text_urls,tweet_text_hashtags,tweet_text_usernames,num_urls_in_tweet_text,num_users_in_tweet_text,num_hashtags_in_tweet_text,text,contains_wanted_text,contains_wanted_text_case_sensitive,contains_multi_word_wanted_text,contains_crypto_terms,contains_religious_terms,contains_inappropriate_terms,contains_video_games_terms,contains_misc_unwanted_terms,contains_non_english_terms,text_stripped,text_processed,text_trimmed,words
0,1479875654157885440,,,,,False,0,0,0,0,False,False,2022-01-08 18:00:25,"<a href=""http://twitter.com/download/android"" ...",,,Twitter for Android,,,,,,,,,[[]],{},,,,,,,Bibrata Kr Das,bibrata_das2,0,22,0,6,5,False,False,False,2022-01-07 06:43:02,,no,https://t.co/JmpGDw25JG|https://t.co/pt0lsJs4y6,UnfoldTheUniverse,NASAWebb,2,1,1,The honeycomb is almost complete!Tune in at ar...,True,True,True,False,False,False,False,False,False,The honeycomb is almost complete!Tune in at ar...,the honeycomb is almost complete tune in at ar...,The honeycomb is almost complete!Tune in at ar...,"[The, honeycomb, is, almost, complete!Tune, in..."
1,1479875728195862529,,,,,False,0,0,0,0,False,False,2022-01-08 18:00:43,"<a href=""https://mobile.twitter.com"" rel=""nofo...",,,Twitter Web App,,,,,,,,,[[]],{},,,,,,,Jon Pennycook,jonpsp,170,898,28,8085,52406,False,False,False,2012-02-29 22:48:53,The South of England,no,,,,0,0,0,The James Webb Space Telescope is now fully de...,True,True,True,False,False,False,False,False,False,The James Webb Space Telescope is now fully de...,the james webb space telescope is now fully de...,The James Webb Space Telescope is now fully de...,"[The, James, Webb, Space, Telescope, is, now, ..."


CPU times: user 56.2 ms, sys: 2.71 ms, total: 58.9 ms
Wall time: 4.73 s


We'll now convert the DataFrame back to the `pandas` API on PySpark and continue with data processing where we will count the number of words in the `words` column

In [20]:
%%time
df_words = dfpy.pandas_api().assign(num_words=lambda df: df["words"].str.len())
display(df_words.head(4))

  series = series.astype(t, copy=False)
  series = series.astype(t, copy=False)


Unnamed: 0,id,geo,coordinates,place,contributors,is_quote_status,quote_count,reply_count,retweet_count,favorite_count,favorited,retweeted,created_at,source,in_reply_to_user_id,in_reply_to_screen_name,source_text,place_id,place_url,place_place_type,place_name,place_full_name,place_country_code,place_country,place_bounding_box_type,place_bounding_box_coordinates,place_attributes,coords_type,coords_lon,coords_lat,geo_type,geo_lon,geo_lat,user_name,user_screen_name,user_followers,user_friends,user_listed,user_favourites,user_statuses,user_protected,user_verified,user_contributors_enabled,user_joined,user_location,retweeted_tweet,tweet_text_urls,tweet_text_hashtags,tweet_text_usernames,num_urls_in_tweet_text,num_users_in_tweet_text,num_hashtags_in_tweet_text,text,contains_wanted_text,contains_wanted_text_case_sensitive,contains_multi_word_wanted_text,contains_crypto_terms,contains_religious_terms,contains_inappropriate_terms,contains_video_games_terms,contains_misc_unwanted_terms,contains_non_english_terms,text_stripped,text_processed,text_trimmed,words,num_words
0,1479875654157885440,,,,,False,0,0,0,0,False,False,2022-01-08 18:00:25,"<a href=""http://twitter.com/download/android"" ...",,,Twitter for Android,,,,,,,,,[[]],{},,,,,,,Bibrata Kr Das,bibrata_das2,0,22,0,6,5,False,False,False,2022-01-07 06:43:02,,no,https://t.co/JmpGDw25JG|https://t.co/pt0lsJs4y6,UnfoldTheUniverse,NASAWebb,2,1,1,The honeycomb is almost complete!Tune in at ar...,True,True,True,False,False,False,False,False,False,The honeycomb is almost complete!Tune in at ar...,the honeycomb is almost complete tune in at ar...,The honeycomb is almost complete!Tune in at ar...,"[The, honeycomb, is, almost, complete!Tune, in...",24
1,1479875728195862529,,,,,False,0,0,0,0,False,False,2022-01-08 18:00:43,"<a href=""https://mobile.twitter.com"" rel=""nofo...",,,Twitter Web App,,,,,,,,,[[]],{},,,,,,,Jon Pennycook,jonpsp,170,898,28,8085,52406,False,False,False,2012-02-29 22:48:53,The South of England,no,,,,0,0,0,The James Webb Space Telescope is now fully de...,True,True,True,False,False,False,False,False,False,The James Webb Space Telescope is now fully de...,the james webb space telescope is now fully de...,The James Webb Space Telescope is now fully de...,"[The, James, Webb, Space, Telescope, is, now, ...",36
2,1479875735644897283,,,,,False,0,0,0,0,False,False,2022-01-08 18:00:44,"<a href=""http://twitter.com/download/iphone"" r...",,,Twitter for iPhone,,,,,,,,,[[]],{},,,,,,,David M. Comfort,DavidmComfort,570,3585,44,6917,7535,False,False,False,2010-03-22 03:53:53,"Los Angeles, California",no,,,,0,0,0,The James Webb Space Telescope is now fully de...,True,True,True,False,False,False,False,False,False,The James Webb Space Telescope is now fully de...,the james webb space telescope is now fully de...,The James Webb Space Telescope is now fully de...,"[The, James, Webb, Space, Telescope, is, now, ...",36
3,1479875741395431424,,,,,False,0,0,0,0,False,False,2022-01-08 18:00:46,"<a href=""http://twitter.com/download/android"" ...",,,Twitter for Android,,,,,,,,,[[]],{},,,,,,,Andy 💙,andrewtug,56,430,0,5860,4479,False,False,False,2009-10-31 06:30:41,,no,,,,0,0,0,The James Webb Space Telescope is now fully de...,True,True,True,False,False,False,False,False,False,The James Webb Space Telescope is now fully de...,the james webb space telescope is now fully de...,The James Webb Space Telescope is now fully de...,"[The, James, Webb, Space, Telescope, is, now, ...",36


CPU times: user 149 ms, sys: 1.76 ms, total: 150 ms
Wall time: 10.2 s


We can now optionally filter the data based on the number of words. We will choose to keep tweets that are of a minimum length of 10 words (controlled by the `min_num_words` variable in the **User Inputs** section) and drop shorter ones

In [21]:
%%time
df_words_long = df_words.query(f"num_words >= {min_num_words_tweet}")
print(
    "Number of rows in processed data, after filtering out tweets based on "
    f"length of text = {len(df_words_long):,}"
)
display(df_words_long.head(6))

Number of rows in processed data, after filtering out tweets based on length of text = 65,609


  series = series.astype(t, copy=False)
  series = series.astype(t, copy=False)


Unnamed: 0,id,geo,coordinates,place,contributors,is_quote_status,quote_count,reply_count,retweet_count,favorite_count,favorited,retweeted,created_at,source,in_reply_to_user_id,in_reply_to_screen_name,source_text,place_id,place_url,place_place_type,place_name,place_full_name,place_country_code,place_country,place_bounding_box_type,place_bounding_box_coordinates,place_attributes,coords_type,coords_lon,coords_lat,geo_type,geo_lon,geo_lat,user_name,user_screen_name,user_followers,user_friends,user_listed,user_favourites,user_statuses,user_protected,user_verified,user_contributors_enabled,user_joined,user_location,retweeted_tweet,tweet_text_urls,tweet_text_hashtags,tweet_text_usernames,num_urls_in_tweet_text,num_users_in_tweet_text,num_hashtags_in_tweet_text,text,contains_wanted_text,contains_wanted_text_case_sensitive,contains_multi_word_wanted_text,contains_crypto_terms,contains_religious_terms,contains_inappropriate_terms,contains_video_games_terms,contains_misc_unwanted_terms,contains_non_english_terms,text_stripped,text_processed,text_trimmed,words,num_words
0,1479875654157885440,,,,,False,0,0,0,0,False,False,2022-01-08 18:00:25,"<a href=""http://twitter.com/download/android"" ...",,,Twitter for Android,,,,,,,,,[[]],{},,,,,,,Bibrata Kr Das,bibrata_das2,0,22,0,6,5,False,False,False,2022-01-07 06:43:02,,no,https://t.co/JmpGDw25JG|https://t.co/pt0lsJs4y6,UnfoldTheUniverse,NASAWebb,2,1,1,The honeycomb is almost complete!Tune in at ar...,True,True,True,False,False,False,False,False,False,The honeycomb is almost complete!Tune in at ar...,the honeycomb is almost complete tune in at ar...,The honeycomb is almost complete!Tune in at ar...,"[The, honeycomb, is, almost, complete!Tune, in...",24
1,1479875728195862529,,,,,False,0,0,0,0,False,False,2022-01-08 18:00:43,"<a href=""https://mobile.twitter.com"" rel=""nofo...",,,Twitter Web App,,,,,,,,,[[]],{},,,,,,,Jon Pennycook,jonpsp,170,898,28,8085,52406,False,False,False,2012-02-29 22:48:53,The South of England,no,,,,0,0,0,The James Webb Space Telescope is now fully de...,True,True,True,False,False,False,False,False,False,The James Webb Space Telescope is now fully de...,the james webb space telescope is now fully de...,The James Webb Space Telescope is now fully de...,"[The, James, Webb, Space, Telescope, is, now, ...",36
2,1479875735644897283,,,,,False,0,0,0,0,False,False,2022-01-08 18:00:44,"<a href=""http://twitter.com/download/iphone"" r...",,,Twitter for iPhone,,,,,,,,,[[]],{},,,,,,,David M. Comfort,DavidmComfort,570,3585,44,6917,7535,False,False,False,2010-03-22 03:53:53,"Los Angeles, California",no,,,,0,0,0,The James Webb Space Telescope is now fully de...,True,True,True,False,False,False,False,False,False,The James Webb Space Telescope is now fully de...,the james webb space telescope is now fully de...,The James Webb Space Telescope is now fully de...,"[The, James, Webb, Space, Telescope, is, now, ...",36
3,1479875741395431424,,,,,False,0,0,0,0,False,False,2022-01-08 18:00:46,"<a href=""http://twitter.com/download/android"" ...",,,Twitter for Android,,,,,,,,,[[]],{},,,,,,,Andy 💙,andrewtug,56,430,0,5860,4479,False,False,False,2009-10-31 06:30:41,,no,,,,0,0,0,The James Webb Space Telescope is now fully de...,True,True,True,False,False,False,False,False,False,The James Webb Space Telescope is now fully de...,the james webb space telescope is now fully de...,The James Webb Space Telescope is now fully de...,"[The, James, Webb, Space, Telescope, is, now, ...",36
4,1479875752443138051,,,,,False,0,0,0,0,False,False,2022-01-08 18:00:48,"<a href=""https://mobile.twitter.com"" rel=""nofo...",,,Twitter Web App,,,,,,,,,[[]],{},,,,,,,Sarah Gott 💙,SarahGott7,55,28,0,19689,6634,False,False,False,2019-02-07 18:46:37,"Grantham, England",no,,,,0,0,0,The James Webb Space Telescope is now fully de...,True,True,True,False,False,False,False,False,False,The James Webb Space Telescope is now fully de...,the james webb space telescope is now fully de...,The James Webb Space Telescope is now fully de...,"[The, James, Webb, Space, Telescope, is, now, ...",36
5,1479875776551993349,,,,,False,0,0,0,0,False,False,2022-01-08 18:00:54,"<a href=""https://mobile.twitter.com"" rel=""nofo...",,,Twitter Web App,,,,,,,,,[[]],{},,,,,,,алекса (renaissance man)),globalism0,126,701,0,14802,10356,False,False,False,2021-02-12 10:04:33,глобалистички естаблишмент,no,https://t.co/UtiH2Wn0Zs|https://t.co/yGRWxKYyro,,,2,0,0,Watch the James Webb Space Telescope soar thro...,True,True,True,False,False,False,False,False,False,Watch the James Webb Space Telescope soar thro...,watch the james webb space telescope soar thro...,Watch the James Webb Space Telescope soar thro...,"[Watch, the, James, Webb, Space, Telescope, so...",13


CPU times: user 116 ms, sys: 12.7 ms, total: 129 ms
Wall time: 6.44 s


With data processing and filtering now complete, we'll update the datatype for the processed text columns that were added in this notebook

In [22]:
dtypes_dict.update(
    {
        "text_trimmed": pd.StringDtype(),
        "text_stripped": pd.StringDtype(),
        "text_processed": pd.StringDtype(),
        "words": pd.StringDtype(),
        "num_words": pd.Int32Dtype(),
    }
)

In [23]:
%%time
df_words_long = df_words_long.astype(dtypes_dict)
with pd.option_context("display.max_rows", None):
    display(df_words_long.dtypes.rename('dtype').to_frame())

Unnamed: 0,dtype
id,string
geo,string
coordinates,string
place,string
contributors,string
is_quote_status,string
quote_count,Int32
reply_count,Int32
retweet_count,Int32
favorite_count,Int32


CPU times: user 154 ms, sys: 38.4 ms, total: 193 ms
Wall time: 452 ms


## Export Processed Data

The data is now processed and ready for machine learning model development using big-data ML frameworks. This processed data will now be exported to a separate `.parquet` file and then uploaded to the S3 bucket. The filepath of the `.parquet` file is shown below

In [24]:
filepath = f"{processed_data_dir}/{processed_file_name}.parquet.gzip"
print(filepath)

../data/processed/processed_text.parquet.gzip


### Save Processed and Filtered Data to `.parquet` File

In [25]:
%%time
df_words_long.set_index('id').to_parquet(filepath, index_col='id', compression='gzip')

CPU times: user 9.84 ms, sys: 851 µs, total: 10.7 ms
Wall time: 5.78 s


Next, we will demonstrate loading this `.parquet` file with processed data into a `DataFrame` using two big-data frameworks - PySpark and Dask.

### Demonstrate Reloading Saved `.parquet` File with PySpark

In [26]:
%%time
df_words_long_reloaded = spark.read.parquet(filepath)
with pd.option_context("display.max_columns", None):
    display(show_pyspark_df(df_words_long_reloaded, 2))

  series = series.astype(t, copy=False)
  series = series.astype(t, copy=False)


Unnamed: 0,id,geo,coordinates,place,contributors,is_quote_status,quote_count,reply_count,retweet_count,favorite_count,favorited,retweeted,created_at,source,in_reply_to_user_id,in_reply_to_screen_name,source_text,place_id,place_url,place_place_type,place_name,place_full_name,place_country_code,place_country,place_bounding_box_type,place_bounding_box_coordinates,place_attributes,coords_type,coords_lon,coords_lat,geo_type,geo_lon,geo_lat,user_name,user_screen_name,user_followers,user_friends,user_listed,user_favourites,user_statuses,user_protected,user_verified,user_contributors_enabled,user_joined,user_location,retweeted_tweet,tweet_text_urls,tweet_text_hashtags,tweet_text_usernames,num_urls_in_tweet_text,num_users_in_tweet_text,num_hashtags_in_tweet_text,text,contains_wanted_text,contains_wanted_text_case_sensitive,contains_multi_word_wanted_text,contains_crypto_terms,contains_religious_terms,contains_inappropriate_terms,contains_video_games_terms,contains_misc_unwanted_terms,contains_non_english_terms,text_stripped,text_processed,text_trimmed,words,num_words
0,1479875654157885440,,,,,False,0,0,0,0,False,False,2022-01-08 18:00:25,"<a href=""http://twitter.com/download/android"" ...",,,Twitter for Android,,,,,,,,,[[]],{},,,,,,,Bibrata Kr Das,bibrata_das2,0,22,0,6,5,False,False,False,2022-01-07 06:43:02,,no,https://t.co/JmpGDw25JG|https://t.co/pt0lsJs4y6,UnfoldTheUniverse,NASAWebb,2,1,1,The honeycomb is almost complete!Tune in at ar...,True,True,True,False,False,False,False,False,False,The honeycomb is almost complete!Tune in at ar...,the honeycomb is almost complete tune in at ar...,The honeycomb is almost complete!Tune in at ar...,"[The, honeycomb, is, almost, complete!Tune, in...",24
1,1479875728195862529,,,,,False,0,0,0,0,False,False,2022-01-08 18:00:43,"<a href=""https://mobile.twitter.com"" rel=""nofo...",,,Twitter Web App,,,,,,,,,[[]],{},,,,,,,Jon Pennycook,jonpsp,170,898,28,8085,52406,False,False,False,2012-02-29 22:48:53,The South of England,no,,,,0,0,0,The James Webb Space Telescope is now fully de...,True,True,True,False,False,False,False,False,False,The James Webb Space Telescope is now fully de...,the james webb space telescope is now fully de...,The James Webb Space Telescope is now fully de...,"[The, James, Webb, Space, Telescope, is, now, ...",36


CPU times: user 111 ms, sys: 4.44 ms, total: 116 ms
Wall time: 269 ms


Get a `DataFrame` version of the Spark Schema (`df.printSchema()`) for the PySpark `DataFrame`

In [27]:
%%time
df_dtypes_pyspark = pd.DataFrame.from_records(
    [
        {"name": field.name, "dtype": field.dataType, "nullable": field.nullable}
        for field in df_words_long_reloaded.schema.fields
    ]
).set_index("name")
with pd.option_context("display.max_rows", None):
    display(df_dtypes_pyspark)

Unnamed: 0_level_0,dtype,nullable
name,Unnamed: 1_level_1,Unnamed: 2_level_1
id,StringType(),True
geo,StringType(),True
coordinates,StringType(),True
place,StringType(),True
contributors,StringType(),True
is_quote_status,StringType(),True
quote_count,IntegerType(),True
reply_count,IntegerType(),True
retweet_count,IntegerType(),True
favorite_count,IntegerType(),True


CPU times: user 6.9 ms, sys: 0 ns, total: 6.9 ms
Wall time: 6.98 ms


### Demonstrate Reloading Saved `.parquet` File with `pandas` API on PySpark

In [28]:
%%time
df_words_long_reloaded_ps = ps.read_parquet(filepath, index_col="id").reset_index().astype(dtypes_dict)
with pd.option_context("display.max_columns", None):
    display(df_words_long_reloaded_ps.head(2))
with pd.option_context("display.max_rows", None):
    display(df_words_long_reloaded_ps.dtypes.rename('dtype').to_frame())

  series = series.astype(t, copy=False)
  series = series.astype(t, copy=False)


Unnamed: 0,id,geo,coordinates,place,contributors,is_quote_status,quote_count,reply_count,retweet_count,favorite_count,favorited,retweeted,created_at,source,in_reply_to_user_id,in_reply_to_screen_name,source_text,place_id,place_url,place_place_type,place_name,place_full_name,place_country_code,place_country,place_bounding_box_type,place_bounding_box_coordinates,place_attributes,coords_type,coords_lon,coords_lat,geo_type,geo_lon,geo_lat,user_name,user_screen_name,user_followers,user_friends,user_listed,user_favourites,user_statuses,user_protected,user_verified,user_contributors_enabled,user_joined,user_location,retweeted_tweet,tweet_text_urls,tweet_text_hashtags,tweet_text_usernames,num_urls_in_tweet_text,num_users_in_tweet_text,num_hashtags_in_tweet_text,text,contains_wanted_text,contains_wanted_text_case_sensitive,contains_multi_word_wanted_text,contains_crypto_terms,contains_religious_terms,contains_inappropriate_terms,contains_video_games_terms,contains_misc_unwanted_terms,contains_non_english_terms,text_stripped,text_processed,text_trimmed,words,num_words
0,1479875654157885440,,,,,False,0,0,0,0,False,False,2022-01-08 18:00:25,"<a href=""http://twitter.com/download/android"" ...",,,Twitter for Android,,,,,,,,,[[]],{},,,,,,,Bibrata Kr Das,bibrata_das2,0,22,0,6,5,False,False,False,2022-01-07 06:43:02,,no,https://t.co/JmpGDw25JG|https://t.co/pt0lsJs4y6,UnfoldTheUniverse,NASAWebb,2,1,1,The honeycomb is almost complete!Tune in at ar...,True,True,True,False,False,False,False,False,False,The honeycomb is almost complete!Tune in at ar...,the honeycomb is almost complete tune in at ar...,The honeycomb is almost complete!Tune in at ar...,"[The, honeycomb, is, almost, complete!Tune, in...",24
1,1479875728195862529,,,,,False,0,0,0,0,False,False,2022-01-08 18:00:43,"<a href=""https://mobile.twitter.com"" rel=""nofo...",,,Twitter Web App,,,,,,,,,[[]],{},,,,,,,Jon Pennycook,jonpsp,170,898,28,8085,52406,False,False,False,2012-02-29 22:48:53,The South of England,no,,,,0,0,0,The James Webb Space Telescope is now fully de...,True,True,True,False,False,False,False,False,False,The James Webb Space Telescope is now fully de...,the james webb space telescope is now fully de...,The James Webb Space Telescope is now fully de...,"[The, James, Webb, Space, Telescope, is, now, ...",36


Unnamed: 0,dtype
id,string
geo,string
coordinates,string
place,string
contributors,string
is_quote_status,string
quote_count,Int32
reply_count,Int32
retweet_count,Int32
favorite_count,Int32


CPU times: user 297 ms, sys: 36.4 ms, total: 333 ms
Wall time: 1.02 s


### Demonstrate Reloading Saved `.parquet` File with `dask`

In [29]:
%%time
df_words_long_reloaded_dd = dd.read_parquet(filepath).astype(dtypes_dict)
with pd.option_context("display.max_columns", None):
    display(df_words_long_reloaded_dd.head(2))
with pd.option_context("display.max_rows", None):
    display(df_words_long_reloaded_dd.dtypes.rename('dtype').to_frame())

Unnamed: 0,id,geo,coordinates,place,contributors,is_quote_status,quote_count,reply_count,retweet_count,favorite_count,favorited,retweeted,created_at,source,in_reply_to_user_id,in_reply_to_screen_name,source_text,place_id,place_url,place_place_type,place_name,place_full_name,place_country_code,place_country,place_bounding_box_type,place_bounding_box_coordinates,place_attributes,coords_type,coords_lon,coords_lat,geo_type,geo_lon,geo_lat,user_name,user_screen_name,user_followers,user_friends,user_listed,user_favourites,user_statuses,user_protected,user_verified,user_contributors_enabled,user_joined,user_location,retweeted_tweet,tweet_text_urls,tweet_text_hashtags,tweet_text_usernames,num_urls_in_tweet_text,num_users_in_tweet_text,num_hashtags_in_tweet_text,text,contains_wanted_text,contains_wanted_text_case_sensitive,contains_multi_word_wanted_text,contains_crypto_terms,contains_religious_terms,contains_inappropriate_terms,contains_video_games_terms,contains_misc_unwanted_terms,contains_non_english_terms,text_stripped,text_processed,text_trimmed,words,num_words
0,1479875654157885440,,,,,False,0,0,0,0,False,False,2022-01-08 18:00:25,"<a href=""http://twitter.com/download/android"" ...",,,Twitter for Android,,,,,,,,,[[]],{},,,,,,,Bibrata Kr Das,bibrata_das2,0,22,0,6,5,False,False,False,2022-01-07 06:43:02,,no,https://t.co/JmpGDw25JG|https://t.co/pt0lsJs4y6,UnfoldTheUniverse,NASAWebb,2,1,1,The honeycomb is almost complete!Tune in at ar...,True,True,True,False,False,False,False,False,False,The honeycomb is almost complete!Tune in at ar...,the honeycomb is almost complete tune in at ar...,The honeycomb is almost complete!Tune in at ar...,"[The, honeycomb, is, almost, complete!Tune, in...",24
1,1479875728195862529,,,,,False,0,0,0,0,False,False,2022-01-08 18:00:43,"<a href=""https://mobile.twitter.com"" rel=""nofo...",,,Twitter Web App,,,,,,,,,[[]],{},,,,,,,Jon Pennycook,jonpsp,170,898,28,8085,52406,False,False,False,2012-02-29 22:48:53,The South of England,no,,,,0,0,0,The James Webb Space Telescope is now fully de...,True,True,True,False,False,False,False,False,False,The James Webb Space Telescope is now fully de...,the james webb space telescope is now fully de...,The James Webb Space Telescope is now fully de...,"[The, James, Webb, Space, Telescope, is, now, ...",36


Unnamed: 0,dtype
id,string
geo,string
coordinates,string
place,string
contributors,string
is_quote_status,string
quote_count,Int32
reply_count,Int32
retweet_count,Int32
favorite_count,Int32


CPU times: user 218 ms, sys: 13.7 ms, total: 232 ms
Wall time: 288 ms


## Zip and Upload Processed Data `.parquet` Files to S3 Bucket

We'll now
- create a `.zip` file from the `.parquet` file (PySpark creates a folder of `.parquet` files, not a single file like `pandas` did in the previous notebook)
- upload the `.zip` file to S3 in the `/processed` prefix
- delete the local `.parquet` folder with processed data
- delete the local `.zip` file prepared from the `.parquet` folder

In [30]:
print(filepath)
print(path_to_folder)
print(processed_data_dir)
print(proc_text_zip_fname)
print(f"{path_to_folder[1:-1]}/processed/{proc_text_zip_fname}")
print(os.path.join(processed_data_dir, proc_text_zip_fname))

../data/processed/processed_text.parquet.gzip
/datasets/twitter/kinesis-demo/
../data/processed
processed_text.zip
datasets/twitter/kinesis-demo/processed/processed_text.zip
../data/processed/processed_text.zip


In [31]:
%%time
if upload_to_s3:
    # create zip of all .parquet.gzip processed data files
    create_zip_file_from_folder(processed_data_dir, proc_text_zip_fname, filepath)

    # upload zip file to S3 bucket
    upload_file_to_s3(
        aws_region,
        processed_data_dir,
        proc_text_zip_fname,
        s3_bucket_name,
        f"{path_to_folder[1:-1]}/processed/{proc_text_zip_fname}",
    )
    print("\nUploaded zipped file to S3 bucket")

if cleanup_local_files:
    # delete folder with locally exported parquet files containing processed data
    # exported by PySpark
    shutil.rmtree(filepath)
    print("Deleted local .parquet.gzip files with processed data.")
    # delete local zip file
    if upload_to_s3:
        os.remove(os.path.join(processed_data_dir, proc_text_zip_fname))
        print("Deleted local .zip file created from all filtered data files.")


Uploaded zipped file to S3 bucket
Deleted local .parquet.gzip files with processed data.
Deleted local .zip file created from all filtered data files.
CPU times: user 147 ms, sys: 44.1 ms, total: 191 ms
Wall time: 1.39 s


## Cleanup

We'll now
- delete the local `.parquet` folder with prepared data that was processed in this notebook
- delete the local `.zip` file (containing the individual `.parquet` files of prepared data) that was downloaded from S3

In [32]:
print(os.path.join(processed_data_dir, proc_zip_fname))
proc_files[:5]

../data/processed/processed_data.zip


['../data/processed/filtered__2022010120.parquet.gzip',
 '../data/processed/filtered__2022010408.parquet.gzip',
 '../data/processed/filtered__2022010401.parquet.gzip',
 '../data/processed/filtered__2022010306.parquet.gzip',
 '../data/processed/filtered__2022010103.parquet.gzip']

In [33]:
%%time
if cleanup_local_files:
    # delete locally exported parquet files
    _ = list(map(os.remove, proc_files))
    print("Deleted local .parquet.gzip files with filtered data.")

    # delete local zip file
    os.remove(os.path.join(processed_data_dir, proc_zip_fname))
    print("Deleted local .zip file created from all filtered data files.")

Deleted local .parquet.gzip files with filtered data.
Deleted local .zip file created from all filtered data files.
CPU times: user 2.14 ms, sys: 3.31 ms, total: 5.45 ms
Wall time: 7.41 ms


---

<span style="float:left;">
    <a href="./4-filter-data/notebooks/4_filter_data.ipynb"><< 4 - Filter Data</a>
</span>

<span style="float:right;">
    <a href="./6-split-data/notebooks/6_split_data.ipynb">6 - Create Data Splits >></a>
</span>