# Data Processing

In [1]:
%%time
!pip3 freeze | grep -E 'boto3|s3fs|nltk|distributed|dask==|dask-m|black==|jupyter-server|pandas'
!conda list -n spark | grep -E 'ipykernel|openjdk|pyspark'

black==22.6.0
boto3==1.24.55
dask==2022.8.0
dask-ml==2022.5.27
distributed==2022.8.0
nb-black==1.0.7
nltk==3.7
pandas==1.4.3
s3fs==0.4.2
ipykernel                 6.15.1             pyh210e3f2_0    conda-forge
openjdk                   17.0.3               h1e1ecb3_1    conda-forge
pyspark                   3.3.0              pyhd8ed1ab_0    conda-forge
CPU times: user 48.6 ms, sys: 11 ms, total: 59.5 ms
Wall time: 2.5 s


In [2]:
%load_ext lab_black

In [3]:
import os
import shutil
from datetime import datetime
from functools import reduce
from glob import glob
import zipfile

import boto3
import dask.dataframe as dd
import pandas as pd
from pyspark import SparkConf, SparkFiles
import pyspark.pandas as ps
from pyspark.sql import Column, SparkSession, functions as F, types as T
from pyspark.sql.dataframe import DataFrame as pdf
from pyspark.sql.window import Window



## About

The streamed tweet data and metadata will  now be loaded with PySpark and processed for use in sentiment classification with a big-data ML framework
- load all the hourly `.parquet` data files saved to S3 objects in the `/processed` prefix, into a single PySpark DataFrame
- perform the following data processing on the `text` of the tweet using PySpark
  - download `.parquet` file from S3
  - extract all contained `.parquet` files (without subfolders)
  - read all `.parquet` files into single PySpark DataFrame
  - process data using PySpark
    - drop leading and trailing whitespaces
    - replace occurrences of multiple consecutive whitespaces by a single whitespace
    - drop tweets with missing (`NaN`s) or blank (`''`) `text`
    - change to lowercase
    - remove special characters
    - remove numbers
    - remove punctuation
  - (optional) filter the data to remove short tweets (number of words below a user-defined threshold)

  and save the processed tweet `text` in a separate column (named `text_processed`) from the original `text`
- save the data after processing and filtering to a `.parquet` file on S3 in the `/processed` prefix

### Pre-Requisites
1. This notebook has been run from inside a `conda` environment on a system Python version of 3.10, with the following Python libraries installed
   ```bash
   - pip==22.2.2
   - pip:
     - nb_black==1.0.7
     - boto3
     - s3fs
     - nltk==3.7
     - distributed==2022.8.0
     - dask[dataframe]==2022.8.0
     - dask[distributed]==2022.8.0
     - dask-ml==2022.5.27
   - conda==4.3.16
   - conda:
     - ipykernel==6.15.1
     - openjdk==17.0.3
     - pyspark==3.3.0
    ```
2. If this notebook is being run on a Sagemaker notebook instance, then only `nltk` should need to be installed manually using
   ```bash
   !pip3 install nltk==3.7
   ```

   in the first code cell of this notebook.

## User Inputs

In [4]:
path_to_folder = "/datasets/twitter/kinesis-demo/"

# data with no unwanted terms in tweets
processed_data_dir = "data/processed"
proc_zip_fname = "processed_data.zip"
# cols_to_load = [
#     "id",
#     "contributors",
#     "created_at" "source",
#     "in_reply_to_screen_name",
#     "source_text",
#     "created_at",
#     "place_id",
#     "place_url",
#     "place_place_type",
#     "place_country_code",
#     "place_country",
#     "user_name",
#     "user_screen_name",
#     "user_followers",
#     "user_friends",
#     "user_listed",
#     "user_favourites",
#     "user_statuses",
#     "user_protected" "user_verified",
#     "user_joined",
#     "user_location",
#     "retweeted_tweet",
#     "is_quote_status",
#     "quote_count",
#     "reply_count",
#     "favorite_count",
#     "favorited",
#     "text",
# ]

# processed data
processed_file_name = "processed_text"
min_num_words_tweet = 10

In [5]:
s3_bucket_name = os.getenv("AWS_S3_BUCKET_NAME")
session = boto3.Session(profile_name="default")
s3_client = session.client("s3")

dtypes_dict = {
    "id": pd.StringDtype(),
    "geo": pd.StringDtype(),
    "coordinates": pd.StringDtype(),
    "place": pd.StringDtype(),
    "contributors": pd.StringDtype(),  # pd.BooleanDtype(),
    "is_quote_status": pd.StringDtype(),  # pd.BooleanDtype(),
    "quote_count": pd.Int32Dtype(),
    "reply_count": pd.Int32Dtype(),
    "retweet_count": pd.Int32Dtype(),
    "favorite_count": pd.Int32Dtype(),
    "favorited": pd.StringDtype(),  # pd.BooleanDtype(),
    "retweeted": pd.StringDtype(),  # pd.BooleanDtype(),
    "source": pd.StringDtype(),
    "in_reply_to_user_id": pd.StringDtype(),
    "in_reply_to_screen_name": pd.StringDtype(),
    "source_text": pd.StringDtype(),
    "place_id": pd.StringDtype(),
    "place_url": pd.StringDtype(),
    "place_place_type": pd.StringDtype(),
    "place_name": pd.StringDtype(),
    "place_full_name": pd.StringDtype(),
    "place_country_code": pd.StringDtype(),
    "place_country": pd.StringDtype(),
    "place_bounding_box_type": pd.StringDtype(),
    "place_bounding_box_coordinates": pd.StringDtype(),
    "place_attributes": pd.StringDtype(),
    "coords_type": pd.StringDtype(),
    "coords_lon": pd.StringDtype(),
    "coords_lat": pd.StringDtype(),
    "geo_type": pd.StringDtype(),
    "geo_lon": pd.StringDtype(),
    "geo_lat": pd.StringDtype(),
    "user_name": pd.StringDtype(),
    "user_screen_name": pd.StringDtype(),
    "user_followers": pd.Int32Dtype(),
    "user_friends": pd.Int32Dtype(),
    "user_listed": pd.Int32Dtype(),
    "user_favourites": pd.Int32Dtype(),
    "user_statuses": pd.Int32Dtype(),
    "user_protected": pd.StringDtype(),  # pd.BooleanDtype(),
    "user_verified": pd.StringDtype(),  # pd.BooleanDtype(),
    "user_contributors_enabled": pd.StringDtype(),
    "user_location": pd.StringDtype(),
    "retweeted_tweet": pd.StringDtype(),
    "tweet_text_urls": pd.StringDtype(),
    "tweet_text_hashtags": pd.StringDtype(),
    "tweet_text_usernames": pd.StringDtype(),
    "num_urls_in_tweet_text": pd.Int32Dtype(),
    "num_users_in_tweet_text": pd.Int32Dtype(),
    "num_hashtags_in_tweet_text": pd.Int32Dtype(),
    "text": pd.StringDtype(),
    "contains_wanted_text": pd.BooleanDtype(),
    "contains_wanted_text_case_sensitive": pd.BooleanDtype(),
    "contains_multi_word_wanted_text": pd.BooleanDtype(),
    "contains_crypto_terms": pd.BooleanDtype(),
    "contains_religious_terms": pd.BooleanDtype(),
    "contains_inappropriate_terms": pd.BooleanDtype(),
    "contains_video_games_terms": pd.BooleanDtype(),
    "contains_misc_unwanted_terms": pd.BooleanDtype(),
    "contains_non_english_terms": pd.BooleanDtype(),
}

proc_text_zip_fname = f"{processed_file_name}.zip"

In [6]:
def show_pyspark_df(df: pdf, nrows: int = 5) -> pd.DataFrame:
    """Display the first n rows of a PySpark DataFrame as a Pandas DataFrame."""
    return df.limit(nrows).toPandas()


def create_zip_file_from_folder(
    processed_data_dir: str, zip_fname: str, filepath: str
) -> None:
    """Create zipped file from a folder."""
    # create a ZipFile object
    with zipfile.ZipFile(os.path.join(processed_data_dir, zip_fname), "w") as zipObj:
        # Iterate over all the files in directory
        for folderName, subfolders, filenames in os.walk(filepath):
            for filename in filenames:
                # print(folderName, filename)
                # create complete filepath of file in directory
                filePath = os.path.join(folderName, filename)
                # # Add file to zip
                zipObj.write(filePath, os.path.basename(filePath))


def upload_file_to_s3(
    aws_region: str,
    processed_data_dir: str,
    fname: str,
    s3_bucket_name: str,
    s3_key: str,
) -> None:
    """Upload file to key in S3 bucket."""
    s3_resource = boto3.resource("s3", region_name=aws_region)
    s3_resource.meta.client.upload_file(
        f"{processed_data_dir}/{fname}",
        s3_bucket_name,
        s3_key,
    )


def download_file_from_s3(
    s3_bucket_name: str,
    path_to_folder: str,
    data_dir: str,
    fname: str,
    aws_region: str,
) -> None:
    """Download file from ."""
    dest_filepath = os.path.join(data_dir, fname)
    s3_filepath_key = s3_client.list_objects_v2(
        Bucket=s3_bucket_name,
        Delimiter="/",
        Prefix=f"{path_to_folder[1:]}processed/",
    )["Contents"][0]["Key"]
    start = datetime.now()
    print(
        f"Started downloading processed data zip file from {s3_filepath_key} to "
        f"{dest_filepath} at {start.strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]}..."
    )
    s3 = boto3.resource("s3", region_name=aws_region)
    s3.meta.client.download_file(
        s3_bucket_name,
        s3_filepath_key,
        dest_filepath,
    )
    duration = (datetime.now() - start).total_seconds()
    print(f"Done downloading in {duration:.3f} seconds.")


def extract_zip_file(dest_filepath: str, data_dir: str) -> None:
    """."""
    start = datetime.now()
    print(
        "Started extracting filtered data parquet files from "
        f"processed data zip file to {data_dir} at "
        f"{start.strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]}..."
    )
    zip_ref = zipfile.ZipFile(dest_filepath)
    zip_ref.extractall(data_dir)
    zip_ref.close()
    duration = (datetime.now() - start).total_seconds()
    print(f"Done extracting in {duration:.3f} seconds.")

## PySpark Setup

In [7]:
%%time
conf = (SparkConf())

CPU times: user 12 µs, sys: 3 µs, total: 15 µs
Wall time: 18.8 µs


In [8]:
# .set("spark.driver.extraClassPath", ":".join(sagemaker_pyspark.classpath_jars()))

Start a Spark session

In [9]:
%%time
spark = (
    SparkSession
    .builder
    .config(conf=conf)
    .appName("schema_test")
    .getOrCreate()
)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/08/19 03:31:43 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
CPU times: user 23 ms, sys: 9.58 ms, total: 32.6 ms
Wall time: 3.42 s


## Get Data

We will start by downloaded the processed `.zip` file from S3 and extracting all the contained `.parquet` files into the same directory as the `.zip` file

In [10]:
%%time
if not os.path.exists(os.path.join(processed_data_dir, proc_zip_fname)):
    download_file_from_s3(
        s3_bucket_name, path_to_folder, processed_data_dir, proc_zip_fname, session.region_name
    )
    extract_zip_file(os.path.join(processed_data_dir, proc_zip_fname), processed_data_dir)
proc_files = glob(f"{processed_data_dir}/*.parquet.gzip")

Started downloading processed data zip file from datasets/twitter/kinesis-demo/processed/processed_data.zip to data/processed/processed_data.zip at 2022-08-19 03:31:44.800...
Done downloading in 0.420 seconds.
Started extracting filtered data parquet files from processed data zip file to data/processed at 2022-08-19 03:31:45.222...
Done extracting in 0.367 seconds.
CPU times: user 537 ms, sys: 98.2 ms, total: 635 ms
Wall time: 834 ms


We will now use the `pandas` API on Spark to load all the `.parquet` files into a single PySpark DataFrame. This could equivalently be done using native PySpark capabilities with
```python
def Zconcat(dfs):
    """Concatenate two PySpark DataFrames."""
    return reduce(lambda df1, df2: df1.union(df2.select(df1.columns)), dfs)


def union_all(dfs):
    """Concatenate multiple PySpark DataFrames."""
    columns = reduce(lambda x, y: set(x).union(set(y)), [i.columns for i in dfs])

    for i in range(len(dfs)):
        d = dfs[i]
        for c in columns:
            if c not in d.columns:
                d = d.withColumn(c, lit(None))
        dfs[i] = d

    return Zconcat(dfs)


df = union_all([spark.read.parquet(f) for f in proc_files])
with pd.option_context("display.max_columns", None):
    display(show_pyspark_df(df))
```

but we will instead perform this using the `pandas` API on Spark, as shown below

In [11]:
%%time
df = (
    ps.read_parquet(proc_files, index_col=['id'])
    .reset_index()
    .astype(dtypes_dict)
)
with pd.option_context("display.max_columns", None):
    display(df.head())

                                                                                

22/08/19 03:31:53 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


                                                                                

Unnamed: 0,id,geo,coordinates,place,contributors,is_quote_status,quote_count,reply_count,retweet_count,favorite_count,favorited,retweeted,created_at,source,in_reply_to_user_id,in_reply_to_screen_name,source_text,place_id,place_url,place_place_type,place_name,place_full_name,place_country_code,place_country,place_bounding_box_type,place_bounding_box_coordinates,place_attributes,coords_type,coords_lon,coords_lat,geo_type,geo_lon,geo_lat,user_name,user_screen_name,user_followers,user_friends,user_listed,user_favourites,user_statuses,user_protected,user_verified,user_contributors_enabled,user_joined,user_location,retweeted_tweet,tweet_text_urls,tweet_text_hashtags,tweet_text_usernames,num_urls_in_tweet_text,num_users_in_tweet_text,num_hashtags_in_tweet_text,text,contains_wanted_text,contains_wanted_text_case_sensitive,contains_multi_word_wanted_text,contains_crypto_terms,contains_religious_terms,contains_inappropriate_terms,contains_video_games_terms,contains_misc_unwanted_terms,contains_non_english_terms
0,1479845397946380290,,,,,False,0,0,0,0,False,False,2022-01-08 16:00:11,"<a href=""http://twitter.com/download/iphone"" r...",,,Twitter for iPhone,,,,,,,,,[[]],{},,,,,,,Looking Up #FBPE,lookingup322,700,1022,35,629,41373,False,False,False,2015-10-26 13:31:35,,no,https://t.co/9ObGVUZvdG,UnfoldTheUniverse,NASAWebb,1,1,1,LIVE from mission control: experts give real...,True,False,False,False,False,False,False,False,False
1,1479845401289179139,,,,,False,0,0,0,0,False,False,2022-01-08 16:00:12,"<a href=""https://www.socialjukebox.com"" rel=""n...",,,The Social Jukebox,,,,,,,,,[[]],{},,,,,,,#LoveThatYacht #Luxury #Tech #Trends #CES2022 ...,LoriMoreno,175705,111415,8920,5842,797307,False,False,False,2008-06-07 16:26:14,SocialMedia RedCarpet #Influencer 🍷 🍷 🍷,no,https://t.co/noVH2Y34Z7|https://t.co/hBvXcosXJH,,,2,0,0,This was taken when we covered the NASA Night...,False,True,False,False,False,False,False,False,False
2,1479845404762152969,,,,,False,0,0,0,0,False,False,2022-01-08 16:00:13,"<a href=""http://twitter.com/download/android"" ...",,,Twitter for Android,,,,,,,,,[[]],{},,,,,,,Jorge Betancourt _ _ _ _ _ _⚗️🔭🔬🪐🚀🏛️📡🛰️🧬,zemblanity00,1423,3050,130,38992,83840,False,False,False,2010-04-27 20:37:51,Earth...between Venus and Mars,no,https://t.co/XXoREQFEYu|https://t.co/Scl4SLxw0Z,JWST|today,,2,0,2,Nearly halfway through its flight to L2 (vs ti...,True,False,False,False,False,False,False,False,False
3,1479845407085629445,,,,,False,0,0,0,0,False,False,2022-01-08 16:00:13,"<a href=""https://mobile.twitter.com"" rel=""nofo...",,,Twitter Web App,,,,,,,,,[[]],{},,,,,,,NAMS,KALANTHURAIADAL,186,2037,11,1558,19656,False,False,False,2010-10-15 05:26:34,,no,https://t.co/Ae2w6KesjU,,YouTube,1,1,0,Time and life of Stephen Hawking | News18 Tam...,False,False,True,False,False,False,False,False,False
4,1479845409014964230,,,,,False,0,0,0,0,False,False,2022-01-08 16:00:14,"<a href=""http://twitter.com/download/android"" ...",,,Twitter for Android,,,,,,,,,[[]],{},,,,,,,Adrak_Wali_Chai,Adrak_Wali_T,188,97,0,948,3069,False,False,False,2021-11-18 13:47:26,,no,https://t.co/BRRDprM46n,स,,1,0,1,"""The scientists at National Aeronautics and Sp...",True,True,True,False,False,False,False,False,False


CPU times: user 534 ms, sys: 80.9 ms, total: 615 ms
Wall time: 13.1 s


The column datatypes are shown below

In [12]:
%%time
with pd.option_context("display.max_rows", None):
    display(df.dtypes.rename("dtype").to_frame())

Unnamed: 0,dtype
id,string
geo,string
coordinates,string
place,string
contributors,string
is_quote_status,string
quote_count,Int32
reply_count,Int32
retweet_count,Int32
favorite_count,Int32


CPU times: user 17.8 ms, sys: 6.28 ms, total: 24.1 ms
Wall time: 39.8 ms


Get the number of rows (retrieved tweets) in the data, number of pyspark `DataFrame` partitions and the number of workers on the host (single-node) cluster

In [13]:
%%time
print(
    f"Raw data contains {len(df):,} rows and {len(df.columns):,} columns "
    f"on a host with {len(os.sched_getaffinity(0))} CPUs"
)

Raw data contains 229,960 rows and 62 columns on a host with 4 CPUs
CPU times: user 4.39 ms, sys: 85 µs, total: 4.47 ms
Wall time: 945 ms


## Process Data

Process the tweet text using the `pandas` API on `PySpark`

In [14]:
%%time
df_processed = (
    df
    # drop tweets with NaN in the text column
    .dropna(subset=['text'])
    # remove leading, trailing spaces and multiple spaces with single space
    .assign(text_stripped=lambda df: df['text'].str.lstrip().str.rstrip().str.replace(r"\s+", " "))
    # remove blank tweets
    .query("text_stripped != ''")
    .assign(
        text_processed=lambda df: df['text_stripped']
        # lowercase
        .str.lower()
        # remove special characters
        .str.replace(r"[^a-zA-z]", " ")
        # remove numbers
        .str.replace(r"\d+", " ")
        # remove punctuation
        .str.replace('[^\w\s]', '')
    )
)
print(
    f"Processed data contains {len(df_processed):,} rows and "
    f"{len(df_processed.columns):,} columns."
)
with pd.option_context("display.max_columns", None, "display.max_colwidth", None):
    display(df_processed[['id', 'source_text', 'text', 'text_stripped', 'text_processed']].head(2))

                                                                                

Processed data contains 229,960 rows and 64 columns.


                                                                                

Unnamed: 0,id,source_text,text,text_stripped,text_processed
0,1479845397946380290,Twitter for iPhone,LIVE from mission control: experts give real-time updates as the telescope's golden honeycomb-like mirror takes its final shape in space. This marks the end of an unprecedented 14-day deployment process! Use for questions.,LIVE from mission control: experts give real-time updates as the telescope's golden honeycomb-like mirror takes its final shape in space. This marks the end of an unprecedented 14-day deployment process! Use for questions.,live from mission control experts give real time updates as the telescope s golden honeycomb like mirror takes its final shape in space this marks the end of an unprecedented day deployment process use for questions
1,1479845401289179139,The Social Jukebox,This was taken when we covered the NASA Night Launch! It was a WoW Experience! CapeCanaveral Florida travel luxurytravel adventuretravel,This was taken when we covered the NASA Night Launch! It was a WoW Experience! CapeCanaveral Florida travel luxurytravel adventuretravel,this was taken when we covered the nasa night launch it was a wow experience capecanaveral florida travel luxurytravel adventuretravel


CPU times: user 221 ms, sys: 38.9 ms, total: 260 ms
Wall time: 18.8 s


The distribution of original tweets and re-tweets is shown below

In [15]:
%%time
display(
    df_processed['retweeted_tweet']
    .value_counts()
    .rename("num_tweets")
    .reset_index()
    .rename(columns={"index": "retweeted_tweet"}).merge(
        df_processed['retweeted_tweet']
        .value_counts(normalize=True)
        .rename("frac_tweets")
        .reset_index()
        .rename(columns={"index": "retweeted_tweet"}),
        on="retweeted_tweet",
        how="left",
    )
)

                                                                                

Unnamed: 0,retweeted_tweet,num_tweets,frac_tweets
0,no,229717,0.998943
1,yes,243,0.001057


CPU times: user 99.6 ms, sys: 26.5 ms, total: 126 ms
Wall time: 16.2 s


**Notes**
1. More than 99% of valid tweets for this use-case are original tweets. For classifying sentiment, we can keep the retweets, but retweets are not needed when extracting the sentiment (in the next notebook), so we'll need to drop them during sentiment extraction. For now, we will leave the retweets in the processed data.

We'll now extract the approximate number of words in each *original* tweet (not the processed tweet). We will use this to optionally filter the dataset to remove short tweets (whose length is below some threshold we specify). To get the words, we'll split the text on whitespace. Since the `pandas` API on PySpark does not support splitting a string based on multiple occurrences of a separator, we'll convert this `DataFrame` to PySpark and use PySpark-native methods to perfrom this split.

Below, we convert the `pandas` on PySpark DataFrame to a PySpark DataFrame

In [16]:
%%time
dfpy = df_processed.to_spark()
with pd.option_context("display.max_columns", None):
    display(show_pyspark_df(dfpy, 2))

                                                                                

Unnamed: 0,id,geo,coordinates,place,contributors,is_quote_status,quote_count,reply_count,retweet_count,favorite_count,favorited,retweeted,created_at,source,in_reply_to_user_id,in_reply_to_screen_name,source_text,place_id,place_url,place_place_type,place_name,place_full_name,place_country_code,place_country,place_bounding_box_type,place_bounding_box_coordinates,place_attributes,coords_type,coords_lon,coords_lat,geo_type,geo_lon,geo_lat,user_name,user_screen_name,user_followers,user_friends,user_listed,user_favourites,user_statuses,user_protected,user_verified,user_contributors_enabled,user_joined,user_location,retweeted_tweet,tweet_text_urls,tweet_text_hashtags,tweet_text_usernames,num_urls_in_tweet_text,num_users_in_tweet_text,num_hashtags_in_tweet_text,text,contains_wanted_text,contains_wanted_text_case_sensitive,contains_multi_word_wanted_text,contains_crypto_terms,contains_religious_terms,contains_inappropriate_terms,contains_video_games_terms,contains_misc_unwanted_terms,contains_non_english_terms,text_stripped,text_processed
0,1479845397946380290,,,,,False,0,0,0,0,False,False,2022-01-08 16:00:11,"<a href=""http://twitter.com/download/iphone"" r...",,,Twitter for iPhone,,,,,,,,,[[]],{},,,,,,,Looking Up #FBPE,lookingup322,700,1022,35,629,41373,False,False,False,2015-10-26 13:31:35,,no,https://t.co/9ObGVUZvdG,UnfoldTheUniverse,NASAWebb,1,1,1,LIVE from mission control: experts give real...,True,False,False,False,False,False,False,False,False,LIVE from mission control: experts give real-t...,live from mission control experts give real t...
1,1479845401289179139,,,,,False,0,0,0,0,False,False,2022-01-08 16:00:12,"<a href=""https://www.socialjukebox.com"" rel=""n...",,,The Social Jukebox,,,,,,,,,[[]],{},,,,,,,#LoveThatYacht #Luxury #Tech #Trends #CES2022 ...,LoriMoreno,175705,111415,8920,5842,797307,False,False,False,2008-06-07 16:26:14,SocialMedia RedCarpet #Influencer 🍷 🍷 🍷,no,https://t.co/noVH2Y34Z7|https://t.co/hBvXcosXJH,,,2,0,0,This was taken when we covered the NASA Night...,False,True,False,False,False,False,False,False,False,This was taken when we covered the NASA Night ...,this was taken when we covered the nasa night ...


CPU times: user 64.6 ms, sys: 7.99 ms, total: 72.6 ms
Wall time: 10.4 s


We now use a PySpark native function `.split()` to extract the words from the `text` of the tweet

In [17]:
%%time
dfpy = dfpy.withColumn(
    "text_trimmed", F.trim(F.col("text"))
).withColumn("words", F.split("text_trimmed", "\s+"))
print(f"Number of rows in processed data = {dfpy.count():,}")



Number of rows in processed data = 229,960
CPU times: user 14.7 ms, sys: 1.74 ms, total: 16.5 ms
Wall time: 8.4 s


                                                                                

In [18]:
%%time
print(
    f"Processed data contains {dfpy.count():,} rows and {len(dfpy.columns):,} columns "
    f"in {dfpy.rdd.getNumPartitions()} partitions, on a host with "
    f"{len(os.sched_getaffinity(0))} CPUs"
)
with pd.option_context("display.max_columns", None):
    display(show_pyspark_df(dfpy, 2))

                                                                                

Processed data contains 229,960 rows and 66 columns in 8 partitions, on a host with 4 CPUs


                                                                                

Unnamed: 0,id,geo,coordinates,place,contributors,is_quote_status,quote_count,reply_count,retweet_count,favorite_count,favorited,retweeted,created_at,source,in_reply_to_user_id,in_reply_to_screen_name,source_text,place_id,place_url,place_place_type,place_name,place_full_name,place_country_code,place_country,place_bounding_box_type,place_bounding_box_coordinates,place_attributes,coords_type,coords_lon,coords_lat,geo_type,geo_lon,geo_lat,user_name,user_screen_name,user_followers,user_friends,user_listed,user_favourites,user_statuses,user_protected,user_verified,user_contributors_enabled,user_joined,user_location,retweeted_tweet,tweet_text_urls,tweet_text_hashtags,tweet_text_usernames,num_urls_in_tweet_text,num_users_in_tweet_text,num_hashtags_in_tweet_text,text,contains_wanted_text,contains_wanted_text_case_sensitive,contains_multi_word_wanted_text,contains_crypto_terms,contains_religious_terms,contains_inappropriate_terms,contains_video_games_terms,contains_misc_unwanted_terms,contains_non_english_terms,text_stripped,text_processed,text_trimmed,words
0,1479845397946380290,,,,,False,0,0,0,0,False,False,2022-01-08 16:00:11,"<a href=""http://twitter.com/download/iphone"" r...",,,Twitter for iPhone,,,,,,,,,[[]],{},,,,,,,Looking Up #FBPE,lookingup322,700,1022,35,629,41373,False,False,False,2015-10-26 13:31:35,,no,https://t.co/9ObGVUZvdG,UnfoldTheUniverse,NASAWebb,1,1,1,LIVE from mission control: experts give real...,True,False,False,False,False,False,False,False,False,LIVE from mission control: experts give real-t...,live from mission control experts give real t...,LIVE from mission control: experts give real-...,"[LIVE, from, mission, control:, experts, give,..."
1,1479845401289179139,,,,,False,0,0,0,0,False,False,2022-01-08 16:00:12,"<a href=""https://www.socialjukebox.com"" rel=""n...",,,The Social Jukebox,,,,,,,,,[[]],{},,,,,,,#LoveThatYacht #Luxury #Tech #Trends #CES2022 ...,LoriMoreno,175705,111415,8920,5842,797307,False,False,False,2008-06-07 16:26:14,SocialMedia RedCarpet #Influencer 🍷 🍷 🍷,no,https://t.co/noVH2Y34Z7|https://t.co/hBvXcosXJH,,,2,0,0,This was taken when we covered the NASA Night...,False,True,False,False,False,False,False,False,False,This was taken when we covered the NASA Night ...,this was taken when we covered the nasa night ...,This was taken when we covered the NASA Night...,"[This, was, taken, when, we, covered, the, NAS..."


CPU times: user 111 ms, sys: 8.25 ms, total: 119 ms
Wall time: 18.9 s


We'll now convert the DataFrame back to the `pandas` API on PySpark and continue with data processing where we will count the number of words in the `words` column

In [19]:
%%time
df_words = dfpy.pandas_api().assign(num_words=lambda df: df["words"].str.len())
display(df_words.head(4))

                                                                                

Unnamed: 0,id,geo,coordinates,place,contributors,is_quote_status,quote_count,reply_count,retweet_count,favorite_count,favorited,retweeted,created_at,source,in_reply_to_user_id,in_reply_to_screen_name,source_text,place_id,place_url,place_place_type,place_name,place_full_name,place_country_code,place_country,place_bounding_box_type,place_bounding_box_coordinates,place_attributes,coords_type,coords_lon,coords_lat,geo_type,geo_lon,geo_lat,user_name,user_screen_name,user_followers,user_friends,user_listed,user_favourites,user_statuses,user_protected,user_verified,user_contributors_enabled,user_joined,user_location,retweeted_tweet,tweet_text_urls,tweet_text_hashtags,tweet_text_usernames,num_urls_in_tweet_text,num_users_in_tweet_text,num_hashtags_in_tweet_text,text,contains_wanted_text,contains_wanted_text_case_sensitive,contains_multi_word_wanted_text,contains_crypto_terms,contains_religious_terms,contains_inappropriate_terms,contains_video_games_terms,contains_misc_unwanted_terms,contains_non_english_terms,text_stripped,text_processed,text_trimmed,words,num_words
0,1479845397946380290,,,,,False,0,0,0,0,False,False,2022-01-08 16:00:11,"<a href=""http://twitter.com/download/iphone"" r...",,,Twitter for iPhone,,,,,,,,,[[]],{},,,,,,,Looking Up #FBPE,lookingup322,700,1022,35,629,41373,False,False,False,2015-10-26 13:31:35,,no,https://t.co/9ObGVUZvdG,UnfoldTheUniverse,NASAWebb,1,1,1,LIVE from mission control: experts give real...,True,False,False,False,False,False,False,False,False,LIVE from mission control: experts give real-t...,live from mission control experts give real t...,LIVE from mission control: experts give real-...,"[LIVE, from, mission, control:, experts, give,...",33
1,1479845401289179139,,,,,False,0,0,0,0,False,False,2022-01-08 16:00:12,"<a href=""https://www.socialjukebox.com"" rel=""n...",,,The Social Jukebox,,,,,,,,,[[]],{},,,,,,,#LoveThatYacht #Luxury #Tech #Trends #CES2022 ...,LoriMoreno,175705,111415,8920,5842,797307,False,False,False,2008-06-07 16:26:14,SocialMedia RedCarpet #Influencer 🍷 🍷 🍷,no,https://t.co/noVH2Y34Z7|https://t.co/hBvXcosXJH,,,2,0,0,This was taken when we covered the NASA Night...,False,True,False,False,False,False,False,False,False,This was taken when we covered the NASA Night ...,this was taken when we covered the nasa night ...,This was taken when we covered the NASA Night...,"[This, was, taken, when, we, covered, the, NAS...",20
2,1479845404762152969,,,,,False,0,0,0,0,False,False,2022-01-08 16:00:13,"<a href=""http://twitter.com/download/android"" ...",,,Twitter for Android,,,,,,,,,[[]],{},,,,,,,Jorge Betancourt _ _ _ _ _ _⚗️🔭🔬🪐🚀🏛️📡🛰️🧬,zemblanity00,1423,3050,130,38992,83840,False,False,False,2010-04-27 20:37:51,Earth...between Venus and Mars,no,https://t.co/XXoREQFEYu|https://t.co/Scl4SLxw0Z,JWST|today,,2,0,2,Nearly halfway through its flight to L2 (vs ti...,True,False,False,False,False,False,False,False,False,Nearly halfway through its flight to L2 (vs ti...,nearly halfway through its flight to l vs ti...,Nearly halfway through its flight to L2 (vs ti...,"[Nearly, halfway, through, its, flight, to, L2...",38
3,1479845407085629445,,,,,False,0,0,0,0,False,False,2022-01-08 16:00:13,"<a href=""https://mobile.twitter.com"" rel=""nofo...",,,Twitter Web App,,,,,,,,,[[]],{},,,,,,,NAMS,KALANTHURAIADAL,186,2037,11,1558,19656,False,False,False,2010-10-15 05:26:34,,no,https://t.co/Ae2w6KesjU,,YouTube,1,1,0,Time and life of Stephen Hawking | News18 Tam...,False,False,True,False,False,False,False,False,False,Time and life of Stephen Hawking | News18 Tami...,time and life of stephen hawking news tami...,Time and life of Stephen Hawking | News18 Tam...,"[Time, and, life, of, Stephen, Hawking, |, New...",11


CPU times: user 396 ms, sys: 57.1 ms, total: 453 ms
Wall time: 41.6 s


We can now optionally filter the data based on the number of words. We will choose to keep tweets that are of a minimum length of 10 words (controlled by the `min_num_words` variable in the **User Inputs** section) and drop shorter ones

In [20]:
%%time
df_words_long = df_words.query(f"num_words >= {min_num_words_tweet}")
print(
    "Number of rows in processed data, after filtering out tweets based on "
    f"length of text = {len(df_words_long):,}"
)
display(df_words_long.head(6))

                                                                                

Number of rows in processed data, after filtering out tweets based on length of text = 227,794




22/08/19 03:34:19 WARN MemoryStore: Not enough space to cache rdd_368_6 in memory! (computed 29.6 MiB so far)
22/08/19 03:34:19 WARN BlockManager: Persisting block rdd_368_6 to disk instead.
22/08/19 03:34:19 WARN MemoryStore: Not enough space to cache rdd_368_4 in memory! (computed 20.1 MiB so far)
22/08/19 03:34:19 WARN BlockManager: Persisting block rdd_368_4 to disk instead.


                                                                                

Unnamed: 0,id,geo,coordinates,place,contributors,is_quote_status,quote_count,reply_count,retweet_count,favorite_count,favorited,retweeted,created_at,source,in_reply_to_user_id,in_reply_to_screen_name,source_text,place_id,place_url,place_place_type,place_name,place_full_name,place_country_code,place_country,place_bounding_box_type,place_bounding_box_coordinates,place_attributes,coords_type,coords_lon,coords_lat,geo_type,geo_lon,geo_lat,user_name,user_screen_name,user_followers,user_friends,user_listed,user_favourites,user_statuses,user_protected,user_verified,user_contributors_enabled,user_joined,user_location,retweeted_tweet,tweet_text_urls,tweet_text_hashtags,tweet_text_usernames,num_urls_in_tweet_text,num_users_in_tweet_text,num_hashtags_in_tweet_text,text,contains_wanted_text,contains_wanted_text_case_sensitive,contains_multi_word_wanted_text,contains_crypto_terms,contains_religious_terms,contains_inappropriate_terms,contains_video_games_terms,contains_misc_unwanted_terms,contains_non_english_terms,text_stripped,text_processed,text_trimmed,words,num_words
0,1479845397946380290,,,,,False,0,0,0,0,False,False,2022-01-08 16:00:11,"<a href=""http://twitter.com/download/iphone"" r...",,,Twitter for iPhone,,,,,,,,,[[]],{},,,,,,,Looking Up #FBPE,lookingup322,700,1022,35,629,41373,False,False,False,2015-10-26 13:31:35,,no,https://t.co/9ObGVUZvdG,UnfoldTheUniverse,NASAWebb,1,1,1,LIVE from mission control: experts give real...,True,False,False,False,False,False,False,False,False,LIVE from mission control: experts give real-t...,live from mission control experts give real t...,LIVE from mission control: experts give real-...,"[LIVE, from, mission, control:, experts, give,...",33
1,1479845401289179139,,,,,False,0,0,0,0,False,False,2022-01-08 16:00:12,"<a href=""https://www.socialjukebox.com"" rel=""n...",,,The Social Jukebox,,,,,,,,,[[]],{},,,,,,,#LoveThatYacht #Luxury #Tech #Trends #CES2022 ...,LoriMoreno,175705,111415,8920,5842,797307,False,False,False,2008-06-07 16:26:14,SocialMedia RedCarpet #Influencer 🍷 🍷 🍷,no,https://t.co/noVH2Y34Z7|https://t.co/hBvXcosXJH,,,2,0,0,This was taken when we covered the NASA Night...,False,True,False,False,False,False,False,False,False,This was taken when we covered the NASA Night ...,this was taken when we covered the nasa night ...,This was taken when we covered the NASA Night...,"[This, was, taken, when, we, covered, the, NAS...",20
2,1479845404762152969,,,,,False,0,0,0,0,False,False,2022-01-08 16:00:13,"<a href=""http://twitter.com/download/android"" ...",,,Twitter for Android,,,,,,,,,[[]],{},,,,,,,Jorge Betancourt _ _ _ _ _ _⚗️🔭🔬🪐🚀🏛️📡🛰️🧬,zemblanity00,1423,3050,130,38992,83840,False,False,False,2010-04-27 20:37:51,Earth...between Venus and Mars,no,https://t.co/XXoREQFEYu|https://t.co/Scl4SLxw0Z,JWST|today,,2,0,2,Nearly halfway through its flight to L2 (vs ti...,True,False,False,False,False,False,False,False,False,Nearly halfway through its flight to L2 (vs ti...,nearly halfway through its flight to l vs ti...,Nearly halfway through its flight to L2 (vs ti...,"[Nearly, halfway, through, its, flight, to, L2...",38
3,1479845407085629445,,,,,False,0,0,0,0,False,False,2022-01-08 16:00:13,"<a href=""https://mobile.twitter.com"" rel=""nofo...",,,Twitter Web App,,,,,,,,,[[]],{},,,,,,,NAMS,KALANTHURAIADAL,186,2037,11,1558,19656,False,False,False,2010-10-15 05:26:34,,no,https://t.co/Ae2w6KesjU,,YouTube,1,1,0,Time and life of Stephen Hawking | News18 Tam...,False,False,True,False,False,False,False,False,False,Time and life of Stephen Hawking | News18 Tami...,time and life of stephen hawking news tami...,Time and life of Stephen Hawking | News18 Tam...,"[Time, and, life, of, Stephen, Hawking, |, New...",11
4,1479845409014964230,,,,,False,0,0,0,0,False,False,2022-01-08 16:00:14,"<a href=""http://twitter.com/download/android"" ...",,,Twitter for Android,,,,,,,,,[[]],{},,,,,,,Adrak_Wali_Chai,Adrak_Wali_T,188,97,0,948,3069,False,False,False,2021-11-18 13:47:26,,no,https://t.co/BRRDprM46n,स,,1,0,1,"""The scientists at National Aeronautics and Sp...",True,True,True,False,False,False,False,False,False,"""The scientists at National Aeronautics and Sp...",the scientists at national aeronautics and sp...,"""The scientists at National Aeronautics and Sp...","[""The, scientists, at, National, Aeronautics, ...",17
5,1479845414413090819,,,,,False,0,0,0,0,False,False,2022-01-08 16:00:15,"<a href=""http://twitter.com/download/android"" ...",,,Twitter for Android,,,,,,,,,[[]],{},,,,,,,𝒦. 🗝️,KeiiChoi,211,240,4,6772,64469,False,False,False,2012-06-20 08:16:08,Somewhere out there,no,https://t.co/JmpGDw25JG|https://t.co/pt0lsJs4y6,UnfoldTheUniverse,NASAWebb,2,1,1,The honeycomb is almost complete!Tune in at ar...,True,False,False,False,False,False,False,False,False,The honeycomb is almost complete!Tune in at ar...,the honeycomb is almost complete tune in at ar...,The honeycomb is almost complete!Tune in at ar...,"[The, honeycomb, is, almost, complete!Tune, in...",24


CPU times: user 251 ms, sys: 14.2 ms, total: 265 ms
Wall time: 27 s


With data processing and filtering now complete, we'll update the datatype for the processed text columns that were added in this notebook

In [21]:
dtypes_dict.update(
    {
        "text_trimmed": pd.StringDtype(),
        "text_stripped": pd.StringDtype(),
        "text_processed": pd.StringDtype(),
        "words": pd.StringDtype(),
        "num_words": pd.Int32Dtype(),
    }
)

In [22]:
%%time
df_words_long = df_words_long.astype(dtypes_dict)
with pd.option_context("display.max_rows", None):
    display(df_words_long.dtypes.rename('dtype').to_frame())

Unnamed: 0,dtype
id,string
geo,string
coordinates,string
place,string
contributors,string
is_quote_status,string
quote_count,Int32
reply_count,Int32
retweet_count,Int32
favorite_count,Int32


CPU times: user 318 ms, sys: 38.3 ms, total: 356 ms
Wall time: 696 ms


## Export Processed Data

The data is now processed and ready for machine learning model development using big-data ML frameworks. This processed data will now be exported to a separate `.parquet` file and then uploaded to the S3 bucket. The filepath of the `.parquet` file is shown below

In [23]:
filepath = f"data/processed/{processed_file_name}.parquet.gzip"
print(filepath)

data/processed/processed_text.parquet.gzip


### Save Processed and Filtered Data to `.parquet` File

In [24]:
%%time
df_words_long.to_parquet(filepath, index=False, compression='gzip')

                                                                                

CPU times: user 42 ms, sys: 8.94 ms, total: 50.9 ms
Wall time: 32.1 s


Next, we will demonstrate loading this `.parquet` file with processed data into a `DataFrame` using two big-data frameworks - PySpark and Dask.

### Demonstrate Reloading Saved `.parquet` File with PySpark

In [25]:
%%time
df_words_long_reloaded = spark.read.parquet(filepath)
with pd.option_context("display.max_columns", None):
    display(show_pyspark_df(df_words_long_reloaded, 2))

Unnamed: 0,id,geo,coordinates,place,contributors,is_quote_status,quote_count,reply_count,retweet_count,favorite_count,favorited,retweeted,created_at,source,in_reply_to_user_id,in_reply_to_screen_name,source_text,place_id,place_url,place_place_type,place_name,place_full_name,place_country_code,place_country,place_bounding_box_type,place_bounding_box_coordinates,place_attributes,coords_type,coords_lon,coords_lat,geo_type,geo_lon,geo_lat,user_name,user_screen_name,user_followers,user_friends,user_listed,user_favourites,user_statuses,user_protected,user_verified,user_contributors_enabled,user_joined,user_location,retweeted_tweet,tweet_text_urls,tweet_text_hashtags,tweet_text_usernames,num_urls_in_tweet_text,num_users_in_tweet_text,num_hashtags_in_tweet_text,text,contains_wanted_text,contains_wanted_text_case_sensitive,contains_multi_word_wanted_text,contains_crypto_terms,contains_religious_terms,contains_inappropriate_terms,contains_video_games_terms,contains_misc_unwanted_terms,contains_non_english_terms,text_stripped,text_processed,text_trimmed,words,num_words
0,1479845397946380290,,,,,False,0,0,0,0,False,False,2022-01-08 16:00:11,"<a href=""http://twitter.com/download/iphone"" r...",,,Twitter for iPhone,,,,,,,,,[[]],{},,,,,,,Looking Up #FBPE,lookingup322,700,1022,35,629,41373,False,False,False,2015-10-26 13:31:35,,no,https://t.co/9ObGVUZvdG,UnfoldTheUniverse,NASAWebb,1,1,1,LIVE from mission control: experts give real...,True,False,False,False,False,False,False,False,False,LIVE from mission control: experts give real-t...,live from mission control experts give real t...,LIVE from mission control: experts give real-...,"[LIVE, from, mission, control:, experts, give,...",33
1,1479845401289179139,,,,,False,0,0,0,0,False,False,2022-01-08 16:00:12,"<a href=""https://www.socialjukebox.com"" rel=""n...",,,The Social Jukebox,,,,,,,,,[[]],{},,,,,,,#LoveThatYacht #Luxury #Tech #Trends #CES2022 ...,LoriMoreno,175705,111415,8920,5842,797307,False,False,False,2008-06-07 16:26:14,SocialMedia RedCarpet #Influencer 🍷 🍷 🍷,no,https://t.co/noVH2Y34Z7|https://t.co/hBvXcosXJH,,,2,0,0,This was taken when we covered the NASA Night...,False,True,False,False,False,False,False,False,False,This was taken when we covered the NASA Night ...,this was taken when we covered the nasa night ...,This was taken when we covered the NASA Night...,"[This, was, taken, when, we, covered, the, NAS...",20


CPU times: user 54.5 ms, sys: 6.73 ms, total: 61.2 ms
Wall time: 369 ms


Get a `DataFrame` version of the Spark Schema (`df.printSchema()`) for the PySpark `DataFrame`

In [26]:
%%time
df_dtypes_pyspark = pd.DataFrame.from_records(
    [
        {"name": field.name, "dtype": field.dataType, "nullable": field.nullable}
        for field in df_words_long_reloaded.schema.fields
    ]
).set_index("name")
with pd.option_context("display.max_rows", None):
    display(df_dtypes_pyspark)

Unnamed: 0_level_0,dtype,nullable
name,Unnamed: 1_level_1,Unnamed: 2_level_1
id,StringType(),True
geo,StringType(),True
coordinates,StringType(),True
place,StringType(),True
contributors,StringType(),True
is_quote_status,StringType(),True
quote_count,IntegerType(),True
reply_count,IntegerType(),True
retweet_count,IntegerType(),True
favorite_count,IntegerType(),True


CPU times: user 10.6 ms, sys: 296 µs, total: 10.9 ms
Wall time: 10.2 ms


### Demonstrate Reloading Saved `.parquet` File with `pandas` API on PySpark

In [27]:
%%time
df_words_long_reloaded_ps = ps.read_parquet(filepath, index_col="id").reset_index().astype(dtypes_dict)
with pd.option_context("display.max_columns", None):
    display(df_words_long_reloaded_ps.head(2))
with pd.option_context("display.max_rows", None):
    display(df_words_long_reloaded_ps.dtypes.rename('dtype').to_frame())

                                                                                

Unnamed: 0,id,geo,coordinates,place,contributors,is_quote_status,quote_count,reply_count,retweet_count,favorite_count,favorited,retweeted,created_at,source,in_reply_to_user_id,in_reply_to_screen_name,source_text,place_id,place_url,place_place_type,place_name,place_full_name,place_country_code,place_country,place_bounding_box_type,place_bounding_box_coordinates,place_attributes,coords_type,coords_lon,coords_lat,geo_type,geo_lon,geo_lat,user_name,user_screen_name,user_followers,user_friends,user_listed,user_favourites,user_statuses,user_protected,user_verified,user_contributors_enabled,user_joined,user_location,retweeted_tweet,tweet_text_urls,tweet_text_hashtags,tweet_text_usernames,num_urls_in_tweet_text,num_users_in_tweet_text,num_hashtags_in_tweet_text,text,contains_wanted_text,contains_wanted_text_case_sensitive,contains_multi_word_wanted_text,contains_crypto_terms,contains_religious_terms,contains_inappropriate_terms,contains_video_games_terms,contains_misc_unwanted_terms,contains_non_english_terms,text_stripped,text_processed,text_trimmed,words,num_words
0,1479845397946380290,,,,,False,0,0,0,0,False,False,2022-01-08 16:00:11,"<a href=""http://twitter.com/download/iphone"" r...",,,Twitter for iPhone,,,,,,,,,[[]],{},,,,,,,Looking Up #FBPE,lookingup322,700,1022,35,629,41373,False,False,False,2015-10-26 13:31:35,,no,https://t.co/9ObGVUZvdG,UnfoldTheUniverse,NASAWebb,1,1,1,LIVE from mission control: experts give real...,True,False,False,False,False,False,False,False,False,LIVE from mission control: experts give real-t...,live from mission control experts give real t...,LIVE from mission control: experts give real-...,"[LIVE, from, mission, control:, experts, give,...",33
1,1479845401289179139,,,,,False,0,0,0,0,False,False,2022-01-08 16:00:12,"<a href=""https://www.socialjukebox.com"" rel=""n...",,,The Social Jukebox,,,,,,,,,[[]],{},,,,,,,#LoveThatYacht #Luxury #Tech #Trends #CES2022 ...,LoriMoreno,175705,111415,8920,5842,797307,False,False,False,2008-06-07 16:26:14,SocialMedia RedCarpet #Influencer 🍷 🍷 🍷,no,https://t.co/noVH2Y34Z7|https://t.co/hBvXcosXJH,,,2,0,0,This was taken when we covered the NASA Night...,False,True,False,False,False,False,False,False,False,This was taken when we covered the NASA Night ...,this was taken when we covered the nasa night ...,This was taken when we covered the NASA Night...,"[This, was, taken, when, we, covered, the, NAS...",20


Unnamed: 0,dtype
id,string
geo,string
coordinates,string
place,string
contributors,string
is_quote_status,string
quote_count,Int32
reply_count,Int32
retweet_count,Int32
favorite_count,Int32


CPU times: user 547 ms, sys: 47.5 ms, total: 594 ms
Wall time: 4.19 s


### Demonstrate Reloading Saved `.parquet` File with `dask`

In [28]:
%%time
df_words_long_reloaded_dd = dd.read_parquet(filepath).astype(dtypes_dict)
with pd.option_context("display.max_columns", None):
    display(df_words_long_reloaded_dd.head(2))
with pd.option_context("display.max_rows", None):
    display(df_words_long_reloaded_dd.dtypes.rename('dtype').to_frame())

Unnamed: 0,id,geo,coordinates,place,contributors,is_quote_status,quote_count,reply_count,retweet_count,favorite_count,favorited,retweeted,created_at,source,in_reply_to_user_id,in_reply_to_screen_name,source_text,place_id,place_url,place_place_type,place_name,place_full_name,place_country_code,place_country,place_bounding_box_type,place_bounding_box_coordinates,place_attributes,coords_type,coords_lon,coords_lat,geo_type,geo_lon,geo_lat,user_name,user_screen_name,user_followers,user_friends,user_listed,user_favourites,user_statuses,user_protected,user_verified,user_contributors_enabled,user_joined,user_location,retweeted_tweet,tweet_text_urls,tweet_text_hashtags,tweet_text_usernames,num_urls_in_tweet_text,num_users_in_tweet_text,num_hashtags_in_tweet_text,text,contains_wanted_text,contains_wanted_text_case_sensitive,contains_multi_word_wanted_text,contains_crypto_terms,contains_religious_terms,contains_inappropriate_terms,contains_video_games_terms,contains_misc_unwanted_terms,contains_non_english_terms,text_stripped,text_processed,text_trimmed,words,num_words
0,1479845397946380290,,,,,False,0,0,0,0,False,False,2022-01-08 16:00:11,"<a href=""http://twitter.com/download/iphone"" r...",,,Twitter for iPhone,,,,,,,,,[[]],{},,,,,,,Looking Up #FBPE,lookingup322,700,1022,35,629,41373,False,False,False,2015-10-26 13:31:35,,no,https://t.co/9ObGVUZvdG,UnfoldTheUniverse,NASAWebb,1,1,1,LIVE from mission control: experts give real...,True,False,False,False,False,False,False,False,False,LIVE from mission control: experts give real-t...,live from mission control experts give real t...,LIVE from mission control: experts give real-...,"[LIVE, from, mission, control:, experts, give,...",33
1,1479845401289179139,,,,,False,0,0,0,0,False,False,2022-01-08 16:00:12,"<a href=""https://www.socialjukebox.com"" rel=""n...",,,The Social Jukebox,,,,,,,,,[[]],{},,,,,,,#LoveThatYacht #Luxury #Tech #Trends #CES2022 ...,LoriMoreno,175705,111415,8920,5842,797307,False,False,False,2008-06-07 16:26:14,SocialMedia RedCarpet #Influencer 🍷 🍷 🍷,no,https://t.co/noVH2Y34Z7|https://t.co/hBvXcosXJH,,,2,0,0,This was taken when we covered the NASA Night...,False,True,False,False,False,False,False,False,False,This was taken when we covered the NASA Night ...,this was taken when we covered the nasa night ...,This was taken when we covered the NASA Night...,"[This, was, taken, when, we, covered, the, NAS...",20


Unnamed: 0,dtype
id,string
geo,string
coordinates,string
place,string
contributors,string
is_quote_status,string
quote_count,Int32
reply_count,Int32
retweet_count,Int32
favorite_count,Int32


CPU times: user 692 ms, sys: 184 ms, total: 876 ms
Wall time: 874 ms


## Zip and Upload Processed Data `.parquet` Files to S3 Bucket

We'll now
- create a `.zip` file from the `.parquet` file (PySpark creates a folder of `.parquet` files, not a single file like `pandas` did in the previous notebook)
- upload the `.zip` file to S3 in the `/processed` prefix
- delete the local `.parquet` folder with processed data
- delete the local `.zip` file prepared from the `.parquet` folder

In [29]:
print(filepath)
print(path_to_folder)
print(processed_data_dir)
print(proc_text_zip_fname)
print(f"{path_to_folder[1:-1]}/processed/{proc_text_zip_fname}")
print(os.path.join(processed_data_dir, proc_text_zip_fname))

data/processed/processed_text.parquet.gzip
/datasets/twitter/kinesis-demo/
data/processed
processed_text.zip
datasets/twitter/kinesis-demo/processed/processed_text.zip
data/processed/processed_text.zip


In [31]:
%%time
# create zip of all .parquet.gzip processed data files
create_zip_file_from_folder(processed_data_dir, proc_text_zip_fname, filepath)

# upload zip file to S3 bucket
upload_file_to_s3(
    session.region_name,
    processed_data_dir,
    proc_text_zip_fname,
    s3_bucket_name,
    f"{path_to_folder[1:-1]}/processed/{proc_text_zip_fname}",
)
print("\nUploaded zipped file to S3 bucket")

# delete folder with locally exported parquet files containing processed data
# exported by PySpark
shutil.rmtree(filepath)
print("Deleted local .parquet.gzip files with processed data.")
# delete local zip file
os.remove(os.path.join(processed_data_dir, proc_text_zip_fname))
print("Deleted local .zip file created from all filtered data files.")


Uploaded zipped file to S3 bucket
Deleted local .parquet.gzip files with processed data.
Deleted local .zip file created from all filtered data files.
CPU times: user 560 ms, sys: 479 ms, total: 1.04 s
Wall time: 2.16 s


## Cleanup

W'll now
- delete the local `.parquet` folder with prepared data that was processed in this notebook
- delete the local `.zip` file (containing the individual `.parquet` files of prepared data) that was downloaded from S3

In [32]:
print(os.path.join(processed_data_dir, proc_zip_fname))
proc_files[:5]

data/processed/processed_data.zip


['data/processed/2022010119.parquet.gzip',
 'data/processed/2022010419.parquet.gzip',
 'data/processed/2022010506.parquet.gzip',
 'data/processed/2022010420.parquet.gzip',
 'data/processed/2022010612.parquet.gzip']

In [33]:
%%time
# delete locally exported parquet files
_ = list(map(os.remove, proc_files))
print("Deleted local .parquet.gzip files with filtered data.")

# delete local zip file
os.remove(os.path.join(processed_data_dir, proc_zip_fname))
print("Deleted local .zip file created from all filtered data files.")

Deleted local .parquet.gzip files with filtered data.
Deleted local .zip file created from all filtered data files.
CPU times: user 275 µs, sys: 18.4 ms, total: 18.6 ms
Wall time: 53.5 ms


---

<span style="float:left;">
    <a href="./3_prepare_data.ipynb"><< 3 - Data Preparation</a>
</span>

<span style="float:right;">
    <a href="./5_get_sample_to_label.ipynb">5 - Get Sample of Data for Labeling >></a>
</span>