# Combine Raw Data Files using PySpark

In [None]:
import os
from functools import reduce
from typing import List

import boto3
import pandas as pd
from pyspark.sql import SparkSession, functions as F, types as T

In [None]:
# os.environ["AWS_ACCESS_KEY_ID"] = ""
# os.environ["AWS_SECRET_ACCESS_KEY"] = ""
# os.environ["AWS_REGION"] = ""
# os.environ["AWS_S3_BUCKET_NAME"] = ""

## About

In this notebook we will walk through how to use PySpark to perform the combination of the raw streamed data that was stored in S3.

Previously, in `3_combine_raw_data.ipynb`, we used `pandas` to combine raw hourly streamed data into CSV files and then loaded all CSV files into a single PySpark `DataFrame` in `4_data_processing.ipynb` before performing quantitative analysis. Here, we will load the raw hourly streamed data directly into a single PySpark `DataFrame` and apply the same filters we applied in `3_combine_raw_data.ipynb`.

Although we won't perform the quantitative (ML) analysis here, this notebook will show how we could use PySpark to work with raw data from S3.

**Requirements**

1. This notebook must be run on Databricks.
2. Required Python libraries are
   - `boto3`
3. Four environment variables must be accessible for this notebook
   - `AWS_S3_BUCKET_NAME`
     - the name of the S3 bucket containing the raw streamed Twitter data
   - `AWS_REGION`
     - [AWS region](https://aws.amazon.com/about-aws/global-infrastructure/regions_az/) in which the S3 bucket was created
     - [as a reminder, S3 buckets must have a globally unique name but are created in a specific region](https://www.quora.com/Why-are-S3-buckets-in-the-global-region)
   - `AWS_ACCESS_KEY_ID`
     - [AWS credential](https://docs.aws.amazon.com/cli/latest/userguide/cli-configure-files.html)
   - `AWS_SECRET_ACCESS_KEY`
     - [AWS credential](https://docs.aws.amazon.com/cli/latest/userguide/cli-configure-files.html)

The Python package requirements to run this notebook are different to those listed in the `requirements.txt` file for this project.

## User Inputs

In [None]:
s3_bucket_name = os.getenv("AWS_S3_BUCKET_NAME")
path_to_folder = "/datasets/twitter/kinesis-demo/"

# List of headers for all streamed twitter attributes
headers = [
    "id",
    "geo",
    "coordinates",
    "place",
    "contributors",
    "is_quote_status",
    "quote_count",
    "reply_count",
    "retweet_count",
    "favorite_count",
    "favorited",
    "retweeted",
    "created_at",
    "source",
    "in_reply_to_user_id",
    "in_reply_to_screen_name",
    "source_text",
    "place_id",
    "place_url",
    "place_place_type",
    "place_name",
    "place_full_name",
    "place_country_code",
    "place_country",
    "place_bounding_box_type",
    "place_bounding_box_coordinates",
    "place_attributes",
    "coords_type",
    "coords_lon",
    "coords_lat",
    "geo_type",
    "geo_lon",
    "geo_lat",
    "user_name",
    "user_screen_name",
    "user_followers",
    "user_friends",
    "user_listed",
    "user_favourites",
    "user_statuses",
    "user_protected",
    "user_verified",
    "user_contributors_enabled",
    "user_joined",
    "user_location",
    "retweeted_tweet",
    "tweet_text_urls",
    "tweet_text_hashtags",
    "tweet_text_usernames",
    "num_urls_in_tweet_text",
    "num_users_in_tweet_text",
    "num_hashtags_in_tweet_text",
    "text",
]

# List of partial strings to use to filter out unwanted tweets
# - tweets containing sensitive tweet texts that were found retrospectively
#   and should be excluded from the CSV files
unwanted_partial_strings_list = [
    # specific to crypto mining
    "crypto",
    "token",
    "koistarter",
    "daostarter",
    "decentralized",
    "services",
    "pancakeswap",
    "eraxnft",
    "browsing",
    "kommunitas",
    "hosting",
    "internet",
    "exipofficial",
    "servers",
    "wallet",
    "liquidity",
    "rewards",
    "floki",
    "10000000000000linkstelegram",
    "dogecoin",
    "czbinance",
    "watch",
    "binance",
    "dogelonmars",
    "cryptocurrency",
    "hbomax",
    "money",
    "danheld",
    "cybersecurity",
    # others
    "prostitution",
    "nairobi",
    "musembe",
    "volcano detected",
    "block-2",
    "mo-greene",
    "running scared2012",
    "running scared 2012",
    "massacres",
    "eric ephriam chavez",
    "drugs",
    "tanzanite",
    "vvsorigin",
    "gemstonecarat",
    "bin laden",
    "saddam",
    "webuye",
    "bungoma",
    "perished",
    "popescu",
    "whore",
    "nasty",
    "ethereum",
    "pay someone",
    "gamejoin",
    "nft",
    "breeding",
    "seungkwan",
    "woozi",
    "hoshi",
    "bitcrush",
    "arcade",
    "homeworkpay",
    "homework",
    "photocards",
    "deta",
    "marketing",
    "dreamcast",
    "sega",
    "xbox",
    "wii",
    "ps4",
    "kasama",
    "nung",
    "lahat",
    "jinsoul",
    "brunisoul",
    "loona",
    "taas",
    "nung",
    "essay",
    # religious
    "scriptures",
    "methusealah",
    "testament",
    "yahweh",
    "god",
    "mullah",
    "allah",
    "clergy",
    "mercy",
    "morality",
    "muslims,",
    "hindus",
    "buddhist",
    "catholics",
    "christians",
    "atheist",
    # inappropriate
    "nazist",
    "antifa",
    "proud boys",
]

In [None]:
pyspark_df_column_names = headers + ["blank"]

region = os.getenv("AWS_REGION")

In [None]:
s3_client = boto3.client("s3", region_name=region)

In [None]:
def mount_s3_bucket(access_key, secret_key, bucket_name, mount_folder):
    """Mount S3 bucket to Databricks filesystem."""
    ENCODED_SECRET_KEY = secret_key.replace("/", "%2F")
    
    s3_path_wout_bucket_name = f"{s3_bucket_name}/datasets/kinesis/2021".split("/", 1)[-1]
    print ("Mounting", s3_path_wout_bucket_name)
    
    try:
        # Unmount the data in case it was already mounted.
        dbutils.fs.unmount("/mnt/%s" % mount_folder)
    except:
        # If it fails to unmount it most likely wasn't mounted in the first place
        print ("Directory not unmounted: ", mount_folder)
    finally:
        # Lastly, mount our bucket.
        dbutils.fs.mount("s3a://%s:%s@%s" % (access_key, ENCODED_SECRET_KEY, bucket_name), "/mnt/%s" % mount_folder)
        #dbutils.fs.mount("s3a://"+ access_key + ":" + secret_key + "@" + bucket_name, mount_folder)
        print ("The bucket path", s3_path_wout_bucket_name, "was mounted to", mount_folder, "\n")

## Get List of Hourly S3 Folders Containing Streamed Data

We'll use the [`boto3` S3 client](https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/s3.html#id221) to get a [flat list](https://coderwall.com/p/rcmaea/flatten-a-list-of-lists-in-one-line-in-python) of hourly folders for each day (during December of 2021) on which tweets were streamed by Kinesis into the S3 bucket

In [None]:
def get_hourly_folders_per_day(s3_bucket_name: str, path_to_folder: str, years_wanted: List[int]) -> List[str]:
    """Get list of hourly data folders in S3 bucket, for each day of streamed data."""
    list_of_hourly_dirs = []
    for year in years_wanted:
        monthly_prefixes = s3_client.list_objects_v2(
            Bucket=s3_bucket_name,
            Prefix=f"{path_to_folder[1:]}{year}/",
            Delimiter="/",
        )["CommonPrefixes"]
        # print(monthly_prefixes)

        for monthly_prefix in monthly_prefixes:
            daily_prefixes = s3_client.list_objects_v2(
                Bucket=s3_bucket_name,
                Prefix=monthly_prefix["Prefix"],
                Delimiter="/",
            )["CommonPrefixes"]
            # print(monthly_prefix, daily_prefixes)

            for daily_prefix in daily_prefixes:
                hourly_prefixes = s3_client.list_objects_v2(
                    Bucket=s3_bucket_name,
                    Prefix=daily_prefix["Prefix"],
                    Delimiter="/",
                )["CommonPrefixes"]
                # print(
                #     monthly_prefix,
                #     # daily_prefixes,
                #     hourly_prefixes,
                # )
                list_of_hourly_dirs.append(hourly_prefixes)
    list_of_hourly_dirs_flat = [sl["Prefix"] for l in list_of_hourly_dirs for sl in l]
    return list_of_hourly_dirs_flat

In [None]:
# get list of hourly folders, per day
list_of_hourly_dirs_flat = get_hourly_folders_per_day(s3_bucket_name, path_to_folder, [2021])
print(f"Found {len(list_of_hourly_dirs_flat):,} hourly folders")
for hourly_dirs in list_of_hourly_dirs_flat:
    print(hourly_dirs)

datasets/twitter/kinesis-demo/2021/12/30/17/
datasets/twitter/kinesis-demo/2021/12/31/18/
datasets/twitter/kinesis-demo/2021/12/31/19/
datasets/twitter/kinesis-demo/2021/12/31/20/
datasets/twitter/kinesis-demo/2021/12/31/21/
datasets/twitter/kinesis-demo/2021/12/31/22/
datasets/twitter/kinesis-demo/2021/12/31/23/


In the next two sections, we'll use PySpark and `pandas` to read files from all these folders into a single PySpark and `pandas` `DataFrame` respectively. In the case of PySpark, we will first mount each of the hourly folders to a separate Databricks dataset and then read in these files from the Databricks filesystem. For `pandas`, we will use Python to read these files into a nested list and then convert this into a single `pandas.DataFrame`.

## Load Data with `pandas`

We'll now load all the files listed above into a `pandas` `DataFrame` and perform some basic filtering of the data. We'll then repeat this using a PySpark `DataFrame`. This data can fit into memory in a single `pandas` `DataFrame` since we have only selected the tweet files (above) that were streamed in 2021, which covered a total of seven hours on December 30 and 31.

This will allow us to compare the data filtering performed with PySpark and `pandas`.

### Get List of Contents of Streamed Data Files

We'll get a flat list of the contents of all files in each hourly folder

In [None]:
def read_files_per_hour(s3_bucket_name: str, flat_list_of_hourly_dirs: List[str]) -> List[str]:
    """Read individual files in each hourly folder in the S3 bucket."""
    file_contents_all = []
    for list_of_hourly_dirs in flat_list_of_hourly_dirs:
        objects_hourly_all = s3_client.list_objects_v2(Bucket=s3_bucket_name, Prefix=list_of_hourly_dirs)
        file_contents_list = []
        for file_obj_dict in objects_hourly_all["Contents"]:
            file_body = s3_client.get_object(Bucket=s3_bucket_name, Key=file_obj_dict.get("Key"))["Body"].read()
            file_contents_list.append(file_body)
        print(f"{list_of_hourly_dirs} contains {len(file_contents_list):,} file objects")
        file_contents_all.append(file_contents_list)
    file_contents_all_flat = [
        file_contents
        for file_contents_list in file_contents_all
        for file_contents in file_contents_list
    ]
    return file_contents_all_flat

In [None]:
# Read all files from hourly directories
file_contents_all_flat = read_files_per_hour(s3_bucket_name, list_of_hourly_dirs_flat)
print(f"Total number of file objects = {len(file_contents_all_flat):,}")

datasets/twitter/kinesis-demo/2021/12/30/17/ contains 15 file objects
datasets/twitter/kinesis-demo/2021/12/31/18/ contains 40 file objects
datasets/twitter/kinesis-demo/2021/12/31/19/ contains 58 file objects
datasets/twitter/kinesis-demo/2021/12/31/20/ contains 59 file objects
datasets/twitter/kinesis-demo/2021/12/31/21/ contains 59 file objects
datasets/twitter/kinesis-demo/2021/12/31/22/ contains 58 file objects
datasets/twitter/kinesis-demo/2021/12/31/23/ contains 59 file objects
Total number of file objects = 348


### Convert List of Streamed Data File Contents into `pandas.DataFrame`

We'll now
- load all files in all hourly folders listed above into a single `pandas` `DataFrame`
- (filter 1) drop blank rows and then drop rows with no text
- (filter 2) remove rows where the text of the tweet (in the `text` column) contains words that are not indicative of the subject (tweets related to space news) we are trying to explore here

In [None]:
nested_list_of_records = []
for file_body in file_contents_all_flat:
    list_of_records = file_body.decode("utf-8").split("\n")[:-1]
    nested_list_of_records.append(list_of_records)
dfp = pd.DataFrame(
    [record.split("\t")[:-1] for sl in nested_list_of_records for record in sl],
    columns=headers
)

print(f"Number of rows after loading data = {len(dfp):,}")

# filter 1
dfp = dfp.dropna(how="all")

dfp = dfp.dropna(subset=["text"])

print(f"Number of rows after dropping missing values = {len(dfp):,}")

# filter 2
unwanted_partial_strings = "|".join(unwanted_partial_strings_list)
dfp = dfp[~dfp["text"].str.lower().str.contains(unwanted_partial_strings)]

print(f"Number of rows after removing unwanted partial tweets = {len(dfp):,}")

dfp = dfp.sort_values(by=["id"], ascending=[True])

display(dfp.head())

Number of rows after loading data = 29,006
Number of rows after dropping missing values = 28,942
Number of rows after removing unwanted partial tweets = 25,971


id,geo,coordinates,place,contributors,is_quote_status,quote_count,reply_count,retweet_count,favorite_count,favorited,retweeted,created_at,source,in_reply_to_user_id,in_reply_to_screen_name,source_text,place_id,place_url,place_place_type,place_name,place_full_name,place_country_code,place_country,place_bounding_box_type,place_bounding_box_coordinates,place_attributes,coords_type,coords_lon,coords_lat,geo_type,geo_lon,geo_lat,user_name,user_screen_name,user_followers,user_friends,user_listed,user_favourites,user_statuses,user_protected,user_verified,user_contributors_enabled,user_joined,user_location,retweeted_tweet,tweet_text_urls,tweet_text_hashtags,tweet_text_usernames,num_urls_in_tweet_text,num_users_in_tweet_text,num_hashtags_in_tweet_text,text
1476607990027407364,,,,,False,0,0,0,0,False,False,Thu Dec 30 17:35:53 +0000 2021,Twitter for iPhone,,,Twitter for iPhone,,,,,,,,,[[]],{},,,,,,,Happy New Years 2022,GuessWho122021,216,255,0,1165,3214,False,False,False,Sat Nov 27 20:38:14 +0000 2021,World Wide,no,https://t.co/yH8cBVuMj3,,,1,0,0,"Space colonists may turn to cannibalism, scientists warn:The first generations to occupy space outposts may be left with few nutrition choices, scholars believe"
1476607992586018824,,,,,True,0,0,0,0,False,False,Thu Dec 30 17:35:54 +0000 2021,Twitter for iPhone,,,Twitter for iPhone,,,,,,,,,[[]],{},,,,,,,TheToysRusKid,JC1of1,202,313,0,18454,53653,False,False,False,Tue Sep 04 17:18:46 +0000 2018,Southern California,no,,,,0,0,0,1. Beat Street2. Harlem Knights 3. National Lampoons Vacation 4. Bachelor Party 5. Do The right Thing6. How High7. Weird Science 8. Clerks 9. Beverly Hills Cop10. Goodfellas
1476607997405184000,,,,,False,0,0,0,0,False,False,Thu Dec 30 17:35:55 +0000 2021,WordPress.com,,,WordPress.com,,,,,,,,,[[]],{},,,,,,,BCABA Network,BcabaNetwork,2581,4710,42,3027,238152,False,False,False,Wed May 25 10:54:46 +0000 2016,"West Midlands, England",no,https://t.co/kPAMK3TxSL,,,1,0,0,NASA Plans Coverage of Webb Space Telescope Deployments
1476607997925330945,,,,,False,0,0,0,0,False,False,Thu Dec 30 17:35:55 +0000 2021,Twitter for iPhone,,,Twitter for iPhone,,,,,,,,,[[]],{},,,,,,,Jude Jackson 💙 Solidarity with #NHS workers,JudeJack,2138,4999,157,79973,109065,False,False,False,Sat Feb 21 14:48:52 +0000 2009,International,no,https://t.co/cRcHiFCHS0,,,1,0,0,"Exciting times!Back in 1915, Einstein published his theory of General Relativity. 1917 the Bolsheviks led the worlds’ first proletarian revolution. Humanity is about to peer back to the beginning of time, and the world is poised once again for revolution."
1476608000114802695,,,,,False,0,0,0,0,False,False,Thu Dec 30 17:35:55 +0000 2021,Twitter for iPhone,,,Twitter for iPhone,,,,,,,,,[[]],{},,,,,,,Alan Stern,AlanStern,43743,1719,1163,33719,40081,False,False,False,Sat Jul 26 17:50:37 +0000 2008,"Niwot, CO",no,,,ng_rippel|rhwells79|AlanStern|NBCNews,0,4,0,"Obviously, since science is a mental mode of nature and Pluto doesn’t have a mind. But having it is the very essence of science and that requires functionally useful concepts like “planet”. Non-functional concepts hurt the progress of science and the public’s insights into it."


## Mount Hourly S3 folders to Databricks File System

We'll now mount the folders of hourly data on S3 to the Databricks file system, so that we can use PySpark to read all the individual files (inside each hourly folder) into a single PySpark `DataFrame`. Each folder of hourly data files will be mounted to a separate [Databricks dataset](https://docs.databricks.com/data/databricks-datasets.html) and then get a flat list

In [None]:
def mount_all_hourly_files(s3_bucket_name: str, flat_list_of_hourly_dirs: List[str]) -> List[str]:
    """Mount every folder of hourly data files to a separate Databricks dataset."""
    dataset_names = []
    for hourly_path in flat_list_of_hourly_dirs:
        hourly_dataset_name = hourly_path.split("/", 2)[-1][:-1].replace("/", "_")
        print(f"Mounting all files in {hourly_path} to {hourly_dataset_name}")
        mount_s3_bucket(
            os.getenv("AWS_ACCESS_KEY_ID"),
            os.getenv("AWS_SECRET_ACCESS_KEY"),
            f"{s3_bucket_name}/{hourly_path}",
            hourly_dataset_name,
        )
        dataset_names.append(hourly_dataset_name)
    return dataset_names


def get_dbfs_mounted_filepaths(dataset_names: List[str]) -> List[List[str]]:
    """Get filepaths from mounted Databricks datasets."""
    files_full = [file for dataset_name in dataset_names for file in dbutils.fs.ls(f"/mnt/{dataset_name}/")]
    file_paths_full = [file.path for file in files_full]
    return [file_paths_full, files_full]

We'll first mount the files contained in each hourly folder, from the list we found earlier, to a single Databricks dataset of the filepaths to all the files in each folder

In [None]:
# Mount folders to databricks datasets
dataset_names = mount_all_hourly_files(s3_bucket_name, list_of_hourly_dirs_flat)
print(f"Finished mounting {len(dataset_names):,} datasets")

# Get filepaths to files in all folders
file_paths_full, files_full = get_dbfs_mounted_filepaths(dataset_names)
print("Printing attributes for first 10 mounted files:")
display(files_full[:10])

print("Printing first five filepaths:")
for file_path_full in file_paths_full[:5]:
    print(file_path_full)

Mounting all files in datasets/twitter/kinesis-demo/2021/12/30/17/ to kinesis-demo_2021_12_30_17
Mounting datasets/kinesis/2021
/mnt/kinesis-demo_2021_12_30_17 has been unmounted.
The bucket path datasets/kinesis/2021 was mounted to kinesis-demo_2021_12_30_17 

Mounting all files in datasets/twitter/kinesis-demo/2021/12/31/18/ to kinesis-demo_2021_12_31_18
Mounting datasets/kinesis/2021
/mnt/kinesis-demo_2021_12_31_18 has been unmounted.
The bucket path datasets/kinesis/2021 was mounted to kinesis-demo_2021_12_31_18 

Mounting all files in datasets/twitter/kinesis-demo/2021/12/31/19/ to kinesis-demo_2021_12_31_19
Mounting datasets/kinesis/2021
/mnt/kinesis-demo_2021_12_31_19 has been unmounted.
The bucket path datasets/kinesis/2021 was mounted to kinesis-demo_2021_12_31_19 

Mounting all files in datasets/twitter/kinesis-demo/2021/12/31/20/ to kinesis-demo_2021_12_31_20
Mounting datasets/kinesis/2021
/mnt/kinesis-demo_2021_12_31_20 has been unmounted.
The bucket path datasets/kinesis/2

path,name,size
dbfs:/mnt/kinesis-demo_2021_12_30_17/twitter_delivery_stream-1-2021-12-30-17-35-58-b98c4593-2164-445f-9c09-b80d647c80fe,twitter_delivery_stream-1-2021-12-30-17-35-58-b98c4593-2164-445f-9c09-b80d647c80fe,68390
dbfs:/mnt/kinesis-demo_2021_12_30_17/twitter_delivery_stream-1-2021-12-30-17-37-00-03050126-c5a3-457a-9141-44c723e9a16e,twitter_delivery_stream-1-2021-12-30-17-37-00-03050126-c5a3-457a-9141-44c723e9a16e,75672
dbfs:/mnt/kinesis-demo_2021_12_30_17/twitter_delivery_stream-1-2021-12-30-17-38-01-ee043884-55a1-4696-afe2-ae744d931748,twitter_delivery_stream-1-2021-12-30-17-38-01-ee043884-55a1-4696-afe2-ae744d931748,73484
dbfs:/mnt/kinesis-demo_2021_12_30_17/twitter_delivery_stream-1-2021-12-30-17-39-01-45daac32-de3f-4189-b312-8cd405214fe4,twitter_delivery_stream-1-2021-12-30-17-39-01-45daac32-de3f-4189-b312-8cd405214fe4,93691
dbfs:/mnt/kinesis-demo_2021_12_30_17/twitter_delivery_stream-1-2021-12-30-17-40-03-9c7445d1-3438-4c6c-96df-4629d801bbd1,twitter_delivery_stream-1-2021-12-30-17-40-03-9c7445d1-3438-4c6c-96df-4629d801bbd1,93241
dbfs:/mnt/kinesis-demo_2021_12_30_17/twitter_delivery_stream-1-2021-12-30-17-41-03-b8becce2-f393-42a9-bc9c-f8847fb4a7d2,twitter_delivery_stream-1-2021-12-30-17-41-03-b8becce2-f393-42a9-bc9c-f8847fb4a7d2,71521
dbfs:/mnt/kinesis-demo_2021_12_30_17/twitter_delivery_stream-1-2021-12-30-17-42-05-d4790ec9-be7b-443a-9109-3d3614310fd9,twitter_delivery_stream-1-2021-12-30-17-42-05-d4790ec9-be7b-443a-9109-3d3614310fd9,68828
dbfs:/mnt/kinesis-demo_2021_12_30_17/twitter_delivery_stream-1-2021-12-30-17-43-05-bbf49b5b-a7a8-4ed7-a7ef-27e366aa82c0,twitter_delivery_stream-1-2021-12-30-17-43-05-bbf49b5b-a7a8-4ed7-a7ef-27e366aa82c0,71076
dbfs:/mnt/kinesis-demo_2021_12_30_17/twitter_delivery_stream-1-2021-12-30-17-44-07-2bf0e38a-9da5-4437-b696-995b1abd0bea,twitter_delivery_stream-1-2021-12-30-17-44-07-2bf0e38a-9da5-4437-b696-995b1abd0bea,69835
dbfs:/mnt/kinesis-demo_2021_12_30_17/twitter_delivery_stream-1-2021-12-30-17-45-07-78f1c731-826b-456c-b6c5-51f4d06ca11b,twitter_delivery_stream-1-2021-12-30-17-45-07-78f1c731-826b-456c-b6c5-51f4d06ca11b,77615


We'll now perform a basic sanity check where we verify that the number of files using
- mounting with the databricks file system
- `boto3`

agree with each other

In [None]:
assert len(file_contents_all_flat) == len(file_paths_full)

**Observations**
1. The two approaches to list all files in all folders of the S3 bucket agree with each other. This means we have the same number of files mounted locally as we do in the hourly folders in the AWS S3 bucket.

## Load Data with PySpark

### Spark Set-up

To prepare for working with PySpark, we'll now create the [Spark session](https://spark.apache.org/docs/latest/api/scala/org/apache/spark/sql/SparkSession.html) and define a Spark context ([1](https://www.educba.com/sparkcontext/), [2](https://spark.apache.org/docs/latest/api/java/org/apache/spark/SparkContext.html))

In [None]:
spark = (
    SparkSession.builder
    .appName('Application Name')
    .getOrCreate()
)
print('Spark Session created')

sc = spark.sparkContext

Spark Session created


### Load Data into single PySpark `DataFrame` and Apply Filters Using Built-in Spark DataFrame Methods

As we did with the `pandas` approach, we will now
- load all data into a pySpark DataFrame
- (filter 1) drop blank rows and then drop rows with no text
- (filter 2) remove rows where the tweet text (`text` column) contains words that are not indicative of the subject (tweets related to space news) we are trying to explore here

and we will print the number of rows in the `DataFrame` after each step, for comparison to the `pandas` approach

In [None]:
df = (
    spark.read
    .format("csv")
    .option("inferSchema", "true")
    .option("header", False)
    .option("encoding", "utf-8")
    .option("delimiter", "\t")
    .option("multiline", "false")
    # .option("quote", "\"")
    # .option("escape", "\"")
    # .option("escape", "\n")
    .load(file_paths_full)
)

df = df.toDF(*pyspark_df_column_names)

df = df.cache()

print(f"Number of rows after loading data = {df.count():,}")

# filter 1
df = df.dropna(how='all')

df = df.dropna(subset=["text"])

print(f"Number of rows after dropping missing values = {df.count():,}")

# filter 2
df1 = df.where(
    reduce(
        lambda a, b: a|b,
        (
            ~F.lower(df['text']).like('%'+pat+"%")
            for pat in unwanted_partial_strings_list
        )
    )
)

print(
    "Number of rows after removing unwanted partial tweets using "
    f"built-in PySpark functions = {df1.count():,}"
)

df1 = df1.orderBy(F.col("id").asc())

display(df1.limit(10))

Number of rows after loading data = 29,005
Number of rows after dropping missing values = 28,936
Number of rows after removing unwanted partial tweets using built-in PySpark functions = 28,936


id,geo,coordinates,place,contributors,is_quote_status,quote_count,reply_count,retweet_count,favorite_count,favorited,retweeted,created_at,source,in_reply_to_user_id,in_reply_to_screen_name,source_text,place_id,place_url,place_place_type,place_name,place_full_name,place_country_code,place_country,place_bounding_box_type,place_bounding_box_coordinates,place_attributes,coords_type,coords_lon,coords_lat,geo_type,geo_lon,geo_lat,user_name,user_screen_name,user_followers,user_friends,user_listed,user_favourites,user_statuses,user_protected,user_verified,user_contributors_enabled,user_joined,user_location,retweeted_tweet,tweet_text_urls,tweet_text_hashtags,tweet_text_usernames,num_urls_in_tweet_text,num_users_in_tweet_text,num_hashtags_in_tweet_text,text,blank
1476607990027407364,,,,,False,0,0,0,0,False,False,Thu Dec 30 17:35:53 +0000 2021,Twitter for iPhone,,,Twitter for iPhone,,,,,,,,,[[]],{},,,,,,,Happy New Years 2022,GuessWho122021,216,255,0,1165,3214,False,False,False,Sat Nov 27 20:38:14 +0000 2021,World Wide,no,https://t.co/yH8cBVuMj3,,,1,0,0,"Space colonists may turn to cannibalism, scientists warn:The first generations to occupy space outposts may be left with few nutrition choices, scholars believe",
1476607992586018824,,,,,True,0,0,0,0,False,False,Thu Dec 30 17:35:54 +0000 2021,Twitter for iPhone,,,Twitter for iPhone,,,,,,,,,[[]],{},,,,,,,TheToysRusKid,JC1of1,202,313,0,18454,53653,False,False,False,Tue Sep 04 17:18:46 +0000 2018,Southern California,no,,,,0,0,0,1. Beat Street2. Harlem Knights 3. National Lampoons Vacation 4. Bachelor Party 5. Do The right Thing6. How High7. Weird Science 8. Clerks 9. Beverly Hills Cop10. Goodfellas,
1476607997405184000,,,,,False,0,0,0,0,False,False,Thu Dec 30 17:35:55 +0000 2021,WordPress.com,,,WordPress.com,,,,,,,,,[[]],{},,,,,,,BCABA Network,BcabaNetwork,2581,4710,42,3027,238152,False,False,False,Wed May 25 10:54:46 +0000 2016,"West Midlands, England",no,https://t.co/kPAMK3TxSL,,,1,0,0,NASA Plans Coverage of Webb Space Telescope Deployments,
1476607997925330945,,,,,False,0,0,0,0,False,False,Thu Dec 30 17:35:55 +0000 2021,Twitter for iPhone,,,Twitter for iPhone,,,,,,,,,[[]],{},,,,,,,Jude Jackson 💙 Solidarity with #NHS workers,JudeJack,2138,4999,157,79973,109065,False,False,False,Sat Feb 21 14:48:52 +0000 2009,International,no,https://t.co/cRcHiFCHS0,,,1,0,0,"Exciting times!Back in 1915, Einstein published his theory of General Relativity. 1917 the Bolsheviks led the worlds’ first proletarian revolution. Humanity is about to peer back to the beginning of time, and the world is poised once again for revolution.",
1476608000114802695,,,,,False,0,0,0,0,False,False,Thu Dec 30 17:35:55 +0000 2021,Twitter for iPhone,,,Twitter for iPhone,,,,,,,,,[[]],{},,,,,,,Alan Stern,AlanStern,43743,1719,1163,33719,40081,False,False,False,Sat Jul 26 17:50:37 +0000 2008,"Niwot, CO",no,,,ng_rippel|rhwells79|AlanStern|NBCNews,0,4,0,"Obviously, since science is a mental mode of nature and Pluto doesn’t have a mind. But having it is the very essence of science and that requires functionally useful concepts like “planet”. Non-functional concepts hurt the progress of science and the public’s insights into it.",
1476608001402277893,,,,,False,0,0,0,0,False,False,Thu Dec 30 17:35:56 +0000 2021,Twitter for iPhone,,,Twitter for iPhone,,,,,,,,,[[]],{},,,,,,,Nhada Naim,kognomee_,115,403,0,14149,19509,False,False,False,Mon Sep 12 07:35:14 +0000 2016,Im there but not too far,no,,,,0,0,0,Jupiter is in Penis and Vagina is stationed retrograde. Wear a condom ya’ll,
1476608002782203915,,,,,False,0,0,0,0,False,False,Thu Dec 30 17:35:56 +0000 2021,Twitter Web App,,,Twitter Web App,,,,,,,,,[[]],{},,,,,,,Themistoclea (Bonnie DeVarco),Themistoclea,358,514,101,1004,7869,False,False,False,Tue Dec 23 19:34:02 +0000 2008,Ubiquitous,no,https://t.co/czsEhxCaVk|https://t.co/n65TKzRAHV,,NASAWebb,2,1,0,"How 'bout we squeeze in some good news before the year ends? Not only did deploy its tower assembly, the team also said the telescope's launch was so precise that we should expect the science to continue well beyond 10 years!",
1476608009669201922,,,,,True,0,0,0,0,False,False,Thu Dec 30 17:35:58 +0000 2021,Twitter Web App,,,Twitter Web App,,,,,,,,,[[]],{},,,,,,,Radio Justice 📻🎙⚖,justiceputnam,2599,1395,84,93424,193934,False,False,False,Tue Jul 14 05:10:36 +0000 2009,"Rogue River, Oregon #homebase",no,,,,0,0,0,"NASA: It wasn't a strike, it was just a work slowdown.",
1476608010202066949,,,,,False,0,0,0,0,False,False,Thu Dec 30 17:35:58 +0000 2021,Twitter for iPhone,,,Twitter for iPhone,,,,,,,,,[[]],{},,,,,,,Kwaku Barcelona,o_theophilus1,3845,1011,3,25207,23203,False,False,False,Sun Apr 03 20:59:45 +0000 2016,Kumerican/Ghana/WestAfrica/USA,no,,,FabrizioRomano,0,1,0,"I was being interviewed by Elon Musk. He asked, ""where are you from"", and I said Portugal. He replied, ""so you are a fellow country man of a Pen merchant whose freekick ball broke my rover on Mars. Get out!"". Tears ran down my face. Shame on you Pendu for costing me my dream job",
1476608015965040643,,,,,False,0,0,0,0,False,False,Thu Dec 30 17:35:59 +0000 2021,Buckshee Forum,,,Buckshee Forum,,,,,,,,,[[]],{},,,,,,,The Buckshee,BucksheeForum,17,1,0,0,329009,False,False,False,Fri Nov 03 12:41:31 +0000 2017,,no,https://t.co/V1zNCNlkQPJames,,,1,0,0,Webb telescope is captured soaring through space from Earth,


**Observations**
1. There appears to be a difference between the two approaches in terms of the number of rows with blank values that are removed. In this notebook, we will not further explore the differences between these two approaches of removing missing values.
2. As we can see, the number of rows before and after applying the filter to remove tweets containing words in the list of unwanted words is the same. This filter was created using PySpark DataFrame's built-in methods but, unfortunately, this approach has not worked.

### Load Data into single PySpark `DataFrame` and Apply Filters Using `pandas` User-Defined Functions

The workaround for the filter to remove unwanted tweets, based on their words, will be to use a [PySpark `pandas` User Defined Functions (UDFs)](https://spark.apache.org/docs/3.0.0/sql-pyspark-pandas-with-arrow.html#pandas-udfs-aka-vectorized-udfs) that applies the `pandas` string method `.contains()` to remove the unwanted list of substrings in the tweet test column (`text`). The `pandas` UDF is defined below wrapped inside a Python function in order to pass a [keyword argument](https://docs.python.org/3/glossary.html) to the `pandas` UDF

In [None]:
def exclude_by_partial_str_wrapper(column: F.column, unwanted_partial_str: str='one|two') -> pd.Series:
    """Wrapper function to filter single column using pandas UDF with a keyword argument."""

    @F.pandas_udf(T.BooleanType(), F.PandasUDFType.SCALAR)
    def exclude_by_partial_str(column: pd.Series) -> pd.Series:
        return ~column.str.lower().str.contains(unwanted_partial_str)

    return exclude_by_partial_str(column)

We'll now call this `pandas` UDF to exclude unwanted substrings from the tweet text column of the PySpark `DataFrame`

In [None]:
# filter 2
df2 = df.withColumn(
    "no_unwanted_strs",
    exclude_by_partial_str_wrapper(F.col("text"), unwanted_partial_strings)
).filter(F.col("no_unwanted_strs") == 1)
print(
    f"Number of rows after removing unwanted partial tweets using pandas UDF = {df2.count():,}"
)

df2 = df2.orderBy(F.col("id").asc())

display(df2.limit(10))

Number of rows after removing unwanted partial tweets using pandas UDF = 25,965


id,geo,coordinates,place,contributors,is_quote_status,quote_count,reply_count,retweet_count,favorite_count,favorited,retweeted,created_at,source,in_reply_to_user_id,in_reply_to_screen_name,source_text,place_id,place_url,place_place_type,place_name,place_full_name,place_country_code,place_country,place_bounding_box_type,place_bounding_box_coordinates,place_attributes,coords_type,coords_lon,coords_lat,geo_type,geo_lon,geo_lat,user_name,user_screen_name,user_followers,user_friends,user_listed,user_favourites,user_statuses,user_protected,user_verified,user_contributors_enabled,user_joined,user_location,retweeted_tweet,tweet_text_urls,tweet_text_hashtags,tweet_text_usernames,num_urls_in_tweet_text,num_users_in_tweet_text,num_hashtags_in_tweet_text,text,blank,no_unwanted_strs
1476607990027407364,,,,,False,0,0,0,0,False,False,Thu Dec 30 17:35:53 +0000 2021,Twitter for iPhone,,,Twitter for iPhone,,,,,,,,,[[]],{},,,,,,,Happy New Years 2022,GuessWho122021,216,255,0,1165,3214,False,False,False,Sat Nov 27 20:38:14 +0000 2021,World Wide,no,https://t.co/yH8cBVuMj3,,,1,0,0,"Space colonists may turn to cannibalism, scientists warn:The first generations to occupy space outposts may be left with few nutrition choices, scholars believe",,True
1476607992586018824,,,,,True,0,0,0,0,False,False,Thu Dec 30 17:35:54 +0000 2021,Twitter for iPhone,,,Twitter for iPhone,,,,,,,,,[[]],{},,,,,,,TheToysRusKid,JC1of1,202,313,0,18454,53653,False,False,False,Tue Sep 04 17:18:46 +0000 2018,Southern California,no,,,,0,0,0,1. Beat Street2. Harlem Knights 3. National Lampoons Vacation 4. Bachelor Party 5. Do The right Thing6. How High7. Weird Science 8. Clerks 9. Beverly Hills Cop10. Goodfellas,,True
1476607997405184000,,,,,False,0,0,0,0,False,False,Thu Dec 30 17:35:55 +0000 2021,WordPress.com,,,WordPress.com,,,,,,,,,[[]],{},,,,,,,BCABA Network,BcabaNetwork,2581,4710,42,3027,238152,False,False,False,Wed May 25 10:54:46 +0000 2016,"West Midlands, England",no,https://t.co/kPAMK3TxSL,,,1,0,0,NASA Plans Coverage of Webb Space Telescope Deployments,,True
1476607997925330945,,,,,False,0,0,0,0,False,False,Thu Dec 30 17:35:55 +0000 2021,Twitter for iPhone,,,Twitter for iPhone,,,,,,,,,[[]],{},,,,,,,Jude Jackson 💙 Solidarity with #NHS workers,JudeJack,2138,4999,157,79973,109065,False,False,False,Sat Feb 21 14:48:52 +0000 2009,International,no,https://t.co/cRcHiFCHS0,,,1,0,0,"Exciting times!Back in 1915, Einstein published his theory of General Relativity. 1917 the Bolsheviks led the worlds’ first proletarian revolution. Humanity is about to peer back to the beginning of time, and the world is poised once again for revolution.",,True
1476608000114802695,,,,,False,0,0,0,0,False,False,Thu Dec 30 17:35:55 +0000 2021,Twitter for iPhone,,,Twitter for iPhone,,,,,,,,,[[]],{},,,,,,,Alan Stern,AlanStern,43743,1719,1163,33719,40081,False,False,False,Sat Jul 26 17:50:37 +0000 2008,"Niwot, CO",no,,,ng_rippel|rhwells79|AlanStern|NBCNews,0,4,0,"Obviously, since science is a mental mode of nature and Pluto doesn’t have a mind. But having it is the very essence of science and that requires functionally useful concepts like “planet”. Non-functional concepts hurt the progress of science and the public’s insights into it.",,True
1476608001402277893,,,,,False,0,0,0,0,False,False,Thu Dec 30 17:35:56 +0000 2021,Twitter for iPhone,,,Twitter for iPhone,,,,,,,,,[[]],{},,,,,,,Nhada Naim,kognomee_,115,403,0,14149,19509,False,False,False,Mon Sep 12 07:35:14 +0000 2016,Im there but not too far,no,,,,0,0,0,Jupiter is in Penis and Vagina is stationed retrograde. Wear a condom ya’ll,,True
1476608002782203915,,,,,False,0,0,0,0,False,False,Thu Dec 30 17:35:56 +0000 2021,Twitter Web App,,,Twitter Web App,,,,,,,,,[[]],{},,,,,,,Themistoclea (Bonnie DeVarco),Themistoclea,358,514,101,1004,7869,False,False,False,Tue Dec 23 19:34:02 +0000 2008,Ubiquitous,no,https://t.co/czsEhxCaVk|https://t.co/n65TKzRAHV,,NASAWebb,2,1,0,"How 'bout we squeeze in some good news before the year ends? Not only did deploy its tower assembly, the team also said the telescope's launch was so precise that we should expect the science to continue well beyond 10 years!",,True
1476608009669201922,,,,,True,0,0,0,0,False,False,Thu Dec 30 17:35:58 +0000 2021,Twitter Web App,,,Twitter Web App,,,,,,,,,[[]],{},,,,,,,Radio Justice 📻🎙⚖,justiceputnam,2599,1395,84,93424,193934,False,False,False,Tue Jul 14 05:10:36 +0000 2009,"Rogue River, Oregon #homebase",no,,,,0,0,0,"NASA: It wasn't a strike, it was just a work slowdown.",,True
1476608010202066949,,,,,False,0,0,0,0,False,False,Thu Dec 30 17:35:58 +0000 2021,Twitter for iPhone,,,Twitter for iPhone,,,,,,,,,[[]],{},,,,,,,Kwaku Barcelona,o_theophilus1,3845,1011,3,25207,23203,False,False,False,Sun Apr 03 20:59:45 +0000 2016,Kumerican/Ghana/WestAfrica/USA,no,,,FabrizioRomano,0,1,0,"I was being interviewed by Elon Musk. He asked, ""where are you from"", and I said Portugal. He replied, ""so you are a fellow country man of a Pen merchant whose freekick ball broke my rover on Mars. Get out!"". Tears ran down my face. Shame on you Pendu for costing me my dream job",,True
1476608015965040643,,,,,False,0,0,0,0,False,False,Thu Dec 30 17:35:59 +0000 2021,Buckshee Forum,,,Buckshee Forum,,,,,,,,,[[]],{},,,,,,,The Buckshee,BucksheeForum,17,1,0,0,329009,False,False,False,Fri Nov 03 12:41:31 +0000 2017,,no,https://t.co/V1zNCNlkQPJames,,,1,0,0,Webb telescope is captured soaring through space from Earth,,True


**Notes**
1. The discrepancy between the number of rows remaining after filtering is due to the difference between how `pandas` and `pySpark` appears to drop rows with missing values, which we identified earlier.

**Observations**
1. As we can see, the filter to remove the list of unwanted words from the tweets is now being applied and the number of rows after filtering is less than before filtering.

### Load and Filter All Data with PySpark

We'll now repeat the entire PySpark workflow with the files from both 2021 (used above) and 2022.

First, we'll get a flat list with the Databricks filesystem filepaths covering tweets streamed during both 2021 and 2022

In [None]:
# get list of hourly folders, per day
list_of_hourly_dirs_flat_all = get_hourly_folders_per_day(s3_bucket_name, path_to_folder, [2021, 2022])
print(f"Found {len(list_of_hourly_dirs_flat_all):,} hourly folders")

# Mount folders to databricks datasets
dataset_names_all = mount_all_hourly_files(s3_bucket_name, list_of_hourly_dirs_flat_all)
print(f"Finished mounting {len(dataset_names_all):,} datasets")

In [None]:
# Get filepaths to files in all folders
file_paths_full_all, files_full_all = get_dbfs_mounted_filepaths(dataset_names_all)
print("Printing attributes for first 10 mounted files:")
display(files_full_all[:10])

Printing attributes for first 10 mounted files:


path,name,size
dbfs:/mnt/kinesis-demo_2021_12_30_17/twitter_delivery_stream-1-2021-12-30-17-35-58-b98c4593-2164-445f-9c09-b80d647c80fe,twitter_delivery_stream-1-2021-12-30-17-35-58-b98c4593-2164-445f-9c09-b80d647c80fe,68390
dbfs:/mnt/kinesis-demo_2021_12_30_17/twitter_delivery_stream-1-2021-12-30-17-37-00-03050126-c5a3-457a-9141-44c723e9a16e,twitter_delivery_stream-1-2021-12-30-17-37-00-03050126-c5a3-457a-9141-44c723e9a16e,75672
dbfs:/mnt/kinesis-demo_2021_12_30_17/twitter_delivery_stream-1-2021-12-30-17-38-01-ee043884-55a1-4696-afe2-ae744d931748,twitter_delivery_stream-1-2021-12-30-17-38-01-ee043884-55a1-4696-afe2-ae744d931748,73484
dbfs:/mnt/kinesis-demo_2021_12_30_17/twitter_delivery_stream-1-2021-12-30-17-39-01-45daac32-de3f-4189-b312-8cd405214fe4,twitter_delivery_stream-1-2021-12-30-17-39-01-45daac32-de3f-4189-b312-8cd405214fe4,93691
dbfs:/mnt/kinesis-demo_2021_12_30_17/twitter_delivery_stream-1-2021-12-30-17-40-03-9c7445d1-3438-4c6c-96df-4629d801bbd1,twitter_delivery_stream-1-2021-12-30-17-40-03-9c7445d1-3438-4c6c-96df-4629d801bbd1,93241
dbfs:/mnt/kinesis-demo_2021_12_30_17/twitter_delivery_stream-1-2021-12-30-17-41-03-b8becce2-f393-42a9-bc9c-f8847fb4a7d2,twitter_delivery_stream-1-2021-12-30-17-41-03-b8becce2-f393-42a9-bc9c-f8847fb4a7d2,71521
dbfs:/mnt/kinesis-demo_2021_12_30_17/twitter_delivery_stream-1-2021-12-30-17-42-05-d4790ec9-be7b-443a-9109-3d3614310fd9,twitter_delivery_stream-1-2021-12-30-17-42-05-d4790ec9-be7b-443a-9109-3d3614310fd9,68828
dbfs:/mnt/kinesis-demo_2021_12_30_17/twitter_delivery_stream-1-2021-12-30-17-43-05-bbf49b5b-a7a8-4ed7-a7ef-27e366aa82c0,twitter_delivery_stream-1-2021-12-30-17-43-05-bbf49b5b-a7a8-4ed7-a7ef-27e366aa82c0,71076
dbfs:/mnt/kinesis-demo_2021_12_30_17/twitter_delivery_stream-1-2021-12-30-17-44-07-2bf0e38a-9da5-4437-b696-995b1abd0bea,twitter_delivery_stream-1-2021-12-30-17-44-07-2bf0e38a-9da5-4437-b696-995b1abd0bea,69835
dbfs:/mnt/kinesis-demo_2021_12_30_17/twitter_delivery_stream-1-2021-12-30-17-45-07-78f1c731-826b-456c-b6c5-51f4d06ca11b,twitter_delivery_stream-1-2021-12-30-17-45-07-78f1c731-826b-456c-b6c5-51f4d06ca11b,77615


In [None]:
print(f"Printing first five filepaths:")
for file_path_full_all_first_five in file_paths_full_all[:5]:
    print(file_path_full_all_first_five)

Printing first five filepaths:
dbfs:/mnt/kinesis-demo_2021_12_30_17/twitter_delivery_stream-1-2021-12-30-17-35-58-b98c4593-2164-445f-9c09-b80d647c80fe
dbfs:/mnt/kinesis-demo_2021_12_30_17/twitter_delivery_stream-1-2021-12-30-17-37-00-03050126-c5a3-457a-9141-44c723e9a16e
dbfs:/mnt/kinesis-demo_2021_12_30_17/twitter_delivery_stream-1-2021-12-30-17-38-01-ee043884-55a1-4696-afe2-ae744d931748
dbfs:/mnt/kinesis-demo_2021_12_30_17/twitter_delivery_stream-1-2021-12-30-17-39-01-45daac32-de3f-4189-b312-8cd405214fe4
dbfs:/mnt/kinesis-demo_2021_12_30_17/twitter_delivery_stream-1-2021-12-30-17-40-03-9c7445d1-3438-4c6c-96df-4629d801bbd1


We'll now read all these files into a single PySpark `DataFrame` and apply the same basic filters as those from above (remove missing values and unwanted tweets based on their contents)

In [None]:
df = (
    spark.read
    .format("csv")
    .option("inferSchema", "true")
    .option("header", False)
    .option("encoding", "utf-8")
    .option("delimiter", "\t")
    .option("multiline", "false")
    # .option("quote", "\"")
    # .option("escape", "\"")
    # .option("escape", "\n")
    .load(file_paths_full_all)
)

df = df.toDF(*pyspark_df_column_names)

df = df.cache()

Show the number of tweets that were streamed using AWS Kinesis

In [None]:
print(f"Number of rows after loading data = {df.count():,}")

Number of rows after loading data = 1,162,432


**Observations**
1. As we can see, this is significantly larger than the the number of rows in the earlier PySpark `DataFrame` since this one covers the full tweets dataset (all hourly files of streamed Twitter data) stored in S3.

We'll drop rows from this `DataFrame` with missing values in
- all columns
- the `text` column (text of the tweet)

In [None]:
# filter 1
df = df.dropna(how='all')

df = df.dropna(subset=["text"])

print(f"Number of rows after dropping missing values = {df.count():,}")

Number of rows after dropping missing values = 1,155,445


We'll now apply the second filter to exclude tweets based on the words used

In [None]:
# filter 2
df2 = df.withColumn(
    "no_unwanted_strs",
    exclude_by_partial_str_wrapper(F.col("text"), unwanted_partial_strings)
).filter(F.col("no_unwanted_strs") == 1)
print(
    "Number of rows after removing unwanted partial tweets "
    f"using pandas UDF = {df2.count():,}"
)

Number of rows after removing unwanted partial tweets using pandas UDF = 989,822


Finally, we'll sort the resulting `DataFrame` as we did earlier

In [None]:
df2 = df2.orderBy(F.col("id").asc())

display(df2.limit(10))

id,geo,coordinates,place,contributors,is_quote_status,quote_count,reply_count,retweet_count,favorite_count,favorited,retweeted,created_at,source,in_reply_to_user_id,in_reply_to_screen_name,source_text,place_id,place_url,place_place_type,place_name,place_full_name,place_country_code,place_country,place_bounding_box_type,place_bounding_box_coordinates,place_attributes,coords_type,coords_lon,coords_lat,geo_type,geo_lon,geo_lat,user_name,user_screen_name,user_followers,user_friends,user_listed,user_favourites,user_statuses,user_protected,user_verified,user_contributors_enabled,user_joined,user_location,retweeted_tweet,tweet_text_urls,tweet_text_hashtags,tweet_text_usernames,num_urls_in_tweet_text,num_users_in_tweet_text,num_hashtags_in_tweet_text,text,blank,no_unwanted_strs
1476607990027407364,,,,,False,0,0,0,0,False,False,Thu Dec 30 17:35:53 +0000 2021,Twitter for iPhone,,,Twitter for iPhone,,,,,,,,,[[]],{},,,,,,,Happy New Years 2022,GuessWho122021,216,255,0,1165,3214,False,False,False,Sat Nov 27 20:38:14 +0000 2021,World Wide,no,https://t.co/yH8cBVuMj3,,,1,0,0,"Space colonists may turn to cannibalism, scientists warn:The first generations to occupy space outposts may be left with few nutrition choices, scholars believe",,True
1476607992586018824,,,,,True,0,0,0,0,False,False,Thu Dec 30 17:35:54 +0000 2021,Twitter for iPhone,,,Twitter for iPhone,,,,,,,,,[[]],{},,,,,,,TheToysRusKid,JC1of1,202,313,0,18454,53653,False,False,False,Tue Sep 04 17:18:46 +0000 2018,Southern California,no,,,,0,0,0,1. Beat Street2. Harlem Knights 3. National Lampoons Vacation 4. Bachelor Party 5. Do The right Thing6. How High7. Weird Science 8. Clerks 9. Beverly Hills Cop10. Goodfellas,,True
1476607997405184000,,,,,False,0,0,0,0,False,False,Thu Dec 30 17:35:55 +0000 2021,WordPress.com,,,WordPress.com,,,,,,,,,[[]],{},,,,,,,BCABA Network,BcabaNetwork,2581,4710,42,3027,238152,False,False,False,Wed May 25 10:54:46 +0000 2016,"West Midlands, England",no,https://t.co/kPAMK3TxSL,,,1,0,0,NASA Plans Coverage of Webb Space Telescope Deployments,,True
1476607997925330945,,,,,False,0,0,0,0,False,False,Thu Dec 30 17:35:55 +0000 2021,Twitter for iPhone,,,Twitter for iPhone,,,,,,,,,[[]],{},,,,,,,Jude Jackson 💙 Solidarity with #NHS workers,JudeJack,2138,4999,157,79973,109065,False,False,False,Sat Feb 21 14:48:52 +0000 2009,International,no,https://t.co/cRcHiFCHS0,,,1,0,0,"Exciting times!Back in 1915, Einstein published his theory of General Relativity. 1917 the Bolsheviks led the worlds’ first proletarian revolution. Humanity is about to peer back to the beginning of time, and the world is poised once again for revolution.",,True
1476608000114802695,,,,,False,0,0,0,0,False,False,Thu Dec 30 17:35:55 +0000 2021,Twitter for iPhone,,,Twitter for iPhone,,,,,,,,,[[]],{},,,,,,,Alan Stern,AlanStern,43743,1719,1163,33719,40081,False,False,False,Sat Jul 26 17:50:37 +0000 2008,"Niwot, CO",no,,,ng_rippel|rhwells79|AlanStern|NBCNews,0,4,0,"Obviously, since science is a mental mode of nature and Pluto doesn’t have a mind. But having it is the very essence of science and that requires functionally useful concepts like “planet”. Non-functional concepts hurt the progress of science and the public’s insights into it.",,True
1476608001402277893,,,,,False,0,0,0,0,False,False,Thu Dec 30 17:35:56 +0000 2021,Twitter for iPhone,,,Twitter for iPhone,,,,,,,,,[[]],{},,,,,,,Nhada Naim,kognomee_,115,403,0,14149,19509,False,False,False,Mon Sep 12 07:35:14 +0000 2016,Im there but not too far,no,,,,0,0,0,Jupiter is in Penis and Vagina is stationed retrograde. Wear a condom ya’ll,,True
1476608002782203915,,,,,False,0,0,0,0,False,False,Thu Dec 30 17:35:56 +0000 2021,Twitter Web App,,,Twitter Web App,,,,,,,,,[[]],{},,,,,,,Themistoclea (Bonnie DeVarco),Themistoclea,358,514,101,1004,7869,False,False,False,Tue Dec 23 19:34:02 +0000 2008,Ubiquitous,no,https://t.co/czsEhxCaVk|https://t.co/n65TKzRAHV,,NASAWebb,2,1,0,"How 'bout we squeeze in some good news before the year ends? Not only did deploy its tower assembly, the team also said the telescope's launch was so precise that we should expect the science to continue well beyond 10 years!",,True
1476608009669201922,,,,,True,0,0,0,0,False,False,Thu Dec 30 17:35:58 +0000 2021,Twitter Web App,,,Twitter Web App,,,,,,,,,[[]],{},,,,,,,Radio Justice 📻🎙⚖,justiceputnam,2599,1395,84,93424,193934,False,False,False,Tue Jul 14 05:10:36 +0000 2009,"Rogue River, Oregon #homebase",no,,,,0,0,0,"NASA: It wasn't a strike, it was just a work slowdown.",,True
1476608010202066949,,,,,False,0,0,0,0,False,False,Thu Dec 30 17:35:58 +0000 2021,Twitter for iPhone,,,Twitter for iPhone,,,,,,,,,[[]],{},,,,,,,Kwaku Barcelona,o_theophilus1,3845,1011,3,25207,23203,False,False,False,Sun Apr 03 20:59:45 +0000 2016,Kumerican/Ghana/WestAfrica/USA,no,,,FabrizioRomano,0,1,0,"I was being interviewed by Elon Musk. He asked, ""where are you from"", and I said Portugal. He replied, ""so you are a fellow country man of a Pen merchant whose freekick ball broke my rover on Mars. Get out!"". Tears ran down my face. Shame on you Pendu for costing me my dream job",,True
1476608015965040643,,,,,False,0,0,0,0,False,False,Thu Dec 30 17:35:59 +0000 2021,Buckshee Forum,,,Buckshee Forum,,,,,,,,,[[]],{},,,,,,,The Buckshee,BucksheeForum,17,1,0,0,329009,False,False,False,Fri Nov 03 12:41:31 +0000 2017,,no,https://t.co/V1zNCNlkQPJames,,,1,0,0,Webb telescope is captured soaring through space from Earth,,True


## Summary
With this processing completed, we would now be ready to proceed to perform the quantitative analysis on this data using [PySparkML](https://spark.apache.org/docs/2.3.1/api/python/pyspark.ml.html), as was done in `4_data_processing.ipynb` ([link](https://nbviewer.org/github/elsdes3/big-data-ml/blob/main/4_data_processing.ipynb)).