# Create Data Splits

In [1]:
%load_ext lab_black
%load_ext autoreload
%autoreload 2

In [2]:
import os
import sys
from calendar import day_name
from glob import glob
from datetime import datetime
import shutil
import zipfile

import boto3
import dask.dataframe as dd
import pandas as pd
from dask_ml.model_selection import train_test_split
from sklearn.model_selection import train_test_split as sk_train_test_split

In [3]:
PROJ_ROOT = os.path.join(os.pardir)
src_dir = os.path.join(PROJ_ROOT, "src")
sys.path.append(src_dir)

In [4]:
%aimport s3_utils
from s3_utils import download_file_from_s3, extract_zip_file

## About

### Objective
This notebook will create
- training
- validation
- test

splits that will be used to [fine-tune](https://huggingface.co/docs/transformers/training#train-in-native-pytorch) (train) a pre-trained transformers model ([1](https://huggingface.co/microsoft/MiniLM-L12-H384-uncased?text=I+like+you.+I+love+you), [2](https://arxiv.org/pdf/2002.10957.pdf)) for twitter sentiment classification
  - since `dask-ml` provides a convenient method (`train_test_split()`, [link](https://ml.dask.org/modules/generated/dask_ml.model_selection.train_test_split.html#dask-ml-model-selection-train-test-split)) for creating data splits
- a split to assess business metrics, using the **previously trained NLP model**

### Data
The data used to create the four splits above will be the processed data that was created by the previous notebook (`5-process-data/notebooks/5_process_data.ipynb`).

### Note About Data in ML Model Drift Monitoring
If the performance of the model being developed in this project is adequate, then the fine-tuned model will be deployed to a production end-point to serve inference. After the same number of inference predictions have been made as the size of the test split used during ML model development above, the following will be performed
- all tweets predicted during inference are labeled
  - manually
  - using the **previously trained NLP model**
- deployed ML model predictions, made during inference, will be scored against these labels (previous bullet point) in order to determine if ML model performance has
  - drifted (the ML training pipeline will be triggered)
    - scores are not within some threshold of the scores on the test split during ML model development
    - predictions made by the **previously trained NLP model** will be served to the customer
      - the other option here is to serve the same (poorly scoring) predictions made by the ML model to the customers
    - updated training, validation and testing splits will be created using *all available data*
    - a new ML model will be trained using *all available data* and will then be deployed to production
  - not drifted
    - scores are within some threshold of the scores on the test split during ML model development
    - the currently used ML model will continue to serve inference

## User Inputs

In [5]:
path_to_folder = "/datasets/twitter/kinesis-demo/"

# processed data
processed_data_dir = "../data/processed"
processed_file_name = "processed_text"

# train-test split
test_split_frac = 0.125

cols_to_use = [
    "id",
    "created_at",
    "contributors",
    "user_joined",
    "source_text",
    "place_country",
    "user_location",
    "user_followers",
    "user_friends",
    "user_listed",
    "user_favourites",
    "user_statuses",
    "user_protected",
    "user_verified",
    "is_quote_status",
    "retweeted",
    "retweeted_tweet",
    "in_reply_to_screen_name",
    "user_screen_name",
    "num_urls_in_tweet_text",
    "num_words",
    "text",
]

# sampling data
nlp_sample_size = 0.66667
nlp_no_support_needed_sample_size = 750
sampled_fname = "sampled_data.csv.zip"
# nlp_cols = [
#     "id",
#     "created_at",
#     "text",
#     # columns below were not present when this notebook was run and so
#     # next two notebooks needed merging of
#     # - NLP splits (created at end of this notebook)
#     # - processed data (created at end of 5_process_data.ipynb)
#     # if these columns are present, then the following are not needed
#     # - merging
#     # - dependencies to load processed data (dask)
#     "contributors",
#     "user_joined",
#     "source_text",
#     "place_country",
#     "user_location",
#     "user_followers",
#     "user_friends",
#     "user_listed",
#     "user_favourites",
#     "user_statuses",
#     "user_protected",
#     "user_verified",
#     "is_quote_status",
#     "retweeted",
#     "retweeted_tweet",
#     "in_reply_to_screen_name",
#     "user_screen_name",
#     "num_urls_in_tweet_text",
# ]

# inference
batch_size = 600
inference_start_date = "2022-01-10 00:00:00"

# feature engineering
b = [0, 4, 8, 12, 16, 20, 24]
l = ["Late Night", "Early Morning", "Morning", "Afternoon", "Evening", "Night"]

upload_to_s3 = True
create_nlp_splits = True
cleanup_local_files = True

In [6]:
s3_bucket_name = os.getenv("AWS_S3_BUCKET_NAME", "")

try:
    session = boto3.Session(profile_name="default")
    s3_client = session.client("s3")
    aws_region = session.region_name
    print("Retrieved AWS credentials from ~/.ssh/aws file")
except Exception as e:
    if str(e) == "The config profile (default) could not be found":
        aws_region = os.getenv("AWS_REGION")
        s3_client = boto3.client("s3", region_name=aws_region)
        print("Retrieved AWS credentials from .env file")

Retrieved AWS credentials from .env file


In [7]:
dtypes_dict = {
    "id": pd.StringDtype(),
    "geo": pd.StringDtype(),
    "coordinates": pd.StringDtype(),
    "place": pd.StringDtype(),
    "contributors": pd.StringDtype(),  # pd.BooleanDtype(),
    "is_quote_status": pd.StringDtype(),  # pd.BooleanDtype(),
    "quote_count": pd.Int32Dtype(),
    "reply_count": pd.Int32Dtype(),
    "retweet_count": pd.Int32Dtype(),
    "favorite_count": pd.Int32Dtype(),
    "favorited": pd.StringDtype(),  # pd.BooleanDtype(),
    "retweeted": pd.StringDtype(),  # pd.BooleanDtype(),
    "source": pd.StringDtype(),
    "in_reply_to_user_id": pd.StringDtype(),
    "in_reply_to_screen_name": pd.StringDtype(),
    "source_text": pd.StringDtype(),
    "place_id": pd.StringDtype(),
    "place_url": pd.StringDtype(),
    "place_place_type": pd.StringDtype(),
    "place_name": pd.StringDtype(),
    "place_full_name": pd.StringDtype(),
    "place_country_code": pd.StringDtype(),
    "place_country": pd.StringDtype(),
    "place_bounding_box_type": pd.StringDtype(),
    "place_bounding_box_coordinates": pd.StringDtype(),
    "place_attributes": pd.StringDtype(),
    "coords_type": pd.StringDtype(),
    "coords_lon": pd.StringDtype(),
    "coords_lat": pd.StringDtype(),
    "geo_type": pd.StringDtype(),
    "geo_lon": pd.StringDtype(),
    "geo_lat": pd.StringDtype(),
    "user_name": pd.StringDtype(),
    "user_screen_name": pd.StringDtype(),
    "user_followers": pd.Int32Dtype(),
    "user_friends": pd.Int32Dtype(),
    "user_listed": pd.Int32Dtype(),
    "user_favourites": pd.Int32Dtype(),
    "user_statuses": pd.Int32Dtype(),
    "user_protected": pd.StringDtype(),  # pd.BooleanDtype(),
    "user_verified": pd.StringDtype(),  # pd.BooleanDtype(),
    "user_contributors_enabled": pd.StringDtype(),
    "user_location": pd.StringDtype(),
    "retweeted_tweet": pd.StringDtype(),
    "tweet_text_urls": pd.StringDtype(),
    "tweet_text_hashtags": pd.StringDtype(),
    "tweet_text_usernames": pd.StringDtype(),
    "num_urls_in_tweet_text": pd.Int32Dtype(),
    "num_users_in_tweet_text": pd.Int32Dtype(),
    "num_hashtags_in_tweet_text": pd.Int32Dtype(),
    "text": pd.StringDtype(),
    "contains_wanted_text": pd.BooleanDtype(),
    "contains_wanted_text_case_sensitive": pd.BooleanDtype(),
    "contains_multi_word_wanted_text": pd.BooleanDtype(),
    "contains_crypto_terms": pd.BooleanDtype(),
    "contains_religious_terms": pd.BooleanDtype(),
    "contains_inappropriate_terms": pd.BooleanDtype(),
    "contains_video_games_terms": pd.BooleanDtype(),
    "contains_misc_unwanted_terms": pd.BooleanDtype(),
    "contains_non_english_terms": pd.BooleanDtype(),
    "text_trimmed": pd.StringDtype(),
    "text_stripped": pd.StringDtype(),
    "text_processed": pd.StringDtype(),
    "words": pd.StringDtype(),
    "num_words": pd.Int32Dtype(),
}

proc_text_zip_fname = f"{processed_file_name}.zip"

val_split_frac = test_split_frac / (1 - test_split_frac)

## Get Data

We will start by downloaded the processed and filtered `.zip` file from S3 and extracting all the contained `.parquet` files into a `.parquet.gzip` file

In [8]:
%%time
if not os.path.exists(os.path.join(processed_data_dir, proc_text_zip_fname)):
    download_file_from_s3(
        s3_client,
        s3_bucket_name,
        # path_to_folder,
        processed_data_dir,
        proc_text_zip_fname,
        aws_region,
        f"{path_to_folder[1:]}processed/{os.path.splitext(proc_text_zip_fname)[0]}",
    )
    extract_zip_file(
        os.path.join(processed_data_dir, proc_text_zip_fname),
        f"{processed_data_dir}/{os.path.splitext(proc_text_zip_fname)[0]}.parquet.gzip",
    )
proc_files = glob(f"{processed_data_dir}/*.parquet.gzip")

Started downloading processed data zip file from datasets/twitter/kinesis-demo/processed/processed_text.zip to ../data/processed/processed_text.zip at 2022-10-29 22:35:41.733...
Done downloading in 0.612 seconds.
Started extracting filtered data parquet files from processed data zip file to ../data/processed/processed_text.parquet.gzip at 2022-10-29 22:35:42.346...
Done extracting in 0.013 seconds.
CPU times: user 114 ms, sys: 42.6 ms, total: 157 ms
Wall time: 795 ms


Find the number of individual `.parquet.gzip` files

In [9]:
proc_files_all = glob(f"{processed_data_dir}/*.parquet.gzip/*.gz.parquet")
print(len(proc_files_all))

12


Use Dask to load the `.parquet.gzip` file (consisting of multiple `.parquet` files) into a single Dask DataFrame

In [10]:
%%time
ddf = (
    dd.read_parquet(proc_files)
    .reset_index(drop=True)
    .astype(dtypes_dict)
    .set_index("created_at")  # sorts DataFrame based on this column
    .reset_index()
    .repartition(npartitions=len(proc_files_all))
)
print(
    f"Loaded processed data from *.parquet.gzip files into Dask DataFrame "
    f"with {ddf.npartitions:,} partitions"
)
with pd.option_context("display.max_columns", None):
    display(ddf.head())
with pd.option_context("display.max_colwidth", None, "display.max_rows", None):
    display(ddf.dtypes.rename("dtype").to_frame())

Loaded processed data from *.parquet.gzip files into Dask DataFrame with 12 partitions


Unnamed: 0,created_at,id,geo,coordinates,place,contributors,is_quote_status,quote_count,reply_count,retweet_count,favorite_count,favorited,retweeted,source,in_reply_to_user_id,in_reply_to_screen_name,source_text,place_id,place_url,place_place_type,place_name,place_full_name,place_country_code,place_country,place_bounding_box_type,place_bounding_box_coordinates,place_attributes,coords_type,coords_lon,coords_lat,geo_type,geo_lon,geo_lat,user_name,user_screen_name,user_followers,user_friends,user_listed,user_favourites,user_statuses,user_protected,user_verified,user_contributors_enabled,user_joined,user_location,retweeted_tweet,tweet_text_urls,tweet_text_hashtags,tweet_text_usernames,num_urls_in_tweet_text,num_users_in_tweet_text,num_hashtags_in_tweet_text,text,contains_wanted_text,contains_wanted_text_case_sensitive,contains_multi_word_wanted_text,contains_crypto_terms,contains_religious_terms,contains_inappropriate_terms,contains_video_games_terms,contains_misc_unwanted_terms,contains_non_english_terms,text_stripped,text_processed,text_trimmed,words,num_words
0,2021-12-30 17:35:55,1476607997405184000,,,,,False,0,0,0,0,False,False,"<a href=""http://publicize.wp.com/"" rel=""nofoll...",,,WordPress.com,,,,,,,,,[[]],{},,,,,,,BCABA Network,BcabaNetwork,2581,4710,42,3027,238152,False,False,False,2016-05-25 10:54:46,"West Midlands, England",no,https://t.co/kPAMK3TxSL,,,1,0,0,NASA Plans Coverage of Webb Space Telescope De...,True,True,True,False,False,False,False,False,False,NASA Plans Coverage of Webb Space Telescope De...,nasa plans coverage of webb space telescope de...,NASA Plans Coverage of Webb Space Telescope De...,"[NASA, Plans, Coverage, of, Webb, Space, Teles...",7
1,2021-12-30 17:35:59,1476608015965040643,,,,,False,0,0,0,0,False,False,"<a href=""http://buckshee.petimer.ru"" rel=""nofo...",,,Buckshee Forum,,,,,,,,,[[]],{},,,,,,,The Buckshee,BucksheeForum,17,1,0,0,329009,False,False,False,2017-11-03 12:41:31,,no,https://t.co/V1zNCNlkQPJames,,,1,0,0,Webb telescope is captured soaring through sp...,True,True,True,False,False,False,False,False,False,Webb telescope is captured soaring through spa...,webb telescope is captured soaring through spa...,Webb telescope is captured soaring through spa...,"[Webb, telescope, is, captured, soaring, throu...",9
2,2021-12-30 17:36:01,1476608024521453573,,,,,False,0,0,0,0,False,False,"<a href=""http://twitter.com/download/iphone"" r...",,,Twitter for iPhone,,,,,,,,,[[]],{},,,,,,,Fabricio F. Costa,ffalconi,1623,5002,303,23,30126,False,False,False,2009-03-06 05:49:14,San Francisco Area,no,https://t.co/9NeCWrTrIp,,,1,0,0,NASA just dropped an exciting update about the...,True,True,True,False,False,False,False,False,False,NASA just dropped an exciting update about the...,nasa just dropped an exciting update about the...,NASA just dropped an exciting update about the...,"[NASA, just, dropped, an, exciting, update, ab...",11
3,2021-12-30 17:36:04,1476608036873682952,,,,,False,0,0,0,0,False,False,"<a href=""http://twitter.com/download/iphone"" r...",,,Twitter for iPhone,,,,,,,,,[[]],{},,,,,,,Beyond Blue Aerospace,beyondblueaero,680,1157,65,5313,21178,False,False,False,2013-11-03 02:35:32,Canada,no,https://t.co/OigaIuriN4|https://t.co/0mebmz62i0,SpaceX,,2,0,1,Photo of James Webb before he transformed into...,True,True,True,False,False,False,False,False,False,Photo of James Webb before he transformed into...,photo of james webb before he transformed into...,Photo of James Webb before he transformed into...,"[Photo, of, James, Webb, before, he, transform...",13
4,2021-12-30 17:36:10,1476608059766194180,,,,,False,0,0,0,0,False,False,"<a href=""https://ifttt.com"" rel=""nofollow"">IFT...",,,IFTTT,,,,,,,,,[[]],{},,,,,,,Imperial News,imperial0news,45,0,0,112,1833,False,False,False,2017-10-17 15:46:06,,no,https://t.co/1dpeUisvWd,,,1,0,0,NASA Plans Coverage of Webb Space Telescope De...,True,True,True,False,False,False,False,False,False,NASA Plans Coverage of Webb Space Telescope De...,nasa plans coverage of webb space telescope de...,NASA Plans Coverage of Webb Space Telescope De...,"[NASA, Plans, Coverage, of, Webb, Space, Teles...",10


Unnamed: 0,dtype
created_at,datetime64[ns]
id,string
geo,string
coordinates,string
place,string
contributors,string
is_quote_status,string
quote_count,Int32
reply_count,Int32
retweet_count,Int32


CPU times: user 2.83 s, sys: 319 ms, total: 3.15 s
Wall time: 2.05 s


**Notes**
1. Using `dask.delayed` with `pd.read_parquet` followed by `.sort_values()` errored out
   ```python
   from dask import delayed
   delayed_dfs = [
       delayed(pd.read_parquet)(f).astype(dtypes_dict).sort_values(by=["created_at"])
       for f in proc_files_all
   ]
   ddf = (
       dd.from_delayed(delayed_dfs)
       .set_index('created_at')
       .reset_index(drop=True)
       .repartition(npartitions=len(proc_files_all))
   )
   ```
2. Using `dask.multi.concat` with `pd.read_parquet` followed by `.sort_values()` errored out
   ```python
   ddf = (
       dd.multi.concat(
           [
               dd.from_pandas(
                   pd.read_parquet(f).astype(dtypes_dict).sort_values(by=["created_at"]),
                   npartitions=1
               )
               for f in proc_files_all
           ],
           axis=1,
           interleave_partitions=False,
       ).sort_values(by=["created_at"])
       .repartition(npartitions=len(proc_files_all))
   )
   ```

## Filter Data

Sort the data by the `datetime` when the tweet was posted (i.e. sort by the `created_at` column), while keeping the duplicated tweets (tweets having the identical text). Also, for all duplicates, add a column indicaing the order in which the tweets were posted (this will allow to more easily understand the meaning of tweets that are duplicates)

In [11]:
%%time
df = (
    ddf[cols_to_use].compute()
    .sort_values(by=["text", "created_at"])
    .assign(
        order=lambda df: df.groupby("text")["created_at"]
        .rank(method="first", ascending=True)
        .astype(pd.Int32Dtype())
    )
)

CPU times: user 2.08 s, sys: 161 ms, total: 2.24 s
Wall time: 1.82 s


**Notes**
1. Since we are only using a subset of all available columns for filtering the data, the subset is small enough to fit in memory so we can call `.compute()` in order to hold the subset in an in-memory (`pandas`) DataFrame.
2. (**IMPORTANT**) Per the use-case for this project, for identifying tweets that are candidates for receiving support (which is this project's use-case), retweets cannot be kept. Only original tweets and replies must be kept. So, original tweets and replies are candidates for receiving support from a mission team member. So, retweets can be dropped. However, for NLP model training, retweets can be kept since this will help increase the size of the training data used to fine-tune the pre-trained transformer model. For this reason, retweets wlill be kept in the model training data. Since only one occurrence of a unique tweet is needed for training, duplicates can be dropped - this means only the first retweet is needed (`order = 1`) and all subsequent re-tweets can be dropped (`order > 1`).

Show the tweets that are identified as
- retweets
  - from the `retweeted_tweet` column
- replies or retweets
  - from the `is_quote_status` column

In [12]:
for col in ["is_quote_status", "retweeted_tweet"]:
    display(df[col].value_counts().to_frame())

Unnamed: 0,is_quote_status
False,63379
True,2230


Unnamed: 0,retweeted_tweet
no,65550
yes,59


**Observations**
1. Since a large number of tweets are not captured by these two columns, we will need other logic in order to capture retweets and replies.

### Get Tweets That Are Replies

We will first capture replies. These are identified using the `in_reply_to_screen_name` column. A non-missing value here indicates a tweet was a reply

In [13]:
df_replies = df.query("in_reply_to_screen_name != 'None'")
with pd.option_context("display.max_colwidth", 1000):
    display(
        df_replies.head().style.set_caption(
            f"Found {len(df_replies):,} tweets that were replies"
        )
    )

Unnamed: 0,id,created_at,contributors,user_joined,source_text,place_country,user_location,user_followers,user_friends,user_listed,user_favourites,user_statuses,user_protected,user_verified,is_quote_status,retweeted,retweeted_tweet,in_reply_to_screen_name,user_screen_name,num_urls_in_tweet_text,num_words,text,order
904,1479467933424304135,2022-01-07 15:00:17,,2021-02-24 17:20:35,TweetDeck,,,36,21,2,6394,2806,False,False,False,False,no,TakeThatHistory,Nominal31152751,0,20,And with the James Webb Space Telescope we may be on the eve of knowing even more about this event.,1
3425,1480225197659676678,2022-01-09 17:09:23,,2009-12-17 16:49:59,Twitter for Android,,"Harlow, Essex, England",159,487,3,2766,3801,False,False,False,False,no,telohno,TheRealEggman,0,31,Science exains everything we csn see. The explanation may not be complete but that is forever being worked upon hence things like the James Webb Telescope. What does flat earth have?,1
2376,1478458028177383427,2022-01-04 20:07:16,,2013-03-11 21:03:17,Twitter Web App,,,214,317,2,31545,35111,False,False,False,False,no,TomPlesier,GlennCarr6,0,18,Too late for that. With his BP! Looks like something off a launch data page for James Webb!,1
5077,1479423441551953923,2022-01-07 12:03:29,,2016-08-25 19:46:11,Twitter for iPhone,,Secure in Christ,256,415,2,9666,53057,False,False,False,False,no,WondersNorbit,1pckt,0,8,You’re not excited about the James Webb telescope?,1
4226,1477752555396485121,2022-01-02 21:23:59,,2019-07-27 10:47:35,Twitter for iPhone,,"Australia, Victoria 🇦🇺",9641,10534,5,74120,54877,False,False,False,False,no,Justice_Wins7,SecularSandwich,0,45,John you will get to see the very early universe soon once James Webb is operational.You will see that there was only dust clouds early on even before stars were created.The iron in our blood comes from stars this was created even before the earth.,1


As shown below, `is_quote_status` is `True` for only a small fraction of replies since the original tweet may (`is_quote_status = True`) or may not (`is_quote_status = False`) not itself be a reply

In [14]:
for col in ["is_quote_status", "retweeted_tweet", "retweeted", "contributors"]:
    display(df_replies[col].value_counts().to_frame())

Unnamed: 0,is_quote_status
False,2313
True,44


Unnamed: 0,retweeted_tweet
no,2357


Unnamed: 0,retweeted
False,2357


Unnamed: 0,contributors
,2357


### Get Original Tweets

We will now get original tweets. By definition, these are not retweets themselves. To do this, we will have to handle duplicates in the `text` column of the `DataFrame`.

Get all duplicated tweets (identical text)

In [15]:
df_duplicated_tweets = df[df.duplicated(subset=["text"], keep=False)]
display(
    df_duplicated_tweets.head(5).style.set_caption(
        f"Found {len(df_duplicated_tweets):,} duplicates (includes the original tweet)"
    )
)

Unnamed: 0,id,created_at,contributors,user_joined,source_text,place_country,user_location,user_followers,user_friends,user_listed,user_favourites,user_statuses,user_protected,user_verified,is_quote_status,retweeted,retweeted_tweet,in_reply_to_screen_name,user_screen_name,num_urls_in_tweet_text,num_words,text,order
2333,1479901312170881024,2022-01-08 19:42:22,,2021-01-05 04:12:09,Twitter for iPad,,,642,1051,1,4854,8138,False,False,False,False,no,Ldogls,RockyMtnView,1,32,"Time for a break:JWST status:Distance from Earth: 668k mileProgress to L2: 74.35%Speed: .2461 miles/secHot side: 131FCold side: -327FLast deployment: Starboard Primary Mirror WingJWST is now fully deployed!Read about it below, thenCarry On!",1
2350,1479901400029036546,2022-01-08 19:42:43,,2017-06-29 13:56:48,Twitter for iPhone,,"Oregon, USA",6177,6441,16,158241,166570,False,False,False,False,no,,RegVickers,1,32,"Time for a break:JWST status:Distance from Earth: 668k mileProgress to L2: 74.35%Speed: .2461 miles/secHot side: 131FCold side: -327FLast deployment: Starboard Primary Mirror WingJWST is now fully deployed!Read about it below, thenCarry On!",2
2604,1479903762080698368,2022-01-08 19:52:06,,2016-04-03 06:10:25,Twitter Web App,,"California, USA",6632,7035,7,195146,121519,False,False,False,False,no,,kristyshl,1,32,"Time for a break:JWST status:Distance from Earth: 668k mileProgress to L2: 74.35%Speed: .2461 miles/secHot side: 131FCold side: -327FLast deployment: Starboard Primary Mirror WingJWST is now fully deployed!Read about it below, thenCarry On!",3
2862,1479906707841208323,2022-01-08 20:03:49,,2012-01-26 19:18:21,Twitter for Android,,A Shack in Virginia,10134,10074,9,189119,140229,False,False,False,False,no,,gojoe_joe,1,32,"Time for a break:JWST status:Distance from Earth: 668k mileProgress to L2: 74.35%Speed: .2461 miles/secHot side: 131FCold side: -327FLast deployment: Starboard Primary Mirror WingJWST is now fully deployed!Read about it below, thenCarry On!",4
3750,1479559011112345600,2022-01-07 21:02:11,,2021-01-05 04:12:09,Twitter for iPad,,,640,1050,1,4830,8104,False,False,False,False,no,bobbiejaneV,RockyMtnView,1,32,"Time for a break:JWST status:Distance from Earth: 647k mileProgress to L2: 72.05%Speed: .2592 miles/secHot side temp: 131FCold side temp: -327FCurrent deployment: Port Primary Mirror WingRead about the last deployment below, thenCarry On!",1


Original tweets will have
- `in_reply_to_screen_name == 'None'`
- `order == 1`

We can use these in a filter to extract candidate original tweets. Shown below are tweets for which `in_reply_to_screen_name` is `None` and `order == 1`

In [16]:
with pd.option_context("display.max_colwidth", 1000):
    display(
        df_duplicated_tweets.query(
            "(in_reply_to_screen_name == 'None') & " "(order == 1)"
        )
    )

Unnamed: 0,id,created_at,contributors,user_joined,source_text,place_country,user_location,user_followers,user_friends,user_listed,...,user_verified,is_quote_status,retweeted,retweeted_tweet,in_reply_to_screen_name,user_screen_name,num_urls_in_tweet_text,num_words,text,order
3740,1478285504420450308,2022-01-04 08:41:44,,2015-11-08 18:11:07,IFTTT,,,102,222,6,...,False,False,False,no,,kwalis6294,2,16,- The James Webb Telescope Lights Up the Sky During Launch of the day via NASA,1
3923,1479918485107089414,2022-01-08 20:50:37,,2009-03-06 18:25:05,Twitter Web App,,New England,30013,26025,91,...,False,False,False,no,,aroseblush,1,30,"The James Webb Space Telescope has completed a final, crucial step: unfolding the last section of its golden, hexagonal mirrors. That amounts to it becoming fully deployed, according to NASA",1
752,1477335532556496897,2022-01-01 17:46:53,,2008-06-06 18:56:59,TweetCaster for Android,,USA,651,637,175,...,False,False,False,no,,rblumel,1,12,Update: NASA Plans Coverage of Webb Space Telescope Deployments | NASA -,1
2597,1478769681142005767,2022-01-05 16:45:40,,2008-10-15 16:50:33,Twitter Web App,,"40.887590000, -74.575584000",15970,12425,751,...,False,False,False,no,,genejm29,1,9,Via - Blueprints of the James Webb Space Telescope,1
7219,1479964413008527360,2022-01-08 23:53:07,,2017-05-30 18:19:23,Twitter Web App,,"He/Him,EU27, #indyref2 = EU28",6145,5563,5,...,False,False,False,no,,zcelticboy,1,4,Webb Telescope be like...,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3865,1477710057542324226,2022-01-02 18:35:06,,2021-01-23 19:49:31,Twitter for Android,,,1,321,0,...,False,False,False,no,,DivyaDibbu21,1,42,"“Where James Webb goes, no astronaut can go.” talks us through the tense launch of the most powerful telescope ever built!The will peer back in time 13.5b years &amp; could answer Q’s on the universe’s formation, black holes &amp; life outside Earth",1
2364,1480193576495689731,2022-01-09 15:03:43,,2015-11-13 00:28:11,Twitter for iPad,,"New York, NY",2620,4964,76,...,False,True,False,no,,ma_nyc7,0,43,"“With a primary mirror 21 feet across, the Webb was too big to fit in a rocket, and so the mirror was made in segments, 18 gold-plated hexagons folded together, that would have to pop into position once the telescope was in space.”",1
3628,1477676490166837250,2022-01-02 16:21:43,,2010-12-22 14:20:32,Twitter for iPhone,,"Dublin, Ireland",2628,2559,52,...,False,False,False,no,,ScienceSpinning,1,33,⁦ give Irish built camera that videoed separation of Webb Space Telescope from its rocket seen all over the world - but no mention of this from our own national broadcaster ⁦ ⁦,1
1076,1479759170031140867,2022-01-08 10:17:33,,2021-03-12 07:03:36,dlvr.it,,"London, UK",8980,400,29,...,False,False,False,no,,BHheadlines,0,25,"𝗕𝗥𝗘𝗔𝗞𝗜𝗡𝗚⎜ James Webb, 9, watches James Webb telescope blast into space: The child’s science-loving parents met while studying physics at Nottingham Trent University. 𝑳𝑰𝑽𝑬 𝑼𝑷𝑫𝑨𝑻𝑬𝑺",1


**Observations**
1. From reading the `text` column for some of these tweets, these
  - may or may not be retweets
    - we don't want to keep retweets since these aren't candidates for receiving support per our project use-case
  - could be original tweets (`in_reply_to_screen_name` is `None`)
    - we do want to keep original tweets

To get original tweets, we will need to remove both of the following
- retweets of an original tweet
  - don't want such tweets
  - the original tweet (that is being retweeted) may contain a quote (`is_quote_status` will be `True`)
    - remove such tweets using a filter that only picks up tweets for which `is_quote_status` is `False`
  - the original tweet (that is being retweeted) may or may not contain a URL
    - remove such tweets using a filter that only picks up tweets with zero urls in the text
    - unfortunately, some retweets may not contain a URL so those will not be filtered out
      - **this means some retweets will be left in the dataset and these could be retweets of original tweets made by users whose account was deleted**
- replies to a tweet by a user account that was deleted (`in_reply_to_screen_name` is `None`) at the time
  - don't want such tweets
  - such tweets may contain a quote (`is_quote_status` will be `True`)
    - remove such tweets using a filter that only picks up tweets for which `is_quote_status` is `False`
    - unfortunately, some replies may not contain a quote so those will not be filtered out
      - **this means some replies left in the dataset could be replies to users whose account was deleted**

In [17]:
df_originals = df.query(
    # remove replies
    "(in_reply_to_screen_name == 'None') & "
    # remove (a) replies to deleted user accounts and (b) retweets
    "(is_quote_status == 'False') & "
    # remove retweets
    "(retweeted_tweet == 'no') & "
    "(order == 1) & "
    # remove (a) replies to deleted user accounts and (b) retweets
    "(num_urls_in_tweet_text == 0)"
)
assert len(df_originals) == len(df_originals.drop_duplicates(subset=["text"]))
assert df_originals.query("text.str.contains('http')").empty
with pd.option_context("display.max_colwidth", 1000):
    display(
        df_originals.head(10).style.set_caption(
            f"Found {len(df_originals):,} candidate original tweets"
        )
    )

Unnamed: 0,id,created_at,contributors,user_joined,source_text,place_country,user_location,user_followers,user_friends,user_listed,user_favourites,user_statuses,user_protected,user_verified,is_quote_status,retweeted,retweeted_tweet,in_reply_to_screen_name,user_screen_name,num_urls_in_tweet_text,num_words,text,order
1659,1480172860932907009,2022-01-09 13:41:24,,2020-07-31 20:39:09,Twitter for Android,,,26,105,0,3218,248,False,False,False,False,no,,EvinSpace,0,25,"hello NASA , what's first image that James Webb telescope will ever take or have NASA decided what's first image is going to be ?",1
4399,1479880884576739332,2022-01-08 18:21:12,,2008-06-06 18:56:59,TweetCaster for Android,,USA,655,640,175,25009,116610,False,False,False,False,no,,rblumel,0,39,"to Host Coverage, Briefing for Webb Telescope’s Final Unfolding Beginning no earlier than 9 a.m. EST, NASA will air live coverage of the final hours of Webb’s major deployments. After the live broadcast concludes, at approximately 1:30 p.m., (1/2)",1
4468,1478360428224909314,2022-01-04 13:39:27,,2021-12-31 10:51:36,Twitter for Android,,,0,4,0,4,1958,False,False,False,False,no,,Evgeniy06930136,0,24,"The James webbs telescooe is like a big update to change the game, this is such a incredible event i love it so much",1
5873,1479943950567890949,2022-01-08 22:31:48,,2022-01-06 12:07:59,Twitter Web App,,United Kingdom,201,419,0,92,114,False,False,False,False,no,,PFYPOLITICS,0,10,engineers complete the unfolding of the James Webb space telescope,1
1625,1480172008654745602,2022-01-09 13:38:01,,2020-07-31 20:39:09,Twitter for Android,,,26,105,0,3217,248,False,False,False,False,no,,EvinSpace,0,24,"hello NASA , what's first image that James Webb telescope will ever take have NASA decided what's first image is going to be ?",1
1328,1477377739812487176,2022-01-01 20:34:36,,2010-06-05 09:28:43,Twitter Web App,,"Covington, GA",1563,2131,319,18821,66263,False,False,False,False,no,,patrickDurusau,0,22,- Have you added NASAWebb to your spell-checker/thesaurus yet? Correct spelling is: HTST (aka NASAWebb) - Expanded form: Harriett Tubman Space Telescope.,1
573,1479832128833662977,2022-01-08 15:07:28,,2008-12-17 15:59:58,Twitter Web App,,downtown,93,370,1,10382,1293,False,False,False,False,no,,celestespace,0,28,"Dad worked on this James Webb thing on & off for years (2001-2009 ish), he's now retired at 87 (?) but so exciting for him and everyone really.",1
3845,1477222956996698120,2022-01-01 10:19:33,,2010-01-06 12:20:32,Twitter for Android,,,33,348,0,24338,13499,False,False,False,False,no,,morsiemc,0,39,GB £37bn = US $50bnWe could have built AND launched FIVE James WebbSpace Telescopes for £37bn !!!How can a 'failed' SmartPhone App cost 5 times morethan the answer:NASA analysts ALL know an Excel spreadsheet doesn't have 65m rows !,1
252,1479825165575938051,2022-01-08 14:39:47,,2021-08-27 16:33:08,Twitter for iPhone,,,0,202,0,18,10,False,False,False,False,no,,AstronomicalAd,0,12,How cold is the primary mirror of the James Webb Space Telescope,1
304,1478408392137920514,2022-01-04 16:50:02,,2012-09-27 23:53:09,Twitter for iPhone,,"Buffalo Grove, IL",136,408,1,3929,1294,False,False,False,False,no,,StargazerBird,0,13,How is the camera inside the Webb Telescope different than the Hubble Telescope?,1


Since original tweets are not a reply, `is_quote_status` is never `True`, as shown below

In [18]:
for col in ["is_quote_status", "retweeted_tweet", "retweeted", "contributors"]:
    display(df_originals[col].value_counts().to_frame())

Unnamed: 0,is_quote_status
False,1774


Unnamed: 0,retweeted_tweet
no,1774


Unnamed: 0,retweeted
False,1774


Unnamed: 0,contributors
,1774


### Combine Replies and Original Tweets

Combine above
- tweets that are replies
- tweets that are original tweets

and extract features
- `hour`
- `day`
- `weekday`
- `time_of_dayhour` (morning, afternoon, etc.)

needed to help process data based on the type of tweet

In [19]:
df_may_need_support = (
    pd.concat([df_replies, df_originals])
    .assign(hour=lambda df: df["created_at"].dt.hour)
    .assign(day=lambda df: df["created_at"].dt.day)
    .assign(weekday=lambda df: df["created_at"].dt.day_name())
    .assign(
        time_of_day=lambda df: pd.cut(df["hour"], bins=b, labels=l, include_lowest=True)
    )
)
display(
    df_may_need_support.drop(columns=["text"])
    .sample(10)
    .style.set_caption(f"Kept {len(df_may_need_support):,} original tweets or replies")
)

Unnamed: 0,id,created_at,contributors,user_joined,source_text,place_country,user_location,user_followers,user_friends,user_listed,user_favourites,user_statuses,user_protected,user_verified,is_quote_status,retweeted,retweeted_tweet,in_reply_to_screen_name,user_screen_name,num_urls_in_tweet_text,num_words,order,hour,day,weekday,time_of_day
109,1480124148080726017,2022-01-09 10:27:50,,2018-06-30 09:36:08,Twitter Web App,,,356,714,1,48014,20670,False,False,False,False,no,CyrusShares,freeiran1919384,0,49,1,10,9,Sunday,Morning
996,1479093880444182528,2022-01-06 14:13:55,,2019-06-04 07:24:51,Twitter Web App,,,111,375,1,861,2567,False,False,False,False,no,,faza__nugraha,0,16,1,14,6,Thursday,Afternoon
466,1479029106440351746,2022-01-06 09:56:32,,2011-05-24 23:09:37,Twitter Web App,,,281,161,4,6911,4297,False,False,False,False,no,,Majere613,0,27,1,9,6,Thursday,Morning
3357,1478794404731174919,2022-01-05 18:23:55,,2020-06-09 16:38:06,Twitter for iPhone,,,44242,778,431,35562,24921,False,False,False,False,no,,spacex360,0,38,1,18,5,Wednesday,Evening
4144,1478834666576629761,2022-01-05 21:03:54,,2015-08-19 15:09:05,Twitter Web App,,"California, USA",51,264,3,11902,3893,False,False,False,False,no,IGN,bruzosa1,0,27,1,21,5,Wednesday,Night
122,1479452471785230342,2022-01-07 13:58:50,,2014-09-07 01:44:54,Twitter Web App,,,48,231,1,7294,6625,False,False,False,False,no,HUBBLE_space,rtbkc,0,26,1,13,7,Friday,Afternoon
5141,1479934259829911556,2022-01-08 21:53:18,,2019-01-15 21:31:58,Twitter for iPhone,,"Texas, USA",268,397,9,2,2106,False,True,False,False,no,,HalRidleyJr,0,56,1,21,8,Saturday,Night
5032,1480271751472951304,2022-01-09 20:14:22,,2009-09-15 23:56:58,Twitter for iPhone,,Israel,160,193,3,72,9423,False,False,True,False,no,gruber,omerlev,0,13,1,20,9,Sunday,Evening
3480,1479913719224311817,2022-01-08 20:31:40,,2009-11-25 20:13:29,Twitter Web App,,,27,119,2,0,4603,False,False,False,False,no,,prosjek,0,19,1,20,8,Saturday,Evening
331,1478023272906301441,2022-01-03 15:19:43,,2007-11-01 16:21:56,Twitter Web App,,"Denton, TX",76,142,0,20822,4779,False,False,False,False,no,,danielmcdonald,0,37,1,15,3,Monday,Afternoon


## Exploratory Data Analysis

Count number of candidate tweets for support by hour of the day

In [20]:
df_tweets_by_hour_of_day = (
    df_may_need_support.groupby(["hour"]).size().rename("num_tweets").to_frame()
)
df_tweets_by_hour_of_day.style.set_caption(
    "Number of tweets by hour of day "
    f"(median = {df_tweets_by_hour_of_day['num_tweets'].median():.3f})"
)

Unnamed: 0_level_0,num_tweets
hour,Unnamed: 1_level_1
0,164
1,137
2,156
3,129
4,167
5,108
6,94
7,64
8,71
9,95


**Observations**
1. The maximum number of hourly tweets (original tweets or replies) is 320.

Count number of candidate tweets for support by time of the day (morning, evening, etc.)

In [21]:
df_tweets_by_time_of_day = df_may_need_support.groupby("time_of_day").agg(
    {"hour": ["min", "max"], "created_at": "count"}
)
df_tweets_by_time_of_day.columns = [
    "_".join(col).strip() for col in df_tweets_by_time_of_day.columns.values
]
df_tweets_by_time_of_day = df_tweets_by_time_of_day.rename(
    columns={"created_at_count": "num_tweets"}
)
df_tweets_by_time_of_day.style.set_caption(
    "Number of tweets by time of day "
    f"(median = {df_tweets_by_time_of_day['num_tweets'].median():.3f})"
)

Unnamed: 0_level_0,hour_min,hour_max,num_tweets
time_of_day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Late Night,0,4,753
Early Morning,5,8,337
Morning,9,12,407
Afternoon,13,16,910
Evening,17,20,1091
Night,21,23,633


**Observations**
1. The maximum number of tweets by time of day (original tweets or replies) is approximately 1,100.

Count number of candidate tweets for support by day of the week

In [22]:
df_tweets_by_day_of_week = (
    df_may_need_support.groupby(["weekday"])
    .size()
    .rename("num_tweets")
    .loc[list(day_name)]
    .to_frame()
)
df_tweets_by_day_of_week.style.set_caption(
    "Number of tweets by day of week "
    f"(median = {df_tweets_by_day_of_week['num_tweets'].median():.3f})"
)

Unnamed: 0_level_0,num_tweets
weekday,Unnamed: 1_level_1
Monday,285
Tuesday,485
Wednesday,447
Thursday,355
Friday,503
Saturday,1173
Sunday,883


**Observations**
1. The maximum number of tweets by day of week (original tweets or replies) is approximately 1,100, across all Saturdays combined.

## Create Data Splits for ML Model Development

We'll now create non-randomized training, validation and testing splits from the sampled data, which will be used during NLP model training and evaluation

In [23]:
df_nlp_train, df_nlp_val, df_nlp_test = [
    df_may_need_support.sort_values(by=["created_at"]).iloc[: -(2 * batch_size)],
    df_may_need_support.sort_values(by=["created_at"]).iloc[
        -(2 * batch_size) : -batch_size
    ],
    df_may_need_support.sort_values(by=["created_at"]).iloc[-batch_size:],
]

Randomize the training, validation and testing splits

In [24]:
df_nlp_train, df_nlp_val, df_nlp_test = [
    df_nlp_train.sample(frac=1.0, random_state=88),
    df_nlp_val.sample(frac=1.0, random_state=88),
    df_nlp_test.sample(frac=1.0, random_state=88),
]

Get start and end dates for training, validation and test splits

In [25]:
df_split_dates = pd.DataFrame.from_records(
    [
        {
            "split": split_type,
            "length": len(df_nlp_split),
            "start": df_nlp_split["created_at"].min().strftime("%Y-%m-%d %H:%M:%S"),
            "end": df_nlp_split["created_at"].max().strftime("%Y-%m-%d %H:%M:%S"),
        }
        for df_nlp_split, split_type in zip(
            [df_nlp_train, df_nlp_val, df_nlp_test], ["train", "val", "test"]
        )
    ]
)
df_split_dates

Unnamed: 0,split,length,start,end
0,train,2931,2021-12-30 17:39:11,2022-01-08 15:14:33
1,val,600,2022-01-08 15:15:45,2022-01-09 01:17:04
2,test,600,2022-01-09 01:18:13,2022-01-10 01:29:01


Tweets might be created at the same timestamp and so duplicated values are possible in this column, meaning that unique values in this column will be less than the total number of tweets in the data (including in the sampled data). Tweet IDs are unique for each tweet so the number of unique values in the `id` column will match the number of tweets in the data. These are shown below

In [26]:
%%time
print(
    f"Number of unique IDs in sampled data = {df_nlp_train['id'].nunique():,}\n"
    f"Number of unique creation datetimes in sampled data = {df_nlp_train['created_at'].nunique():,}\n"
    f"Number of rows in sampled data = {len(df_nlp_train):,}"
)

Number of unique IDs in sampled data = 2,929
Number of unique creation datetimes in sampled data = 2,916
Number of rows in sampled data = 2,931
CPU times: user 1.06 ms, sys: 264 µs, total: 1.32 ms
Wall time: 1.09 ms


## Export All Data Splits to S3 Bucket

Get the start date for making inference with the trained ML model

In [27]:
inference_start_date_str = (
    inference_start_date.replace("-", "").replace(":", "").replace(" ", "_")
)
print(inference_start_date_str)

20220110_000000


**Notes**
1. The inference start date is used in file naming as a crude way to version data used in each round of ML model training. This start date is a `datetime` that captures the hour immediately after the `datetime` of the last tweet that was streamed using AWS Kinesis.

All sampled data spits for NLP model development will now be saved to a separate
- `.CSV` file on S3
- local `.XLSX` file (for use in the next notebook)

In [28]:
%%time
if create_nlp_splits:
    for split_name, df_split_to_export in zip(
        ["train_nlp", "val_nlp", "test_nlp"],
        [df_nlp_train, df_nlp_val, df_nlp_test],
    ):
        fname = f"{split_name}__inference_starts_{inference_start_date_str}.csv.zip"
        if upload_to_s3:
            storage_options={
                "key": os.getenv("AWS_ACCESS_KEY_ID"),
                "secret": os.getenv("AWS_SECRET_ACCESS_KEY"),
            }
            prefix = f"{path_to_folder[1:]}processed/nlp_splits/{fname}"
            split_filepath = f"s3://{s3_bucket_name}/{prefix}"
        else:
            storage_options = None
            prefix = f"{processed_data_dir}/{fname}"
            split_filepath = prefix
        df_split_to_export.to_csv(
            split_filepath,
            index=False,
            storage_options=storage_options,
        )
        df_split_to_export.to_excel(
            f"{processed_data_dir}/{fname.replace('.csv.zip', '.xlsx')}",
            index=False,
        )
        print(f"Exported {len(df_split_to_export):,} rows to {prefix}")

Exported 2,931 rows to datasets/twitter/kinesis-demo/processed/nlp_splits/train_nlp__inference_starts_20220110_000000.csv.zip
Exported 600 rows to datasets/twitter/kinesis-demo/processed/nlp_splits/val_nlp__inference_starts_20220110_000000.csv.zip
Exported 600 rows to datasets/twitter/kinesis-demo/processed/nlp_splits/test_nlp__inference_starts_20220110_000000.csv.zip
CPU times: user 2.43 s, sys: 59.4 ms, total: 2.49 s
Wall time: 3.37 s


## Cleanup

We'll now delete the local
- `.zip` file (containing the individual `.parquet` files of prepared data) that was downloaded from S3
- `.parquet` folder with prepared data that was processed in the previous notebook (`5-process-data/notebooks/5_process_data.ipynb`) and extracted above

In [29]:
%%time
if cleanup_local_files:
    # delete locally exported (by PySpark) parquet files
    shutil.rmtree(proc_files[0])
    print("Deleted local .parquet.gzip files with filtered data.")

    # delete local zip file
    os.remove(os.path.join(processed_data_dir, proc_text_zip_fname))
    print("Deleted local .zip file created from all filtered data files.")

Deleted local .parquet.gzip files with filtered data.
Deleted local .zip file created from all filtered data files.
CPU times: user 2.18 ms, sys: 419 µs, total: 2.59 ms
Wall time: 3.1 ms


---

<span style="float:left;">
    <a href="./5-process-data/notebooks/5_process_data.ipynb"><< 5 - Data Processing</a>
</span>

<span style="float:right;">
    <a href="./7-train/notebooks/7_train.ipynb">7 - Fine-Tuning Pre-Trained Model using Manually Labeled Data >></a>
</span>