# Create Data Splits

In [None]:
%%time
!pip3 freeze | grep -E 'boto3|s3fs|scikit-learn|distributed|dask==|dask-m|black==|jupyter-server|pandas'
!conda list -n spark | grep -E 'ipykernel'

In [None]:
%load_ext lab_black

In [None]:
import os
from glob import glob
from datetime import datetime
import zipfile

import boto3
import dask.dataframe as dd
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split as sk_train_test_split

## About

### Objective
This notebook will split the processed data into training, validation and test splits that can be used to train a machine learning model for twitter sentiment classification.

### ML Model Development
A random sample of the training split will be further divided into three smaller splits in order to support training a NLP (transformers) model to predict sentiment. This NLP model will be used to label the processed tweets data with sentiment. The NLP model will be used to label the data (i.e. to extract the sentiment) used during ML model development. The ML model will be trained using this labeled data and then deployed.

### ML Model Usage in Production
In production, the deployed ML model will be used to predict sentiment for 12 hours of incoming tweets. These predictions will be served to customers.

## User Inputs

In [None]:
path_to_folder = "/datasets/twitter/kinesis-demo/"

# processed data
processed_data_dir = "data/processed"
processed_file_name = "processed_text"
# cols_to_load = [
#     "id",
#     "source_text",
#     "created_at",
#     "user_joined",
#     "retweeted_tweet",
#     "contributors",
#     "text",
# ]
cols_to_load = None

# train-test split
test_split_frac = 0.125
val_split_frac = test_split_frac / (1 - test_split_frac)

# sampling data
num_sampled_tweets = 10_000
sampled_fname = "sampled_data.csv.zip"

# inference schedule records
# - this notebook only supports one record at a time
inference_datetime_ranges = [
    {"k": 0, "infer_starts_range": ["2022-01-07 12:00:00", "2022-01-09 12:00:00"]}
]

In [None]:
s3_bucket_name = os.getenv("AWS_S3_BUCKET_NAME")
session = boto3.Session(profile_name="default")
s3_client = session.client("s3")

dtypes_dict = {
    "id": pd.StringDtype(),
    "geo": pd.StringDtype(),
    "coordinates": pd.StringDtype(),
    "place": pd.StringDtype(),
    "contributors": pd.StringDtype(),  # pd.BooleanDtype(),
    "is_quote_status": pd.StringDtype(),  # pd.BooleanDtype(),
    "quote_count": pd.Int32Dtype(),
    "reply_count": pd.Int32Dtype(),
    "retweet_count": pd.Int32Dtype(),
    "favorite_count": pd.Int32Dtype(),
    "favorited": pd.StringDtype(),  # pd.BooleanDtype(),
    "retweeted": pd.StringDtype(),  # pd.BooleanDtype(),
    "source": pd.StringDtype(),
    "in_reply_to_user_id": pd.StringDtype(),
    "in_reply_to_screen_name": pd.StringDtype(),
    "source_text": pd.StringDtype(),
    "place_id": pd.StringDtype(),
    "place_url": pd.StringDtype(),
    "place_place_type": pd.StringDtype(),
    "place_name": pd.StringDtype(),
    "place_full_name": pd.StringDtype(),
    "place_country_code": pd.StringDtype(),
    "place_country": pd.StringDtype(),
    "place_bounding_box_type": pd.StringDtype(),
    "place_bounding_box_coordinates": pd.StringDtype(),
    "place_attributes": pd.StringDtype(),
    "coords_type": pd.StringDtype(),
    "coords_lon": pd.StringDtype(),
    "coords_lat": pd.StringDtype(),
    "geo_type": pd.StringDtype(),
    "geo_lon": pd.StringDtype(),
    "geo_lat": pd.StringDtype(),
    "user_name": pd.StringDtype(),
    "user_screen_name": pd.StringDtype(),
    "user_followers": pd.Int32Dtype(),
    "user_friends": pd.Int32Dtype(),
    "user_listed": pd.Int32Dtype(),
    "user_favourites": pd.Int32Dtype(),
    "user_statuses": pd.Int32Dtype(),
    "user_protected": pd.StringDtype(),  # pd.BooleanDtype(),
    "user_verified": pd.StringDtype(),  # pd.BooleanDtype(),
    "user_contributors_enabled": pd.StringDtype(),
    "user_location": pd.StringDtype(),
    "retweeted_tweet": pd.StringDtype(),
    "tweet_text_urls": pd.StringDtype(),
    "tweet_text_hashtags": pd.StringDtype(),
    "tweet_text_usernames": pd.StringDtype(),
    "num_urls_in_tweet_text": pd.Int32Dtype(),
    "num_users_in_tweet_text": pd.Int32Dtype(),
    "num_hashtags_in_tweet_text": pd.Int32Dtype(),
    "text": pd.StringDtype(),
    "contains_wanted_text": pd.BooleanDtype(),
    "contains_wanted_text_case_sensitive": pd.BooleanDtype(),
    "contains_multi_word_wanted_text": pd.BooleanDtype(),
    "contains_crypto_terms": pd.BooleanDtype(),
    "contains_religious_terms": pd.BooleanDtype(),
    "contains_inappropriate_terms": pd.BooleanDtype(),
    "contains_video_games_terms": pd.BooleanDtype(),
    "contains_misc_unwanted_terms": pd.BooleanDtype(),
    "contains_non_english_terms": pd.BooleanDtype(),
}

proc_text_zip_fname = f"{processed_file_name}.zip"

In [None]:
def highlight_cols(df_cols, cols_to_use):
    """Highlight a list of columns in a DataFrame."""
    # copy df to new - original data is not changed
    df = df_cols[cols_to_use].copy()
    # select all values to yellow color
    df.loc[:, :] = "background-color: yellow"
    # return color df
    return df


def download_file_from_s3(
    s3_bucket_name: str,
    path_to_folder: str,
    data_dir: str,
    fname: str,
    aws_region: str,
    prefix: str,
) -> None:
    """Download file from ."""
    dest_filepath = os.path.join(data_dir, fname)
    s3_filepath_key = s3_client.list_objects_v2(
        Bucket=s3_bucket_name,
        Delimiter="/",
        Prefix=prefix,
    )["Contents"][0]["Key"]
    start = datetime.now()
    print(
        f"Started downloading processed data zip file from {s3_filepath_key} to "
        f"{dest_filepath} at {start.strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]}..."
    )
    s3 = boto3.resource("s3", region_name=aws_region)
    s3.meta.client.download_file(
        s3_bucket_name,
        s3_filepath_key,
        dest_filepath,
    )
    duration = (datetime.now() - start).total_seconds()
    print(f"Done downloading in {duration:.3f} seconds.")


def extract_zip_file(dest_filepath: str, data_dir: str) -> None:
    """."""
    start = datetime.now()
    print(
        "Started extracting filtered data parquet files from "
        f"processed data zip file to {data_dir} at "
        f"{start.strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]}..."
    )
    zip_ref = zipfile.ZipFile(dest_filepath)
    zip_ref.extractall(data_dir)
    zip_ref.close()
    duration = (datetime.now() - start).total_seconds()
    print(f"Done extracting in {duration:.3f} seconds.")

## Get Data

We will start by downloaded the processed and filtered `.zip` file from S3 and extracting all the contained `.parquet` files into a `.parquet.gzip` file

In [None]:
%%time
if not os.path.exists(os.path.join(processed_data_dir, proc_text_zip_fname)):
    download_file_from_s3(
        s3_bucket_name,
        path_to_folder,
        processed_data_dir,
        proc_text_zip_fname,
        session.region_name,
        f"{path_to_folder[1:]}processed/{os.path.splitext(proc_text_zip_fname)[0]}",
    )
    extract_zip_file(
        os.path.join(processed_data_dir, proc_text_zip_fname),
        f"{processed_data_dir}/{os.path.splitext(proc_text_zip_fname)[0]}.parquet.gzip",
    )
proc_files = glob(f"{processed_data_dir}/*.parquet.gzip")

Use Dask to load the `.parquet.gzip` file (consisting of multiple `.parquet` files) into a single Dask DataFrame

In [None]:
%%time
ddf = dd.read_parquet(proc_files, columns=cols_to_load).astype(dtypes_dict)
with pd.option_context("display.max_colwidth", None):
    display(ddf.head())
display(ddf.dtypes.rename("dtype").to_frame())

### Create Data Splits for ML Model Development

Get starting (inclusive) and ending (exclusive) `datetime`s for training, validation, testing and inference

In [None]:
def get_split_length(split_start, split_end):
    ddf_split = ddf[(ddf['created_at'] >= split_start) & (ddf['created_at'] < split_end)]
    return len(ddf_split)

In [None]:
%%time
cols_order = [
    'model_training_num',
    'train_starts',
    'train_ends',
    'val_starts',
    'val_ends',
    'test_starts',
    'test_ends',
    'infer_num',
    'infer_starts',
    'infer_ends',
    'inference_time',
    'train_len',
    'val_len',
    'test_len',
    'infer_len',
    'total_len',
    'train_frac',
    'val_frac',
    'test_frac',
    'infer_frac',
]
dfs_rand_dates = []
for k, infer_record in enumerate(inference_datetime_ranges):
    v = infer_record['infer_starts_range']
    df_rand_dates = (
        pd.Series(
            pd.date_range(
                v[0],  # "2022-01-07 04:00:00" or "2022-01-08 00:00:00"
                v[1],
                freq="12H",  # "4H" or "12H"
                name="infer_starts",
            )
        ).to_frame()
        .assign(infer_ends=lambda df: df['infer_starts']+pd.Timedelta(12, unit="H"))
        .assign(test_ends=lambda df: df['infer_starts'].min())
        .assign(test_starts=lambda df: df['test_ends']-pd.Timedelta(12, unit="H"))
        .assign(val_ends=lambda df: df['test_starts'])
        .assign(val_starts=lambda df: df['val_ends']-pd.Timedelta(12, unit="H"))
        .assign(train_ends=lambda df: df['val_starts'])
        .assign(train_starts=datetime(2021, 12, 30, 17, 0, 0))
        .assign(inference_time=lambda df: df['infer_ends'])
        .assign(infer_num=lambda df: range(len(df)))
        .assign(model_training_num=k)
        .assign(train_len=lambda df: get_split_length(df.iloc[0]["train_starts"], df.iloc[0]["train_ends"]))
        .assign(val_len=lambda df: get_split_length(df.iloc[0]["val_starts"], df.iloc[0]["val_ends"]))
        .assign(test_len=lambda df: get_split_length(df.iloc[0]["test_starts"], df.iloc[0]["test_ends"]))
        .assign(
            infer_len=lambda df: pd.Series(
                np.vectorize(get_split_length)(df["infer_starts"], df["infer_ends"]),
                index=df.index,
                name='infer_length',
            )
        )
        .assign(total_len=len(ddf))
        .assign(train_frac=lambda df: df['train_len']/df['total_len'])
        .assign(val_frac=lambda df: df['val_len']/df['total_len'])
        .assign(test_frac=lambda df: df['test_len']/df['total_len'])
        .assign(infer_frac=lambda df: df['infer_len']/df['total_len'])
        [cols_order]
    )
    dfs_rand_dates.append(df_rand_dates)
df_rand_dates = pd.concat(dfs_rand_dates)
df_rand_dates

## Split Data and Create Sample for Training NLP Labeling (Transformer) Model

### Create Data Splits for ML Model Development

In [None]:
%%time
train_starts = df_rand_dates.iloc[0]["train_starts"]
train_ends = df_rand_dates.iloc[0]["train_ends"]
val_starts = df_rand_dates.iloc[0]["val_starts"]
val_ends = df_rand_dates.iloc[0]["val_ends"]
test_starts = df_rand_dates.iloc[0]["test_starts"]
test_ends = df_rand_dates.iloc[0]["test_ends"]

In [None]:
df_train, df_val, df_test = [
    ddf[(ddf['created_at'] >= train_starts) & (ddf['created_at'] < train_ends)].sample(frac=1.0),
    ddf[(ddf['created_at'] >= val_starts) & (ddf['created_at'] < val_ends)].sample(frac=1.0),
    ddf[(ddf['created_at'] >= test_starts) & (ddf['created_at'] < test_ends)].sample(frac=1.0),
]

Get lengths of inference period

In [None]:
%%time
df_inferences = {}
for k, row in df_rand_dates.iterrows():
    df_infer = ddf[
        (ddf["created_at"] >= row["infer_starts"])
        & (ddf["created_at"] < row["infer_ends"])
    ]
    df_inferences[k] = df_infer

### Create Data Splits from Training Split, for NLP Model Development (to assign sentiment labels)

We will now draw a random sample from the training split to use in NLP (transformer) model fine-tuning in order to label the tweets with a sentiment (i.e. in order to extract the sentiment of the text in the tweets).

First, we'll define the fraction of the training split to be used in NLP model fine-tuning

In [None]:
%%time
nlp_sample_size = num_sampled_tweets / len(df_train)

Next, we'll extract a sample of the training data corresponding to this fraction

In [None]:
%%time
df_train_sample = (
    df_train.sample(frac=nlp_sample_size, random_state=88)
    .compute()
    .sort_values(by=["created_at"])
)

**Notes**
1. The sample is small enough that it fits in memory and so we don't need to use a big data framework to hold its contents. So, we call `.compute()` to bring this sample into memory and we can use in-memory tools (below) for creating data splits from this sample.
2. Before creating the data splits for ML model development, the data was sorted by `datetime` when the tweet was posted (i.e. sorted by the `created_at` column). In order to create the splits for NLP model development in a way that is consistent with how the data splits were created for ML model development, after drawing the random sample, we sort the sampled data by the same `created_at` column before random splits will be created next.

Tweets might be created at the same timestamp and so duplicated values are possible in this column, meaning that unique values in this column will be less than the total number of tweets in the data (including in the mpled data). Tweet IDs are unique for each tweet so the number of unique values in the `id` column will match the number of tweets in the data. These are shown below

In [None]:
%%time
print(
    f"Number of unique IDs in sampled data = {df_train_sample['id'].nunique():,}\n"
    f"Number of unique creation datetimes in sampled data = {df_train_sample['created_at'].nunique():,}\n"
    f"Number of rows in sampled data = {len(df_train_sample):,}"
)

We'll now create random training, validation and testing splits from the sampled data, which will be used during NLP model training and evaluation

In [None]:
df_nlp_train_val, df_nlp_test = sk_train_test_split(
    df_train_sample, test_size=test_split_frac, random_state=88
)
df_nlp_train, df_nlp_val = sk_train_test_split(
    df_nlp_train_val, test_size=test_split_frac, random_state=88
)

## Export All Data Splits to S3 Bucket

Get the counter for the two-day inference period

In [None]:
infer_idx = inference_datetime_ranges[0]["k"]
infer_idx

**Notes**
1. The counter indicates which occurrence of two-day inference has been made by a deployed ML model.
2. The counter is used in file naming as a crude way to version data.

All data spits for ML model development will now be saved to a separate `.parquet` file

In [None]:
%%time
test_end = df_rand_dates.loc[0, "test_ends"].strftime("%Y%m%d_%H%M%S")
for split_name, df_split_to_export in zip(
    ["train", "val", "test"], [df_train, df_val, df_test]
):
    fname = f"{split_name}__inference2d_{infer_idx}__{test_end}.parquet.gzip"
    split_filepath = f"s3://{s3_bucket_name}{path_to_folder}processed/splits/{fname}"
    # df_split_to_export.to_csv(
    #     split_filepath,
    #     index=False,
    #     storage_options={
    #         "key": session.get_credentials().access_key,
    #         "secret": session.get_credentials().secret_key,
    #     },
    # )
    dest_path_str = f"{path_to_folder[1:]}processed/splits/{fname}"
    print(f"Exported {len(df_split_to_export):,} to {dest_path_str} to S3")

All sampled data spits for NLP model development will now be saved to a separate `.CSV` file

In [None]:
%%time
train_end = df_train["created_at"].max().compute().strftime("%Y%m%d_%H%M%S")
for split_name, df_split_to_export in zip(
    ["train_nlp", "val_nlp", "test_nlp"],
    [df_nlp_train, df_nlp_val, df_nlp_test],
):
    fname = f"{split_name}__inference2d_{infer_idx}__{train_end}.csv.zip"
    split_filepath = f"s3://{s3_bucket_name}{path_to_folder}processed/nlp_splits/{fname}"
    # df_split_to_export.to_csv(
    #     split_filepath,
    #     index=False,
    #     storage_options={
    #         "key": session.get_credentials().access_key,
    #         "secret": session.get_credentials().secret_key,
    #     },
    # )
    dest_fpath = f"{path_to_folder[1:]}processed/nlp_splits/{fname}"
    print(f"Exported {len(df_split_to_export):,} to {dest_fpath} on S3")

All the data for inference during ML model development, will now be saved to a separate `.parquet` file

In [None]:
%%time
for infer_idx, df_infer in df_inferences.items():
    infer_start, infer_end = [
        df_rand_dates.loc[infer_idx, "infer_starts"].strftime("%Y%m%d_%H%M%S"),
        df_rand_dates.loc[infer_idx, "infer_ends"].strftime("%Y%m%d_%H%M%S")
    ]
    fname = f"inference_{infer_idx}__{infer_start}__{infer_end}.parquet.gzip"
    infer_filepath = f"s3://{s3_bucket_name}{path_to_folder}processed/inference/{fname}"
    # df_infer.to_parquet(
    #     infer_filepath,
    #     index=False,
    #     compression='gzip',
    #     storage_options={
    #         "key": session.get_credentials().access_key,
    #         "secret": session.get_credentials().secret_key,
    #     },
    # )
    dest_path_str = f"{path_to_folder[1:]}processed/inference/{fname}"
    print(f"Exported {len(df_infer):,} rows to {dest_path_str} on S3")