# NLP Model Fine-Tuning

In [1]:
%load_ext lab_black
%load_ext autoreload
%autoreload 2

In [2]:
import os
import sys
from calendar import day_name
from glob import glob
from datetime import datetime

import boto3
import dask.dataframe as dd
import numpy as np
import pandas as pd
import torch
from datasets import Dataset, DatasetDict, load_dataset, load_metric
from sklearn.dummy import DummyClassifier
from sklearn.pipeline import Pipeline
from torch import nn
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    DataCollatorWithPadding,
    Trainer,
    TrainingArguments,
    pipeline,
    set_seed,
)

In [3]:
PROJ_ROOT = os.path.join(os.pardir)
src_dir = os.path.join(PROJ_ROOT, "src")
sys.path.append(src_dir)

In [4]:
%aimport metrics_utils
from metrics_utils import calculate_metrics

%aimport model_utils
from model_utils import (
    compute_metrics,
    get_metrics,
    make_predictions,
    tokenize_function,
)

%aimport pandas_utils
from pandas_utils import save_to_parquet

%aimport s3_utils
from s3_utils import download_files_from_s3, extract_zip_file

In [5]:
set_seed(42)

## About

### Objective
This notebook [fine-tunes a pre-trained](https://huggingface.co/docs/transformers/training#train-in-native-pytorch) transformers model ([1](https://huggingface.co/microsoft/MiniLM-L12-H384-uncasedhttps://huggingface.co/microsoft/MiniLM-L12-H384-uncased), [2](https://arxiv.org/pdf/2002.10957.pdf#page=5https://arxiv.org/pdf/2002.10957.pdf#page=5)) using the [PyTorch](https://pytorch.org/https://pytorch.org/) deep learning framework.

### Data
The data used for fine-tuning consists of the three data splits
- (training) `train_nlp_inference_starts_*.xlsx` (approximately 2,900 tweets, for initial model training)
- (validation, for model scoring per [epoch](https://deepai.org/machine-learning-glossary-and-terms/epochhttps://deepai.org/machine-learning-glossary-and-terms/epoch)) `val_nlp_inference_starts_*.xlsx` (600 tweets)
- (testing) `test_nlp_inference_starts_*.xlsx` (600 tweets)

that were
- created in `6-split-data/notebooks/6_split_data.ipynb`
- manually labeled by reading the tweets to identify the sentiment
  - 0 - negative
  - 1 - neutral, or
  - 2 - positive

  of each tweet

As a reminder of the context outlined in the project scope, tweets labeled with a
- negative or neutral sentiment
  - need support from (must be reviewed by) the mission support team
- positive sentiment
  - do not need support from the mission support team

Model
- fine-tuning is performed using the training and validation splits
- evaluation is performed using the training and testing splits using ML and business metrics
  - for assessing the business metrics, a comparison is made between the metrics calculated using
    - the fine-tuned ML model
    - a naive (non-ML) model

  and the fine-tuned model must have
  - F2-score greather than 0.8
  - superior (higher) evaluation metrics than those using a naive (non-ML) approach

### Outputs
1. `test_nlp__inference_starts_xxxx__batch_n__with_preds.parquet.gzip`
   - predictions of the sentiment in the *testing* split will be appended to the test split and then exported to a `.parquet` file
   - this file will be used to monitor the relationship between metadata and both the fine-tuned ML and naive (non-ML) predictions for one or multiple testing splits
2. `metrics__inference_starts_xxxx__batch_n`
   - summary of ML evaluation and business metrics for the *testing* split
   - this file is used to validate the analysis

## User Inputs

In [6]:
path_to_folder = "/datasets/twitter/kinesis-demo/"

# processed data
processed_data_dir = "../data/processed"

label_mapper = {"does_not_need_support": 0, "needs_support": 1}

needs_support_labels = [0, 1]

checkpoint_pretrained = "microsoft/MiniLM-L12-H384-uncased"

model_output_dir = "../model-fine-tuned"

# Metadata - feature engineering
b = [0, 4, 8, 12, 16, 20, 24]
l = ["Late Night", "Early Morning", "Morning", "Afternoon", "Evening", "Night"]
num_words_bins = [0, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60]
num_words_labels = [
    "0-5",
    "6-10",
    "11-15",
    "16-20",
    "20-25",
    "26-30",
    "31-35",
    "36-40",
    "41-45",
    "46-50",
    "51-55",
    "56-60",
]

# Model evaluation
wanted_pred_proba_stats = [
    "count",
    "min",
    "mean",
    "coeff_of_var",
    "std_error",
    "num_samples",
]
avg_reading_speed_wpm = 135
avg_typing_speed_wpm = 40
frac_support_tweets_needing_response = 1.0

upload_to_s3 = True

In [7]:
s3_bucket_name = os.getenv("AWS_S3_BUCKET_NAME", "")

try:
    session = boto3.Session(profile_name="default")
    s3_client = session.client("s3")
    aws_region = session.region_name
    print("Retrieved AWS credentials from ~/.ssh/aws file")
except Exception as e:
    if str(e) == "The config profile (default) could not be found":
        aws_region = os.getenv("AWS_REGION")
        s3_client = boto3.client("s3", region_name=aws_region)
        print("Retrieved AWS credentials from .env file")

dtypes_dict = {
    "id": pd.StringDtype(),
    "contributors": pd.StringDtype(),  # pd.BooleanDtype(),
    "source_text": pd.StringDtype(),
    "place_country": pd.StringDtype(),
    "user_location": pd.StringDtype(),
    "user_followers": pd.Int32Dtype(),
    "user_friends": pd.Int32Dtype(),
    "user_listed": pd.Int32Dtype(),
    "user_favourites": pd.Int32Dtype(),
    "user_statuses": pd.Int32Dtype(),
    "user_protected": pd.StringDtype(),  # pd.BooleanDtype(),
    "user_verified": pd.StringDtype(),  # pd.BooleanDtype(),
    "is_quote_status": pd.StringDtype(),  # pd.BooleanDtype(),
    "retweeted": pd.StringDtype(),  # pd.BooleanDtype(),
    "retweeted_tweet": pd.StringDtype(),
    "in_reply_to_screen_name": pd.StringDtype(),
    "user_screen_name": pd.StringDtype(),
    "num_urls_in_tweet_text": pd.Int32Dtype(),
    "num_words": pd.Int32Dtype(),
    "text": pd.StringDtype(),
    "sentiment": pd.Int32Dtype(),
    "order": pd.Int32Dtype(),
    "hour": pd.Int32Dtype(),
    "day": pd.Int32Dtype(),
    "weekday": pd.StringDtype(),
    "time_of_day": pd.StringDtype(),
    "batch_num": pd.Int32Dtype(),
}
test_feats_dtypes_test_dict = {
    "pred": pd.Int32Dtype(),
    "created_at_hour": pd.Int32Dtype(),
    "created_at_day": pd.StringDtype(),
    "user_joined_hour": pd.Int32Dtype(),
    "user_joined_day": pd.StringDtype(),
    "split": pd.StringDtype(),
    "created_at_time_of_day": pd.StringDtype(),
    "reading_time": pd.Float32Dtype(),
    "replying_time": pd.Float32Dtype(),
    "response_time": pd.Float32Dtype(),
}
metrics_dtypes_dict = dict(
    batch_num=pd.Int32Dtype(),
    num_tweets_missed=pd.Int32Dtype(),
    num_tweets_unnecessarily_read=pd.Int32Dtype(),
    total_number_tweets=pd.Int32Dtype(),
    num_needs_support=pd.Int32Dtype(),
)

# Metadata - Feature Engineering
mdict = dict(
    USA=[
        " AL",
        " AK",
        " AZ",
        " AR",
        " CA",
        " CO",
        " CT",
        " DC",
        " DE",
        " FL",
        " GA",
        " HI",
        " ID",
        " IL",
        " IN",
        " IA",
        " KS",
        " KY",
        " LA",
        " ME",
        " MD",
        " MA",
        " MI",
        " MN",
        " MS",
        " MO",
        " MT",
        " NE",
        " NV",
        " NH",
        " NJ",
        " NM",
        " NY",
        " NC",
        " ND",
        " OH",
        " OK",
        " OR",
        " PA",
        " RI",
        " SC",
        " SD",
        " TN",
        " TX",
        " UT",
        " VT",
        " VA",
        " WA",
        " WV",
        " WI",
        " WY",
        "USA",
        " USA",
        "United States",
        "Los Angeles",
        "Los Angeles ",
        "Texas",
    ],
    UK=["United Kingdom", " United Kingdom", " England", "London", "UK"],
    India=[" India", "India"],
    Australia=["Australia"],
    Canada=[" Ontario", "Canada"],
    Philippines=["Republic of the Philippines", "Philippines"],
    Indonesia=["Indonesia"],
    France=["France", " France"],
    Germany=["Germany", "Deutschland"],
    Kenya=[" Kenya"],
)

id2label = {v: k for k, v in label_mapper.items()}
id2label

Retrieved AWS credentials from .env file


{0: 'does_not_need_support', 1: 'needs_support'}

## Get Annotated Data Splits

In [8]:
%%time
download_files_from_s3(
    s3_client,
    s3_bucket_name,
    processed_data_dir,
    aws_region,
    f"{path_to_folder[1:]}processed/nlp_splits/",
    ".xlsx",
)
proc_files = sorted(glob(f"{processed_data_dir}/*_nlp_*.xlsx"))
proc_file_test = [f for f in proc_files if "test_" in f][0]
print(proc_file_test)
proc_files

File found at ../data/processed/test_nlp__inference_starts_20220110_000000.xlsx. Did nothing.
File found at ../data/processed/train_nlp__inference_starts_20220110_000000.xlsx. Did nothing.
File found at ../data/processed/val_nlp__inference_starts_20220110_000000.xlsx. Did nothing.
../data/processed/test_nlp__inference_starts_20220110_000000.xlsx
CPU times: user 16 ms, sys: 199 µs, total: 16.2 ms
Wall time: 182 ms


['../data/processed/test_nlp__inference_starts_20220110_000000.xlsx',
 '../data/processed/train_nlp__inference_starts_20220110_000000.xlsx',
 '../data/processed/val_nlp__inference_starts_20220110_000000.xlsx']

## Load and Process Data

Perform the following
- rename the class labels column from `sentiment` to `labels`
- remove retweets (tweets starting with *RT*)
- map the `labels` column (sentiment) to indicate a tweet
  - needing support (neutral or negative sentiment)
  - not needing support (positive sentiment)
- text processing to
  - remove leading and trailing spaces
  - replace HTML by `>`, `<` or `&`, as appropriate
- add a column with a binned version of the number of words in the tweet
  - bin width was chosen as 5 words (eg. 0-5, 6-10, etc.)

In [9]:
%%time
df_test, df_train, df_val = [
    (
        pd.read_excel(
            f,
            dtype=dtypes_dict,
            usecols=list(dtypes_dict)+['created_at', 'user_joined']
        ).rename(columns={"sentiment": "labels"})
        # .sort_values(by=['created_at'])
        .query("~text.str.startswith('RT')")
        .assign(split=st)
        .assign(labels=lambda df: df['labels'].isin(needs_support_labels).astype(pd.Int32Dtype()))
        .assign(
            text=lambda df: (
                df["text"]
                .str.lstrip()
                .str.rstrip()
                .str.replace("&gt;", ">")
                .str.replace("&lt;", "<")
                .str.replace("&amp;", "&")
            )
        )
        .assign(
            bin_name=lambda df: pd.cut(df["num_words"], bins=num_words_bins, labels=num_words_labels).astype(pd.StringDtype())
        )
    )
    for f, st in zip(proc_files, ['test', 'train', 'val'])
]

CPU times: user 2.53 s, sys: 13.3 ms, total: 2.54 s
Wall time: 2.64 s


Drop any tweets which were not manually labeled with a sentiment. Since re-training and manual labeling are only performed after every five batches of new data arrives, the test split will contain data that is missing labels which won't be used in both of
- re-training
- model evaluation

so these rows must be dropped

In [10]:
df_test = df_test.dropna(subset=["labels"])

Get split sizes

In [11]:
split_sizes = [{"train": len(df_train), "val": len(df_val), "test": len(df_test)}]
df_split_sizes = pd.DataFrame.from_records(split_sizes).assign(type="raw")

Get the start and end date of the raw data in each split

In [12]:
df_split_dates = pd.DataFrame.from_records(
    [
        {
            "split": split_type,
            "start": df_nlp_spit["created_at"].min().strftime("%Y-%m-%d %H:%M:%S"),
            "end": df_nlp_spit["created_at"].max().strftime("%Y-%m-%d %H:%M:%S"),
        }
        for df_nlp_spit, split_type in zip(
            [df_train, df_val, df_test], ["train", "val", "test"]
        )
    ]
)
df_split_dates

Unnamed: 0,split,start,end
0,train,2021-12-30 17:39:11,2022-01-08 15:14:33
1,val,2022-01-08 15:15:45,2022-01-09 01:17:04
2,test,2022-01-09 01:18:13,2022-01-10 01:29:01


**Observations**
1. The training split preceeds the validation split which, in-turn, preceeds the testing split. This is expected, since the splits were separated based on `datetime` in order to replicate the arrival of new (unseen) data during inference. See `README.md` for further details.

Perform sanity checks to verify the expected time-ordering of the splits

In [13]:
train_end = df_train["created_at"].max()
val_start = df_val["created_at"].min()
val_end = df_val["created_at"].max()
test_start = df_test["created_at"].min()
assert train_end < val_start
assert val_start < val_end
assert val_end < test_start

(If not initial training run) Get most current test split and
- append validation split to training split
- use the most recent non-current test split as the validation split
- use the most recent test split as the current test split

In [14]:
batch_nums = df_test["batch_num"].unique().tolist()
batch_nums

[1]

In [15]:
# test_current_batch_num = df_test["batch_num"].max()
# if df_test["batch_num"].nunique() > 1:
#     df_train = pd.concat(
#         [df_train, df_val, df_test.query(f"batch_num < {test_current_batch_num-1}")]
#     )
#     df_val = df_test.query(f"batch_num == {test_current_batch_num-1}")
#     df_test = df_test.query(f"batch_num == {test_current_batch_num}")

if len(batch_nums) > 1:
    # get all but second last and last batch numbers from test split (to use in training split)
    training_batch_nums = batch_nums[:-2]
    # get second last batch number from test split (to use in validation split)
    val_batch_num = batch_nums[-2]
    # get last batch number from test split (to use as current test split)
    test_current_batch_num = batch_nums[-1]

    # Slice raw data splits based on batch numbers defined above
    df_train = pd.concat(
        [df_train, df_val, df_test.query(f"batch_num.isin(@training_batch_nums)")]
    )
    df_val = df_test.query(f"batch_num == {val_batch_num}")
    df_test = df_test.query(f"batch_num == {test_current_batch_num}")
else:
    test_current_batch_num = df_test["batch_num"].max()

Drop duplicates in the
- training split
  - this will improve (reduce) fine-tuning time

In [16]:
df_train = df_train.drop_duplicates(subset=["text"])

Get the new split sizes

In [17]:
split_sizes_no_dups = [
    {"train": len(df_train), "val": len(df_val), "test": len(df_test)}
]
df_split_sizes_no_dups = pd.DataFrame.from_records(split_sizes_no_dups).assign(
    type="without-duplicates"
)

Show split sizes before and after dropping duplicates

In [18]:
df_split_sizes_comp = pd.concat(
    [df_split_sizes, df_split_sizes_no_dups], ignore_index=True
)
df_split_sizes_comp

Unnamed: 0,train,val,test,type
0,2931,600,600,raw
1,2775,600,600,without-duplicates


Get the start and end date of each split after combining training, validation and test splits

In [19]:
df_split_dates = pd.DataFrame.from_records(
    [
        {
            "split": split_type,
            "start": df_nlp_spit["created_at"].min().strftime("%Y-%m-%d %H:%M:%S"),
            "end": df_nlp_spit["created_at"].max().strftime("%Y-%m-%d %H:%M:%S"),
        }
        for df_nlp_spit, split_type in zip(
            [df_train, df_val, df_test], ["train", "val", "test"]
        )
    ]
)
df_split_dates

Unnamed: 0,split,start,end
0,train,2021-12-30 17:39:11,2022-01-08 15:14:33
1,val,2022-01-08 15:15:45,2022-01-09 01:17:04
2,test,2022-01-09 01:18:13,2022-01-10 01:29:01


**Notes**
1. The time-dependence seen in the raw splits should still be preserved after combining splits.

Perform sanity checks to verify the expected time-ordering of the splits

In [20]:
train_end = df_train["created_at"].max()
val_start = df_val["created_at"].min()
val_end = df_val["created_at"].max()
test_start = df_test["created_at"].min()
assert train_end < val_start
assert val_start < val_end
assert val_end < test_start

The feature `bin_name`, containing a binned version of the number of words in each tweet, was added here. The bin boundaries were defined based on statistics for the training data, which are shown below

In [21]:
df_train["num_words"].describe().to_frame().T.assign(
    nunique_num_words=df_train["num_words"].nunique()
).T

Unnamed: 0,num_words
count,2775.0
mean,22.950631
std,12.881696
min,3.0
25%,12.0
50%,20.0
75%,33.0
max,57.0
nunique_num_words,54.0


## Get Features and Labels For Each Data Split

In [22]:
%%time
X_train, X_val, X_test, y_train, y_val, y_test = [
    df_train["text"],
    df_val["text"],
    df_test["text"],
    df_train["labels"],
    df_val["labels"],
    df_test["labels"],
]

CPU times: user 201 µs, sys: 0 ns, total: 201 µs
Wall time: 205 µs


## Create `huggingface` Dataset (Including All Splits)

In [23]:
mydict = {
    "train": {"label": y_train.tolist(), "text": X_train.tolist()},
    "val": {"label": y_val.tolist(), "text": X_val.tolist()},
    "test": {"label": y_test.tolist(), "text": X_test.tolist()},
}
dataset = DatasetDict()
for k, v in mydict.items():
    dataset[k] = Dataset.from_dict(v)

In [24]:
dataset["train"][0]

{'label': 0,
 'text': 'James Webb telescope could fundermentaley change are understanding of the universe. This right now is one of the most exciting times to ever be alive, If your interested in space and future tech then this should be at the top of your least xxx'}

## Exploratory Data Analysis

In [25]:
print(np.unique(y_train))
display(
    y_train.value_counts(normalize=True)
    .rename("freq")
    .sort_index()
    .reset_index()
    .rename(columns={"index": "label"})
    .merge(
        y_train.value_counts()
        .rename("rows")
        .sort_index()
        .reset_index()
        .rename(columns={"index": "label"})
    )
    .style.set_caption("Class Balance of Train Split")
)

[0 1]


Unnamed: 0,label,freq,rows
0,0,0.630631,1750
1,1,0.369369,1025


## Instantiate Pre-Trained Model and Tokenizer

In [26]:
%%time
tokenizer = AutoTokenizer.from_pretrained(checkpoint_pretrained)
model = AutoModelForSequenceClassification.from_pretrained(
    checkpoint_pretrained,
    num_labels=y_train.nunique(),
    id2label=id2label,
    label2id=label_mapper,
)

Downloading:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/385 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/133M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at microsoft/MiniLM-L12-H384-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


CPU times: user 1.83 s, sys: 298 ms, total: 2.13 s
Wall time: 3.94 s


## Perform Dynamic Batching During Tokenization

Tokenize all the data splits

In [27]:
%%time
tokenized_datasets = dataset.map(
    tokenize_function, fn_kwargs=dict(mytokenizer=tokenizer), batched=True
)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

  0%|          | 0/3 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

CPU times: user 660 ms, sys: 17.8 ms, total: 678 ms
Wall time: 248 ms


## Dealing With Class Imbalance

Create class weights

In [28]:
class_weights = (1 - (y_train.value_counts().sort_index() / len(y_train))).to_numpy(
    dtype="float32"
)
class_weights

array([0.36936936, 0.6306306 ], dtype=float32)

Convert class weights to `pytorch` tensor

In [29]:
class_weights = torch.from_numpy(class_weights).float()
class_weights

tensor([0.3694, 0.6306])

Define an instance of the `Trainer` class, that implements a custom `.CrossEntropyLoss()` which uses the above class weights based on the training data

In [30]:
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        # Extract true labels
        labels = inputs.get("labels")
        # forward pass - feed inputs to model and extract logits
        outputs = model(**inputs)
        logits = outputs.get("logits")
        # Define loss function with class weights
        loss_fct = nn.CrossEntropyLoss(weight=class_weights)
        # Compute loss
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss

## Re-Training

### Set Up `Trainer` Object (using `huggingface`'s `Trainer` API)

In [31]:
%%time
batch_size = 64
logging_steps = len(df_train) // batch_size

training_args = TrainingArguments(
    output_dir=model_output_dir,
    num_train_epochs=5,
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    evaluation_strategy='epoch',
    save_strategy = "epoch",  # new
    logging_steps=logging_steps,
    load_best_model_at_end=True,  # new
    optim="adamw_torch",
    report_to="all",  # default='all'
    push_to_hub=False,
    metric_for_best_model="loss",
    greater_is_better=False,
)
trainer = CustomTrainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["val"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

CPU times: user 4.66 ms, sys: 230 µs, total: 4.89 ms
Wall time: 14.4 ms


### Train (Performs Fine-Tuning)

Perform fine-tuning of the pre-trained transformers model using the
- training
- validation

splits from the manually labeled data

In [32]:
%%time
print(f"Starting time = {datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]}...")
trainer.train()
print(f"done at {datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]}.")

The following columns in the training set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 2775
  Num Epochs = 5
  Instantaneous batch size per device = 64
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 1
  Total optimization steps = 220
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Starting time = 2022-11-16 21:39:59.745...


Epoch,Training Loss,Validation Loss,Accuracy,Balanced Accuracy,Precision,Recall,F1,F05,F2
1,0.6736,0.526155,0.825,0.835979,0.842713,0.825,0.827904,0.835401,0.824671
2,0.4956,0.398444,0.87,0.859503,0.87,0.87,0.87,0.87,0.87
3,0.4237,0.388109,0.863333,0.856237,0.864669,0.863333,0.863841,0.8643,0.863498
4,0.3976,0.36885,0.868333,0.862133,0.869833,0.868333,0.86888,0.869404,0.868503
5,0.3725,0.374702,0.865,0.856561,0.865767,0.865,0.865321,0.865573,0.865113


The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 600
  Batch size = 64
Saving model checkpoint to ../model-fine-tuned/checkpoint-44
Configuration saved in ../model-fine-tuned/checkpoint-44/config.json
Model weights saved in ../model-fine-tuned/checkpoint-44/pytorch_model.bin
tokenizer config file saved in ../model-fine-tuned/checkpoint-44/tokenizer_config.json
Special tokens file saved in ../model-fine-tuned/checkpoint-44/special_tokens_map.json
The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Nu

done at 2022-11-16 21:48:30.303.
CPU times: user 50min 23s, sys: 9.95 s, total: 50min 33s
Wall time: 8min 30s


Train the non-ML (naive) model to make random guesses at whether the tweet needs support (non-positive sentiment) or not (positive sentiment)

In [33]:
pipe = Pipeline([("clf", DummyClassifier(strategy="uniform", random_state=88))])

In [34]:
%%time
_ = pipe.fit(X_train, y_train)

CPU times: user 842 µs, sys: 3 µs, total: 845 µs
Wall time: 828 µs


## Model Evaluation

Use the model trained above to make predictions on the manually labeled data.

### Make Predictions with Re-Trained Model

Make predictions of the test split using the ML model

In [35]:
%%time
y_test_pred, y_test_proba = make_predictions(tokenized_datasets['test'], trainer)

The following columns in the test set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 600
  Batch size = 64


CPU times: user 39.4 s, sys: 168 ms, total: 39.6 s
Wall time: 6.65 s


Make predictions of the test split using the naive model

In [36]:
%%time
y_test_pred_naive = (
    pd.Series(pipe.predict(X_test), name='label', index=y_test.index)
    .astype(pd.Int32Dtype())
)

CPU times: user 728 µs, sys: 3 µs, total: 731 µs
Wall time: 699 µs


Make predictions of the train split using the ML model

In [37]:
%%time
y_train_pred, y_train_proba = make_predictions(tokenized_datasets['train'], trainer)

The following columns in the test set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 2775
  Batch size = 64


CPU times: user 3min 1s, sys: 864 ms, total: 3min 1s
Wall time: 30.5 s


The fine-tuned model's predictions are now evaluated using the following
- evaluation metrics
  - accuracy
  - precision
  - recall
  - [F1-, F-0.5 and F2-score](https://docs.h2o.ai/driverless-ai/latest-stable/docs/userguide/scorers.html#f05-f1-and-f2)
  - confusion matrix
  - classification report

### Evaluation Metrics

Model evaluation is performed on the predictions of the test split using the ML model

In [38]:
%%time
y_test_numpy = tokenized_datasets["test"].data.to_pandas()["label"].to_numpy()
metrics_dict, df_cm, df_cr = calculate_metrics(
    y_test_numpy,
    y_test_pred,
    list(label_mapper.values()),
    list(label_mapper.keys()),
    "weighted",
    0,
    use_sample_weights=False,
)
df_metrics = pd.DataFrame.from_dict(metrics_dict, orient="index").T
df_cr = df_cr.merge(
    y_test.value_counts(normalize=True).rename("freq").reset_index().assign(
        index=lambda df: df["index"].map(id2label)
    ).set_index("index"),
    left_index=True,
    right_index=True,
    how='left'
)
display(df_metrics)
display(df_cm)
display(df_cr)

Unnamed: 0,accuracy,balanced_accuracy,precision,recall,f1,f05,f2
0,0.805,0.80725,0.812648,0.805,0.806673,0.80975,0.805141


Unnamed: 0,Actual,does_not_need_support,needs_support
0,does_not_need_support,290,74
1,needs_support,43,193


Unnamed: 0,precision,recall,f1-score,support,freq
does_not_need_support,0.870871,0.796703,0.832138,364,0.606667
needs_support,0.722846,0.817797,0.767396,236,0.393333


CPU times: user 27.5 ms, sys: 16 µs, total: 27.5 ms
Wall time: 30.2 ms


Model evaluation is now performed on the predictions of the train split using the ML model

In [39]:
%%time
y_train_numpy = tokenized_datasets["train"].data.to_pandas()["label"].to_numpy()
metrics_dict_train, df_cm_train, df_cr_train = calculate_metrics(
    y_train_numpy,
    y_train_pred,
    list(label_mapper.values()),
    list(label_mapper.keys()),
    "weighted",
    0,
    use_sample_weights=False,
)
df_metrics_train = pd.DataFrame.from_dict(metrics_dict_train, orient="index").T
df_cr_train = df_cr_train.merge(
    y_train.value_counts(normalize=True).rename("freq").reset_index().assign(
        index=lambda df: df["index"].map(id2label)
    ).set_index("index"),
    left_index=True,
    right_index=True,
    how='left'
)
display(df_metrics_train)
display(df_cm_train)
display(df_cr_train)

Unnamed: 0,accuracy,balanced_accuracy,precision,recall,f1,f05,f2
0,0.861982,0.874404,0.877282,0.861982,0.864056,0.870689,0.861419


Unnamed: 0,Actual,does_not_need_support,needs_support
0,does_not_need_support,1447,303
1,needs_support,80,945


Unnamed: 0,precision,recall,f1-score,support,freq
does_not_need_support,0.94761,0.826857,0.883125,1750,0.630631
needs_support,0.757212,0.921951,0.8315,1025,0.369369


CPU times: user 30.5 ms, sys: 10 µs, total: 30.5 ms
Wall time: 29.4 ms


Model evaluation is performed on the predictions of the test split using the naive model

In [40]:
y_test_pred_naive_numpy = y_test_pred_naive.astype("float64").to_numpy()
metrics_dict_naive, df_cm_naive, df_cr_naive = calculate_metrics(
    y_test_numpy,
    y_test_pred_naive_numpy,
    list(label_mapper.values()),
    list(label_mapper.keys()),
    "weighted",
    0,
    use_sample_weights=False,
)
df_metrics_naive = pd.DataFrame.from_dict(metrics_dict_naive, orient="index").T
df_cr_naive = df_cr_naive.merge(
    y_test.value_counts(normalize=True)
    .rename("freq")
    .reset_index()
    .assign(index=lambda df: df["index"].map(id2label))
    .set_index("index"),
    left_index=True,
    right_index=True,
    how="left",
)
display(df_metrics_naive)
display(df_cm_naive)
display(df_cr_naive)

Unnamed: 0,accuracy,balanced_accuracy,precision,recall,f1,f05,f2
0,0.515,0.516833,0.538858,0.515,0.520564,0.529993,0.515681


Unnamed: 0,Actual,does_not_need_support,needs_support
0,does_not_need_support,185,179
1,needs_support,112,124


Unnamed: 0,precision,recall,f1-score,support,freq
does_not_need_support,0.622896,0.508242,0.559758,364,0.606667
needs_support,0.409241,0.525424,0.460111,236,0.393333


Summarize the model evaluation metrics for both train and test splits

In [41]:
df_metrics_combo = (
    pd.concat([df_metrics_train, df_metrics, df_metrics_naive], ignore_index=True)
    .assign(split_type=["train", "test", "test"])
    .assign(model_type=["ML", "ML", "naive"])
)
df_metrics_combo

Unnamed: 0,accuracy,balanced_accuracy,precision,recall,f1,f05,f2,split_type,model_type
0,0.861982,0.874404,0.877282,0.861982,0.864056,0.870689,0.861419,train,ML
1,0.805,0.80725,0.812648,0.805,0.806673,0.80975,0.805141,test,ML
2,0.515,0.516833,0.538858,0.515,0.520564,0.529993,0.515681,test,naive


Summarize the classification report for both train and test splits

In [42]:
df_cr_combo = (
    pd.concat(
        [
            df_cr_train.assign(split_type="train").assign(model_type="ML"),
            df_cr.assign(split_type="test").assign(model_type="ML"),
            df_cr_naive.assign(split_type="test").assign(model_type="naive"),
        ]
    )
    .reset_index()
    .rename(columns={"index": "label"})
    .sort_values(by=["label"])
)
df_cr_combo

Unnamed: 0,label,precision,recall,f1-score,support,freq,split_type,model_type
0,does_not_need_support,0.94761,0.826857,0.883125,1750,0.630631,train,ML
2,does_not_need_support,0.870871,0.796703,0.832138,364,0.606667,test,ML
4,does_not_need_support,0.622896,0.508242,0.559758,364,0.606667,test,naive
1,needs_support,0.757212,0.921951,0.8315,1025,0.369369,train,ML
3,needs_support,0.722846,0.817797,0.767396,236,0.393333,test,ML
5,needs_support,0.409241,0.525424,0.460111,236,0.393333,test,naive


**Notes**
1. As mentioned in `6-split-data.ipynb`, shorter tweets were retrospectively added to the business metrics evaluation batches. So, the total number of tweets in each split will be different from those shown in `6-split-data.ipynb` - see that notebook for more details. See **Notes** point 2. from **Summary of Sizes of Datasets Used in This Notebook** in `6_split_data.ipynb`.

### Append Predictions and Features to the Test Split

Append test split predictions to the test split

In [43]:
df_test = df_test.assign(pred=pd.Series(y_test_pred, index=df_test.index)).assign(
    pred_naive=pd.Series(y_test_pred_naive, index=df_test.index)
)
df_test[["id", "created_at", "labels", "pred", "pred_naive", "split"]].sample(
    n=10, random_state=88
)

Unnamed: 0,id,created_at,labels,pred,pred_naive,split
548,1480261790567182336,2022-01-09 19:34:47,1,1,1,test
437,1480033459083087873,2022-01-09 04:27:28,0,0,0,test
241,1480292181306871810,2022-01-09 21:35:33,0,1,1,test
525,1480239785574912008,2022-01-09 18:07:21,0,0,1,test
236,1480055280264220672,2022-01-09 05:54:11,1,1,0,test
68,1480337582387040259,2022-01-10 00:35:57,0,0,0,test
535,1480045552041562114,2022-01-09 05:15:32,1,1,0,test
204,1479999803761307651,2022-01-09 02:13:44,0,1,1,test
383,1480060383368986625,2022-01-09 06:14:28,0,0,1,test
252,1480008567189090307,2022-01-09 02:48:34,0,0,0,test


Engineer features from metadata for the test split, including appending a *response time* column (approximate time spent reading and responding to tweets) based on the
- number of words in each tweet
- [average reading speed of 130 words per minute](https://www.omnicalculator.com/everyday-life/words-per-minute)
- [average typing speed of 40 wpm](https://www.ratatype.com/learn/average-typing-speed/)

In [44]:
# Feature engineering for metadata
df_test = (
    df_test.assign(error=lambda df: df["labels"] != df["pred"])
    .assign(created_at_hour=lambda df: df["created_at"].dt.hour)
    .assign(created_at_day=lambda df: df["created_at"].dt.day_name())
    .assign(user_joined_hour=lambda df: df["user_joined"].dt.hour)
    .assign(user_joined_day=lambda df: df["user_joined"].dt.day_name())
    .assign(
        created_at_time_of_day=lambda df: pd.cut(
            df["created_at_hour"], bins=b, labels=l, include_lowest=True
        )
    )
    .assign(
        reading_time=lambda df: df["num_words"] * (1 / avg_reading_speed_wpm) * (60 / 1)
    )
    .assign(
        replying_time=lambda df: df["num_words"]
        * frac_support_tweets_needing_response
        * (1 / avg_typing_speed_wpm)
        * (60 / 1)
    )
    .assign(response_time=lambda df: df["reading_time"] + df["replying_time"])
    .astype(test_feats_dtypes_test_dict)
)
df_test["country"] = "Other"
for k, v in mdict.items():
    mask = df_test["user_location"].str.contains("|".join(v))
    df_test.loc[mask, "country"] = k
df_test = df_test.astype({"country": pd.StringDtype()})
# Verify that a country is present for every row of metadata
try:
    assert df_test["country"].value_counts().sum() == len(df_test)
    print("Extracted country for every user_location")
except AssertionError as e:
    print(f"{str(e)}: Did not sucessfully extract country for every user_location")

Extracted country for every user_location


### Test Split Errors by Source Country

Summarize model errors of the test split by the country from which the tweet originated

In [45]:
df_by_country = (
    df_test["country"]
    .value_counts()
    .rename("counts")
    .to_frame()
    .merge(
        df_test["country"].value_counts(normalize=True).rename("freq").to_frame(),
        left_index=True,
        right_index=True,
        how="left",
    )
)
df_error_by_country = (
    df_test.groupby("country", as_index=False)["error"]
    .sum()
    .sort_values(by=["error"], ascending=False)
    .assign(error_freq=lambda df: df["error"] / len(df_test))
    .set_index("country")
    .merge(df_by_country, left_index=True, right_index=True, how="left")
    .reset_index()
    .assign(
        freq_rank=lambda df: df["freq"]
        .rank(ascending=False, method="dense")
        .astype(int)
        .astype(pd.Int32Dtype())
    )
    .assign(
        error_freq_rank=lambda df: df["error_freq"]
        .rank(ascending=False, method="dense")
        .astype(int)
        .astype(pd.Int32Dtype())
    )
    .assign(error_freq_to_freq=lambda df: df["error"] / df["counts"])
    .sort_values(by=["error_freq_to_freq"], ascending=False)
)
df_error_by_country

Unnamed: 0,country,error,error_freq,counts,freq,freq_rank,error_freq_rank,error_freq_to_freq
4,Germany,2,0.003333,7,0.011667,6,5,0.285714
2,Canada,4,0.006667,15,0.025,4,3,0.266667
0,Other,85,0.141667,420,0.7,1,1,0.202381
1,USA,23,0.038333,116,0.193333,2,2,0.198276
3,UK,3,0.005,25,0.041667,3,4,0.12
5,Australia,0,0.0,4,0.006667,7,6,0.0
6,India,0,0.0,12,0.02,5,6,0.0
7,Kenya,0,0.0,1,0.001667,8,6,0.0


### Test Split Errors by Weekday and Hour of Day

Summarize model errors of the test split by
- hour the day
- day of the week (Monday - Sunday)

In [46]:
df_by_time_of_day = (
    (
        df_test.groupby(["created_at_time_of_day", "created_at_day"])["id"].count()
        / len(df_test)
    )
    .reset_index()
    .astype({"id": pd.Float32Dtype()})
    .pivot(
        index="created_at_time_of_day",
        columns=["created_at_day"],
        values="id",
    )
)
df_error_by_time_of_day = (
    (
        df_test.groupby(["created_at_time_of_day", "created_at_day"])["error"].sum()
        / len(df_test)
    )
    .reset_index()
    .astype({"error": pd.Float32Dtype()})
    .pivot(
        index="created_at_time_of_day",
        columns=["created_at_day"],
        values="error",
    )
)
display(
    df_by_time_of_day.astype("float64")
    .style.set_caption("Tweet Frequency by Time of Day")
    .background_gradient(cmap="YlOrRd")
    .set_properties(**{"font-size": "12px"})
)
display(
    df_error_by_time_of_day.astype("float64")
    .style.set_caption("Error Frequency by Time of Day")
    .background_gradient(cmap="YlOrRd")
    .set_properties(**{"font-size": "12px"})
)

created_at_day,Monday,Sunday
created_at_time_of_day,Unnamed: 1_level_1,Unnamed: 2_level_1
Afternoon,,0.181667
Early Morning,,0.108333
Evening,,0.195
Late Night,0.038333,0.236667
Morning,,0.138333
Night,,0.101667


created_at_day,Monday,Sunday
created_at_time_of_day,Unnamed: 1_level_1,Unnamed: 2_level_1
Afternoon,,0.031667
Early Morning,,0.016667
Evening,,0.041667
Late Night,0.008333,0.05
Morning,,0.02
Night,,0.026667


### Test Split Errors by Twitter Client

Summarize model errors of the test split by the Twitter client used to post the tweet

In [47]:
df_by_device = (
    df_test["source_text"]
    .value_counts()
    .rename("counts")
    .to_frame()
    .merge(
        df_test["source_text"].value_counts(normalize=True).rename("freq").to_frame(),
        left_index=True,
        right_index=True,
        how="left",
    )
)
df_error_by_device = (
    df_test.groupby("source_text", as_index=False)["error"]
    .sum()
    .sort_values(by=["error"], ascending=False)
    .assign(error_freq=lambda df: df["error"] / len(df_test))
    .set_index("source_text")
    .merge(df_by_device, left_index=True, right_index=True, how="left")
    .reset_index()
    .assign(
        freq_rank=lambda df: df["freq"]
        .rank(ascending=False, method="dense")
        .astype(int)
        .astype(pd.Int32Dtype())
    )
    .assign(
        error_freq_rank=lambda df: df["error_freq"]
        .rank(ascending=False, method="dense")
        .astype(int)
        .astype(pd.Int32Dtype())
    )
    .assign(error_freq_to_freq=lambda df: df["error"] / df["counts"])
    .sort_values(by=["error_freq"], ascending=False)
    .rename(columns={"source_text": "client"})
)
df_error_by_device

Unnamed: 0,client,error,error_freq,counts,freq,freq_rank,error_freq_rank,error_freq_to_freq
0,Twitter Web App,53,0.088333,215,0.358333,1,1,0.246512
1,Twitter for iPhone,31,0.051667,179,0.298333,2,2,0.173184
2,Twitter for Android,27,0.045,166,0.276667,3,3,0.162651
3,Twitter for iPad,3,0.005,20,0.033333,4,4,0.15
4,Alt-brain news test,1,0.001667,1,0.001667,8,5,1.0
5,TweetDeck,1,0.001667,8,0.013333,5,5,0.125
6,autonewssite.com,1,0.001667,1,0.001667,8,5,1.0
7,Hacker__News,0,0.0,1,0.001667,8,6,0.0
8,Heropost,0,0.0,1,0.001667,8,6,0.0
9,IFTTT,0,0.0,3,0.005,6,6,0.0


### Business Metrics

In [48]:
df_missed = df_test.query("(labels == 1) & (pred == 0)")
df_wasted = df_test.query("(labels == 0) & (pred == 1)")
df_missed_naive = df_test.query("(labels == 1) & (pred_naive == 0)")
df_wasted_naive = df_test.query("(labels == 0) & (pred_naive == 1)")

Calculate the business metrics for the test split using the
- fine-tuned ML
- naive

models

In [49]:
df_summary = (
    # business metrics for current test split
    pd.concat(
        [
            pd.DataFrame.from_dict(
                dict(
                    time_missed_ML=df_missed["response_time"].sum() / 60,
                    time_missed_naive=df_missed_naive["response_time"].sum() / 60,
                ),
                orient="index",
            ),
            pd.DataFrame.from_dict(
                dict(
                    time_wasted_ML=df_wasted["response_time"].sum() / 60,
                    time_wasted_naive=df_wasted_naive["response_time"].sum() / 60,
                ),
                orient="index",
            ),
        ]
    )
    .T.assign(num_tweets_missed_ML=len(df_missed))
    .assign(num_tweets_missed_naive=len(df_missed_naive))
    .assign(num_tweets_unnecessarily_read_ML=len(df_wasted))
    .assign(num_tweets_unnecessarily_read_naive=len(df_wasted_naive))
    .assign(
        frac_tweets_unnecessarily_read_ML=lambda df: df[
            "num_tweets_unnecessarily_read_ML"
        ]
        / len(df_test)
    )
    .assign(
        frac_tweets_unnecessarily_read_naive=lambda df: df[
            "num_tweets_unnecessarily_read_naive"
        ]
        / len(df_test)
    )
    .assign(frac_tweets_missed_ML=lambda df: df["num_tweets_missed_ML"] / len(df_test))
    .assign(
        frac_tweets_missed_naive=lambda df: df["num_tweets_missed_naive"] / len(df_test)
    )
    .assign(
        response_time_pred_ML=df_test.query("pred == 1")["response_time"].sum() / 60
    )
    .assign(
        response_time_pred_naive=df_test.query("pred_naive == 1")["response_time"].sum()
        / 60
    )
    .T.reset_index()
)
df_summary[["business_metric", "model_type"]] = df_summary["index"].str.rsplit(
    "_", n=1, expand=True
)

Get the combined ML evaluation and business metrics

In [50]:
df_summary = (
    pd.concat(
        [
            # metadata for current test split
            pd.DataFrame()
            .assign(ML=[test_current_batch_num])
            .assign(naive=[test_current_batch_num])
            .T.assign(total_number_tweets=len(df_test))
            .assign(num_needs_support=len(df_test.query("labels == 1")))
            .rename(columns={0: "batch_num"})
            .T,
            # ML evaluation metrics for current test split
            pd.concat(
                [
                    pd.DataFrame.from_dict(metrics_dict, orient="index")
                    .squeeze()
                    .rename("ML")
                    .to_frame()
                    .astype(pd.Float32Dtype()),
                    pd.DataFrame.from_dict(metrics_dict_naive, orient="index")
                    .squeeze()
                    .rename("naive")
                    .to_frame()
                    .astype(pd.Float32Dtype()),
                ],
                axis=1,
            ),
            # business metrics for current test split (contd.)
            (
                df_summary.iloc[:, 1:]
                .pivot(index=["business_metric"], columns=["model_type"], values=[0])
                .T.assign(
                    response_time_true=df_test.query("labels == 1")[
                        "response_time"
                    ].sum()
                    / 60
                )
                .reset_index(level=0, drop=True)
                .assign(
                    pct_reduction_in_time_wasted=lambda df: 100
                    * (df.loc["naive", "time_wasted"] - df.loc["ML", "time_wasted"])
                    / df.loc["naive", "time_wasted"]
                )
                .assign(
                    pct_reduction_in_time_missed=lambda df: 100
                    * (df.loc["naive", "time_missed"] - df.loc["ML", "time_missed"])
                    / df.loc["naive", "time_missed"]
                )
                .T
            ).sort_index(),
        ]
    )
    .T.astype(metrics_dtypes_dict)
    .reset_index()
    .rename(columns={"index": "model_type"})
)
df_summary.set_index("model_type").T.assign(
    data_type=df_summary.set_index("model_type").dtypes.to_frame()
).style.set_caption(
    f"ML and Business Metrics for Latest Batch of Data (batch "
    f"{test_current_batch_num}) in Test Split"
)

model_type,ML,naive,data_type
batch_num,1.0,1.0,Int32
total_number_tweets,600.0,600.0,Int32
num_needs_support,236.0,236.0,Int32
accuracy,0.805,0.515,Float64
balanced_accuracy,0.80725,0.516833,Float64
precision,0.812648,0.538858,Float64
recall,0.805,0.515,Float64
f1,0.806673,0.520564,Float64
f05,0.80975,0.529993,Float64
f2,0.805141,0.515681,Float64


**Observations**
1. The time missed and time wasted are lower with the ML model approach compared to the non-ML approach. Due to this, the percent reduction in time missed and time wasted are greater than zero, which indicates that the ML approach to predicting whether tweets need support delivers value over the non-ML approach.
2. The F2-score meets the required threshold of 80% set in the project scope.

## Export Data and Model Outputs

### Test Split Evaluation Metrics

In [51]:
metrics_file_path = os.path.join(
    processed_data_dir,
    os.path.splitext(os.path.basename(proc_file_test))[0].replace("test_nlp", "metrics")
    + f"__batch_{test_current_batch_num}.parquet.gzip",
)
metrics_fname = os.path.basename(metrics_file_path)
print(metrics_file_path)
print(metrics_fname)

../data/processed/metrics__inference_starts_20220110_000000__batch_1.parquet.gzip
metrics__inference_starts_20220110_000000__batch_1.parquet.gzip


In [52]:
%%time
if upload_to_s3:
    storage_options = {
        "key": os.getenv("AWS_ACCESS_KEY_ID"),
        "secret": os.getenv("AWS_SECRET_ACCESS_KEY"),
    }
    prefix = f"{path_to_folder[1:]}processed/nlp_splits/{metrics_fname}"
    metrics_filepath = f"s3://{s3_bucket_name}/{prefix}"
else:
    storage_options = None
    split_preds_filepath = metrics_file_path
save_to_parquet(df_summary, metrics_filepath, storage_options)

Saving to parquet file metrics__inference_starts_20220110_000000__batch_1.parquet.gzip...
Done.
CPU times: user 95 ms, sys: 4.01 ms, total: 99 ms
Wall time: 511 ms


### Fine-Tuned Model to Disk

Export the fine-tuned model locally

In [53]:
trainer.save_model(model_output_dir)

Saving model checkpoint to ../model-fine-tuned
Configuration saved in ../model-fine-tuned/config.json
Model weights saved in ../model-fine-tuned/pytorch_model.bin
tokenizer config file saved in ../model-fine-tuned/tokenizer_config.json
Special tokens file saved in ../model-fine-tuned/special_tokens_map.json


### Test Split with Fine-Tuned Model Predictions and Metadata

In [54]:
test_split_with_preds_path = os.path.join(
    processed_data_dir,
    os.path.splitext(os.path.basename(proc_file_test))[0]
    + f"__batch_{test_current_batch_num}__with_preds.parquet.gzip",
)
test_split_with_preds_fname = os.path.basename(test_split_with_preds_path)
print(test_split_with_preds_path)
print(test_split_with_preds_fname)

../data/processed/test_nlp__inference_starts_20220110_000000__batch_1__with_preds.parquet.gzip
test_nlp__inference_starts_20220110_000000__batch_1__with_preds.parquet.gzip


In [55]:
%%time
if upload_to_s3:
    storage_options = {
        "key": os.getenv("AWS_ACCESS_KEY_ID"),
        "secret": os.getenv("AWS_SECRET_ACCESS_KEY"),
    }
    prefix = f"{path_to_folder[1:]}processed/nlp_splits/{test_split_with_preds_fname}"
    split_preds_filepath = f"s3://{s3_bucket_name}/{prefix}"
else:
    storage_options = None
    split_preds_filepath = test_split_with_preds_path
save_to_parquet(df_test, split_preds_filepath, storage_options)

Saving to parquet file test_nlp__inference_starts_20220110_000000__batch_1__with_preds.parquet.gzip...
Done.
CPU times: user 6.88 ms, sys: 7.98 ms, total: 14.9 ms
Wall time: 221 ms


---

<span style="float:left;">
    <a href="./6-split-data/notebooks/6_split_data.ipynb"><< 6 - Create Data Splits for Model Training and Business Metrics</a>
</span>

<span style="float:right;">
    <a href="./8-assess/notebooks/8_assess.ipynb">8 - Assess Business Metrics on Test Split >></a>
</span>