# Inference Using Fine-Tuned Transformers Model

In [1]:
%load_ext lab_black
%load_ext autoreload
%autoreload 2

In [2]:
import os
import sys
from glob import glob
from datetime import datetime

import boto3
import pandas as pd
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    pipeline,
    set_seed,
)

In [3]:
PROJ_ROOT = os.path.join(os.pardir)
src_dir = os.path.join(PROJ_ROOT, "src")
sys.path.append(src_dir)

In [4]:
%aimport metrics_utils
from metrics_utils import calculate_metrics

%aimport model_utils
from model_utils import extract_sentiment_using_pretrained_model

%aimport pandas_utils
from pandas_utils import save_to_parquet

%aimport s3_utils
from s3_utils import download_files_from_s3

In [5]:
set_seed(42)

## About

### Objective
This notebook evaluates prediction probabilities made during inference by the pre-trained model that was fine-tuned in `7_train.ipynb`. Here, the test split will be treated as the inference data so the true labels will be ignored, since they are not known during inference. This covers the end-to-end workflow to evaluate the performance of a model that is deployed to production in order to monitor for drift in newly arrived Tweets (data) relative the previously-used training data.

### Data
The data used for fine-tuning consists of the two data splits
- (training) `train_nlp_inference_starts_*.xlsx` (approximately 2,900 tweets)
- (testing) `test_nlp_inference_starts_*.xlsx` (600 tweets)

that were
- created in `6-split-data/notebooks/6_split_data.ipynb`
- manually labeled by reading the tweets to identify the sentiment
  - 0 - negative sentiment
  - 1 - neutral, or
  - 2 - positive

  of each tweet

Model evaluation (calculation of prediction probability statistics) is performed using both the training and testing (inference) splits.

### Outputs
1. `pred_proba_stats_nlp__inference_starts_20220110_000000__batch_n_inference.parquet.gzip`
   - statistics for prediction probability under various groupings, including
     - [coefficient of variation](https://www.investopedia.com/terms/c/coefficientofvariation.asp) (defined as standard deviation / mean)
     - [standard error](https://www.investopedia.com/terms/s/standard-error.asp) [defined as standard deviation / sqrt(number of samples)]

     for both training and testing (inference) splits
   - this file will be used to evaluate the performance of a trained model, by comparison to the performance during previous model training runs, in production in order to monitor data drift

## User Inputs

In [6]:
path_to_folder = "/datasets/twitter/kinesis-demo/"

# processed data
processed_data_dir = "../data/processed"

label_mapper = {"does_not_need_support": 0, "needs_support": 1}

needs_support_labels = [0, 1]

model_output_dir = "../model-fine-tuned"

# Metadata - feature engineering
b = [0, 4, 8, 12, 16, 20, 24]
l = ["Late Night", "Early Morning", "Morning", "Afternoon", "Evening", "Night"]
num_words_bins = [0, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60]
num_words_labels = [
    "0-5",
    "6-10",
    "11-15",
    "16-20",
    "20-25",
    "26-30",
    "31-35",
    "36-40",
    "41-45",
    "46-50",
    "51-55",
    "56-60",
]

# Model evaluation
wanted_pred_proba_stats = [
    "count",
    "min",
    "mean",
    "coeff_of_var",
    "std_error",
    "num_samples",
]

upload_to_s3 = True

In [7]:
s3_bucket_name = os.getenv("AWS_S3_BUCKET_NAME", "")

try:
    session = boto3.Session(profile_name="default")
    s3_client = session.client("s3")
    aws_region = session.region_name
    print("Retrieved AWS credentials from ~/.ssh/aws file")
except Exception as e:
    if str(e) == "The config profile (default) could not be found":
        aws_region = os.getenv("AWS_REGION")
        s3_client = boto3.client("s3", region_name=aws_region)
        print("Retrieved AWS credentials from .env file")

dtypes_dict = {
    "id": pd.StringDtype(),
    "contributors": pd.StringDtype(),  # pd.BooleanDtype(),
    "source_text": pd.StringDtype(),
    "place_country": pd.StringDtype(),
    "user_location": pd.StringDtype(),
    "user_followers": pd.Int32Dtype(),
    "user_friends": pd.Int32Dtype(),
    "user_listed": pd.Int32Dtype(),
    "user_favourites": pd.Int32Dtype(),
    "user_statuses": pd.Int32Dtype(),
    "user_protected": pd.StringDtype(),  # pd.BooleanDtype(),
    "user_verified": pd.StringDtype(),  # pd.BooleanDtype(),
    "is_quote_status": pd.StringDtype(),  # pd.BooleanDtype(),
    "retweeted": pd.StringDtype(),  # pd.BooleanDtype(),
    "retweeted_tweet": pd.StringDtype(),
    "in_reply_to_screen_name": pd.StringDtype(),
    "user_screen_name": pd.StringDtype(),
    "num_urls_in_tweet_text": pd.Int32Dtype(),
    "num_words": pd.Int32Dtype(),
    "text": pd.StringDtype(),
    "sentiment": pd.Int32Dtype(),
    "order": pd.Int32Dtype(),
    "hour": pd.Int32Dtype(),
    "day": pd.Int32Dtype(),
    "weekday": pd.StringDtype(),
    "time_of_day": pd.StringDtype(),
    "batch_num": pd.Int32Dtype(),
}
infer_dtypes_dict = {
    "label": pd.StringDtype(),
    "score": pd.Float32Dtype(),
    "pred": pd.Int32Dtype(),
}
pred_feats_dtypes_dict = {
    "created_at_hour": pd.Int32Dtype(),
    "split": pd.StringDtype(),
    "created_at_time_of_day": pd.StringDtype(),
}
stats_dtypes_dict = dict(
    query=pd.StringDtype(),
    value=pd.StringDtype(),
    count=pd.Int32Dtype(),
    std=pd.Float32Dtype(),
    min=pd.Float32Dtype(),
    mean=pd.Float32Dtype(),
    median=pd.Float32Dtype(),
    max=pd.Float32Dtype(),
    num_samples=pd.Int32Dtype(),
    coeff_of_var=pd.Float32Dtype(),
    std_error=pd.Float32Dtype(),
    frac=pd.Float32Dtype(),
    split_type=pd.StringDtype(),
    batch_num=pd.Int32Dtype(),
)

id2label = {v: k for k, v in label_mapper.items()}
id2label

Retrieved AWS credentials from .env file


{0: 'does_not_need_support', 1: 'needs_support'}

## Get Annotated Training and Inference Data

In [8]:
%%time
download_files_from_s3(
    s3_client,
    s3_bucket_name,
    processed_data_dir,
    aws_region,
    f"{path_to_folder[1:]}processed/nlp_splits/",
    ".xlsx",
)
proc_files = sorted(glob(f"{processed_data_dir}/*_nlp_*.xlsx"))
proc_file_inference = [f for f in proc_files if "test_" in f][0]
print(proc_file_inference)
proc_files

File found at ../data/processed/test_nlp__inference_starts_20220110_000000.xlsx. Did nothing.
File found at ../data/processed/train_nlp__inference_starts_20220110_000000.xlsx. Did nothing.
File found at ../data/processed/val_nlp__inference_starts_20220110_000000.xlsx. Did nothing.
../data/processed/test_nlp__inference_starts_20220110_000000.xlsx
CPU times: user 15.3 ms, sys: 3.21 ms, total: 18.6 ms
Wall time: 189 ms


['../data/processed/test_nlp__inference_starts_20220110_000000.xlsx',
 '../data/processed/train_nlp__inference_starts_20220110_000000.xlsx',
 '../data/processed/val_nlp__inference_starts_20220110_000000.xlsx']

## Load and Process Data

Perform the following
- rename the class labels column from `sentiment` to `labels`
- remove retweets (tweets starting with *RT*)
- map the `labels` column (sentiment) to indicate a tweet
  - needing support (neutral or negative sentiment)
  - not needing support (positive sentiment)
- text processing to
  - remove leading and trailing spaces
  - replace HTML by `>`, `<` or `&`, as appropriate

In [9]:
%%time
df_infer_data, df_train = [
    (
        pd.read_excel(
            f,
            dtype=dtypes_dict,
            usecols=list(dtypes_dict)+['created_at', 'user_joined']
        ).rename(columns={"sentiment": "labels"})
        # .sort_values(by=['created_at'])
        .query("~text.str.startswith('RT')")
        .assign(split=st)
        .assign(labels=lambda df: df['labels'].isin(needs_support_labels).astype(pd.Int32Dtype()))
        .assign(
            text=lambda df: (
                df["text"]
                .str.lstrip()
                .str.rstrip()
                .str.replace("&gt;", ">")
                .str.replace("&lt;", "<")
                .str.replace("&amp;", "&")
            )
        )
        .assign(created_at_hour=lambda df: df["created_at"].dt.hour)
        .assign(
            created_at_time_of_day=lambda df: pd.cut(
                df["created_at_hour"], bins=b, labels=l, include_lowest=True
            )
        )
        .assign(
            num_words=lambda df: pd.cut(df["num_words"], bins=num_words_bins, labels=num_words_labels).astype(pd.StringDtype())
        )
    )
    for f, st in zip(proc_files[:-1], ['inference', 'train'])
]

CPU times: user 1.6 s, sys: 40.7 ms, total: 1.64 s
Wall time: 1.73 s


Drop any tweets which were not manually labeled with a sentiment. Since re-training and manual labeling are only performed after every five batches of new data arrives, the test split file (used here for inference) will contain data that is missing labels which won't be used in both of
- re-training
- inference

so these rows must be dropped from this file

In [10]:
df_infer_data = df_infer_data.dropna(subset=["labels"])

Get raw data split sizes

In [11]:
split_sizes = [{"train": len(df_train), "inference": len(df_infer_data)}]
df_split_sizes = pd.DataFrame.from_records(split_sizes).assign(type="raw")

(If not initial training run) Get most current test split and
- append non-current test split to the training split
- use the most recent test split as the current test split

In [12]:
batch_nums = df_infer_data["batch_num"].unique().tolist()
batch_nums

[1]

In [13]:
# inference_current_batch_num = df_infer_data["batch_num"].max()
# if df_infer_data["batch_num"].nunique() > 1:
#     df_train = pd.concat(
#         [df_train, df_infer_data.query(f"batch_num < {inference_current_batch_num}")]
#     )
#     df_infer_data = df_infer_data.query(f"batch_num == {inference_current_batch_num}")

if len(batch_nums) > 1:
    # get all but last batch numbers from test split (to use in training split)
    training_batch_nums = batch_nums[:-1]
    # get last batch number from test split (to use as current test split)
    inference_current_batch_num = batch_nums[-1]

    # Slice raw data splits based on batch numbers defined above
    df_train = pd.concat(
        [df_train, df_infer_data.query(f"batch_num.isin(@training_batch_nums)")]
    )
    df_test = df_infer_data.query(f"batch_num == {test_current_batch_num}")
else:
    inference_current_batch_num = df_infer_data["batch_num"].max()

Drop duplicates in the
- training split
  - this was done in ML model training in `7_train.ipynb` and so will also be done here

In [14]:
df_train = df_train.drop_duplicates(subset=["text"])

Get the new split sizes

In [15]:
split_sizes_no_dups = [{"train": len(df_train), "inference": len(df_infer_data)}]
df_split_sizes_no_dups = pd.DataFrame.from_records(split_sizes_no_dups).assign(
    type="without-duplicates"
)

Show split sizes before and after dropping duplicates

In [16]:
df_split_sizes_comp = pd.concat(
    [df_split_sizes, df_split_sizes_no_dups], ignore_index=True
)
df_split_sizes_comp

Unnamed: 0,train,inference,type
0,2931,600,raw
1,2775,600,without-duplicates


Get the start and end date of each split after combining training and non-current inference splits

In [17]:
df_split_dates = pd.DataFrame.from_records(
    [
        {
            "split": split_type,
            "start": df_nlp_spit["created_at"].min().strftime("%Y-%m-%d %H:%M:%S"),
            "end": df_nlp_spit["created_at"].max().strftime("%Y-%m-%d %H:%M:%S"),
        }
        for df_nlp_spit, split_type in zip(
            [df_train, df_infer_data], ["train", "inference"]
        )
    ]
)
df_split_dates

Unnamed: 0,split,start,end
0,train,2021-12-30 17:39:11,2022-01-08 15:14:33
1,inference,2022-01-09 01:18:13,2022-01-10 01:29:01


Perform sanity checks to verify the expected time-ordering of the splits

In [18]:
train_end = df_train["created_at"].max()
infer_start = df_infer_data["created_at"].min()
assert train_end < infer_start

## Get Features For Each Data Split

In [19]:
data_inference = df_infer_data["text"]
data_inference_train = df_train["text"]

## Load Fine-Tuned Model from Disk

In [20]:
%%time
tokenizer_fine_tuned = AutoTokenizer.from_pretrained(model_output_dir)
model_fine_tuned = AutoModelForSequenceClassification.from_pretrained(
    model_output_dir,
    num_labels=len(label_mapper),
    id2label=id2label,
    label2id=label_mapper,
)
pipe_fine_tuned = pipeline(
    "sentiment-analysis", model=model_fine_tuned, tokenizer=tokenizer_fine_tuned
)

CPU times: user 467 ms, sys: 129 ms, total: 597 ms
Wall time: 807 ms


## Make Inference Predictions Using Loaded Model

Inference on inference data

In [21]:
%%time
print(f"Starting time = {datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]}...")
df_infer = extract_sentiment_using_pretrained_model(pipe_fine_tuned, data_inference)
print(f"done at {datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]}.")
with pd.option_context('display.max_colwidth', None):
    display(df_infer)

Starting time = 2022-11-17 00:01:37.807...
done at 2022-11-17 00:01:44.396.


Unnamed: 0_level_0,label,score
index,Unnamed: 1_level_1,Unnamed: 2_level_1
0,does_not_need_support,0.881728
1,does_not_need_support,0.875102
2,needs_support,0.836894
3,needs_support,0.812929
4,does_not_need_support,0.815415
...,...,...
595,needs_support,0.856652
596,does_not_need_support,0.883569
597,does_not_need_support,0.883237
598,does_not_need_support,0.876942


CPU times: user 39.2 s, sys: 41.8 ms, total: 39.3 s
Wall time: 6.59 s


Inference on training data

In [22]:
%%time
print(f"Starting time = {datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]}...")
df_infer_train = extract_sentiment_using_pretrained_model(pipe_fine_tuned, data_inference_train)
print(f"done at {datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]}.")
with pd.option_context('display.max_colwidth', None):
    display(df_infer_train)

Starting time = 2022-11-17 00:01:48.970...
done at 2022-11-17 00:02:18.163.


Unnamed: 0_level_0,label,score
index,Unnamed: 1_level_1,Unnamed: 2_level_1
0,does_not_need_support,0.880782
1,does_not_need_support,0.874443
2,needs_support,0.859845
3,needs_support,0.865527
4,needs_support,0.861848
...,...,...
2926,needs_support,0.861677
2927,does_not_need_support,0.882740
2928,needs_support,0.851937
2929,does_not_need_support,0.852015


CPU times: user 2min 54s, sys: 251 ms, total: 2min 55s
Wall time: 29.2 s


Merge inference predictions with remaining columns in training and inference data

In [23]:
df_inference = df_infer_data.merge(
    (
        df_infer.assign(pred=df_infer["label"].map(label_mapper))
        .astype(infer_dtypes_dict)
        .add_suffix("_infer")
    ),
    left_index=True,
    right_index=True,
    how="left",
).astype(pred_feats_dtypes_dict)
df_inference_train = df_train.merge(
    (
        df_infer_train.assign(pred=df_infer_train["label"].map(label_mapper))
        .astype(infer_dtypes_dict)
        .add_suffix("_infer")
    ),
    left_index=True,
    right_index=True,
    how="left",
).astype(pred_feats_dtypes_dict)

## Model Evaluation

### (Optional) Evaluation Metrics

Model evaluation is performed on the predictions of the inference data

In [24]:
metrics_dict_inf, df_cm_inf, df_cr_inf = calculate_metrics(
    df_inference["labels"].astype("float64").to_numpy(),
    df_inference["pred_infer"].astype("float64").to_numpy(),
    list(label_mapper.values()),
    list(label_mapper.keys()),
    "weighted",
    0,
    use_sample_weights=False,
)
df_metrics = pd.DataFrame.from_dict(metrics_dict_inf, orient="index").T
df_cr_inf = df_cr_inf.merge(
    df_inference["labels"]
    .value_counts(normalize=True)
    .rename("freq")
    .reset_index()
    .assign(index=lambda df: df["index"].map(id2label))
    .set_index("index"),
    left_index=True,
    right_index=True,
    how="left",
)
display(df_metrics)
display(df_cm_inf)
display(df_cr_inf)

Unnamed: 0,accuracy,balanced_accuracy,precision,recall,f1,f05,f2
0,0.805,0.80725,0.812648,0.805,0.806673,0.80975,0.805141


Unnamed: 0,Actual,does_not_need_support,needs_support
0,does_not_need_support,290,74
1,needs_support,43,193


Unnamed: 0,precision,recall,f1-score,support,freq
does_not_need_support,0.870871,0.796703,0.832138,364,0.606667
needs_support,0.722846,0.817797,0.767396,236,0.393333


Model evaluation is now performed on the predictions of the train split

In [25]:
metrics_dict_train_inf, df_cm_train_inf, df_cr_train_inf = calculate_metrics(
    df_inference_train["labels"].astype("float64").to_numpy(),
    df_inference_train["pred_infer"].astype("float64").to_numpy(),
    list(label_mapper.values()),
    list(label_mapper.keys()),
    "weighted",
    0,
    use_sample_weights=False,
)
df_metrics_train = pd.DataFrame.from_dict(metrics_dict_train_inf, orient="index").T
df_cr_train_inf = df_cr_train_inf.merge(
    df_inference_train["labels"]
    .value_counts(normalize=True)
    .rename("freq")
    .reset_index()
    .assign(index=lambda df: df["index"].map(id2label))
    .set_index("index"),
    left_index=True,
    right_index=True,
    how="left",
)
display(df_metrics_train)
display(df_cm_train_inf)
display(df_cr_train_inf)

Unnamed: 0,accuracy,balanced_accuracy,precision,recall,f1,f05,f2
0,0.861982,0.874404,0.877282,0.861982,0.864056,0.870689,0.861419


Unnamed: 0,Actual,does_not_need_support,needs_support
0,does_not_need_support,1447,303
1,needs_support,80,945


Unnamed: 0,precision,recall,f1-score,support,freq
does_not_need_support,0.94761,0.826857,0.883125,1750,0.630631
needs_support,0.757212,0.921951,0.8315,1025,0.369369


Summarize the model evaluation metrics for both train and inference data

In [26]:
df_metrics_combo = pd.concat([df_metrics_train, df_metrics], ignore_index=True).assign(
    split_type=["train", "inference"]
)
df_metrics_combo

Unnamed: 0,accuracy,balanced_accuracy,precision,recall,f1,f05,f2,split_type
0,0.861982,0.874404,0.877282,0.861982,0.864056,0.870689,0.861419,train
1,0.805,0.80725,0.812648,0.805,0.806673,0.80975,0.805141,inference


Summarize the classification report for both train and inference data

In [27]:
df_cr_combo = (
    pd.concat(
        [
            df_cr_train_inf.assign(split_type="train"),
            df_cr_inf.assign(split_type="inference"),
        ]
    )
    .reset_index()
    .rename(columns={"index": "label"})
    .sort_values(by=["label"])
)
df_cr_combo

Unnamed: 0,label,precision,recall,f1-score,support,freq,split_type
0,does_not_need_support,0.94761,0.826857,0.883125,1750,0.630631,train
2,does_not_need_support,0.870871,0.796703,0.832138,364,0.606667,inference
1,needs_support,0.757212,0.921951,0.8315,1025,0.369369,train
3,needs_support,0.722846,0.817797,0.767396,236,0.393333,inference


**Observations**
1. These metrics are the same as those seen during model fine-tuning (in `7_train.ipynb`). This verifies that the data with inference predictions was correctly merged with the remaining metadata.

### Prediction Probability Statistics By Error Type, for Inference Data

In [28]:
%%time
df_tp = df_inference.query("(labels == 1) & (pred_infer == 1)")
df_tn = df_inference.query("(labels == 0) & (pred_infer == 0)")
df_fn = df_inference.query("(labels == 0) & (pred_infer == 1)")
df_fp = df_inference.query("(labels == 1) & (pred_infer == 0)")

CPU times: user 9.82 ms, sys: 3.91 ms, total: 13.7 ms
Wall time: 14.7 ms


In [29]:
%%time
df_scores_by_error_type = (
    df_tp["score_infer"].describe().rename("TP").to_frame().merge(
        df_tn["score_infer"].describe().rename("TN").to_frame(),
        left_index=True,
        right_index=True,
        how='left',
    ).merge(
        df_fn["score_infer"].describe().rename("FN").to_frame(),
        left_index=True,
        right_index=True,
        how='left',
    ).merge(
        df_fp["score_infer"].describe().rename("FP").to_frame(),
        left_index=True,
        right_index=True,
        how='left',
    ).T
    .assign(num_samples=len(df_inference)).astype({"num_samples": pd.Int32Dtype()})
    .assign(frac=lambda df: df['count']/df['num_samples'])
    .assign(coeff_of_var=lambda df: 100*(df['std']/df['mean']))
    .assign(std_error=lambda df: 100*(df['std']/(df['count']**0.5)))
    .T
)
df_scores_by_error_type.style.set_caption(
    "Prediction Probabliity Statistics by Error Type, for Inference Data"
)

CPU times: user 323 ms, sys: 51.8 ms, total: 375 ms
Wall time: 608 ms


Unnamed: 0,TP,TN,FN,FP
count,193.0,290.0,74.0,43.0
mean,0.832022,0.862697,0.795028,0.803363
std,0.063763,0.053979,0.086521,0.104785
min,0.507186,0.564547,0.505435,0.528471
25%,0.839238,0.875315,0.779029,0.771207
50%,0.855443,0.881522,0.833101,0.857252
75%,0.861003,0.883265,0.852152,0.876117
max,0.867998,0.885162,0.865188,0.883283
num_samples,600.0,600.0,600.0,600.0
frac,0.321667,0.483333,0.123333,0.071667


**Observations**
1. True Positives and True Negatives
   - account for aproximately 80% of the fine-tuned model's predictions
   - have a
     - coefficient of variation less than 10%
     - standard error less than 0.46%
2. False Positives and False Negatives
   - account for aproximately 20% of the fine-tuned model's predictions
   - have a
     - coefficient of variation between approximately 11% - 13%
     - standard error between approximately 1% - 1.6%

### Prediction Probability Statistics By Error Type and Time of Day, for Inference Data

In [30]:
df_scores_by_error_type_time_of_day = pd.concat(
    [
        df_error_type.groupby("created_at_time_of_day")["score_infer"]
        .agg(["count", "std", "min", "mean", "median", "max"])
        .assign(coeff_of_var=lambda df: 100 * (df["std"] / df["mean"]))
        .assign(std_error=lambda df: 100 * (df["std"] / (df["count"] ** 0.5)))
        .assign(frac=lambda df: df["count"] / len(df_error_type))
        .T.reset_index()
        .rename(columns={"index": "statistic"})
        .assign(error_type=error_type)
        .assign(num_samples=len(df_error_type))
        for df_error_type, error_type in zip(
            [df_tp, df_tn, df_fn, df_fp], ["TP", "TN", "FN", "FP"]
        )
    ],
    ignore_index=True,
)
df_scores_by_error_type_time_of_day[
    df_scores_by_error_type_time_of_day["statistic"].isin(wanted_pred_proba_stats[:-1])
].set_index(["error_type", "statistic"]).style.set_caption(
    "Prediction Probabliity Statistics by Error Type and Time of Day, for Inference Data"
)

Unnamed: 0_level_0,created_at_time_of_day,Afternoon,Early Morning,Evening,Late Night,Morning,Night,num_samples
error_type,statistic,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
TP,count,33.0,27.0,36.0,50.0,31.0,16.0,193
TP,min,0.755577,0.709263,0.507186,0.50876,0.69393,0.509283,193
TP,mean,0.848166,0.843621,0.817706,0.830605,0.842653,0.795201,193
TP,coeff_of_var,2.513618,4.116006,10.194446,8.096391,4.698758,13.697968,193
TP,std_error,0.371127,0.668254,1.389342,0.951044,0.711133,2.723159,193
TN,count,57.0,28.0,56.0,80.0,40.0,29.0,290
TN,min,0.564547,0.803448,0.579188,0.57075,0.653722,0.592299,290
TN,mean,0.870178,0.87149,0.866144,0.858294,0.864109,0.843045,290
TN,coeff_of_var,5.439089,2.442198,6.27518,6.720901,5.352674,9.424699,290
TN,std_error,0.626898,0.402221,0.726311,0.644939,0.731323,1.475432,290


### Prediction Probability Statistics By Time of Day

Show prediction probability stats by time of day for the inference data

In [31]:
df_scores_by_time_of_day = (
    df_inference.groupby("created_at_time_of_day")["score_infer"]
    .agg(["count", "std", "min", "mean", "median", "max"])
    .assign(num_samples=len(df_inference))
    .assign(coeff_of_var=lambda df: 100 * (df["std"] / df["mean"]))
    .assign(std_error=lambda df: 100 * (df["std"] / (df["count"] ** 0.5)))
    .assign(frac=lambda df: df["count"] / len(df_inference))
    .T.reset_index()
    .rename(columns={"index": "statistic"})
    .set_index("statistic")
    .T.reset_index()
    .assign(query="time_of_day")
    .rename(columns={"created_at_time_of_day": "value"})
    .assign(batch_num=inference_current_batch_num)
    .assign(split_type="inference")
    .astype(stats_dtypes_dict)
)
df_scores_by_time_of_day[["value"] + wanted_pred_proba_stats].astype(
    {"coeff_of_var": "float64", "std_error": "float64"}
).style.set_caption(
    "Prediction Probabliity Statistics by Time of Day, for Inference Data"
)

statistic,value,count,min,mean,coeff_of_var,std_error,num_samples
0,Afternoon,109,0.528785,0.850031,7.284144,0.593062,600
1,Early Morning,65,0.584427,0.843888,7.018717,0.734659,600
2,Evening,117,0.507186,0.836289,9.783653,0.756422,600
3,Late Night,165,0.505435,0.837573,8.688473,0.566532,600
4,Morning,83,0.653722,0.845818,6.455267,0.599311,600
5,Night,61,0.509283,0.82598,10.390778,1.098886,600


Show prediction probability stats by time of day for the train split

In [32]:
df_scores_train_by_time_of_day = (
    df_inference_train.groupby("created_at_time_of_day")["score_infer"]
    .agg(["count", "std", "min", "mean", "median", "max"])
    .assign(num_samples=len(df_inference_train))
    .assign(coeff_of_var=lambda df: 100 * (df["std"] / df["mean"]))
    .assign(std_error=lambda df: 100 * (df["std"] / (df["count"] ** 0.5)))
    .assign(frac=lambda df: df["count"] / len(df_inference_train))
    .T.reset_index()
    .rename(columns={"index": "statistic"})
    .set_index("statistic")
    .T.reset_index()
    .assign(query="time_of_day")
    .rename(columns={"created_at_time_of_day": "value"})
    .assign(batch_num=inference_current_batch_num)
    .assign(split_type="train")
    .astype(stats_dtypes_dict)
)
df_scores_train_by_time_of_day[["value"] + wanted_pred_proba_stats].astype(
    {"coeff_of_var": "float64", "std_error": "float64"}
).style.set_caption(
    "Prediction Probabliity Statistics by Time of Day, for Training Data"
)

statistic,value,count,min,mean,coeff_of_var,std_error,num_samples
0,Afternoon,588,0.514442,0.845633,7.506486,0.261776,2775
1,Early Morning,260,0.519999,0.843866,8.061712,0.421905,2775
2,Evening,673,0.500223,0.849518,7.289383,0.238702,2775
3,Late Night,509,0.504797,0.84468,7.806822,0.292286,2775
4,Morning,314,0.528901,0.844927,7.571234,0.361011,2775
5,Night,431,0.5051,0.83711,8.829121,0.356009,2775


Show combined prediction probability stats by time of day

In [33]:
df_scores_by_time_of_day_combo = pd.concat(
    [
        df_scores_train_by_time_of_day[["value"] + wanted_pred_proba_stats].assign(
            split_type="train"
        ),
        df_scores_by_time_of_day[["value"] + wanted_pred_proba_stats].assign(
            split_type="inference"
        ),
    ],
    ignore_index=True,
)
df_scores_by_time_of_day_combo.astype(
    {"coeff_of_var": "float64", "std_error": "float64"}
).style.set_caption("Prediction Probabliity Statistics by Time of Day")

statistic,value,count,min,mean,coeff_of_var,std_error,num_samples,split_type
0,Afternoon,588,0.514442,0.845633,7.506486,0.261776,2775,train
1,Early Morning,260,0.519999,0.843866,8.061712,0.421905,2775,train
2,Evening,673,0.500223,0.849518,7.289383,0.238702,2775,train
3,Late Night,509,0.504797,0.84468,7.806822,0.292286,2775,train
4,Morning,314,0.528901,0.844927,7.571234,0.361011,2775,train
5,Night,431,0.5051,0.83711,8.829121,0.356009,2775,train
6,Afternoon,109,0.528785,0.850031,7.284144,0.593062,600,inference
7,Early Morning,65,0.584427,0.843888,7.018717,0.734659,600,inference
8,Evening,117,0.507186,0.836289,9.783653,0.756422,600,inference
9,Late Night,165,0.505435,0.837573,8.688473,0.566532,600,inference


**Observations**
1. For all times of the day, the coefficient of variation (COV) is less than 10.4%.
2. The range of standard error is between approximately
   - (inference data) 0.6% and 1.1%
   - (train split) 0.24% and 0.42%
2. The range of COV is between approximately
   - (inference data) 6.5% and 10.4%
   - (train split) 7.3% and 8.8%

### Prediction Probability Statistics By Twitter Client

Show prediction probability stats by Twitter client for the inference data

In [34]:
df_scores_by_client = (
    df_inference.rename(columns={"source_text": "client"})
    .groupby("client")["score_infer"]
    .agg(["count", "std", "min", "mean", "median", "max"])
    .assign(num_samples=len(df_inference))
    .assign(coeff_of_var=lambda df: 100 * (df["std"] / df["mean"]))
    .assign(std_error=lambda df: 100 * (df["std"] / (df["count"] ** 0.5)))
    .assign(frac=lambda df: df["count"] / len(df_inference))
    .sort_values(by=["count"], ascending=False)
    .T.reset_index()
    .rename(columns={"index": "statistic"})
    .set_index(["statistic"])
    .T.reset_index(drop=False)
    .assign(query="Twitter_client")
    .rename(columns={"client": "value"})
    .assign(batch_num=inference_current_batch_num)
    .assign(split_type="inference")
    .astype(stats_dtypes_dict)
)
display(
    df_scores_by_client[["value"] + wanted_pred_proba_stats]
    .astype({"coeff_of_var": "float64", "std_error": "float64"})
    .style.set_caption(
        "Prediction Probabliity Statistics by Twitter Client, for Inference Data"
    )
)

statistic,value,count,min,mean,coeff_of_var,std_error,num_samples
0,Twitter Web App,215,0.507186,0.83522,9.465076,0.539145,600
1,Twitter for iPhone,179,0.505435,0.844191,7.890692,0.497885,600
2,Twitter for Android,166,0.524087,0.840332,8.085901,0.527382,600
3,Twitter for iPad,20,0.758419,0.855259,3.432977,0.656528,600
4,TweetDeck,8,0.687165,0.832604,9.755071,2.871598,600
5,IFTTT,3,0.880182,0.881158,0.108122,0.055006,600
6,Typefully,2,0.821237,0.851092,4.960894,2.985531,600
7,Alt-brain news test,1,0.850679,0.850679,,,600
8,Hacker__News,1,0.880775,0.880775,,,600
9,Heropost,1,0.880972,0.880972,,,600


**Observations**
1. For all Twitter clients used to post more than one tweet, the
   - coefficient of variation is less than 10% and standard error is less than 3%
     - for clients used to post more than 20 tweets, the range of
       - coefficient of variation is between 3.4% and 9.5%
       - standard error is between 0.5% and 0.66%
     - for clients used to post less than 20 tweets, the range of
       - coefficient of variation is between 0% and 9.8%
       - standard error is between 0% and 3%

Show prediction probability stats by Twitter client for the train split

In [35]:
df_scores_train_by_client = (
    df_inference_train.rename(columns={"source_text": "client"})
    .groupby("client")["score_infer"]
    .agg(["count", "std", "min", "mean", "median", "max"])
    .assign(num_samples=len(df_inference_train))
    .assign(coeff_of_var=lambda df: 100 * (df["std"] / df["mean"]))
    .assign(std_error=lambda df: 100 * (df["std"] / (df["count"] ** 0.5)))
    .assign(frac=lambda df: df["count"] / len(df_inference_train))
    .sort_values(by=["count"], ascending=False)
    .T.reset_index()
    .rename(columns={"index": "statistic"})
    .set_index(["statistic"])
    .T.reset_index(drop=False)
    .assign(query="Twitter_client")
    .rename(columns={"client": "value"})
    .assign(batch_num=inference_current_batch_num)
    .assign(split_type="train")
    .astype(stats_dtypes_dict)
)
display(
    df_scores_train_by_client[["value"] + wanted_pred_proba_stats]
    .astype({"coeff_of_var": "float64", "std_error": "float64"})
    .style.set_caption(
        "Prediction Probabliity Statistics by Twitter Client, for Training Data"
    )
)

statistic,value,count,min,mean,coeff_of_var,std_error,num_samples
0,Twitter Web App,994,0.500223,0.843359,8.200008,0.219348,2775
1,Twitter for Android,786,0.53209,0.846084,7.268277,0.219348,2775
2,Twitter for iPhone,750,0.5051,0.844281,7.928266,0.244419,2775
3,TweetDeck,81,0.502232,0.855227,6.697653,0.636446,2775
4,Twitter for iPad,79,0.564215,0.843394,6.871428,0.652024,2775
5,IFTTT,11,0.681094,0.841294,7.176301,1.820339,2775
6,Tweetbot for iΟS,6,0.861272,0.874194,1.126559,0.402056,2775
7,Tweetbot for Mac,6,0.85952,0.876256,1.003255,0.358894,2775
8,Microsoft Power Platform,5,0.883379,0.883824,0.057838,0.022861,2775
9,ApodApp,4,0.866924,0.876013,0.757292,0.331699,2775


**Observations**
1. For all Twitter clients used to post more than one tweet, the COV is as high as approximately 25% and the corresponding standard error is as high as approximately 11.4%. However, there are only four such clients with a COV greater than 10% and the corresponding standard error is greater than 3%, and they were used infrequently (to post two or three tweets out of nearly 2,800 tweets in the training split) - all other such clients have a COV of less than approximately 8.3% and a standard error less than approximately 1.8%
   - for clients used to post more than 10 tweets, the range of
     - COV is between 6.7% and 8.3%
     - standard error is between 0.22% and 0.65%

### Prediction Probability Statistics By Number of Words Bin

Show prediction probability stats by length of tweet (in number of words) for the inference data

In [36]:
df_scores_based_on_number_of_words = (
    df_inference.groupby("num_words")["score_infer"]
    .agg(["count", "std", "min", "mean", "median", "max"])
    .assign(num_samples=len(df_inference))
    .assign(coeff_of_var=lambda df: 100 * (df["std"] / df["mean"]))
    .assign(std_error=lambda df: 100 * (df["std"] / (df["count"] ** 0.5)))
    .assign(frac=lambda df: df["count"] / len(df_inference))
    .T.reset_index()
    .rename(columns={"index": "statistic"})
    .set_index("statistic")
    .T.reset_index()
    .assign(query="num_words")
    .rename(columns={"num_words": "value"})
    .assign(batch_num=inference_current_batch_num)
    .assign(split_type="inference")
    .astype(stats_dtypes_dict)
    .set_index("value")
    .loc[list(num_words_labels)[:-1]]
    .reset_index()
)
df_scores_based_on_number_of_words[["value"] + wanted_pred_proba_stats].astype(
    {"coeff_of_var": "float64", "std_error": "float64"}
).style.set_caption(
    "Prediction Probabliity Statistics by number of words in Tweet, for Inference Data"
)

statistic,value,count,min,mean,coeff_of_var,std_error,num_samples
0,0-5,26,0.671214,0.867007,4.887348,0.831016,600
1,6-10,75,0.542198,0.850235,7.458877,0.732288,600
2,11-15,82,0.507186,0.848979,8.348276,0.782684,600
3,16-20,84,0.528785,0.837803,8.163739,0.746262,600
4,20-25,79,0.505435,0.828855,9.599474,0.895184,600
5,26-30,64,0.509283,0.848692,6.630328,0.703389,600
6,31-35,46,0.50876,0.834084,10.943321,1.345799,600
7,36-40,43,0.597381,0.829801,8.678071,1.098153,600
8,41-45,56,0.57075,0.829834,8.838259,0.980085,600
9,46-50,37,0.564622,0.838404,8.485442,1.169572,600


**Observations**
1. For all binned tweet lengths, the relative standard error is less than 5%, which is re-assuring.
2. For tweets of
   - less than or equal to 30 words in length
     - the relative standard error is less than 1%
   - more than 30 words in length
     - the relative standard error is between 1% and 2.4%

Show prediction probability stats by length of tweet (in number of words) for the training data

In [37]:
df_scores_train_based_on_number_of_words = (
    df_inference_train.groupby("num_words")["score_infer"]
    .agg(["count", "std", "min", "mean", "median", "max"])
    .assign(num_samples=len(df_inference_train))
    .assign(coeff_of_var=lambda df: 100 * (df["std"] / df["mean"]))
    .assign(std_error=lambda df: 100 * (df["std"] / (df["count"] ** 0.5)))
    .assign(frac=lambda df: df["count"] / len(df_inference_train))
    .T.reset_index()
    .rename(columns={"index": "statistic"})
    .set_index("statistic")
    .T.reset_index()
    .assign(query="num_words")
    .rename(columns={"num_words": "value"})
    .assign(batch_num=inference_current_batch_num)
    .assign(split_type="train")
    .astype(stats_dtypes_dict)
    .set_index("value")
    .loc[list(num_words_labels)]
    .reset_index()
)
df_scores_train_based_on_number_of_words[["value"] + wanted_pred_proba_stats].astype(
    {"coeff_of_var": "float64", "std_error": "float64"}
).style.set_caption(
    "Prediction Probabliity Statistics by number of words in Tweet, for Training Data"
)

statistic,value,count,min,mean,coeff_of_var,std_error,num_samples
0,0-5,102,0.528225,0.86226,6.70899,0.57279,2775
1,6-10,400,0.50489,0.851538,7.109303,0.302692,2775
2,11-15,487,0.504797,0.852932,6.580843,0.25435,2775
3,16-20,445,0.531383,0.852304,6.423296,0.259521,2775
4,20-25,314,0.50614,0.839807,8.598775,0.407522,2775
5,26-30,261,0.502232,0.839145,8.6071,0.447068,2775
6,31-35,175,0.514442,0.840547,8.300021,0.527378,2775
7,36-40,223,0.500223,0.833895,9.085354,0.507342,2775
8,41-45,186,0.533381,0.833972,8.767193,0.536112,2775
9,46-50,144,0.55075,0.82219,9.366593,0.64176,2775


**Observations**
1. For all binned tweet lengths, the relative standard error is less than 5%. In the case of the training data, the relative standard error is less than 2%.
2. For tweets of
   - less than or equal to 55 words in length
     - the relative standard error is less than 1%
   - more than 55 words in length
     - the relative standard error is approximately 1%

## Export Combined Prediction Probability Statistics

In [38]:
df_scores_stats = pd.concat(
    [
        df_scores_train_by_time_of_day,
        df_scores_by_time_of_day,
        df_scores_train_by_client,
        df_scores_by_client,
        df_scores_based_on_number_of_words,
        df_scores_train_based_on_number_of_words,
    ],
).sort_values(
    by=["query", "value", "split_type"],
    ascending=[False, True, False],
    ignore_index=True,
)
df_scores_stats.insert(0, "query", df_scores_stats.pop("query"))
display(df_scores_stats.dtypes.rename("dtype").to_frame().T)
display(
    df_scores_stats[
        ["query", "value"] + wanted_pred_proba_stats + ["batch_num", "split_type"]
    ]
    .astype({"coeff_of_var": "float64", "std_error": "float64"})
    .style.set_caption("Prediction Probabliity Statistics")
)

statistic,query,value,count,std,min,mean,median,max,num_samples,coeff_of_var,std_error,frac,batch_num,split_type
dtype,string,string,Int32,Float32,Float32,Float32,Float32,Float32,Int32,Float32,Float32,Float32,Int32,string


statistic,query,value,count,min,mean,coeff_of_var,std_error,num_samples,batch_num,split_type
0,time_of_day,Afternoon,588,0.514442,0.845633,7.506486,0.261776,2775,1,train
1,time_of_day,Afternoon,109,0.528785,0.850031,7.284144,0.593062,600,1,inference
2,time_of_day,Early Morning,260,0.519999,0.843866,8.061712,0.421905,2775,1,train
3,time_of_day,Early Morning,65,0.584427,0.843888,7.018717,0.734659,600,1,inference
4,time_of_day,Evening,673,0.500223,0.849518,7.289383,0.238702,2775,1,train
5,time_of_day,Evening,117,0.507186,0.836289,9.783653,0.756422,600,1,inference
6,time_of_day,Late Night,509,0.504797,0.84468,7.806822,0.292286,2775,1,train
7,time_of_day,Late Night,165,0.505435,0.837573,8.688473,0.566532,600,1,inference
8,time_of_day,Morning,314,0.528901,0.844927,7.571234,0.361011,2775,1,train
9,time_of_day,Morning,83,0.653722,0.845818,6.455267,0.599311,600,1,inference


In [39]:
pred_proba_stats_path = os.path.join(
    processed_data_dir,
    os.path.splitext(os.path.basename(proc_file_inference))[0].replace(
        "test", "pred_proba_stats"
    )
    + f"__batch_{inference_current_batch_num}_inference.parquet.gzip",
)
pred_proba_fname = os.path.basename(pred_proba_stats_path)
print(pred_proba_stats_path)
print(pred_proba_fname)

../data/processed/pred_proba_stats_nlp__inference_starts_20220110_000000__batch_1_inference.parquet.gzip
pred_proba_stats_nlp__inference_starts_20220110_000000__batch_1_inference.parquet.gzip


In [40]:
%%time
if upload_to_s3:
    storage_options = {
        "key": os.getenv("AWS_ACCESS_KEY_ID"),
        "secret": os.getenv("AWS_SECRET_ACCESS_KEY"),
    }
    prefix = f"{path_to_folder[1:]}processed/nlp_splits/{pred_proba_fname}"
    pred_proba_stats_filepath = f"s3://{s3_bucket_name}/{prefix}"
else:
    storage_options = None
    pred_proba_stats_filepath = pred_proba_stats_path
save_to_parquet(df_scores_stats, pred_proba_stats_filepath, storage_options)

Saving to parquet file pred_proba_stats_nlp__inference_starts_20220110_000000__batch_1_inference.parquet.gzip...
Done.
CPU times: user 82.3 ms, sys: 27.7 ms, total: 110 ms
Wall time: 575 ms


---

<span style="float:left;">
    <a href="./6-split-data/notebooks/6_split_data.ipynb"><< 6 - Create Data Splits for Model Training and Business Metrics</a>
</span>

<span style="float:right;">
    <a href="./8-assess/notebooks/8_assess.ipynb">8 - Assess Business Metrics on Test Split >></a>
</span>