In [None]:
# %%time
# !pip3 freeze | grep -E 'boto3|s3fs|black==|jupyter-server|pandas|openpyxl|ipywidgets|IProgress|tqdm|torch|transformers'
# !conda list -n spark | grep -E 'ipykernel'

In [None]:
%load_ext lab_black

In [None]:
import os
from glob import glob
from datetime import datetime
import shutil

import boto3
import numpy as np
import pandas as pd
import torch
from datasets import Dataset, DatasetDict, load_dataset, load_metric
from torch import nn
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    DataCollatorWithPadding,
    Trainer,
    TrainingArguments,
    pipeline,
    set_seed,
)

In [None]:
set_seed(42)

## User Inputs

In [None]:
path_to_folder = "/datasets/twitter/kinesis-demo/"

# processed data
processed_data_dir = "data/processed/nlp_splits"

label_mapper = {"negative": 0, "neutral": 1, "positive": 2}

checkpoint_pretrained = "microsoft/MiniLM-L12-H384-uncased"

model_output_dir = (
    f"{checkpoint_pretrained.split('/')[1].split('-')[0].lower()}-finetuned"
)

In [None]:
s3_bucket_name = os.getenv("AWS_S3_BUCKET_NAME", "")
session = boto3.Session(profile_name="default")
s3_client = session.client("s3")

dtypes_dict = {
    "id": pd.StringDtype(),
    "text": pd.StringDtype(),
}

In [None]:
def tokenize_function(examples, mytokenizer):
    """Tokenize text."""
    return mytokenizer(examples["text"], truncation=True, max_length=512)


def get_metrics(
    y_true, y_pred, average="binary", zero_division="warn", use_sample_weights=False
):
    """Use transformers library to calculate sklearn metrics."""
    if use_sample_weights:
        y_true_list = list(y_true)
        mapper = dict(Counter(y_true_list))
        sample_weights = [mapper[q] for q in y_true_list]
    else:
        sample_weights = None
    metrics_dict = dict(
        accuracy=skm.accuracy_score(y_true, y_pred),
        balanced_accuracy=skm.balanced_accuracy_score(y_true, y_pred),
        precision=skm.precision_score(
            y_true,
            y_pred,
            average=average,
            sample_weight=sample_weights,
            zero_division=zero_division,
        ),
        recall=skm.recall_score(
            y_true,
            y_pred,
            average=average,
            sample_weight=sample_weights,
            zero_division=zero_division,
        ),
        f1=skm.f1_score(
            y_true,
            y_pred,
            average=average,
            sample_weight=sample_weights,
            zero_division=zero_division,
        ),
        f05=skm.fbeta_score(
            y_true,
            y_pred,
            beta=0.5,
            average=average,
            sample_weight=sample_weights,
            zero_division=zero_division,
        ),
        f2=skm.fbeta_score(
            y_true,
            y_pred,
            beta=2.0,
            average=average,
            sample_weight=sample_weights,
            zero_division=zero_division,
        ),
    )
    return [metrics_dict, sample_weights]


def compute_metrics(eval_pred):
    """Calculate metrics as part of Trainer class."""
    labels = eval_pred.label_ids
    predictions = eval_pred.predictions.argmax(-1)
    metrics, _ = get_metrics(labels, predictions, "weighted", 0, False)
    return metrics

In [None]:
def download_files_from_s3(
    s3_bucket_name: str,
    path_to_folder: str,
    data_dir: str,
    aws_region: str,
    prefix: str,
) -> None:
    """Download files from S3."""
    s3_filepath_contents = s3_client.list_objects_v2(
        Bucket=s3_bucket_name,
        Delimiter="/",
        Prefix=prefix,
    )["Contents"]
    s3_filepath_keys = [fc["Key"] for fc in s3_filepath_contents]

    for s3_filepath_key in s3_filepath_keys:
        dest_filepath = os.path.join(
            processed_data_dir, os.path.basename(s3_filepath_key)
        )
        if not os.path.exists(dest_filepath):
            start = datetime.now()
            print(
                f"Started downloading processed data zip file from {s3_filepath_key} to "
                f"{dest_filepath} at {start.strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]}..."
            )
            s3 = boto3.resource("s3", region_name=aws_region)
            s3.meta.client.download_file(
                s3_bucket_name,
                s3_filepath_key,
                dest_filepath,
            )
            duration = (datetime.now() - start).total_seconds()
            print(f"Done downloading in {duration:.3f} seconds.")
        else:
            print(f"File found at {dest_filepath}. Did nothing.")

## Get Annotated Data Splits

In [None]:
%%time
os.makedirs(processed_data_dir, exist_ok=True)
download_files_from_s3(
    s3_bucket_name,
    path_to_folder,
    processed_data_dir,
    session.region_name,
    f"{path_to_folder[1:]}processed/nlp_splits/",
)
proc_files = glob(f"{processed_data_dir}/*_annotated.xlsx")

In [None]:
%%time
df_test, df_train, df_val = [pd.read_excel(f) for f in [proc_files]]

## Get Features and Labels

In [None]:
%%time
X_train, X_val, X_test, y_train, y_val, y_test = [
    df_train["text"],
    df_val["text"],
    df_test["text"],
    df_train["labels"],
    df_val["labels"],
    df_test["labels"],
]

## Create `huggingface` dataset

In [None]:
mydict = {
    "train": {"label": y_train.tolist(), "text": X_train.tolist()},
    "val": {"label": y_val.tolist(), "text": X_val.tolist()},
    "test": {"label": y_test.tolist(), "text": X_test.tolist()},
}
dataset = DatasetDict()
for k, v in mydict.items():
    dataset[k] = Dataset.from_dict(v)

In [None]:
dataset["train"][0]

## Exploratory Data Analysis

In [None]:
print(np.unique(y_train))
display(
    y_train.value_counts(normalize=True)
    .rename("freq")
    .sort_index()
    .reset_index()
    .rename(columns={"index": "label"})
)

## Instantiate Pre-Trained Model and Tokenizer

In [None]:
id2label = {v: k for k, v in label_mapper.items()}
id2label

In [None]:
%%time
tokenizer = AutoTokenizer.from_pretrained(checkpoint_pretrained)
model = AutoModelForSequenceClassification.from_pretrained(
    checkpoint_pretrained,
    num_labels=y_train.nunique(),
    id2label=id2label,
    label2id=label_mapper,
)

## Perform Dynamic Batching During Tokenization

Tokenize all the data splits

In [None]:
%%time
tokenized_datasets = dataset.map(
    tokenize_function, fn_kwargs=dict(mytokenizer=tokenizer), batched=True
)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

## Dealing With Class Imbalance

Create class weights

In [None]:
class_weights = (1 - (y_train.value_counts().sort_index() / len(y_train))).values
class_weights

Convert class weights to `pytorch` tensor

In [None]:
class_weights = torch.from_numpy(class_weights).float()
class_weights

Define an instance of the `Trainer` class, that implements a custom `.CrossEntropyLoss()` which uses the above class weights based on the training data

In [None]:
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        # Extract true labels
        labels = inputs.get("labels")
        # forward pass - feed inputs to model and extract logits
        outputs = model(**inputs)
        logits = outputs.get("logits")
        # Define loss function with class weights
        loss_fct = nn.CrossEntropyLoss(weight=class_weights)
        # Compute loss
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss