# Fine-tuning the BERT model for Tip Mining
Combination of Huggingface library and TensorFlow 
 --------------------------------------
@author: Erik van der Heide, EUR 2021-2022

In [None]:
!nvidia-smi

Wed Jun 23 08:59:24 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 465.27       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla K80           Off  | 00000000:00:04.0 Off |                    0 |
| N/A   52C    P0    57W / 149W |   4265MiB / 11441MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

### Hyperparameters

In [None]:
# Hyperparameters
data_type = 'subset'       # Choose data 'subsetX' or 'full'
model_type = 'roberta'     # Choose model 'bert', 'distilbert' or 'roberta'
optim_type = 'adam'        # Choose optimizer 'Adam' or 'AdamW'
max_seq_length = 64        # define maximum length (32, 64)
train_batch_size = 16      # define batch size (16, 32)
learning_rate = 2e-5       # define learning rate (2e-5, 3e-5, 5e-5)
num_epochs = 2             # define nr. of epoch (1, 2, 3, 4)

### Installations & packages

In [None]:
# Install transformers
!pip install transformers



In [None]:
# Install tensorflow model environment
#!pip install -q tf-models-official

In [None]:
# Packages
import pandas as pd
import time
import random

import torch
import torch.nn as nn
import tensorflow as tf
#from official.nlp import optimization

from transformers import BertTokenizerFast, DistilBertTokenizerFast, RobertaTokenizerFast
from transformers import TFDistilBertForSequenceClassification
from transformers import TFBertForSequenceClassification
from transformers import TFRobertaForSequenceClassification

import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.metrics import roc_auc_score, accuracy_score, precision_recall_curve
from sklearn.metrics import confusion_matrix

### Loading the data

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
if data_type=='full': 
  path_train = "drive/MyDrive/Thesis BA&QM 2021/Data/td_train.csv"    
  path_test  = "drive/MyDrive/Thesis BA&QM 2021/Data/td_test.csv"
  df_train = pd.read_csv(path_train, sep="\t", header=0)
  df_test  = pd.read_csv(path_test, sep="\t", header=0)

if data_type=='subset': 
  path_train = "drive/MyDrive/Thesis BA&QM 2021/Data/td_train5050.csv"    
  path_test  = "drive/MyDrive/Thesis BA&QM 2021/Data/td_test.csv"
  df_train = pd.read_csv(path_train, sep="\t", header=0)
  df_test  = pd.read_csv(path_test, sep="\t", header=0)

if data_type=='category': 
  df_baby_train = pd.read_csv("drive/MyDrive/Thesis BA&QM 2021/Data/td_baby_train5050.csv", sep="\t", header=0)[['sentence', 'tip']]
  df_baby_test = pd.read_csv("drive/MyDrive/Thesis BA&QM 2021/Data/td_baby_test.csv", sep="\t", header=0)[['sentence', 'tip']]

  df_music_train = pd.read_csv("drive/MyDrive/Thesis BA&QM 2021/Data/td_music_train5050.csv", sep="\t", header=0)[['sentence', 'tip']]
  df_music_test = pd.read_csv("drive/MyDrive/Thesis BA&QM 2021/Data/td_music_test.csv", sep="\t", header=0)[['sentence', 'tip']]

  df_sports_train = pd.read_csv("drive/MyDrive/Thesis BA&QM 2021/Data/td_sports_train5050.csv", sep="\t", header=0)[['sentence', 'tip']]
  df_sports_test = pd.read_csv("drive/MyDrive/Thesis BA&QM 2021/Data/td_sports_test.csv", sep="\t", header=0)[['sentence', 'tip']]

  df_tools_train = pd.read_csv("drive/MyDrive/Thesis BA&QM 2021/Data/td_tools_train5050.csv", sep="\t", header=0)[['sentence', 'tip']]
  df_tools_test = pd.read_csv("drive/MyDrive/Thesis BA&QM 2021/Data/td_tools_test.csv", sep="\t", header=0)[['sentence', 'tip']]

  df_toys_train = pd.read_csv("drive/MyDrive/Thesis BA&QM 2021/Data/td_toys_train5050.csv", sep="\t", header=0)[['sentence', 'tip']]
  df_toys_test = pd.read_csv("drive/MyDrive/Thesis BA&QM 2021/Data/td_toys_test.csv", sep="\t", header=0)[['sentence', 'tip']]

In [None]:
# Choose which data you want to use
if data_type=='full' or data_type=='subset': 
  df_train = df_train[['sentence', 'tip']]
  df_test  = df_test[['sentence', 'tip']]
  df_train['tip'] = df_train['tip'].astype(int)
  df_test['tip'] = df_test['tip'].astype(int)
  print(f"Tip distribution train:\n{df_train['tip'].value_counts(normalize=True)} \n")
  print(f"Tip distribution test:\n{df_test['tip'].value_counts(normalize=True)}")
elif data_type=='category':
  df_baby_train['tip'] = df_baby_train['tip'].astype(int)
  df_baby_test['tip'] = df_baby_test['tip'].astype(int)
  df_music_train['tip'] = df_music_train['tip'].astype(int)
  df_music_test['tip'] = df_music_test['tip'].astype(int)
  df_sports_train['tip'] = df_sports_train['tip'].astype(int)
  df_sports_test['tip'] = df_sports_test['tip'].astype(int)
  df_tools_train['tip'] = df_tools_train['tip'].astype(int)
  df_tools_test['tip'] = df_tools_test['tip'].astype(int)
  df_toys_train['tip'] = df_toys_train['tip'].astype(int)
  df_toys_test['tip'] = df_toys_test['tip'].astype(int)

Tip distribution train:
1    0.5
0    0.5
Name: tip, dtype: float64 

Tip distribution test:
0    0.881843
1    0.118157
Name: tip, dtype: float64


Split into X and y data and keep 

In [None]:
from sklearn.utils import shuffle

if data_type=='full' or data_type=='subset': 
  # shuffle
  df_train = shuffle(df_train, random_state=0)
  df_test = shuffle(df_test, random_state=0)

  # split
  X_train = df_train['sentence']
  y_train = df_train['tip']
  X_test = df_test['sentence']
  y_test = df_test['tip']
  print("Training size: ", X_train.shape[0])
  print("Test size: ", X_test.shape[0])

elif data_type=='category':
  # shuffle
  df_baby_train = shuffle(df_baby_train, random_state=0)
  df_baby_test = shuffle(df_baby_test, random_state=0)
  df_music_train = shuffle(df_music_train, random_state=0)
  df_music_test = shuffle(df_music_test, random_state=0)
  df_sports_train = shuffle(df_sports_train, random_state=0)
  df_sports_test = shuffle(df_sports_test, random_state=0)
  df_tools_train = shuffle(df_tools_train, random_state=0)
  df_tools_test = shuffle(df_tools_test, random_state=0)
  df_toys_train = shuffle(df_toys_train, random_state=0)
  df_toys_test = shuffle(df_toys_test, random_state=0)

  X_baby_train = df_baby_train['sentence']
  y_baby_train = df_baby_train['tip']
  X_baby_test = df_baby_test['sentence']
  y_baby_test = df_baby_test['tip']

  X_music_train = df_music_train['sentence']
  y_music_train = df_music_train['tip']
  X_music_test = df_music_test['sentence']
  y_music_test = df_music_test['tip']

  X_sports_train = df_sports_train['sentence']
  y_sports_train = df_sports_train['tip']
  X_sports_test = df_sports_test['sentence']
  y_sports_test = df_sports_test['tip']

  X_tools_train = df_tools_train['sentence']
  y_tools_train = df_tools_train['tip']
  X_tools_test = df_tools_test['sentence']
  y_tools_test = df_tools_test['tip']

  X_toys_train = df_toys_train['sentence']
  y_toys_train = df_toys_train['tip']
  X_toys_test = df_toys_test['sentence']
  y_toys_test = df_toys_test['tip']

Training size:  6142
Test size:  6576


### Tokenization

Choose a tokenizer corresponding to your model of choice.

In [None]:
if model_type.lower() == 'distilbert':
  tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
elif model_type.lower() == 'bert':
  tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')
elif model_type.lower() == 'roberta':
  tokenizer = RobertaTokenizerFast.from_pretrained('roberta-base')
tokenizer

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=898823.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=456318.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1355863.0, style=ProgressStyle(descript…




PreTrainedTokenizerFast(name_or_path='roberta-base', vocab_size=50265, model_max_len=512, is_fast=True, padding_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'sep_token': '</s>', 'pad_token': '<pad>', 'cls_token': '<s>', 'mask_token': AddedToken("<mask>", rstrip=False, lstrip=True, single_word=False, normalized=False)})

Tokenize the training, validation and test set.

In [None]:
if data_type=='full' or data_type=='subset': 
  train_encodings = tokenizer(X_train.tolist(), truncation=True, padding=True, max_length=max_seq_length)
  test_encodings = tokenizer(X_test.tolist(), truncation=True, padding=True, max_length=max_seq_length)
elif data_type=='category':
  train_encodings_baby = tokenizer(X_baby_train.tolist(), truncation=True, padding=True, max_length=max_seq_length)
  test_encodings_baby = tokenizer(X_baby_test.tolist(), truncation=True, padding=True, max_length=max_seq_length)

  train_encodings_music = tokenizer(X_music_train.tolist(), truncation=True, padding=True, max_length=max_seq_length)
  test_encodings_music = tokenizer(X_music_test.tolist(), truncation=True, padding=True, max_length=max_seq_length)

  train_encodings_sports = tokenizer(X_sports_train.tolist(), truncation=True, padding=True, max_length=max_seq_length)
  test_encodings_sports = tokenizer(X_sports_test.tolist(), truncation=True, padding=True, max_length=max_seq_length)

  train_encodings_tools = tokenizer(X_tools_train.tolist(), truncation=True, padding=True, max_length=max_seq_length)
  test_encodings_tools = tokenizer(X_tools_test.tolist(), truncation=True, padding=True, max_length=max_seq_length)

  train_encodings_toys = tokenizer(X_toys_train.tolist(), truncation=True, padding=True, max_length=max_seq_length)
  test_encodings_toys = tokenizer(X_toys_test.tolist(), truncation=True, padding=True, max_length=max_seq_length)

# Example
#print(f"Example        : {X_train[0]}")
#toks = ['CLS'] + tokenizer.tokenize(X_train[0]) + ['SEP']
#print(f"Tokens         : {toks}  ({len(toks)} tokens)")
#print(f"Token id's     : {train_encodings['input_ids'][0]}")
#print(f"Attention mask : {train_encodings['attention_mask'][0]}")

In [None]:
#with tf.device('/cpu:0'): # run on cpu
# Write datasets to tensor slices

if data_type=='full' or data_type=='subset': 
  train_dataset = tf.data.Dataset.from_tensor_slices((dict(train_encodings), y_train))
  test_dataset = tf.data.Dataset.from_tensor_slices((dict(test_encodings), y_test))
elif data_type=='category':
  train_dataset_baby = tf.data.Dataset.from_tensor_slices((dict(train_encodings_baby), y_baby_train))
  test_dataset_baby = tf.data.Dataset.from_tensor_slices((dict(test_encodings_baby), y_baby_test))

  train_dataset_music = tf.data.Dataset.from_tensor_slices((dict(train_encodings_music), y_music_train))
  test_dataset_music = tf.data.Dataset.from_tensor_slices((dict(test_encodings_music), y_music_test))

  train_dataset_sports = tf.data.Dataset.from_tensor_slices((dict(train_encodings_sports), y_sports_train))
  test_dataset_sports = tf.data.Dataset.from_tensor_slices((dict(test_encodings_sports), y_sports_test))

  train_dataset_tools = tf.data.Dataset.from_tensor_slices((dict(train_encodings_tools), y_tools_train))
  test_dataset_tools = tf.data.Dataset.from_tensor_slices((dict(test_encodings_tools), y_tools_test))

  train_dataset_toys = tf.data.Dataset.from_tensor_slices((dict(train_encodings_toys), y_toys_train))
  test_dataset_toys = tf.data.Dataset.from_tensor_slices((dict(test_encodings_toys), y_toys_test))

### Model definition

Set the model to your prefered model architecture.

In [None]:
if data_type=='full' or data_type=='subset': 
  if model_type.lower() == 'distilbert':
    model = TFDistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased')
  elif model_type.lower() == 'bert':
    model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased')
  elif model_type.lower() == 'roberta':
    model = TFRobertaForSequenceClassification.from_pretrained('roberta-base')
  model
elif data_type=='category':
  # CHANGE MANUALLY
  #model = TFDistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased')
  #model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased')
  #model = TFRobertaForSequenceClassification.from_pretrained('roberta-base')

  model_baby = TFBertForSequenceClassification.from_pretrained('bert-base-uncased')
  model_music = TFBertForSequenceClassification.from_pretrained('bert-base-uncased')
  model_sports = TFBertForSequenceClassification.from_pretrained('bert-base-uncased')
  model_tools = TFBertForSequenceClassification.from_pretrained('bert-base-uncased')
  model_toys = TFBertForSequenceClassification.from_pretrained('bert-base-uncased')

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=481.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=657434796.0, style=ProgressStyle(descri…




All model checkpoint layers were used when initializing TFRobertaForSequenceClassification.

Some layers of TFRobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Define your optimizer, loss function, and evaluation metrics, then compile the model.

In [None]:
# Optimizer
if optim_type.lower() == 'adam':
  optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
elif optim_type.lower() == 'adamw':  # not improving results
  steps_per_epoch = 156
  num_train_steps = steps_per_epoch * num_epochs
  num_warmup_steps = int(0.1*num_train_steps)
  optimizer = optimization.create_optimizer(init_lr=learning_rate,
                                            num_train_steps=num_train_steps,
                                            num_warmup_steps=num_warmup_steps,
                                            optimizer_type='adamw')
  
# Loss function
loss_function = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True) 
# Sparse, as true are [0, 1, ..], and pred is [[0.9, 0.1], [0.2, 0.8]]

# Evaluation metrics
ev_metrics = [tf.keras.metrics.SparseCategoricalAccuracy('accuracy')]

# Compile model  
if data_type=='full' or data_type=='subset': 
  model.compile(optimizer=optimizer, loss=loss_function, metrics=ev_metrics)
elif data_type=='category':
  model_baby.compile(optimizer=optimizer, loss=loss_function, metrics=ev_metrics)
  model_music.compile(optimizer=optimizer, loss=loss_function, metrics=ev_metrics)
  model_sports.compile(optimizer=optimizer, loss=loss_function, metrics=ev_metrics)
  model_tools.compile(optimizer=optimizer, loss=loss_function, metrics=ev_metrics)
  model_toys.compile(optimizer=optimizer, loss=loss_function, metrics=ev_metrics)

# Save weights of model with minimum validation loss
#checkpoint_filepath = '/content/drive/MyDrive/Thesis BA&QM 2021'
#model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
#    filepath=checkpoint_filepath,
#    save_weights_only=True,
#    monitor='val_loss',
#    mode='min',
#    save_best_only=True)

In [None]:
#model.summary()

Run the model.

In [None]:
from sklearn.utils import class_weight 
import numpy as np

if data_type=='full' or data_type=='subset': 
  class_weights = class_weight.compute_class_weight('balanced', np.unique(y_train), y_train)
  class_weights = dict(enumerate(class_weights))
  print("Class weights: ", class_weights)
elif data_type=='category':
  class_weights_baby = class_weight.compute_class_weight('balanced', np.unique(y_baby_train), y_baby_train)
  class_weights_baby = dict(enumerate(class_weights_baby))
  print("Class weights: ", class_weights_baby)

  class_weights_music = class_weight.compute_class_weight('balanced', np.unique(y_music_train), y_music_train)
  class_weights_music = dict(enumerate(class_weights_music))
  print("Class weights: ", class_weights_music)

  class_weights_sports = class_weight.compute_class_weight('balanced', np.unique(y_sports_train), y_sports_train)
  class_weights_sports = dict(enumerate(class_weights_sports))
  print("Class weights: ", class_weights_sports)

  class_weights_tools = class_weight.compute_class_weight('balanced', np.unique(y_tools_train), y_tools_train)
  class_weights_tools = dict(enumerate(class_weights_tools))
  print("Class weights: ", class_weights_tools)

  class_weights_toys = class_weight.compute_class_weight('balanced', np.unique(y_toys_train), y_toys_train)
  class_weights_toys = dict(enumerate(class_weights_toys))
  print("Class weights: ", class_weights_toys)

Class weights:  {0: 1.0, 1: 1.0}


In [None]:
if data_type=='full' or data_type=='subset': 
  train_data = train_dataset.shuffle(1000).batch(train_batch_size, drop_remainder=True).prefetch(tf.data.experimental.AUTOTUNE)
  test_data = test_dataset.batch(train_batch_size, drop_remainder=False).prefetch(tf.data.experimental.AUTOTUNE)

  start = time.time()

  # Fit model
  history = model.fit(train_data, 
                    epochs=num_epochs,
                    #callbacks=[model_checkpoint_callback],
                    verbose=1,
                    class_weight = class_weights)

  end = time.time()
  print("Training completed")
  print("Elapsed time fine-tuning the model: ", (end-start)/60, "min\n")
elif data_type=='category':
  
  train_data_baby = train_dataset_baby.shuffle(1000).batch(train_batch_size, drop_remainder=True).prefetch(tf.data.experimental.AUTOTUNE)
  test_data_baby = test_dataset_baby.batch(train_batch_size, drop_remainder=False).prefetch(tf.data.experimental.AUTOTUNE)

  train_data_music = train_dataset_music.shuffle(1000).batch(train_batch_size, drop_remainder=True).prefetch(tf.data.experimental.AUTOTUNE)
  test_data_music = test_dataset_music.batch(train_batch_size, drop_remainder=False).prefetch(tf.data.experimental.AUTOTUNE)

  train_data_sports = train_dataset_sports.shuffle(1000).batch(train_batch_size, drop_remainder=True).prefetch(tf.data.experimental.AUTOTUNE)
  test_data_sports = test_dataset_sports.batch(train_batch_size, drop_remainder=False).prefetch(tf.data.experimental.AUTOTUNE)

  train_data_tools = train_dataset_tools.shuffle(1000).batch(train_batch_size, drop_remainder=True).prefetch(tf.data.experimental.AUTOTUNE) 
  test_data_tools = test_dataset_tools.batch(train_batch_size, drop_remainder=False).prefetch(tf.data.experimental.AUTOTUNE)

  train_data_toys = train_dataset_toys.shuffle(1000).batch(train_batch_size, drop_remainder=True).prefetch(tf.data.experimental.AUTOTUNE)
  test_data_toys = test_dataset_toys.batch(train_batch_size, drop_remainder=False).prefetch(tf.data.experimental.AUTOTUNE)

  start = time.time()

# Fit model
  history_baby = model_baby.fit(train_data_baby, 
                    epochs=num_epochs,
                    #callbacks=[model_checkpoint_callback],
                    verbose=1,
                    class_weight = class_weights_baby)
  
  history_music = model_music.fit(train_data_music, 
                    epochs=num_epochs,
                    #callbacks=[model_checkpoint_callback],
                    verbose=1,
                    class_weight = class_weights_music)
  
  history_sports = model_sports.fit(train_data_sports, 
                    epochs=num_epochs,
                    #callbacks=[model_checkpoint_callback],
                    verbose=1,
                    class_weight = class_weights_sports)

  history_tools = model_tools.fit(train_data_tools, 
                    epochs=num_epochs,
                    #callbacks=[model_checkpoint_callback],
                    verbose=1)
                 #  class_weight = class_weights_tools)

  history_toys = model_toys.fit(train_data_toys, 
                    epochs=num_epochs,
                    #callbacks=[model_checkpoint_callback],
                    verbose=1,
                    class_weight = class_weights_toys)

  end = time.time()
  print("Training completed")
  print("Elapsed time fine-tuning the model: ", (end-start)/60, "min\n")

Epoch 1/2
Epoch 2/2
Training completed
Elapsed time fine-tuning the model:  5.575246326128641 min



### Evaluate epochs

Print the train and validation loss given the epoch number.

In [None]:
if data_type=='full' or data_type=='subset': 
  if num_epochs > 1:
    history_dict = history.history
    #print(history_dict.keys())

    acc = history_dict['accuracy']
    #val_acc = history_dict['val_accuracy']
    loss = history_dict['loss']
    #val_loss = history_dict['val_loss']

    epochs = range(1, len(acc) + 1)
    fig = plt.figure(figsize=(10, 5))
    fig.subplots_adjust(hspace=0.4)

    plt.subplot(2, 1, 1)
    plt.plot(epochs, loss, 'r', label='training loss')
    #plt.plot(epochs, val_loss, 'b', label='validation loss')
    #plt.title('Training and validation loss')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.locator_params(axis="x", nbins=len(acc))
    plt.legend()

    plt.subplot(2, 1, 2)
    plt.plot(epochs, acc, 'r', label='training accuracy')
    #plt.plot(epochs, val_acc, 'b', label='validation accuracy')
    #plt.title('Training and validation accuracy')
    plt.xlabel('Epochs')
    plt.ylabel('Accuracy')
    #plt.ylim((0.70,0.78))
    plt.locator_params(axis="x", nbins=len(acc))
    plt.legend(loc='upper left')
  else:
    print("Not enough epochs to plot")

### Model evaluation

Load best model weights with lowest validation loss into the model.

In [None]:
#model.load_weights(checkpoint_filepath)

Evaluate the model on 4 artificial sentences: 2 tips and 2 non-tips.

In [None]:
if data_type=='full' or data_type=='subset': 
  # Example data
  sample_examples = ["Just make sure to check if you have enough batteries at home.",      # TIP
                  "I would argue that a child of two is still too young for this toy.",  # TIP
                  "It was way too expensive and the shipping was late.",                 # NON-TIP
                  "Loved it!"]                                                           # NON-TIP
  sample_labels = pd.Series([1, 1, 0, 0])

  # Prepare examples
  sample_encodings = tokenizer(sample_examples, truncation=True, padding=True, max_length=max_seq_length)
  sample_encodings = tf.data.Dataset.from_tensor_slices((dict(sample_encodings), sample_labels))

  # Calculate accuracy
  sample_loss, sample_accuracy = model.evaluate(sample_encodings.batch(1))

  # Make predictions
  preds_percentage = model.predict(sample_encodings.batch(1)).logits
  preds_percentage = tf.nn.softmax(preds_percentage, axis=1).numpy()
  print(preds_percentage)    
  preds_prob = preds_percentage[:,1]
  threshold = 0.5
  preds = [1 if pred[1] >=threshold else 0 for pred in preds_percentage]

  # Print results
  print()
  for i in range(len(sample_examples)):
    print(f"Sentence   : {sample_examples[i]}")
    print(f"Prob. tip  : {preds_percentage[i][1]}")
    print(f"Prediction : Pred = {preds[i]} True = {sample_labels[i]}")
    print()
  print("Final  accuracy : ", sample_accuracy)
  print("Final  loss     : ", sample_loss)

[[0.06000345 0.93999654]
 [0.07078373 0.9292163 ]
 [0.22405846 0.77594155]
 [0.9964232  0.00357683]]

Sentence   : Just make sure to check if you have enough batteries at home.
Prob. tip  : 0.9399965405464172
Prediction : Pred = 1 True = 1

Sentence   : I would argue that a child of two is still too young for this toy.
Prob. tip  : 0.9292163252830505
Prediction : Pred = 1 True = 1

Sentence   : It was way too expensive and the shipping was late.
Prob. tip  : 0.7759415507316589
Prediction : Pred = 1 True = 0

Sentence   : Loved it!
Prob. tip  : 0.003576833987608552
Prediction : Pred = 0 True = 0

Final  accuracy :  0.75
Final  loss     :  0.40868109464645386


Evaluate on the test set.

In [None]:
# Evaluate model
#LOSS, test_accuracy = model.evaluate(test_dataset.shuffle(1000).batch(train_batch_size))

if data_type=='full' or data_type=='subset': 
  preds_percentage = model.predict(test_dataset.batch(train_batch_size)).logits
  preds_percentage = tf.nn.softmax(preds_percentage, axis=1).numpy()    
  preds_prob = preds_percentage[:,1]
elif data_type=='category':
  preds_percentage_baby = model_baby.predict(test_dataset_baby.batch(train_batch_size)).logits
  preds_percentage_baby = tf.nn.softmax(preds_percentage_baby, axis=1).numpy()
  preds_prob_baby = preds_percentage_baby[:,1]

  preds_percentage_music = model_music.predict(test_dataset_music.batch(train_batch_size)).logits
  preds_percentage_music = tf.nn.softmax(preds_percentage_music, axis=1).numpy()    
  preds_prob_music = preds_percentage_music[:,1]

  preds_percentage_sports = model_sports.predict(test_dataset_sports.batch(train_batch_size)).logits
  preds_percentage_sports = tf.nn.softmax(preds_percentage_sports, axis=1).numpy()    
  preds_prob_sports = preds_percentage_sports[:,1]

  preds_percentage_tools = model_tools.predict(test_dataset_tools.batch(train_batch_size)).logits
  preds_percentage_tools = tf.nn.softmax(preds_percentage_tools, axis=1).numpy()    
  preds_prob_tools = preds_percentage_tools[:,1]

  preds_percentage_toys = model_toys.predict(test_dataset_toys.batch(train_batch_size)).logits
  preds_percentage_toys = tf.nn.softmax(preds_percentage_toys, axis=1).numpy()    
  preds_prob_toys = preds_percentage_toys[:,1]

  preds_percentage = np.concatenate([preds_percentage_baby, preds_percentage_music, preds_percentage_sports, preds_percentage_tools, preds_percentage_toys])
  preds_prob = np.concatenate([preds_prob_baby, preds_prob_music, preds_prob_sports, preds_prob_tools, preds_prob_toys])
  y_test = np.concatenate([y_baby_test, y_music_test, y_sports_test, y_tools_test, y_toys_test])

# Create binary predictions
threshold = 0.5
preds = [1 if pred[1] >=threshold else 0 for pred in preds_percentage]

# Evaluation metrics
PR   = precision_score(y_test, preds)
RC   = recall_score(y_test, preds)
F1   = f1_score(y_test, preds)
ACC  = accuracy_score(y_test, preds)
AUC  = roc_auc_score(y_test, preds_prob)

print(f"Precision : {PR}")
print(f"Recall    : {RC}")
print(f"F1-score  : {F1}")
print(f"Accuracy  : {ACC}")
print(f"AUC       : {AUC}")
print()

print(f"COPY:  {round(PR,4)}, {round(RC,4)}, {round(F1,4)}, {round(ACC,4)}, {round(AUC,4)}")

Precision : 0.2202808112324493
Recall    : 0.9086229086229086
F1-score  : 0.3545956805625314
Accuracy  : 0.6091849148418491
AUC       : 0.8359287526385303

COPY:  0.2203, 0.9086, 0.3546, 0.6092, 0.8359


Print the confusion matrix and the number of tips obtained.

In [None]:
threshold2 = 0.5
preds2 = [1 if pred[1] >=threshold2 else 0 for pred in preds_percentage]

print("Confusion matrix:")
print(confusion_matrix(y_test, preds2))
print()
print("Predicted as tip: ", preds2.count(1))
print("Predicted as non-tip: ", preds2.count(0))
print("Real number of tips:", y_test.sum())

Confusion matrix:
[[3300 2499]
 [  71  706]]

Predicted as tip:  3205
Predicted as non-tip:  3371
Real number of tips: 777


Print the precision and recall as a function of the threshold.

In [None]:
precisions = []
recalls = []
f1s = []
tips_pred = []
nontips_pred = []
false_pos = []
false_neg = []
true_pos = []
thresholds = np.arange(0.4, 1.0, 0.01)
max_f1s = 0
max_thresh = 0
max_prec = 0
max_rec = 0
max_tp = 0
max_fp = 0

for threshold3 in thresholds:
  preds3 = [1 if pred[1] >=threshold3 else 0 for pred in preds_percentage]
  precisions.append(precision_score(y_test, preds3))
  recalls.append(recall_score(y_test, preds3))
  f1s.append(f1_score(y_test, preds3))
  tips_pred.append(preds3.count(1))
  nontips_pred.append(preds3.count(0))
  false_pos.append(confusion_matrix(y_test, preds3)[0,1])
  false_neg.append(confusion_matrix(y_test, preds3)[1,0])
  true_pos.append(confusion_matrix(y_test, preds3)[1,1])
  if f1_score(y_test, preds3) > max_f1s:
    max_f1s = f1_score(y_test, preds3)
    max_thresh = threshold3
    max_f1s_prec = precision_score(y_test, preds3)
    max_f1s_rec = recall_score(y_test, preds3)
    max_f1s_tp = confusion_matrix(y_test, preds3)[1,1]
    max_f1s_fp = confusion_matrix(y_test, preds3)[0,1]
  if precision_score(y_test, preds3) > max_prec:
    max_prec = precision_score(y_test, preds3)
    max_rec = recall_score(y_test, preds3)
    max_tp = confusion_matrix(y_test, preds3)[1,1]
    max_fp = confusion_matrix(y_test, preds3)[0,1]

print(f"Max F1-score: {max_f1s} at threshold {max_thresh} with precision {max_f1s_prec} and recall {max_f1s_rec} and TP {max_f1s_tp} and FP {max_f1s_fp}")
print(f"Max precision: {max_prec} with recall {max_rec} and TP {max_tp} and FP {max_fp}\n")

  _warn_prf(average, modifier, msg_start, len(result))


Max F1-score: 0.4386907669760626 at threshold 0.8700000000000004 with precision 0.3535433070866142 and recall 0.5778635778635779 and TP 449 and FP 821
Max precision: 1.0 with recall 0.002574002574002574 and TP 2 and FP 0



In [None]:
# Precision vs. Recall
plt.plot(thresholds, precisions, 'blue', label='precision')
plt.plot(thresholds, recalls, 'red', label='recall')
plt.plot(thresholds, f1s, 'black', label='f1')
plt.xlabel('Threshold')
plt.grid(axis='y', alpha=0.75)
plt.legend()
plt.show()

# False positives vs. False negatives
plt.plot(thresholds, false_pos, 'blue', label='false pos.')
plt.plot(thresholds, false_neg, 'red', label='false neg.')
plt.plot(thresholds, true_pos, 'black', label='true pos.')
plt.xlabel('Threshold')
plt.grid(axis='y', alpha=0.75)
plt.hlines(y=777, xmin=0.4, xmax=0.99, linestyle='dotted', linewidth=1.0)
plt.legend()
plt.show()

# Predicted tips vs. non-tips
plt.plot(thresholds, tips_pred, 'blue', label='predicted tips')
plt.plot(thresholds, nontips_pred, 'red', label='predicted non-tips')
plt.xlabel('Threshold')
plt.grid(axis='y', alpha=0.75)
plt.legend()
#plt.hlines(y=770, xmin=0.4, xmax=0.99, linestyle='dotted', linewidth=1.0)
plt.show()

In [None]:
# False positives vs. False negatives
predicted_tips = []
for i in range(60):
  predicted_tips.append(true_pos[i]+false_pos[i])

plt.plot(thresholds, false_pos, 'blue', label='false pos.')
#plt.plot(thresholds, false_neg, 'red', label='false neg.')
plt.plot(thresholds, true_pos, 'black', label='true pos.')
plt.plot(thresholds, predicted_tips, 'green', label='Predicted as tip')
plt.xlabel('Threshold')
plt.grid(axis='y', alpha=0.75)
plt.hlines(y=777, xmin=0.4, xmax=0.99, linewidth=1.0)
plt.legend()
plt.show()

@references: 
* https://huggingface.co/transformers/custom_datasets.html