# Fine-tuning the BERT model for Tip Mining
Combination of Huggingface library and TensorFlow 
 --------------------------------------
@author: Erik van der Heide, EUR 2021-2022

In [None]:
!nvidia-smi

Wed Jun 23 11:46:26 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 465.27       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   73C    P0    32W /  70W |   4478MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

### Hyperparameters

In [None]:
# Hyperparameters
data_type = 'full'         # Choose data 'subsetX' or 'full'
model_type = 'bert'        # Choose model 'bert', 'distilbert' or 'roberta'
optim_type = 'adam'        # Choose optimizer 'Adam' or 'AdamW'
use_weights = True         # True when using class weights for unbalanced data, False ow
max_seq_length = 64        # define maximum length (32, 64)
train_batch_size = 16      # define batch size (16, 32)
learning_rate = 2e-5       # define learning rate (2e-5, 3e-5, 5e-5)
num_epochs = 2             # define nr. of epoch (1, 2, 3, 4)

### Installations & packages

In [None]:
# Install transformers
!pip install transformers



In [None]:
# Install tensorflow model environment
#!pip install -q tf-models-official

In [None]:
# Packages
import pandas as pd
import time
import random

import torch
import torch.nn as nn
import tensorflow as tf
#from official.nlp import optimization

from transformers import BertTokenizerFast, DistilBertTokenizerFast, RobertaTokenizerFast
from transformers import TFDistilBertForSequenceClassification
from transformers import TFBertForSequenceClassification
from transformers import TFRobertaForSequenceClassification

import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.metrics import roc_auc_score, accuracy_score, precision_recall_curve
from sklearn.metrics import confusion_matrix
from sklearn.utils import shuffle

### Loading the data

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Training data

In [None]:
if data_type=='subset' : path ="drive/MyDrive/Thesis BA&QM 2021/Data/td_clean_subset.csv"
if data_type=='subsetf': path ="drive/MyDrive/Thesis BA&QM 2021/Data/td_clean_subset_filtered.csv" 
if data_type=='full'   : path ="drive/MyDrive/Thesis BA&QM 2021/Data/td_clean.csv"    
if data_type=='fullf'  : path ="drive/MyDrive/Thesis BA&QM 2021/Data/td_clean_full_filtered.csv"   

# Read in data
df = pd.read_csv(path, sep="\t", header=0)
print(path)

drive/MyDrive/Thesis BA&QM 2021/Data/td_clean.csv


Evaluation data

In [None]:
ev_baby = pd.read_csv("drive/MyDrive/Thesis BA&QM 2021/Data/ev10_baby_5.csv", sep="\t", header=0)
ev_cloth = pd.read_csv("drive/MyDrive/Thesis BA&QM 2021/Data/ev10_cloth_5.csv", sep="\t", header=0)
ev_food = pd.read_csv("drive/MyDrive/Thesis BA&QM 2021/Data/ev10_food_5.csv", sep="\t", header=0)
ev_health = pd.read_csv("drive/MyDrive/Thesis BA&QM 2021/Data/ev10_health_5.csv", sep="\t", header=0)
ev_music = pd.read_csv("drive/MyDrive/Thesis BA&QM 2021/Data/ev10_music_5.csv", sep="\t", header=0)
ev_phone = pd.read_csv("drive/MyDrive/Thesis BA&QM 2021/Data/ev10_phone_5.csv", sep="\t", header=0)
ev_sports = pd.read_csv("drive/MyDrive/Thesis BA&QM 2021/Data/ev10_sports_5.csv", sep="\t", header=0)
ev_tools = pd.read_csv("drive/MyDrive/Thesis BA&QM 2021/Data/ev10_tools_5.csv", sep="\t", header=0)
ev_toys = pd.read_csv("drive/MyDrive/Thesis BA&QM 2021/Data/ev10_toys_5.csv", sep="\t", header=0)
ev_video = pd.read_csv("drive/MyDrive/Thesis BA&QM 2021/Data/ev10_video_5.csv", sep="\t", header=0)

In [None]:
# Choose which data you want to use
df = df[['sentence', 'tip']]
df.reset_index(drop=True, inplace=True)
df['tip'] = df['tip'].astype(int)
print(f"Tip distribution:\n{df['tip'].value_counts(normalize=True)}")
df.dtypes

Tip distribution:
0    0.954229
1    0.045771
Name: tip, dtype: float64


sentence    object
tip          int64
dtype: object

We split the data into 65% training, 15% validation and 20% testing.

In [None]:
# Train+validation and test split
df = shuffle(df, random_state=0)

X_train = df['sentence']
y_train = df['tip']

# Sizes & example:
print(f"Train size : {X_train.size} - {100*round(X_train.size/df.shape[0], 2)}% - ratio {round(y_train.value_counts(normalize=True)[0],2)}-{round(y_train.value_counts(normalize=True)[1],2)}")

Train size : 84071 - 100.0% - ratio 0.95-0.05


### Tokenization

Choose a tokenizer corresponding to your model of choice.

In [None]:
if model_type.lower() == 'distilbert':
  tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
elif model_type.lower() == 'bert':
  tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')
elif model_type.lower() == 'roberta':
  tokenizer = RobertaTokenizerFast.from_pretrained('roberta-base')
tokenizer

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=466062.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=28.0, style=ProgressStyle(description_w…




PreTrainedTokenizerFast(name_or_path='bert-base-uncased', vocab_size=30522, model_max_len=512, is_fast=True, padding_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})

Tokenize the training, validation and test set.

In [None]:
train_encodings = tokenizer(X_train.tolist(), truncation=True, padding=True, max_length=max_seq_length)

# Example
#print(f"Example        : {X_train[0]}")
#toks = ['CLS'] + tokenizer.tokenize(X_train[0]) + ['SEP']
#print(f"Tokens         : {toks}  ({len(toks)} tokens)")
#print(f"Token id's     : {train_encodings['input_ids'][0]}")
#print(f"Attention mask : {train_encodings['attention_mask'][0]}")

In [None]:
#with tf.device('/cpu:0'): # run on cpu
# Write datasets to tensor slices
train_dataset = tf.data.Dataset.from_tensor_slices((dict(train_encodings), y_train))

### Model definition

Set the model to your prefered model architecture.

In [None]:
if model_type.lower() == 'distilbert':
  model = TFDistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased')
elif model_type.lower() == 'bert':
  model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased')
elif model_type.lower() == 'roberta':
  model = TFRobertaForSequenceClassification.from_pretrained('roberta-base')
model

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=570.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=536063208.0, style=ProgressStyle(descri…




All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


<transformers.models.bert.modeling_tf_bert.TFBertForSequenceClassification at 0x7f6064a69e10>

Define your optimizer, loss function, and evaluation metrics, then compile the model.

In [None]:
# Optimizer
if optim_type.lower() == 'adam':
  optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
elif optim_type.lower() == 'adamw':  # not improving results
  steps_per_epoch = 156
  num_train_steps = steps_per_epoch * num_epochs
  num_warmup_steps = int(0.1*num_train_steps)
  optimizer = optimization.create_optimizer(init_lr=learning_rate,
                                            num_train_steps=num_train_steps,
                                            num_warmup_steps=num_warmup_steps,
                                            optimizer_type='adamw')
  
# Loss function
loss_function = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True) 
# Sparse, as true are [0, 1, ..], and pred is [[0.9, 0.1], [0.2, 0.8]]

# Evaluation metrics
ev_metrics = [tf.keras.metrics.SparseCategoricalAccuracy('accuracy')]

# Compile model  
model.compile(optimizer=optimizer, loss=loss_function, metrics=ev_metrics)

In [None]:
model.summary()

Model: "tf_bert_for_sequence_classification"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
bert (TFBertMainLayer)       multiple                  109482240 
_________________________________________________________________
dropout_97 (Dropout)         multiple                  0         
_________________________________________________________________
classifier (Dense)           multiple                  1538      
Total params: 109,483,778
Trainable params: 109,483,778
Non-trainable params: 0
_________________________________________________________________


Run the model.

In [None]:
from sklearn.utils import class_weight 
import numpy as np
class_weights = class_weight.compute_class_weight('balanced', np.unique(y_train), y_train)
class_weights = dict(enumerate(class_weights))
print("Class weights: ", class_weights)

Class weights:  {0: 0.5239831469777994, 1: 10.923986486486486}


In [None]:
train_data = train_dataset.shuffle(1000).batch(train_batch_size, drop_remainder=True).prefetch(tf.data.experimental.AUTOTUNE)

start = time.time()

# Fit model
if use_weights == True:
  history = model.fit(train_data, 
          epochs=num_epochs,
          verbose=1,
          class_weight = class_weights)
else: 
  history = model.fit(train_data, 
          epochs=num_epochs,
          verbose=1)

end = time.time()
print("Training completed")
print("Elapsed time fine-tuning the model: ", (end-start)/60, "min\n")

Epoch 1/2
Epoch 2/2
Training completed
Elapsed time fine-tuning the model:  38.05428458849589 min



### Model evaluation

Test if the model works reasonably.

In [None]:
# Example data
sample_examples = ["Just make sure to check if you have enough batteries at home.",      # TIP
                  "I would argue that a child of two is still too young for this toy.",  # TIP
                  "It was way too expensive and the shipping was late.",                 # NON-TIP
                  "Loved it!"]                                                           # NON-TIP
sample_labels = pd.Series([1, 1, 0, 0])

# Prepare examples
sample_encodings = tokenizer(sample_examples, truncation=True, padding=True, max_length=max_seq_length)
sample_encodings = tf.data.Dataset.from_tensor_slices((dict(sample_encodings), sample_labels))

# Calculate accuracy
sample_loss, sample_accuracy = model.evaluate(sample_encodings.batch(1))

# Make predictions
preds_percentage = model.predict(sample_encodings.batch(1)).logits
preds_percentage = tf.nn.softmax(preds_percentage, axis=1).numpy()
print(preds_percentage)    
preds_prob = preds_percentage[:,1]
threshold = 0.5
preds = [1 if pred[1] >=threshold else 0 for pred in preds_percentage]

# Print results
print()
for i in range(len(sample_examples)):
  print(f"Sentence   : {sample_examples[i]}")
  print(f"Prob. tip  : {preds_percentage[i][1]}")
  print(f"Prediction : Pred = {preds[i]} True = {sample_labels[i]}")
  print()
print("Final  accuracy : ", sample_accuracy)
print("Final  loss     : ", sample_loss)

[[0.04498824 0.95501184]
 [0.13212228 0.8678778 ]
 [0.8991388  0.10086124]
 [0.99316347 0.00683656]]

Sentence   : Just make sure to check if you have enough batteries at home.
Prob. tip  : 0.9550118446350098
Prediction : Pred = 1 True = 1

Sentence   : I would argue that a child of two is still too young for this toy.
Prob. tip  : 0.8678777813911438
Prediction : Pred = 1 True = 1

Sentence   : It was way too expensive and the shipping was late.
Prob. tip  : 0.10086123645305634
Prediction : Pred = 0 True = 0

Sentence   : Loved it!
Prob. tip  : 0.006836556829512119
Prediction : Pred = 0 True = 0

Final  accuracy :  1.0
Final  loss     :  0.07522846758365631


Evaluate the model on unseen data and save the predictions.

In [None]:
# BABY
baby_encodings = tokenizer(ev_baby['sentence'].tolist(), truncation=True, padding=True, max_length=max_seq_length)
baby_dataset = tf.data.Dataset.from_tensor_slices((dict(baby_encodings))) # test dataset
baby_data = baby_dataset.batch(train_batch_size, drop_remainder=False).prefetch(tf.data.experimental.AUTOTUNE)

preds_percentage_baby = model.predict(baby_data).logits
preds_percentage_baby = tf.nn.softmax(preds_percentage_baby, axis=1).numpy()  
preds_prob_baby = preds_percentage_baby[:,1]

print(f"Num. tips baby at threshold = 0.5:  {(preds_prob_baby > 0.5).sum()} / {len(preds_prob_baby)}  ")
print(f"Num. tips baby at threshold = 0.7:  {(preds_prob_baby > 0.7).sum()} / {len(preds_prob_baby)}  ")
print(f"Num. tips baby at threshold = 0.9:  {(preds_prob_baby > 0.9).sum()} / {len(preds_prob_baby)}  \n")

ev_baby['prediction'] = preds_prob_baby
ev_baby.head(5)

Num. tips baby at threshold = 0.5:  52 / 363  
Num. tips baby at threshold = 0.7:  32 / 363  
Num. tips baby at threshold = 0.9:  4 / 363  



Unnamed: 0.1,Unnamed: 0,asin,reviewer_id,category,freq,sentence,num_review,num_sentence,sen_length,prediction
0,0,B000DZTS2I,A3R0RVO90YCOSR,Baby,75,This is the only carrier that my daughter woul...,1,1,17,0.059832
1,1,B000DZTS2I,A3R0RVO90YCOSR,Baby,75,It is super easy to sling over your shoulder a...,1,2,22,0.283269
2,2,B000DZTS2I,A3R0RVO90YCOSR,Baby,75,It felt very natural and intuitive unlike othe...,1,3,11,0.021588
3,3,B000DZTS2I,A3R0RVO90YCOSR,Baby,75,"Also, if the baby falls asleep while you're we...",1,4,29,0.493169
4,4,B000DZTS2I,A3R0RVO90YCOSR,Baby,75,"(It takes a bit of practice, but it is very do...",1,5,15,0.015891


In [None]:
# CLOTH
cloth_encodings = tokenizer(ev_cloth['sentence'].tolist(), truncation=True, padding=True, max_length=max_seq_length)
cloth_dataset = tf.data.Dataset.from_tensor_slices((dict(cloth_encodings))) # test dataset
cloth_data = cloth_dataset.batch(train_batch_size, drop_remainder=False).prefetch(tf.data.experimental.AUTOTUNE)

preds_percentage_cloth = model.predict(cloth_data).logits
preds_percentage_cloth = tf.nn.softmax(preds_percentage_cloth, axis=1).numpy()  
preds_prob_cloth = preds_percentage_cloth[:,1]

print(f"Num. tips cloth at threshold = 0.5:  {(preds_prob_cloth > 0.5).sum()} / {len(preds_prob_cloth)}  ")
print(f"Num. tips cloth at threshold = 0.7:  {(preds_prob_cloth > 0.7).sum()} / {len(preds_prob_cloth)}  ")
print(f"Num. tips cloth at threshold = 0.9:  {(preds_prob_cloth > 0.9).sum()} / {len(preds_prob_cloth)}  \n")

ev_cloth['prediction'] = preds_prob_cloth
ev_cloth.head(5)

Num. tips cloth at threshold = 0.5:  48 / 225  
Num. tips cloth at threshold = 0.7:  31 / 225  
Num. tips cloth at threshold = 0.9:  4 / 225  



Unnamed: 0.1,Unnamed: 0,asin,reviewer_id,category,freq,sentence,num_review,num_sentence,sen_length,prediction
0,0,B000J3H31W,A1P64ZMQZDLL1W,"Clothing, Shoes & Jewelry",12,These are easy shoes to slip on and off so I c...,1,1,14,0.232792
1,1,B000J3H31W,A1P64ZMQZDLL1W,"Clothing, Shoes & Jewelry",12,"They are slightly bigger than I expected, but ...",1,2,15,0.785652
2,2,B000J3H31W,A1P64ZMQZDLL1W,"Clothing, Shoes & Jewelry",12,The points of the shoes kind of start to go up...,1,3,19,0.690341
3,3,B000J3H31W,A36NGMIQTDR5YT,"Clothing, Shoes & Jewelry",12,I wear between a 6 1/2 and 7.,2,1,5,0.415958
4,4,B000J3H31W,A36NGMIQTDR5YT,"Clothing, Shoes & Jewelry",12,Usually I get a 7 to be on the safe side.,2,2,10,0.02323


In [None]:
# FOOD
food_encodings = tokenizer(ev_food['sentence'].tolist(), truncation=True, padding=True, max_length=max_seq_length)
food_dataset = tf.data.Dataset.from_tensor_slices((dict(food_encodings))) # test dataset
food_data = food_dataset.batch(train_batch_size, drop_remainder=False).prefetch(tf.data.experimental.AUTOTUNE)

preds_percentage_food = model.predict(food_data).logits
preds_percentage_food = tf.nn.softmax(preds_percentage_food, axis=1).numpy()  
preds_prob_food = preds_percentage_food[:,1]

print(f"Num. tips food at threshold = 0.5:  {(preds_prob_food > 0.5).sum()} / {len(preds_prob_food)}  ")
print(f"Num. tips food at threshold = 0.7:  {(preds_prob_food > 0.7).sum()} / {len(preds_prob_food)}  ")
print(f"Num. tips food at threshold = 0.9:  {(preds_prob_food > 0.9).sum()} / {len(preds_prob_food)}  \n")

ev_food['prediction'] = preds_prob_food
ev_food.head(5)

Num. tips food at threshold = 0.5:  33 / 191  
Num. tips food at threshold = 0.7:  23 / 191  
Num. tips food at threshold = 0.9:  6 / 191  



Unnamed: 0.1,Unnamed: 0,asin,reviewer_id,category,freq,sentence,num_review,num_sentence,sen_length,prediction
0,0,B000E1BL5S,A3LMS2UO0ESHX5,Grocery & Gourmet Food,89,May 26th: New update I just received a new shi...,1,1,22,0.118403
1,1,B000E1BL5S,A3LMS2UO0ESHX5,Grocery & Gourmet Food,89,"Fortunately, Amazon is making it right but I a...",1,2,61,0.070481
2,2,B000E1BL5S,A3LMS2UO0ESHX5,Grocery & Gourmet Food,89,"Anyway, last week I received my latest Subscri...",1,3,53,0.008666
3,3,B000E1BL5S,A3LMS2UO0ESHX5,Grocery & Gourmet Food,89,This particular variety of planters nuts are m...,1,4,34,0.006456
4,4,B000E1BL5S,A3LMS2UO0ESHX5,Grocery & Gourmet Food,89,Chipotle has an aftertaste I don't enjoy and t...,1,5,18,0.130728


In [None]:
# HEALTH
health_encodings = tokenizer(ev_health['sentence'].tolist(), truncation=True, padding=True, max_length=max_seq_length)
health_dataset = tf.data.Dataset.from_tensor_slices((dict(health_encodings))) # test dataset
health_data = health_dataset.batch(train_batch_size, drop_remainder=False).prefetch(tf.data.experimental.AUTOTUNE)

preds_percentage_health = model.predict(health_data).logits
preds_percentage_health = tf.nn.softmax(preds_percentage_health, axis=1).numpy()  
preds_prob_health = preds_percentage_health[:,1]

print(f"Num. tips health at threshold = 0.5:  {(preds_prob_health > 0.5).sum()} / {len(preds_prob_health)}  ")
print(f"Num. tips health at threshold = 0.7:  {(preds_prob_health > 0.7).sum()} / {len(preds_prob_health)}  ")
print(f"Num. tips health at threshold = 0.9:  {(preds_prob_health > 0.9).sum()} / {len(preds_prob_health)}  \n")

ev_health['prediction'] = preds_prob_health
ev_health.head(5)

Num. tips health at threshold = 0.5:  36 / 245  
Num. tips health at threshold = 0.7:  24 / 245  
Num. tips health at threshold = 0.9:  3 / 245  



Unnamed: 0.1,Unnamed: 0,asin,reviewer_id,category,freq,sentence,num_review,num_sentence,sen_length,prediction
0,0,B0002DSVTC,A34W2MFO0M3MNN,Health & Personal Care,51,They're only a dollar each for name brand batt...,1,1,11,0.023339
1,1,B0002DSVTC,A34W2MFO0M3MNN,Health & Personal Care,51,Less than a dollar even.,1,2,5,0.062069
2,2,B0002DSVTC,A34W2MFO0M3MNN,Health & Personal Care,51,Can't beat that.,1,3,3,0.010087
3,3,B0002DSVTC,A34W2MFO0M3MNN,Health & Personal Care,51,Came in a week; lasted 2 months.,1,4,6,0.515929
4,4,B0002DSVTC,A34W2MFO0M3MNN,Health & Personal Care,51,I paid $10 each for Radio Shack brand batterie...,1,5,15,0.038508


In [None]:
# MUSIC
music_encodings = tokenizer(ev_music['sentence'].tolist(), truncation=True, padding=True, max_length=max_seq_length)
music_dataset = tf.data.Dataset.from_tensor_slices((dict(music_encodings))) # test dataset
music_data = music_dataset.batch(train_batch_size, drop_remainder=False).prefetch(tf.data.experimental.AUTOTUNE)

preds_percentage_music = model.predict(music_data).logits
preds_percentage_music = tf.nn.softmax(preds_percentage_music, axis=1).numpy()  
preds_prob_music = preds_percentage_music[:,1]

print(f"Num. tips music at threshold = 0.5:  {(preds_prob_music > 0.5).sum()} / {len(preds_prob_music)}  ")
print(f"Num. tips music at threshold = 0.7:  {(preds_prob_music > 0.7).sum()} / {len(preds_prob_music)}  ")
print(f"Num. tips music at threshold = 0.9:  {(preds_prob_music > 0.9).sum()} / {len(preds_prob_music)}  \n")

ev_music['prediction'] = preds_prob_music
ev_music.head(5)

Num. tips music at threshold = 0.5:  37 / 234  
Num. tips music at threshold = 0.7:  23 / 234  
Num. tips music at threshold = 0.9:  1 / 234  



Unnamed: 0.1,Unnamed: 0,asin,reviewer_id,category,freq,sentence,num_review,num_sentence,sen_length,prediction
0,0,B00074B67A,A8ANJNZAZQ6IQ,Music & Instruments,5,I ordered three of these ...so far!Very afford...,1,1,12,0.005491
1,1,B00074B67A,A8ANJNZAZQ6IQ,Music & Instruments,5,Made in Canada.Keeps wireless unit very snug a...,1,2,14,0.18656
2,2,B00074B67A,A8ANJNZAZQ6IQ,Music & Instruments,5,(all leather.,1,3,2,0.017652
3,3,B00074B67A,A8ANJNZAZQ6IQ,Music & Instruments,5,)I use the little plastic flat ring on the bot...,1,4,33,0.40314
4,4,B00074B67A,A8ANJNZAZQ6IQ,Music & Instruments,5,The leather will bend a little to allow a slig...,1,5,28,0.527002


In [None]:
# PHONE
phone_encodings = tokenizer(ev_phone['sentence'].tolist(), truncation=True, padding=True, max_length=max_seq_length)
phone_dataset = tf.data.Dataset.from_tensor_slices((dict(phone_encodings))) # test dataset
phone_data = phone_dataset.batch(train_batch_size, drop_remainder=False).prefetch(tf.data.experimental.AUTOTUNE)

preds_percentage_phone = model.predict(phone_data).logits
preds_percentage_phone = tf.nn.softmax(preds_percentage_phone, axis=1).numpy()  
preds_prob_phone = preds_percentage_phone[:,1]

print(f"Num. tips phone at threshold = 0.5:  {(preds_prob_phone > 0.5).sum()} / {len(preds_prob_phone)}  ")
print(f"Num. tips phone at threshold = 0.7:  {(preds_prob_phone > 0.7).sum()} / {len(preds_prob_phone)}  ")
print(f"Num. tips phone at threshold = 0.9:  {(preds_prob_phone > 0.9).sum()} / {len(preds_prob_phone)}  \n")

ev_phone['prediction'] = preds_prob_phone
ev_phone.head(5)

Num. tips phone at threshold = 0.5:  40 / 231  
Num. tips phone at threshold = 0.7:  25 / 231  
Num. tips phone at threshold = 0.9:  8 / 231  



Unnamed: 0.1,Unnamed: 0,asin,reviewer_id,category,freq,sentence,num_review,num_sentence,sen_length,prediction
0,0,B0036255ZE,A1LWHQY6JOWHBG,Cell Phones & Accessories,5,I must say in a word Zagg SmartBuds are awesome!,1,2,10,0.005881
1,1,B0036255ZE,A1LWHQY6JOWHBG,Cell Phones & Accessories,5,I have the Bose version.,1,3,5,0.013463
2,2,B0036255ZE,A1LWHQY6JOWHBG,Cell Phones & Accessories,5,Both in the ear and over the head.,1,4,8,0.232189
3,3,B0036255ZE,A1LWHQY6JOWHBG,Cell Phones & Accessories,5,"Not real wild about either, but they work.",1,5,8,0.014831
4,4,B0036255ZE,A1LWHQY6JOWHBG,Cell Phones & Accessories,5,Then I found something that I though nobody wo...,1,6,10,0.011886


In [None]:
# SPORTS
sports_encodings = tokenizer(ev_sports['sentence'].tolist(), truncation=True, padding=True, max_length=max_seq_length)
sports_dataset = tf.data.Dataset.from_tensor_slices((dict(sports_encodings))) # test dataset
sports_data = sports_dataset.batch(train_batch_size, drop_remainder=False).prefetch(tf.data.experimental.AUTOTUNE)

preds_percentage_sports = model.predict(sports_data).logits
preds_percentage_sports = tf.nn.softmax(preds_percentage_sports, axis=1).numpy()  
preds_prob_sports = preds_percentage_sports[:,1]

print(f"Num. tips sports at threshold = 0.5:  {(preds_prob_sports > 0.5).sum()} / {len(preds_prob_sports)}  ")
print(f"Num. tips sports at threshold = 0.7:  {(preds_prob_sports > 0.7).sum()} / {len(preds_prob_sports)}  ")
print(f"Num. tips sports at threshold = 0.9:  {(preds_prob_sports > 0.9).sum()} / {len(preds_prob_sports)}  \n")

ev_sports['prediction'] = preds_prob_sports
ev_sports.head(5)

Num. tips sports at threshold = 0.5:  39 / 218  
Num. tips sports at threshold = 0.7:  20 / 218  
Num. tips sports at threshold = 0.9:  5 / 218  



Unnamed: 0.1,Unnamed: 0,asin,reviewer_id,category,freq,sentence,num_review,num_sentence,sen_length,prediction
0,0,B000AR2N76,A30CS2AL07WGU5,Sports & Outdoors,29,This spork is great.,1,1,4,0.005707
1,1,B000AR2N76,A30CS2AL07WGU5,Sports & Outdoors,29,Really lightweight.,1,2,2,0.031443
2,2,B000AR2N76,A30CS2AL07WGU5,Sports & Outdoors,29,Can pack it anywhere.,1,3,4,0.078991
3,3,B000AR2N76,A30CS2AL07WGU5,Sports & Outdoors,29,I have used it just twice but I love it.,1,4,10,0.006734
4,4,B000AR2N76,A30CS2AL07WGU5,Sports & Outdoors,29,Easy to clean.,1,5,3,0.125893


In [None]:
# TOOLS
tools_encodings = tokenizer(ev_tools['sentence'].tolist(), truncation=True, padding=True, max_length=max_seq_length)
tools_dataset = tf.data.Dataset.from_tensor_slices((dict(tools_encodings))) # test dataset
tools_data = tools_dataset.batch(train_batch_size, drop_remainder=False).prefetch(tf.data.experimental.AUTOTUNE)

preds_percentage_tools = model.predict(tools_data).logits
preds_percentage_tools = tf.nn.softmax(preds_percentage_tools, axis=1).numpy()  
preds_prob_tools = preds_percentage_tools[:,1]

print(f"Num. tips tools at threshold = 0.5:  {(preds_prob_tools > 0.5).sum()} / {len(preds_prob_tools)}  ")
print(f"Num. tips tools at threshold = 0.7:  {(preds_prob_tools > 0.7).sum()} / {len(preds_prob_tools)}  ")
print(f"Num. tips tools at threshold = 0.9:  {(preds_prob_tools > 0.9).sum()} / {len(preds_prob_tools)}  \n")

ev_tools['prediction'] = preds_prob_tools
ev_tools.head(5)

Num. tips tools at threshold = 0.5:  70 / 304  
Num. tips tools at threshold = 0.7:  42 / 304  
Num. tips tools at threshold = 0.9:  10 / 304  



Unnamed: 0.1,Unnamed: 0,asin,reviewer_id,category,freq,sentence,num_review,num_sentence,sen_length,prediction
0,0,B00004Z0YC,AASH83SNPTUXG,Tools & Home Improvement,7,These are extremely weak if you need to swing ...,1,1,12,0.922304
1,1,B00004Z0YC,AASH83SNPTUXG,Tools & Home Improvement,7,I have a solid wood door going into my basemen...,1,2,28,0.035821
2,2,B00004Z0YC,AASH83SNPTUXG,Tools & Home Improvement,7,If you are using it for a hollow core door the...,1,3,13,0.928025
3,3,B00004Z0YC,AASH83SNPTUXG,Tools & Home Improvement,7,"Anything other than that, these are useless.",1,4,7,0.181776
4,4,B00004Z0YC,AASH83SNPTUXG,Tools & Home Improvement,7,Use the Everbuilt spring hinges which work gre...,1,5,27,0.888856


In [None]:
# TOYS
toys_encodings = tokenizer(ev_toys['sentence'].tolist(), truncation=True, padding=True, max_length=max_seq_length)
toys_dataset = tf.data.Dataset.from_tensor_slices((dict(toys_encodings))) # test dataset
toys_data = toys_dataset.batch(train_batch_size, drop_remainder=False).prefetch(tf.data.experimental.AUTOTUNE)

preds_percentage_toys = model.predict(toys_data).logits
preds_percentage_toys = tf.nn.softmax(preds_percentage_toys, axis=1).numpy()  
preds_prob_toys = preds_percentage_toys[:,1]

print(f"Num. tips toys at threshold = 0.5:  {(preds_prob_toys > 0.5).sum()} / {len(preds_prob_toys)}  ")
print(f"Num. tips toys at threshold = 0.7:  {(preds_prob_toys > 0.7).sum()} / {len(preds_prob_toys)}  ")
print(f"Num. tips toys at threshold = 0.9:  {(preds_prob_toys > 0.9).sum()} / {len(preds_prob_toys)}  \n")

ev_toys['prediction'] = preds_prob_toys
ev_toys.head(5)

Num. tips toys at threshold = 0.5:  33 / 259  
Num. tips toys at threshold = 0.7:  25 / 259  
Num. tips toys at threshold = 0.9:  5 / 259  



Unnamed: 0.1,Unnamed: 0,asin,reviewer_id,category,freq,sentence,num_review,num_sentence,sen_length,prediction
0,0,B0000C9WI2,A1B37KSGD8XUJ8,Toys & Games,18,The goal works great.,1,1,4,0.005858
1,1,B0000C9WI2,A1B37KSGD8XUJ8,Toys & Games,18,Make sure you have the correct batteries ( 6 A...,1,2,16,0.938989
2,2,B0000C9WI2,A1B37KSGD8XUJ8,Toys & Games,18,You need a pump to pump up the miniature ball ...,1,3,13,0.85677
3,3,B0000C9WI2,A1B37KSGD8XUJ8,Toys & Games,18,"I bought a better ball to go with this, howeve...",1,4,18,0.885033
4,4,B0000C9WI2,A1B37KSGD8XUJ8,Toys & Games,18,Great buy overall; instructions were easy to f...,1,5,8,0.005868


In [None]:
# VIDEO
video_encodings = tokenizer(ev_video['sentence'].tolist(), truncation=True, padding=True, max_length=max_seq_length)
video_dataset = tf.data.Dataset.from_tensor_slices((dict(video_encodings))) # test dataset
video_data = video_dataset.batch(train_batch_size, drop_remainder=False).prefetch(tf.data.experimental.AUTOTUNE)

preds_percentage_video = model.predict(video_data).logits
preds_percentage_video = tf.nn.softmax(preds_percentage_video, axis=1).numpy()  
preds_prob_video = preds_percentage_video[:,1]

print(f"Num. tips video at threshold = 0.5:  {(preds_prob_video > 0.5).sum()} / {len(preds_prob_video)}  ")
print(f"Num. tips video at threshold = 0.7:  {(preds_prob_video > 0.7).sum()} / {len(preds_prob_video)}  ")
print(f"Num. tips video at threshold = 0.9:  {(preds_prob_video > 0.9).sum()} / {len(preds_prob_video)}  \n")

ev_video['prediction'] = preds_prob_video
ev_video.head(5)

Num. tips video at threshold = 0.5:  38 / 458  
Num. tips video at threshold = 0.7:  15 / 458  
Num. tips video at threshold = 0.9:  2 / 458  



Unnamed: 0.1,Unnamed: 0,asin,reviewer_id,category,freq,sentence,num_review,num_sentence,sen_length,prediction
0,0,B00004WHW7,ADJKMC7AO7YN3,Video Games,21,Loopy Landscapes is the second expansion pack ...,1,1,21,0.011506
1,1,B00004WHW7,ADJKMC7AO7YN3,Video Games,21,"Loopy Landscapes includes 30 new scenarios, 3 ...",1,2,21,0.137283
2,2,B00004WHW7,ADJKMC7AO7YN3,Video Games,21,It should be said upfront that this expansion ...,1,3,49,0.440835
3,3,B00004WHW7,ADJKMC7AO7YN3,Video Games,21,You must complete a scenario here in order to ...,1,4,11,0.324584
4,4,B00004WHW7,ADJKMC7AO7YN3,Video Games,21,"However, I think this is where Loopy Landscape...",1,5,25,0.059905


Write results matrices to csv

In [None]:
from google.colab import files

ev_baby.to_csv("results_baby.csv", sep = "\t")  
ev_cloth.to_csv("results_cloth.csv", sep = "\t")  
ev_food.to_csv("results_food.csv", sep = "\t")  
ev_health.to_csv("results_health.csv", sep = "\t")   
ev_music.to_csv("results_music.csv", sep = "\t")   
ev_phone.to_csv("results_phone.csv", sep = "\t")   
ev_sports.to_csv("results_sports.csv", sep = "\t")  
ev_tools.to_csv("results_tools.csv", sep = "\t")   
ev_toys.to_csv("results_toys.csv", sep = "\t")   
ev_video.to_csv("results_video.csv", sep = "\t")  

files.download("results_baby.csv")
files.download('results_cloth.csv')
files.download('results_food.csv')
files.download('results_health.csv')
files.download('results_music.csv')
files.download('results_phone.csv')
files.download('results_sports.csv')
files.download('results_tools.csv')
files.download('results_toys.csv')
files.download('results_video.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
from google.colab import files
ev_baby.to_csv('filename.csv') 
files.download('filename.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

@references: 
* https://huggingface.co/transformers/custom_datasets.html