In [None]:
DATA_PATH="./data"

In [1]:
import json
import gzip
import numpy as np
import pandas as pd
import tensorflow as tf
import transformers
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
from datasets import Dataset

In [2]:
from transformers import AutoTokenizer

model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)


In [3]:
def preprocess_function(data):
    return tokenizer(data["text"], truncation=True, padding=True)

In [4]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="tf")

In [5]:
tf.config.list_physical_devices('GPU')

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

In [6]:
from transformers import TFAutoModelForSequenceClassification

model = TFAutoModelForSequenceClassification.from_pretrained(model_name, classifier_dropout=0.1)

2022-11-24 21:42:57.544449: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-11-24 21:42:58.227699: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1532] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 38419 MB memory:  -> device: 0, name: A100-SXM4-40GB, pci bus id: 0000:87:00.0, compute capability: 8.0
2022-11-24 21:43:00.115230: I tensorflow/stream_executor/cuda/cuda_blas.cc:1786] TensorFloat-32 will be used for the matrix multiplication. This will only be logged once.
All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: [

In [7]:
class BalancedSparseCategoricalAccuracy(tf.keras.metrics.SparseCategoricalAccuracy):
    def __init__(self, name='balanced_sparse_categorical_accuracy', dtype=None):
        super().__init__(name, dtype=dtype)

    def update_state(self, y_true, y_pred, sample_weight=None):
        y_flat = y_true
        if y_true.shape.ndims == y_pred.shape.ndims:
            y_flat = tf.squeeze(y_flat, axis=[-1])
        y_true_int = tf.cast(y_flat, tf.int32)

        cls_counts = tf.math.bincount(y_true_int)
        cls_counts = tf.math.reciprocal_no_nan(tf.cast(cls_counts, self.dtype))
        weight = tf.gather(cls_counts, y_true_int)
        return super().update_state(y_true, y_pred, sample_weight=weight)

In [8]:

def get_all_sentences(sentences):
    all_sentences = []
    for sentence in sentences:
        all_sentences.append(sentence[1])
    return " ".join(all_sentences)
def get_reviews_from_tropes(filename):
    with open(filename, 'rb') as f:
        reviews = []
        for line in f:
            book = json.loads(line)
            reviews.append({'text': get_all_sentences(book['sentences']), 'label': book['has_spoiler']})
        return reviews

def tropes_to_tf(tropes_list, model_name):
    tokenized_tropes = Dataset.from_list(tropes_list).map(preprocess_function, batched=True)
    return model_name.prepare_tf_dataset(
        tokenized_tropes, shuffle=True, batch_size=32, collate_fn=data_collator
    )

In [9]:
tropes_train = get_reviews_from_tropes(f'{DATA_PATH}/tvtropes_books/tvtropes_books-train.json')
tropes_test = get_reviews_from_tropes(f'{DATA_PATH}/tvtropes_books/tvtropes_books-test.json')
tropes_val = get_reviews_from_tropes(f'{DATA_PATH}/tvtropes_books/tvtropes_books-val.json')


In [10]:
tf_tropes_train = tropes_to_tf(tropes_train, model)
tf_tropes_val = tropes_to_tf(tropes_test, model)
tf_tropes_test = tropes_to_tf(tropes_val, model)

  0%|          | 0/274 [00:00<?, ?ba/s]

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


  0%|          | 0/35 [00:00<?, ?ba/s]

  0%|          | 0/35 [00:00<?, ?ba/s]

In [11]:
from transformers import create_optimizer

batch_size = 32

num_epochs = 3

batches_per_epoch = len(tropes_train) // batch_size

total_train_steps = int(batches_per_epoch * num_epochs)

optimizer, schedule = create_optimizer(
    init_lr=2e-5, num_warmup_steps=0, 
    num_train_steps=total_train_steps)

In [85]:
neg = 213915
pos = 59347
total = neg + pos
weight_for_0 = (1 / neg) * (total / 2.0)
weight_for_1 = (1 / pos) * (total / 2.0)

class_weight = [weight_for_1, weight_for_0]


In [12]:
model.compile(optimizer=optimizer,
             metrics=['accuracy', BalancedSparseCategoricalAccuracy()])

No loss specified in compile() - the model's internal loss computation will be used as the loss. Don't panic - this is a common way to train TensorFlow models in Transformers! To disable this behaviour please pass a loss argument, or explicitly pass `loss=None` if you do not want your model to compute a loss.


In [13]:
model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath="./checkpoints/best_val_model",
    save_weights_only=True,
    monitor='val_accuracy',
    mode='max',
    save_best_only=True)

In [14]:
model.summary()

Model: "tf_bert_for_sequence_classification"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bert (TFBertMainLayer)      multiple                  109482240 
                                                                 
 dropout_37 (Dropout)        multiple                  0         
                                                                 
 classifier (Dense)          multiple                  1538      
                                                                 
Total params: 109,483,778
Trainable params: 109,483,778
Non-trainable params: 0
_________________________________________________________________


In [None]:
model.fit(x=tf_tropes_train, validation_data=tf_tropes_val, epochs=3, callbacks=[model_checkpoint_callback])

Epoch 1/3
Epoch 2/3
Epoch 3/3
 752/8539 [=>............................] - ETA: 56:46 - loss: 0.1807 - accuracy: 0.9285 - balanced_sparse_categorical_accuracy: 0.8796

In [None]:
model.evaluate(tf_tropes_test)

In [43]:
model.predict(tf_tropes_test.take(1))



TFSequenceClassifierOutput(loss=None, logits=array([[ 0.29023138,  0.34916064],
       [ 0.34931722,  0.30988342],
       [ 0.29810795,  0.20926128],
       [ 0.16940089,  0.10431242],
       [ 0.30276096,  0.23780829],
       [ 0.22682428,  0.25773048],
       [ 0.42555362,  0.23762904],
       [ 0.30690712,  0.2631874 ],
       [ 0.20597264,  0.12430929],
       [ 0.0904084 , -0.05847306],
       [ 0.2188375 ,  0.10585496],
       [ 0.40851647,  0.3299694 ],
       [ 0.32778746,  0.2464583 ],
       [ 0.33404464,  0.32148886],
       [ 0.35756263,  0.14559889],
       [ 0.26798752,  0.2514991 ],
       [ 0.4475265 ,  0.37847757],
       [ 0.29502618,  0.20877525],
       [ 0.40949923,  0.2736877 ],
       [ 0.278615  ,  0.46980965],
       [ 0.17033055,  0.16642803],
       [ 0.31911743,  0.157756  ],
       [ 0.32622588,  0.16656825],
       [ 0.30639672,  0.16873321],
       [ 0.11122294,  0.0373375 ],
       [ 0.38310233,  0.31911883],
       [ 0.30635056,  0.290002  ],
       [ 0

In [None]:
model.save_weights('./checkpoints/bert-base-uncased-3-epoch-dropout-01-tropes')

In [None]:
auc = tf.keras.metrics.AUC()
acc = tf.keras.metrics.Accuracy()
for batch_data, batch_labels in tqdm(tf_tropes_test):
    preds = tf.nn.softmax(model(batch_data)[0])[:,1]
    auc.update_state(batch_labels, preds)
    acc.update_state(batch_labels, preds>=0.5)

In [27]:
acc.result().numpy()

0.84700096

In [28]:
auc.result().numpy()

0.86697996