In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
import urllib.request
import os
import json
import gzip
from transformers import create_optimizer
from transformers import AutoTokenizer
from transformers import DataCollatorWithPadding
from transformers import TFAutoModelForSequenceClassification
from datasets import Dataset

DATA_PATH =  "./data"

urls = [
    "https://spoiler-datasets.s3.eu-central-1.amazonaws.com/goodreads_balanced-train.json.gz",
    "https://spoiler-datasets.s3.eu-central-1.amazonaws.com/goodreads_balanced-val.json.gz",
    "https://spoiler-datasets.s3.eu-central-1.amazonaws.com/goodreads_balanced-test.json.gz"
]

FORCE_REDOWNLOAD = False

goodreads_train = []
goodreads_val = []
goodreads_test = []

for goodreads_list, url in zip(
        [goodreads_train, goodreads_val, goodreads_test], urls):
    file = f"{DATA_PATH}/goodreads/{url.rsplit('/', 1)[-1]}"
    if not os.path.exists(file) or FORCE_REDOWNLOAD:
        urllib.request.urlretrieve(url, file)

    with gzip.open(file, "rb") as f:
        for line in tqdm(f):
            goodreads_list.append(json.loads(line))

143402it [00:07, 20024.11it/s]
17926it [00:00, 37434.80it/s]
17926it [00:01, 13430.38it/s]


In [2]:
model_name = "distilbert-base-uncased"

tokenizer = AutoTokenizer.from_pretrained(model_name, )



def preprocess_function(data):
    return tokenizer(data["text"], truncation=True)


data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="tf")
model = TFAutoModelForSequenceClassification.from_pretrained(model_name, dropout=0.2)

2022-11-24 19:07:12.817799: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-11-24 19:07:13.437893: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1532] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 38419 MB memory:  -> device: 0, name: A100-SXM4-40GB, pci bus id: 0000:07:00.0, compute capability: 8.0
2022-11-24 19:07:14.524064: I tensorflow/stream_executor/cuda/cuda_blas.cc:1786] TensorFloat-32 will be used for the matrix multiplication. This will only be logged once.
Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertForSequenceClassification: ['activation_13', 'vocab_projector', 'vocab_layer_norm', 'vocab_transform']
- This IS expected if

In [3]:
def create_tf_dataset(goodreads_list):
    tokenized_goodreads = Dataset.from_list([
        {'text': ''.join(y[1] for y in x['review_sentences']), 'label': x['has_spoiler']}
        for x in goodreads_list
        ]).map(preprocess_function, batched=True)
    return model.prepare_tf_dataset(
        tokenized_goodreads, shuffle=True, batch_size=32, collate_fn=data_collator
    )

tf_goodreads_train = create_tf_dataset(goodreads_train)
tf_goodreads_val = create_tf_dataset(goodreads_val)
tf_goodreads_test = create_tf_dataset(goodreads_test)

  0%|          | 0/144 [00:00<?, ?ba/s]

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


  0%|          | 0/18 [00:00<?, ?ba/s]

  0%|          | 0/18 [00:00<?, ?ba/s]

In [4]:
batch_size = 32

num_epochs = 3

batches_per_epoch = len(goodreads_train) // batch_size

total_train_steps = int(batches_per_epoch * num_epochs)

optimizer, schedule = create_optimizer(
    init_lr=2e-5, num_warmup_steps=0,
    num_train_steps=total_train_steps)

In [5]:
# for w in model.get_layer('distilbert').weights:
#     w._trainable = False

In [6]:
class BalancedSparseCategoricalAccuracy(tf.keras.metrics.SparseCategoricalAccuracy):
    def __init__(self, name='balanced_sparse_categorical_accuracy', dtype=None):
        super().__init__(name, dtype=dtype)

    def update_state(self, y_true, y_pred, sample_weight=None):
        y_flat = y_true
        if y_true.shape.ndims == y_pred.shape.ndims:
            y_flat = tf.squeeze(y_flat, axis=[-1])
        y_true_int = tf.cast(y_flat, tf.int32)

        cls_counts = tf.math.bincount(y_true_int)
        cls_counts = tf.math.reciprocal_no_nan(tf.cast(cls_counts, self.dtype))
        weight = tf.gather(cls_counts, y_true_int)
        return super().update_state(y_true, y_pred, sample_weight=weight)

In [7]:
model.compile(optimizer=optimizer,
             metrics=['accuracy', BalancedSparseCategoricalAccuracy()])

No loss specified in compile() - the model's internal loss computation will be used as the loss. Don't panic - this is a common way to train TensorFlow models in Transformers! To disable this behaviour please pass a loss argument, or explicitly pass `loss=None` if you do not want your model to compute a loss.


In [8]:
model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=f"./checkpoints/best_val_model_{model_name}",
    save_weights_only=True,
    monitor='val_accuracy',
    mode='max',
    save_best_only=True)

In [9]:
csv_logger = tf.keras.callbacks.CSVLogger(f'goodreads_fit_log_{model_name}.csv', append=True, separator=';')

In [10]:
model.fit(x=tf_goodreads_train, validation_data=tf_goodreads_val, epochs=3, callbacks=[model_checkpoint_callback, csv_logger])

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7fecb062f7c0>

In [13]:
model.load_weights(f"./checkpoints/best_val_model_{model_name}")

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7fecb0802b60>

In [14]:
model.evaluate(tf_goodreads_test)



[0.40434446930885315, 0.8156808018684387, 0.8161695003509521]

In [15]:
auc = tf.keras.metrics.AUC()
acc = tf.keras.metrics.Accuracy()

fp = tf.keras.metrics.FalsePositives()
fn = tf.keras.metrics.FalseNegatives()
tp = tf.keras.metrics.TruePositives()
tn = tf.keras.metrics.TrueNegatives()

for batch_data, batch_labels in tqdm(tf_goodreads_test):
    preds = tf.nn.softmax(model(batch_data)[0])[:,1]
    auc.update_state(batch_labels, preds)
    acc.update_state(batch_labels, preds>=0.5)
    
    fp.update_state(batch_labels, preds >= 0.5)
    fn.update_state(batch_labels, preds >= 0.5)
    tp.update_state(batch_labels, preds >= 0.5)
    tn.update_state(batch_labels, preds >= 0.5)

100%|██████████| 560/560 [01:21<00:00,  6.84it/s]


In [16]:
auc.result().numpy(), acc.result().numpy()

(0.8993575, 0.815625)

In [17]:
fp.result().numpy(), tp.result().numpy()

(1961.0, 7617.0)

In [18]:
fn.result().numpy(), tn.result().numpy()

(1343.0, 6999.0)

# Whole Goodreads test

In [19]:
from itertools import chain
exclude_reviews = set(x["review_id"] for x in chain(goodreads_train, goodreads_test))
len(exclude_reviews)
whole_goodreads_test = []
file = f"{DATA_PATH}/goodreads/goodreads_review_spoiler.json"

with open(file, "r") as f:
    for line in tqdm(f):
        as_json = json.loads(line)
        if as_json["review_id"] not in exclude_reviews:
            whole_goodreads_test.append(as_json)
tf_whole_goodreads_test = create_tf_dataset(whole_goodreads_test)
del whole_goodreads_test

1378033it [00:35, 39212.67it/s]


  0%|          | 0/1217 [00:00<?, ?ba/s]

In [20]:
auc = tf.keras.metrics.AUC()
acc = tf.keras.metrics.Accuracy()

fp = tf.keras.metrics.FalsePositives()
fn = tf.keras.metrics.FalseNegatives()
tp = tf.keras.metrics.TruePositives()
tn = tf.keras.metrics.TrueNegatives()

for batch_data, batch_labels in tqdm(tf_whole_goodreads_test):
    preds = tf.nn.softmax(model(batch_data)[0])[:,1]
    auc.update_state(batch_labels, preds)
    acc.update_state(batch_labels, preds>=0.5)
    
    fp.update_state(batch_labels, preds >= 0.5)
    fn.update_state(batch_labels, preds >= 0.5)
    tp.update_state(batch_labels, preds >= 0.5)
    tn.update_state(batch_labels, preds >= 0.5)

100%|██████████| 38022/38022 [52:21<00:00, 12.10it/s]


In [21]:
auc.result().numpy(), acc.result().numpy()

(0.89521176, 0.77776927)

In [22]:
fp.result().numpy(), tp.result().numpy()

(268962.0, 7536.0)

In [23]:
fn.result().numpy(), tn.result().numpy()

(1427.0, 938779.0)