## Running Instructions

This notebook contains code for Fine-Tuning BERT, including dataset pre-processing and tokenization, hyperparameter searching and final model training and evaluation.

Please note that because of the time constraints of fine-tuning, training code calls have been commented out. Instead, this notebook loads our final fine-tuned models from saved checkpoints on disk.

This notebook also requires our balanced csv datasets (provided).

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, ConfusionMatrixDisplay, classification_report
from tqdm import tqdm
from collections import Counter
from transformers import DataCollatorWithPadding,AutoModelForSequenceClassification, TrainingArguments, Trainer

from sklearn.utils import shuffle
from transformers import AutoTokenizer
from datasets import load_dataset, Dataset, Features, ClassLabel, Value
import evaluate
from evaluate import evaluator

from sklearn.model_selection import StratifiedKFold
from datasets import DatasetDict
import wandb
from ast import literal_eval

import torch
import gc
gc.collect()
torch.cuda.empty_cache()

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
## Util to check GPU memory 

def check_dev(n):
    t = torch.cuda.get_device_properties(n).total_memory
    r = torch.cuda.memory_reserved(n)
    a = torch.cuda.memory_allocated(n)
    f = r-a  # free
    print(f"{a} / {t} used for dev {n}, reserved {r}")
    
def check_devs():
    for i in range(torch.cuda.device_count()):
        check_dev(i)

check_devs()

0 / 85051572224 used for dev 0, reserved 0


# Data Preprocessing

## Load pre-balanced data


In [3]:
movie_review_df = pd.read_csv('balanced_movie_review.csv')
book_review_df = pd.read_csv('balanced_book_review.csv')

In [4]:
movie_review_df.info()
book_review_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300924 entries, 0 to 300923
Data columns (total 8 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   Unnamed: 0      300924 non-null  int64 
 1   review_date     300924 non-null  object
 2   movie_id        300924 non-null  object
 3   user_id         300924 non-null  object
 4   is_spoiler      300924 non-null  bool  
 5   review_text     300924 non-null  object
 6   rating          300924 non-null  int64 
 7   review_summary  300923 non-null  object
dtypes: bool(1), int64(2), object(5)
memory usage: 16.4+ MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 179254 entries, 0 to 179253
Data columns (total 8 columns):
 #   Column            Non-Null Count   Dtype 
---  ------            --------------   ----- 
 0   Unnamed: 0        179254 non-null  int64 
 1   user_id           179254 non-null  object
 2   timestamp         179254 non-null  object
 3   review_sentences  179254 non-n

## Preprocess for BERT Classification

This section defines utility methods for constructing HF Datasets from our data in DataFrame format.

In [5]:
id2label = {0: False, 1: True}
label2id = {False: 0, True: 1}
spoiler_features = Features({'text': Value('string'), 'label': ClassLabel(names=[True, False])})

In [6]:
def join_sents(entry):
    '''
    Entry: a list of lists
    Method for casting the UCSD review sentences to plain string format 
    (removes sentence-level labels).
    '''
    review = literal_eval(entry)
    return ' '.join([sent[-1] for sent in review])


def bert_rename(df, review_type):
    '''
    Renames DataFrame columns to be consistent in all datasets: ["label","text"]
    '''
    if review_type == "movie":
        df.rename(inplace=True, columns={"is_spoiler": "label", "review_text":"text"})
    elif review_type == "book":
        df.rename(inplace=True, columns={"has_spoiler": "label", "review_sentences":"text"})
    else:
        pass


def bert_preproc(df, max_n, review_type):
    '''
    Receives a DataFrame dataset and clips it to MAX_N number of
    label=0 examples and label=1 examples (balanced).
    Requires a review_type=["movie","book"] flag.
    Returns the preprocessed DataFrame.
    '''
    df = shuffle(df)
    df.reset_index(inplace=True, drop=True) 
    
    bert_rename(df, review_type)
    if review_type == "book":
        df["text"] = df["text"].apply(join_sents).values.tolist()
        
    df = df.loc[:,["label", "text"]]
    
    no_spoilers_indices = df.index[df['label'] == False][:max_n]
    spoilers_indices = df.index[df['label'] == True][:max_n]

    mini_spoilers = df.iloc[spoilers_indices]
    mini_no_spoilers = df.iloc[no_spoilers_indices]
    print(len(mini_spoilers), len(mini_no_spoilers))

    preproc_df = mini_spoilers.append(mini_no_spoilers, ignore_index=True)
    return preproc_df

### Example run for IMDB BERT. 
For Goodreads BERT, change `movie_review_df` to `book_review_df`

In [7]:
REVIEW_TYPE = "movie"
MAX = 3000

preproc_df = bert_preproc(movie_review_df, MAX, REVIEW_TYPE)
preproc_df.info()
preproc_df

3000 3000
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6000 entries, 0 to 5999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   label   6000 non-null   bool  
 1   text    6000 non-null   object
dtypes: bool(1), object(1)
memory usage: 52.9+ KB


Unnamed: 0,label,text
0,True,At wits end - this is exactly how i felt after...
1,True,Advanced beings who have mastered Space Travel...
2,True,This movie started out promising...a great per...
3,True,******WARNING SPOILERS*********Some major plot...
4,True,This is a non-linear narrative of a relationsh...
...,...,...
5995,False,"THE GOOD: performances, technical values, seve..."
5996,False,"As so many reviews attest to, I think that man..."
5997,False,Oliver Stone reveals some truths about the war...
5998,False,This is a true story of two great rivals of Fo...


## DistilBERT Finetune Setup

Section to define functions used for the Fine-tuning. The following functions will be used by HF Trainer.

In [8]:
# Tokenizing functions

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

def tokenize(examples):
    return tokenizer(examples["text"], truncation=True)

metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    '''
    Computes accuracy for training examples and predictions
    '''
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return metric.compute(predictions=predictions, references=labels)

def model_init():
    '''
    Returns a pretrained (not finetuned) DISTIL_BERT model
    '''
    checkpoint="distilbert-base-uncased"
    model = AutoModelForSequenceClassification.from_pretrained(
        checkpoint, num_labels=2, id2label=id2label, label2id=label2id
    )
    return model

In [9]:
# tokenize full dataset (to be split into k_folds)
data = Dataset.from_pandas(preproc_df, split="train", preserve_index=False, features=spoiler_features)
data_tokenized = data.map(tokenize, batched=True)
model = model_init()
data.features

 83%|████████▎ | 5/6 [00:02<00:00,  2.18ba/s]
Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.bias', 'vocab_projector.weight', 'vocab_transform.weight', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_clas

{'text': Value(dtype='string', id=None),
 'label': ClassLabel(names=[True, False], id=None)}

## Hyperparameter search with WandB

This section performs hyperparameter searches using WandB and k-fold cross-validation sets.

In [10]:
import os
PATH = '/home/lucchetti.f/6120/spoiler_detection_bert.ipynb'
os.environ['WANDB_NOTEBOOK_NAME'] = PATH

def make_folds(k, dataset):
    folds = StratifiedKFold(n_splits=k)
    kfolds = folds.split(np.zeros(dataset.num_rows), dataset["label"])
    return kfolds

def train(
        name,
        train,
        val,
        hyperparam):
    '''
    Trains a model according to hyperparam dict and saves checkpoints.
    '''
    
    with wandb.init(group='imdb', project='spoiler_detection_cls'):
        training_args = TrainingArguments(
            output_dir=name,
            learning_rate=hyperparam["learning_rate"],
            per_device_train_batch_size=hyperparam["per_device_train_batch_size"],
            per_device_eval_batch_size=hyperparam["per_device_eval_batch_size"],
            num_train_epochs=hyperparam["num_train_epochs"],
            weight_decay=hyperparam["weight_decay"],
            report_to='wandb',  
            evaluation_strategy="epoch",
            save_strategy="epoch"
        )

        trainer = Trainer(
            model_init=model_init,
            args=training_args,
            train_dataset=train,
            eval_dataset=val,
            tokenizer=tokenizer,
            data_collator=data_collator,
            compute_metrics=compute_metrics
        )

        trainer.train()

    
# BERT authors reccomend: 
{
  "per_gpu_batch_size": [16, 32],
  "learning_rate": [2e-5, 3e-5, 5e-5],
  "num_epochs": [2, 3, 4]
}


lr = [2e-5, 3e-5, 5e-5]
n_train_batch = [16,4,8]
n_eval_batch = [16,4,8]
n_epochs = [2,3,4]
wd = [0.1,0.2,0.2]


def search(k, datasets, name):
    kfolds = make_folds(k, datasets)
    for i, idx in enumerate(kfolds):
        train_idxs = idx[0]
        val_idxs = idx[1]
        assert(len(set(train_idxs).intersection(set(val_idxs))) == 0), list(set(train_idxs).intersection(set(val_idxs)))

        fold_dataset = DatasetDict({
            "train":datasets.select(train_idxs),
            "validation":datasets.select(val_idxs)
        })
        hyperparam = {
            "learning_rate":lr[i],
            "per_device_train_batch_size":n_train_batch[i],
            "per_device_eval_batch_size":n_eval_batch[i],
            "num_train_epochs":n_epochs[i],
            "weight_decay":wd[i],
        }
        train(name, fold_dataset["train"], fold_dataset["validation"], hyperparam)
    

In [11]:
# search(3, data_tokenized, "hyperparam_search")

## Final BERT Fine-tuning with Optimal Hyperparams

Fine-tuning model with hyperparameters from above. To fine-tune a Goodreads model instead,change `data_tokenized` above

In [13]:
NAME = "final_imdb"

train_test = make_folds(10, data_tokenized)

def train_final():
    for train_idxs, val_idxs in train_test:
        assert(len(set(train_idxs).intersection(set(val_idxs))) == 0), list(set(train_idxs).intersection(set(val_idxs)))
        fold_dataset = DatasetDict({
                "train":data_tokenized.select(train_idxs),
                "test":data_tokenized.select(val_idxs)
        })
        hyperparam = {
            "learning_rate":5e-5,
            "per_device_train_batch_size":8,
            "per_device_eval_batch_size":8,
            "num_train_epochs":4,
            "weight_decay":0.2,
        }
        print(len(train_idxs), len(val_idxs))
        train(NAME, fold_dataset["train"], fold_dataset["test"], hyperparam)
        break
        
# train_final()

## Evaluation: OOD test set + inference

Evaluating fine-tuned BERT models with preciison, recall, f-1

In [61]:
bert_tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased", truncation=True)

def eval(model, data):

    p = task_evaluator.compute(
        model_or_pipeline=model,
        data=data,
        metric="precision",
        tokenizer=bert_tokenizer,
        label_mapping=label2id,
        strategy="bootstrap",
        n_resamples=10,
        random_state=0
    )

    a = task_evaluator.compute(
        model_or_pipeline=model,
        data=data,
        metric="accuracy",
        tokenizer=bert_tokenizer,
        label_mapping=label2id,
        strategy="bootstrap",
        n_resamples=10,
        random_state=0
    )

    r = task_evaluator.compute(
        model_or_pipeline=model,
        data=data,
        metric="recall",
        tokenizer=bert_tokenizer,
        label_mapping=label2id,
        strategy="bootstrap",
        n_resamples=10,
        random_state=0
    )
    
    f = task_evaluator.compute(
        model_or_pipeline=model,
        data=data,
        metric="f1",
        tokenizer=bert_tokenizer,
        label_mapping=label2id,
        strategy="bootstrap",
        n_resamples=10,
        random_state=0
    )

    return p,a,r,f

def predict(text, model):
    '''
    Given a text input and model, return model's class
    prediction for the given text.
    '''
    inputs = tokenizer(text, return_tensors="pt")
    inputs = inputs.to(0)

    with torch.no_grad():
        logits = model(**inputs).logits

    predicted_class_id = logits.argmax().item()
    return model.config.id2label[predicted_class_id]


def get_model(checkpoint):
    '''
    Returns a model loaded from chekcpoint path
    '''
    model = AutoModelForSequenceClassification.from_pretrained(
        checkpoint, num_labels=2, id2label=id2label, label2id=label2id
    )
    return model

loading configuration file config.json from cache at /home/lucchetti.f/.cache/huggingface/hub/models--distilbert-base-uncased/snapshots/1c4513b2eedbda136f57676a34eea67aba266e5c/config.json
Model config DistilBertConfig {
  "_name_or_path": "distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.25.1",
  "vocab_size": 30522
}

loading file vocab.txt from cache at /home/lucchetti.f/.cache/huggingface/hub/models--distilbert-base-uncased/snapshots/1c4513b2eedbda136f57676a34eea67aba266e5c/vocab.txt
loading file tokenizer.json from cache at /home/lucchetti.f/.cache/huggingface/hub/mo

In [62]:
## OOD evaluation

task_evaluator = evaluator("text-classification")

MAX = 500
eval_movie_data = Dataset.from_pandas(bert_preproc(movie_review_df, MAX, "movie"), 
                                     split="test", preserve_index=False, features=spoiler_features)
eval_book_data = Dataset.from_pandas(bert_preproc(book_review_df, MAX, "book"), 
                                     split="test", preserve_index=False, features=spoiler_features)

goodreads_model = get_model("my_checkpoints/goodreads_checkpoint")
imdb_model = get_model("my_checkpoints/imdb_checkpoint")

500 500


loading configuration file my_checkpoints/goodreads_checkpoint/config.json
Model config DistilBertConfig {
  "_name_or_path": "my_checkpoints/goodreads_checkpoint",
  "activation": "gelu",
  "architectures": [
    "DistilBertForSequenceClassification"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "id2label": {
    "0": false,
    "1": true
  },
  "initializer_range": 0.02,
  "label2id": {
    "false": 0,
    "true": 1
  },
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "problem_type": "single_label_classification",
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "torch_dtype": "float32",
  "transformers_version": "4.25.1",
  "vocab_size": 30522
}

loading weights file my_checkpoints/goodreads_checkpoint/pytorch_model.bin


500 500


All model checkpoint weights were used when initializing DistilBertForSequenceClassification.

All the weights of DistilBertForSequenceClassification were initialized from the model checkpoint at my_checkpoints/goodreads_checkpoint.
If your task is similar to the task the model of the checkpoint was trained on, you can already use DistilBertForSequenceClassification for predictions without further training.
loading configuration file my_checkpoints/imdb_checkpoint/config.json
Model config DistilBertConfig {
  "_name_or_path": "my_checkpoints/imdb_checkpoint",
  "activation": "gelu",
  "architectures": [
    "DistilBertForSequenceClassification"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "id2label": {
    "0": false,
    "1": true
  },
  "initializer_range": 0.02,
  "label2id": {
    "false": 0,
    "true": 1
  },
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "problem

In [63]:
EVAL_MODEL = imdb_model
EVAL_DATA = eval_book_data
eval(EVAL_MODEL, EVAL_DATA)

({'precision': {'confidence_interval': (0.5926576324484764,
    0.6440405420897869),
   'standard_error': 0.018796319412844882,
   'score': 0.6203288490284006},
  'total_time_in_seconds': 4.68152535893023,
  'samples_per_second': 213.60559290626352,
  'latency_in_seconds': 0.00468152535893023},
 {'accuracy': {'confidence_interval': (0.6350219304265113, 0.6660577191643197),
   'standard_error': 0.01642119226961171,
   'score': 0.661},
  'total_time_in_seconds': 4.6791293658316135,
  'samples_per_second': 213.71497170014058,
  'latency_in_seconds': 0.004679129365831613},
 {'recall': {'confidence_interval': (0.7797601340662207, 0.8438372892941408),
   'standard_error': 0.020247616308683436,
   'score': 0.83},
  'total_time_in_seconds': 4.650393579155207,
  'samples_per_second': 215.03556268492454,
  'latency_in_seconds': 0.0046503935791552065},
 {'f1': {'confidence_interval': (0.6823962825750187, 0.7193803444241105),
   'standard_error': 0.01787941234477903,
   'score': 0.7100085543199316

In [64]:
predict("Omg can't believe Harry actually died at the end.", EVAL_MODEL)

True

In [65]:
predict("Omg this killed me.", EVAL_MODEL)

False

In [66]:
predict("!!!", EVAL_MODEL)

False

In [90]:
k = 10
predict(EVAL_DATA[k]["text"], EVAL_MODEL), EVAL_DATA[k]["label"], len(EVAL_DATA[k]["text"].split()), EVAL_DATA[k]["text"]

(False,
 1,
 192,
 'I like reading books based upon a concept that is very new to me and this is one of them. Funnily enough, though, as I was reading this book I mentioned it to a friend of mine and she said she might have read it but she wasn\'t sure so she asked me for details. Not wanting to give any spoilers I told her I think that if she had read it she would have remembered it as it is so different to which she replied that she actually reads a lot of books like that :D So I guess the concept is not as unique as I had thought but to me it was the first book of this kind so I found it interesting :) One of the strongest points of the book to me was that it didn\'t finish with the captives regaining their freedom as the way they\'re adjusting is equally interesting. Unfortunately, I felt like it was sometimes just a vehicle for writing maudlin observations about everyday life through the eyes of a child that is naive and "unspoilt by social norms" which caused me some eye-rolling.

## Test hypothesis: short sentences score better

In [68]:
# book reviews preprocess by sent

def int_to_bool(num):
    return num == 1

data = book_review_df["review_sentences"].apply(literal_eval).values.tolist()
data = [sent for review in data for sent in review]

# to df
book_review_sent_df = pd.DataFrame(data=data, columns=["label","text"])
book_review_sent_df["label"] = book_review_sent_df["label"].apply(int_to_bool).values.tolist()

book_review_sent_df.info()
book_review_sent_df

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3183227 entries, 0 to 3183226
Data columns (total 2 columns):
 #   Column  Dtype 
---  ------  ----- 
 0   label   bool  
 1   text    object
dtypes: bool(1), object(1)
memory usage: 27.3+ MB


Unnamed: 0,label,text
0,False,This is a special book.
1,False,"It started slow for about the first third, the..."
2,False,This is what I love about good science fiction...
3,False,"It is a 2015 Hugo winner, and translated from ..."
4,False,For instance the intermixing of Chinese revolu...
...,...,...
3183222,False,"I got the ISBN number that's listed as the ""au..."
3183223,False,And there's some talk about the American versi...
3183224,False,I did notice a few lines here and there which ...
3183225,False,"So, yeah."


In [69]:
sent_preproc = bert_preproc(book_review_sent_df, 100, "other")
eval_book_sent_data = Dataset.from_pandas(sent_preproc, 
                                     split="test", preserve_index=False, features=spoiler_features)

eval(imdb_model, eval_book_sent_data) 

100 100


({'precision': {'confidence_interval': (0.6211466856724185,
    0.7755666161661056),
   'standard_error': 0.05658150435830794,
   'score': 0.7205882352941176},
  'total_time_in_seconds': 0.690104128792882,
  'samples_per_second': 289.8113366599277,
  'latency_in_seconds': 0.0034505206439644095},
 {'accuracy': {'confidence_interval': (0.6201011511041631, 0.6916698874652987),
   'standard_error': 0.03366501646120691,
   'score': 0.65},
  'total_time_in_seconds': 0.68892377987504,
  'samples_per_second': 290.30787707208606,
  'latency_in_seconds': 0.0034446188993752},
 {'recall': {'confidence_interval': (0.44086175909444086, 0.5046917520702033),
   'standard_error': 0.04475901819976398,
   'score': 0.49},
  'total_time_in_seconds': 0.6830449383705854,
  'samples_per_second': 292.80650329845525,
  'latency_in_seconds': 0.0034152246918529275},
 {'f1': {'confidence_interval': (0.5260077005526892, 0.6377408232719738),
   'standard_error': 0.047597525834103475,
   'score': 0.5833333333333334},

## Train on sentence dataset

In [70]:
NAME = "final_sent_goodreads"

sent_tokenized = Dataset.from_pandas(bert_preproc(book_review_sent_df, 3000, "other"), 
                                     split="train", 
                                     preserve_index=False, 
                                     features=spoiler_features).map(tokenize, batched=True)
train_test = make_folds(10, sent_tokenized)
print(sent_tokenized)

def train_sent():
    for train_idxs, val_idxs in train_test:
        assert(len(set(train_idxs).intersection(set(val_idxs))) == 0), list(set(train_idxs).intersection(set(val_idxs)))
        fold_dataset = DatasetDict({
                "train":sent_tokenized.select(train_idxs),
                "test":sent_tokenized.select(val_idxs)
        })
        hyperparam = {
            "learning_rate":5e-5,
            "per_device_train_batch_size":8,
            "per_device_eval_batch_size":8,
            "num_train_epochs":4,
            "weight_decay":0.2,
        }
        print(len(train_idxs), len(val_idxs))
        train(NAME, fold_dataset["train"], fold_dataset["test"], hyperparam)
        break
        
# train_sent()

3000 3000


 83%|████████▎ | 5/6 [00:00<00:00, 15.68ba/s]

Dataset({
    features: ['text', 'label', 'input_ids', 'attention_mask'],
    num_rows: 6000
})





In [71]:
sent_model = get_model("my_checkpoints/sent_goodreads")
eval(sent_model, eval_movie_data)

loading configuration file my_checkpoints/sent_goodreads/config.json
Model config DistilBertConfig {
  "_name_or_path": "my_checkpoints/sent_goodreads",
  "activation": "gelu",
  "architectures": [
    "DistilBertForSequenceClassification"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "id2label": {
    "0": false,
    "1": true
  },
  "initializer_range": 0.02,
  "label2id": {
    "false": 0,
    "true": 1
  },
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "problem_type": "single_label_classification",
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "torch_dtype": "float32",
  "transformers_version": "4.25.1",
  "vocab_size": 30522
}

loading weights file my_checkpoints/sent_goodreads/pytorch_model.bin
All model checkpoint weights were used when initializing DistilBertForSequenceClassification.

All the weigh

({'precision': {'confidence_interval': (0.6267949204298924,
    0.6677331133349331),
   'standard_error': 0.017127604456405814,
   'score': 0.6415094339622641},
  'total_time_in_seconds': 4.911277273669839,
  'samples_per_second': 203.6130204582754,
  'latency_in_seconds': 0.004911277273669839},
 {'accuracy': {'confidence_interval': (0.6154096537376601, 0.6573069871424213),
   'standard_error': 0.015319196381591895,
   'score': 0.635},
  'total_time_in_seconds': 4.902495447546244,
  'samples_per_second': 203.9777518815467,
  'latency_in_seconds': 0.004902495447546244},
 {'recall': {'confidence_interval': (0.561926035781331, 0.6317377375187047),
   'standard_error': 0.023397106559815143,
   'score': 0.612},
  'total_time_in_seconds': 4.926526803523302,
  'samples_per_second': 202.98275841812745,
  'latency_in_seconds': 0.004926526803523302},
 {'f1': {'confidence_interval': (0.5923806774802787, 0.643556876527997),
   'standard_error': 0.01834349033995444,
   'score': 0.6264073694984647},

In [72]:
eval(goodreads_model, eval_movie_data)

({'precision': {'confidence_interval': (0.6236965406373279,
    0.6884792730416581),
   'standard_error': 0.02213215680527061,
   'score': 0.6480446927374302},
  'total_time_in_seconds': 4.895009087398648,
  'samples_per_second': 204.28971267373672,
  'latency_in_seconds': 0.004895009087398648},
 {'accuracy': {'confidence_interval': (0.5816493599492131, 0.6268100769834994),
   'standard_error': 0.01627847931745743,
   'score': 0.606},
  'total_time_in_seconds': 4.907166248187423,
  'samples_per_second': 203.78359921459224,
  'latency_in_seconds': 0.004907166248187422},
 {'recall': {'confidence_interval': (0.43628508689901213, 0.4980803674931558),
   'standard_error': 0.022566200502435,
   'score': 0.464},
  'total_time_in_seconds': 4.919306471943855,
  'samples_per_second': 203.28068716663057,
  'latency_in_seconds': 0.0049193064719438555},
 {'f1': {'confidence_interval': (0.511715839150287, 0.574288045228997),
   'standard_error': 0.02128639443387322,
   'score': 0.5407925407925408},


## Final evaluation

Extract new unseen data

In [None]:
n=150
m = 4000
test_goodreads = Dataset.from_pandas(bert_preproc(book_review_df, m+n, "book", min_n=m), 
                                     split="test", preserve_index=False, features=spoiler_features)
test_imdb = Dataset.from_pandas(bert_preproc(movie_review_df, m+n, "movie", min_n=m), 
                                     split="test", preserve_index=False, features=spoiler_features)

In [None]:
eval(goodreads_model, test_goodreads)

In [None]:
eval(goodreads_model,test_imdb)

In [None]:
eval(sent_model, test_goodreads)

In [None]:
eval(sent_model, test_imdb)

In [None]:
eval(imdb_model, test_imdb)

In [None]:
eval(imdb_model, test_goodreads)