In [2]:
import torch
import torch.nn as nn
import numpy as np
import pandas as pd
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel
import re 
from tqdm import tqdm
import wandb
import json
import os
from model import OrdinalRegressionModel
from sklearn.metrics import classification_report, f1_score, mean_squared_error
import collections
import random

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
def load_model(model_class, labels_filename, model_filename, device):
    # load label mappings
    with open(labels_filename) as f:
        maps = json.load(f)

    label2id = {k: int(v) for k, v in maps["label2id"].items()}
    id2label = {int(k): int(v) for k, v in maps["id2label"].items()}

    # recreate model with correct num_classes
    num_classes = len(label2id)
    model = model_class(model_name="distilbert-base-uncased", num_classes=num_classes)
    model.load_state_dict(torch.load(model_filename, map_location=device))
    model.to(device)
    model.eval()

    print("Model loaded with", num_classes, "classes.")
    return model, label2id, id2label


In [4]:
device = "cuda" if torch.cuda.is_available() else "cpu"

model, label2id, id2label = load_model(OrdinalRegressionModel, "labels-20000.json", "model-20000.pt", device)

Model loaded with 25 classes.


In [5]:
class EvaluationTextByYearTestDataset(Dataset):
    def __init__(self, df, tokenizer, decade_to_idx, max_len=128):
        self.samples = []
        self.has_labels = "year" in df.columns and decade_to_idx is not None
        self.has_type = "type" in df.columns
        
        for _, row in df.iterrows():
            text = str(row['line'])

            # removing chunking for this bc then it explodes the test dataset size
            enc = tokenizer(
                text,
                truncation=True,
                max_length=max_len,
                padding="max_length",
                return_attention_mask=True
            )

            sample = {
                "input_ids": torch.tensor(enc["input_ids"], dtype=torch.long),
                "attention_mask": torch.tensor(enc["attention_mask"], dtype=torch.long),
            }

            if self.has_labels:
                decade = (int(row["year"]) // 10) * 10
                sample["label"] = torch.tensor(decade_to_idx[decade], dtype=torch.long)
            if self.has_type:
                sample["book"] = torch.tensor(row["type"] == "book", dtype=torch.bool)

            self.samples.append(sample)

    def __len__(self):
        return len(self.samples)
    def __getitem__(self, idx):
        return self.samples[idx]

In [6]:

def preprocess_and_label(df, tokenizer, max_len=128, stride=64):
    # Clean text
    df["line"] = df["line"].astype(str).apply(lambda s: re.sub(r"\s+", " ", s.lower().strip()))
    df = df[df["line"].str.len() > 0].reset_index(drop=True)

    # years to decades
    df["year"] = (df["year"].astype(int) // 10) * 10
    all_decades = sorted(df["year"].unique())
    decade_to_idx = {decade: i for i, decade in enumerate(all_decades)}
    idx_to_decade = {i: decade for decade, i in decade_to_idx.items()}
    df["label"] = df["year"].map(decade_to_idx)

    # Create dataset
    dataset = EvaluationTextByYearTestDataset(df, tokenizer, decade_to_idx, max_len=max_len)
    return dataset, len(all_decades), decade_to_idx, idx_to_decade

In [7]:
# code to load test csv as dataloader and do predictions on it 
test_df = pd.read_csv("test.csv")
# randomly sampling because there are 516,506 entries in test.csv and thats wayy too many to manageably test on cpu 
test_df = test_df.sample(n=20000, random_state=12).reset_index(drop=True)
test_dataset, num_decades, decade_to_idx, idx_to_decade = preprocess_and_label(test_df, AutoTokenizer.from_pretrained("distilbert-base-uncased"))
test_loader = DataLoader(test_dataset, batch_size=1, shuffle=False, num_workers=0, pin_memory=False)

In [8]:
print(test_dataset[:5])

[{'input_ids': tensor([  101,  6535,  7249,  8384,  2024,  3098,  5628,  2005,  1996,  4425,
         1997,  2367, 21955,  1012,  2012,  1996, 14575,  2057,  2156,  1037,
         2554,  2551,  1037, 18112,  1010,  1037,  3573,  1998,  1037,  6501,
         2449,  1012,  2178,  2554,  2081,  1037,  3112,  1997,  2551,  1037,
        18112,  1010,  1037,  3573,  2007,  2195,  5628,  1010,  1037,  2740,
         5427,  2533,  2007,  3174,  7435,  1998,  1037, 21459, 26572, 20464,
         5498,  2278,  1010,  5819,  1996,  1000,  1041, 29206, 15876, 14277,
         1000,  5324, 21255,  2037, 10336,  1997,  3747, 14678,  1012,  1999,
         1996,  4731,  1998,  3760,  4865,  1010,  1996, 26512,  1997,  2522,
         1011,  3169,  2024,  2172,  2062, 10975, 12672,  3372,  1012,  1037,
         2210, 18112,  2007,  1037,  2200,  2235,  3573,  2003,  1996,  6623,
         2927,  1010,  1998,  2130,  2023,  2003,  2025,  2467,  3144,  1012,
         2059,  2045,  2024,  1996,  2210,  3182,

In [9]:
def corn_decode(logits, threshold=0.5):
    probs = torch.sigmoid(logits)
    return torch.sum(probs > threshold, dim=1)

def predict_on_test(model, dataloader, id2label, device):
    model.eval()
    preds = []

    torch.set_grad_enabled(False)

    for batch in tqdm(dataloader, desc="Predicting", total=len(dataloader)):
        input_ids = batch["input_ids"].to(device, non_blocking=True)
        attention_mask = batch["attention_mask"].to(device, non_blocking=True)

        logits = model(input_ids=input_ids, attention_mask=attention_mask)

        pred_ids = corn_decode(logits).cpu().tolist()

        pred_decades = [id2label[i] for i in pred_ids]
        preds.extend(pred_decades)

    return preds

In [10]:
test_predictions = predict_on_test(model, test_loader, idx_to_decade, device)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Predicting: 100%|██████████| 19842/19842 [15:37<00:00, 21.17it/s]


In [11]:
true_labels = [int(item["label"].int()) for item in test_dataset]
predicted_label_indices = [decade_to_idx[pred] for pred in test_predictions]

# MOST FREQUENT ELEMENT BASELINE
true_c = collections.Counter(true_labels)
mode = true_c.most_common(1)[0][0]
most_freq_baseline = [mode for _ in range(len(true_labels))]

macro_f1_mode = f1_score(true_labels, most_freq_baseline, average='macro')
print(f"Most frequent element ({mode})'s macro F1:", macro_f1_mode)

# RANDOM CHOICE BASELINE
random.seed(12)
choices = list(idx_to_decade.keys())
random_baseline = random.choices(choices, k=len(true_labels))

macro_f1_random = f1_score(true_labels, random_baseline, average='macro')
print(f"Random choice's macro F1:", macro_f1_random)

# MODEL F1
macro_f1_predictions = f1_score(true_labels, predicted_label_indices, average='macro')
print("Model's macro F1:", macro_f1_predictions)
# print("\nFull classification report:")
# print(classification_report(true_labels, predicted_label_indices))
print(dict(sorted(collections.Counter(predicted_label_indices).items())))
print(dict(sorted(true_c.items())))

Most frequent element (24)'s macro F1: 0.01674092376359941
Random choice's macro F1: 0.02882188329460911
Model's macro F1: 0.3562043453200865
{0: 22, 1: 1, 2: 26, 3: 82, 4: 334, 5: 790, 6: 552, 7: 526, 8: 655, 9: 640, 10: 1114, 11: 399, 12: 246, 13: 386, 14: 7534, 15: 92, 16: 25, 17: 29, 18: 102, 19: 104, 20: 383, 21: 261, 22: 104, 23: 73, 24: 5362}
{0: 21, 1: 51, 2: 58, 3: 101, 4: 357, 5: 1003, 6: 684, 7: 491, 8: 1064, 9: 871, 10: 939, 11: 643, 12: 965, 13: 1256, 14: 3216, 15: 1739, 16: 19, 17: 15, 18: 259, 19: 158, 20: 352, 21: 208, 22: 115, 23: 6, 24: 5251}


In [12]:
print(mean_squared_error(true_labels, predicted_label_indices))

10.50201592581393


In [33]:
true_century_labels = [(idx_to_decade[x] // 100) * 100 for x in true_labels]
predicted_century_labels = [(idx_to_decade[x] // 100) * 100 for x in predicted_label_indices]
century_f1_predictions = f1_score(true_century_labels, predicted_century_labels, average='macro')
# print(collections.Counter(true_century_labels))
# print(collections.Counter(predicted_century_labels))
print("Model's macro F1 by century:", century_f1_predictions)

most_freq_century_baseline = [(idx_to_decade[x] // 100) * 100 for x in most_freq_baseline]
random_century_baseline = [(idx_to_decade[x] // 100) * 100 for x in random_baseline]
print("Most freq baseline by century:", f1_score(true_century_labels, most_freq_century_baseline, average='macro'))
print("Random baseline by century:", f1_score(true_century_labels, random_century_baseline, average='macro'))

Model's macro F1 by century: 0.6191372111423143
Most freq baseline by century: 0.08370461881799705
Random baseline by century: 0.15942687213082418


In [14]:
# split test dataset into books and movies and predictions to see if f1 score differs there
book_indices = []
movie_indices = []
for i, sample in enumerate(test_dataset.samples):
    if "book" in sample:
        if sample["book"].item():
            book_indices.append(i)
        else:
            movie_indices.append(i)

true_book_labels = [true_labels[x] for x in book_indices]
true_movie_labels = [true_labels[x] for x in movie_indices]

predicted_book_labels = [predicted_label_indices[x] for x in book_indices]
predicted_movie_labels = [predicted_label_indices[x] for x in movie_indices]

book_f1_predictions = f1_score(true_book_labels, predicted_book_labels, average='macro')
print("Model's book F1:", book_f1_predictions)

movie_f1_predictions = f1_score(true_movie_labels, predicted_movie_labels, average='macro')
print("Model's movie F1:", movie_f1_predictions)

Model's book F1: 0.3519307857084939
Model's movie F1: 0.04335903357313307


In [15]:
def predict_decade(model, tokenizer, id2label, text, device="cpu", max_len=128):
    model.eval()

    # preprocess text
    text = str(text).lower().strip()

    # tokenize
    enc = tokenizer(
        text,
        truncation=True,
        max_length=max_len,
        padding="max_length",
        return_tensors="pt"
    )

    input_ids = enc["input_ids"].to(device)
    attention_mask = enc["attention_mask"].to(device)

    with torch.no_grad():
        logits = model(input_ids=input_ids, attention_mask=attention_mask)

    # CORN decode
    probs = torch.sigmoid(logits)
    pred_id = int(torch.sum(probs > 0.5, dim=1).item())

    # map to decade
    decade = id2label[pred_id]
    return decade


In [23]:
text = "“Hateful day when I received life! Accursed creator! Why did you form a monster so hideous that even you turned from me in disgust? God, in pity, made man beautiful and alluring, after his own image; but my form is a filthy type of yours, more horrid even from the very resemlance. Satan had his companions, fellow-devils, to admire and encourage him; but I am solitary and abhorred.'"
predicted_decade = predict_decade(model, AutoTokenizer.from_pretrained("distilbert-base-uncased"), idx_to_decade, text, device)
print("Predicted decade:", predicted_decade)


Predicted decade: 1910


In [27]:
text = "It is a truth universally acknowledged, that a single man in possession of a good fortune, must be in want of a wife"
predicted_decade = predict_decade(model, AutoTokenizer.from_pretrained("distilbert-base-uncased"), idx_to_decade, text, device)
print("Predicted decade:", predicted_decade)


Predicted decade: 1900


In [25]:
text = "Look, I didn’t want to be a half-blood. If you’re reading this because you think you might be one, my advice is: close this book right now. Believe whatever lie your mom or dad told you about your birth, and try to lead a normal life."
predicted_decade = predict_decade(model, AutoTokenizer.from_pretrained("distilbert-base-uncased"), idx_to_decade, text, device)
print("Predicted decade:", predicted_decade)

Predicted decade: 1990
