In [1]:
import pandas as pd
import numpy as np
import torch

from torch.utils.data.dataset import Dataset
from transformers import TrainingArguments, Trainer, AutoTokenizer, AutoModelForSequenceClassification
from sklearn.model_selection import train_test_split

In [2]:
# Read in the data and add columns to the dataframe
book_summs = pd.read_csv('data/booksummaries.txt', header=None, sep='\t')
book_summs.columns = ['wikipedia_article_id', 'freebase_id', 'title', 'author', 'pub_date', 'genre', 'summary']

# Remove extraneous features - freebase_id, author, pub_date
book_summs = book_summs.drop(labels=['freebase_id', 'author', 'pub_date'], axis=1)
print(book_summs.head())

   wikipedia_article_id                                      title  \
0                   620                                Animal Farm   
1                   843                         A Clockwork Orange   
2                   986                                 The Plague   
3                  1756  An Enquiry Concerning Human Understanding   
4                  2080                       A Fire Upon the Deep   

                                               genre  \
0  {"/m/016lj8": "Roman \u00e0 clef", "/m/06nbt":...   
1  {"/m/06n90": "Science Fiction", "/m/0l67h": "N...   
2  {"/m/02m4t": "Existentialism", "/m/02xlf": "Fi...   
3                                                NaN   
4  {"/m/03lrw": "Hard science fiction", "/m/06n90...   

                                             summary  
0   Old Major, the old boar on the Manor Farm, ca...  
1   Alex, a teenager living in near-future Englan...  
2   The text of The Plague is divided into five p...  
3   The argument of th

In [3]:
print('Size of dataset before preprocessing:', len(book_summs))

# Remove any books which don't have genres
book_summs.dropna(subset=['genre'], inplace=True)

print('Size of dataset after removing missing genres:', len(book_summs))

# Remove any books which don't have titles
book_summs.dropna(subset=['title'], inplace=True)

print('Size of dataset after removing missing titles:', len(book_summs))

# Remove any books which don't have summaries
book_summs.dropna(subset=['summary'], inplace=True)

print('Size of dataset after removing missing summaries:', len(book_summs))
print('Size of dataset after preprocessing:', len(book_summs))

Size of dataset before preprocessing: 16559
Size of dataset after removing missing genres: 12841
Size of dataset after removing missing titles: 12841
Size of dataset after removing missing summaries: 12841
Size of dataset after preprocessing: 12841


In [4]:
# Format the genre field
formatted_genres = []
genre_dict = dict()
for g in book_summs['genre']:
    subg = []
    genre_dict = eval(g)
    for k in genre_dict.keys():
        subg.append(genre_dict[k])
    formatted_genres.append(subg)
book_summs['formatted_genre'] = formatted_genres
print(book_summs.head())

   wikipedia_article_id                           title  \
0                   620                     Animal Farm   
1                   843              A Clockwork Orange   
2                   986                      The Plague   
4                  2080            A Fire Upon the Deep   
5                  2152  All Quiet on the Western Front   

                                               genre  \
0  {"/m/016lj8": "Roman \u00e0 clef", "/m/06nbt":...   
1  {"/m/06n90": "Science Fiction", "/m/0l67h": "N...   
2  {"/m/02m4t": "Existentialism", "/m/02xlf": "Fi...   
4  {"/m/03lrw": "Hard science fiction", "/m/06n90...   
5  {"/m/098tmk": "War novel", "/m/016lj8": "Roman...   

                                             summary  \
0   Old Major, the old boar on the Manor Farm, ca...   
1   Alex, a teenager living in near-future Englan...   
2   The text of The Plague is divided into five p...   
4   The novel posits that space around the Milky ...   
5   The book tells the story

In [5]:
genres = set()
for x in book_summs['formatted_genre']:
    genres.update(x)

new_columns = list(genres)
default_value = 0
book_summs = pd.concat([book_summs, pd.DataFrame({col: default_value for col in new_columns}, index=book_summs.index)], axis=1)
book_summs = book_summs.drop(labels=['genre'], axis=1)

In [6]:
class_df = book_summs.drop(labels=['formatted_genre'], axis=1)
print(class_df.columns)

Index(['wikipedia_article_id', 'title', 'summary', 'Soft science fiction',
       'Adventure novel', 'Ergodic literature', 'Collage', 'Ghost story',
       'Robinsonade', 'Post-holocaust',
       ...
       'Absurdist fiction', 'Non-fiction novel', 'War novel', 'Biography',
       'Science Fiction', 'Transhumanism', 'Science fantasy',
       'Autobiographical comics', 'Biopunk', 'Role-playing game'],
      dtype='object', length=230)


In [7]:
train_df, test_df = train_test_split(class_df,test_size=.2)
print(f"Number of rows in training set: {len(train_df)}")
print(f"Number of rows in test set: {len(test_df)}")

Number of rows in training set: 10272
Number of rows in test set: 2569


In [8]:
feature_columns = ['wikipedia_article_id', 'title', 'summary']
label_columns = [col for col in class_df.columns if col not in feature_columns]

df_labels_train = train_df[label_columns]
df_labels_test = test_df[label_columns]

labels_list_train = df_labels_train.values.tolist()
labels_list_test = df_labels_test.values.tolist()

In [9]:
train_texts = train_df['summary'].tolist()
eval_texts = test_df['summary'].tolist()

tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
train_encodings = tokenizer(train_texts, padding="max_length", truncation=True, max_length=512)
eval_encodings = tokenizer(eval_texts, padding="max_length", truncation=True, max_length=512)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [10]:
class TextClassifierDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

In [11]:
train_dataset = TextClassifierDataset(train_encodings, labels_list_train)
eval_dataset = TextClassifierDataset(eval_encodings, labels_list_test)

In [12]:
model = AutoModelForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    problem_type="multi_label_classification",
    num_labels=93
)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [13]:
training_arguments = TrainingArguments(
    output_dir=".",
    evaluation_strategy="epoch",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=8,
)

In [14]:
trainer = Trainer(
    model=model,
    args=training_arguments,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
)

trainer.train()

ValueError: Target size (torch.Size([16, 227])) must be the same as input size (torch.Size([16, 93]))