In [2]:
from pathlib import Path
import torch

import polars as pl

from strip_headers import strip_headers
import re

from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW
from torch.utils.data import DataLoader


In [3]:
class GutenbergDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
        
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item
    
    def __len__(self):
        return len(self.labels)
    

In [4]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

data_dir = Path.cwd() / "data"
metadata = pl.read_csv(data_dir / "metadata.csv")



authors = ["Goethe, Johann Wolfgang von", "Schiller, Friedrich", "Raimund, Ferdinand"]
author_mapping = {"Goethe, Johann Wolfgang von": torch.tensor([1,0,0]),
                  "Schiller, Friedrich": torch.tensor([0,1,0]),
                  "Raimund, Ferdinand": torch.tensor([0,0,1]),}
metadata = metadata.filter(pl.col("language") == "['de']")


tokenizer = AutoTokenizer.from_pretrained("bert-base-german-cased")
model = AutoModelForSequenceClassification.from_pretrained("bert-base-german-cased", num_labels=3)

model.to(device)
model.train()


Some weights of the model checkpoint at bert-base-german-cased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoi

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [7]:
#train test split, preprocessing, tokenization and splitting into blocks in no particular order

train_encodings = {'input_ids'      : torch.tensor([], requires_grad=False, dtype=torch.long),
                   'token_type_ids' : torch.tensor([], requires_grad=False, dtype=torch.long),
                   'attention_mask' : torch.tensor([], requires_grad=False, dtype=torch.long),
                   }
train_labels = []

test_encodings = {'input_ids'      : torch.tensor([], requires_grad=False, dtype=torch.long),
                  'token_type_ids' : torch.tensor([], requires_grad=False, dtype=torch.long),
                  'attention_mask' : torch.tensor([], requires_grad=False, dtype=torch.long),
                  }
test_labels = []

val_encodings = {'input_ids'      : torch.tensor([], requires_grad=False, dtype=torch.long),
                 'token_type_ids' : torch.tensor([], requires_grad=False, dtype=torch.long),
                 'attention_mask' : torch.tensor([], requires_grad=False, dtype=torch.long),
                 }
val_labels = []


train_ids = []
test_ids = []
val_ids = []

for author in authors:
    
    #select texts by the authors from the whole gutenberg corpus
    author_ids = metadata.filter((pl.col("author") == author) & (pl.col("subjects").str.contains("Drama")))["id"].to_list()
    
    #splitting in such a way that the model has never seen any parts of the play before
    #otherwise it would probably just learn to recognize the names of the characters
    #it will still ... 
    #I will just have very bad test loss
    
    #TODO: implement name removal
    #https://stackoverflow.com/questions/53534376/removing-names-from-noun-chunks-in-spacy
    
    train_ids += author_ids[:-2]
    test_ids += [author_ids[-2]]
    val_ids += [author_ids[-1]] 
    
    
    for doc_id in author_ids:
        
        file_path = data_dir / "raw" / (doc_id + "_raw.txt")
        
        try:
            with open(file_path, "r") as in_f:
                raw_text = in_f.read()    
        except FileNotFoundError:
            print(f"Warning file not found{file_path}")
            continue
            
        #script for removing the gutenberg project headers 
        text = strip_headers(raw_text)
        
        #more custom header and footer stripping
        
        #this book has a longer appendix
        if doc_id =='PG47804':
            
            a = re.search(r'\b(Fußnoten)\b', text)
            text = text[1000:-a.start()]
            
        # for the rest we just strip another 1000 chars
        else: 
            text = text[1000:-1000]
            
        #encoding the data first 
        #since I probably want to use the full 512 tokens without doing any truncating or padding
        encoding = tokenizer(text, return_tensors="pt")
        
        # might be inefficient but fine for now
        # input ids here are token ids not doc ids ...

        encodings = {'input_ids'      : torch.split(encoding['input_ids'], 512, dim=1)[:-1],
                     'token_type_ids' : torch.split(encoding['token_type_ids'], 512, dim=1)[:-1],
                     'attention_mask' : torch.split(encoding['attention_mask'], 512, dim=1)[:-1],
                    }
                     
        
        #not dry it's pretty damp
        if doc_id in train_ids:
            train_encodings = {key : torch.cat((encodings[key][0], train_encodings[key])).clone().detach() for key in encodings}
            train_labels += [author_mapping[author].clone().detach() * len(encodings['input_ids'])]
        elif doc_id in test_ids:
            test_encodings = {key : torch.cat((encodings[key][0], test_encodings[key])).clone().detach() for key in encodings}
            test_labels += [author_mapping[author].clone().detach() * len(encodings['input_ids'])]
        elif doc_id in val_ids:
            val_encodings = {key : torch.cat((encodings[key][0], val_encodings[key])).clone().detach() for key in encodings}
            val_labels += [author_mapping[author].clone().detach() * len(encodings['input_ids'])]

In [5]:
train_dataset = GutenbergDataset(train_encodings, train_labels)
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)

test_dataset = GutenbergDataset(test_encodings, test_labels)
val_dataset = GutenbergDataset(val_encodings, val_labels)

In [None]:
from transformers import  Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=3,              # total number of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=64,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=10,
)



trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=val_dataset             # evaluation dataset
)

trainer.train()

***** Running training *****
  Num examples = 18
  Num Epochs = 3
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 6
  Number of trainable parameters = 109083651
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item['labels'] = torch.tensor(self.labels[idx])
