## Imports and Installations

In [7]:
import pandas as pd
import numpy as np

import torch
from torch import nn

from torch.optim import Adam
from tqdm import tqdm

from transformers import GPT2Tokenizer
from transformers import GPT2ForSequenceClassification, GPT2Config

In [9]:
import platform
platform.platform()

torch.has_mps

True

## Load Data for Training

In [10]:
train_data_path = 'Data/BF_Subject_Clean_Call_Number.csv'
df = pd.read_csv(train_data_path, sep='\t')
df = df.drop("Unnamed: 0", axis=1)

df.head()

Unnamed: 0,Title,Call_Number
0,jung c g carl gustav 18751961 psychoanalysis,174.0
1,witchcraft germany braunschweig region demon...,1583.0
2,psychology qualitative research ethnology me...,76.5
3,selling psychology applied,636.0
4,associations institutions etc stress psycholog...,175.5


In [11]:
vals = pd.unique(df['Call_Number'])
print(len(vals))
labels = {value: key for (key, value) in enumerate(vals)}
print(labels)

658
{'174': 0, '1583': 1, '76.5': 2, '636': 3, '175.5': 4, '121': 5, '723': 6, '1598': 7, '210': 8, '870': 9, '711': 10, '698.5': 11, '201': 12, '323': 13, '311': 14, '713': 15, '319': 16, '431': 17, '176': 18, '637': 19, '18.02': 20, '175': 21, '633': 22, '632.5': 23, '1576': 24, '173': 25, '710': 26, '698': 27, '378': 28, '697': 29, '1325': 30, '408': 31, '432': 32, '113': 33, '325': 34, '448': 35, '639': 36, '295': 37, '371': 38, '207': 39, '721': 40, '1593': 41, '692.2': 42, '1407': 43, '724': 44, '575': 45, '318': 46, '1779': 47, '341': 48, '1999': 49, '692.5': 50, '162': 51, '39': 52, '321': 53, '441': 54, '38': 55, '714': 56, '1622': 57, '447': 58, '109': 59, '76.4': 60, '1791': 61, '241': 62, '1611': 63, '698.3': 64, '316.6': 65, '1561': 66, '105': 67, '315': 68, '701': 69, '335': 70, '698.95': 71, '455': 72, '1793': 73, '201.3': 74, '38.5': 75, '724.3': 76, '1434': 77, '233': 78, '774': 79, '720': 80, '717': 81, '458': 82, '531': 83, '81': 84, '31': 85, '1591': 86, '39.9': 87,

In [12]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.padding_side = "left"
tokenizer.pad_token = tokenizer.eos_token

class dataset(torch.utils.data.Dataset):
  def __init__(self, df):

    

    self.labels = [labels[label] for label in df['Call_Number']]
    self.texts = [tokenizer(text,
                            padding='max_length',max_length=512, truncation=True,
                            return_tensors="pt") for text in df['Title']]

  def __len__(self):
      return len(self.labels)

  def __getitem__(self, idx):
    batch_texts = self.get_batch_texts(idx)
    batch_y = self.get_batch_labels(idx)

    return batch_texts, batch_y
  def classes(self):
    return self.labels

  def get_batch_labels(self, idx):
    return np.array(self.labels[idx])

  def get_batch_texts(self, idx):
    return self.texts[idx]

    

In [13]:
np.random.seed(112)
rand = np.random.randint(1,high=100)
print(rand)
df_train, df_val, df_test = np.split(df.sample(frac=1, random_state=rand),
                                     [int(.8*len(df)), int(.9*len(df))])

print(len(df_train), len(df_val), len(df_test))

44
15074 1884 1885


In [14]:
df_train

Unnamed: 0,Title,Call_Number
18255,attitude psychology dogmatism,378
16478,psychology history,81
18594,goal psychology motivation psychology affect p...,504
17029,psychological tests psychometrics,38.5
15840,worth patience spirit parapsychology,1301
...,...,...
7980,psychological tests,431
17370,thought and thinking,455
10496,identity psychology identity psychology in lit...,697
8155,conduct of life early works to 1800 emotions ...,551


## Training Loop

In [22]:
def train(model, train_data, val_data, learning_rate, epoch):
  training_dataset, val_dataset = dataset(train_data), dataset(val_data)

  train_dataloader = torch.utils.data.DataLoader(training_dataset, batch_size=2, shuffle=True)
  val_dataloader = torch.utils.data.DataLoader(val_dataset, batch_size=2)

  use_cuda = torch.cuda.is_available()

  device = torch.device("cuda" if use_cuda else "cpu")

  optimizer = Adam(model.parameters(), lr=learning_rate)

  prev_val_acc = 0
  

  if use_cuda:
    model = model.cuda()

  
  for epoch_num in range(epoch):

    total_acc_train = 0
    total_loss_train = 0

    for train_input, train_label in tqdm(train_dataloader):
      

      train_label = train_label.to(device)
      mask = train_input['attention_mask'].to(device)
      input_id = train_input['input_ids'].squeeze(1).to(device)

      model.zero_grad()

      outputs = model(input_ids=input_id, attention_mask=mask, labels=train_label)
      loss = outputs.loss
      logits = outputs.logits

      total_loss_train += loss.item()
      acc = (logits.argmax(axis=-1) == train_label).sum().item()
      total_acc_train += acc

      loss.backward()
      #torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
      optimizer.step()

    
    total_acc_val = 0
    total_loss_val = 0

    with torch.no_grad():
      print("validating")
      for val_input, val_label in val_dataloader:
        val_label = val_label.to(device)
        mask = val_input['attention_mask'].to(device)
        input_id = val_input['input_ids'].squeeze(1).to(device)
        
        outputs = model(input_ids=input_id, attention_mask=mask, labels=val_label)
        loss = outputs.loss
        logits = outputs.logits

        total_loss_val += loss.item()
        acc = (logits.argmax(axis=-1) == val_label).sum().item()
        total_acc_val += acc

    print(
        f'Epochs: {epoch_num + 1} | Train Loss: {total_loss_train / len(train_data): .3f} \
        | Train Accuracy: {total_acc_train / len(train_data): .3f} \
        | Val Loss: {total_loss_val / len(val_data): .3f} \
        | Val Accuracy: {total_acc_val / len(val_data): .3f}')
    
    if prev_val_acc > total_acc_val / len(val_data):
      print("Breaking early because validation accuracy has decreased")
      break
    else:
      prev_val_Acc = total_acc_val / len(val_data)

In [16]:
def evaluate(model, test_data):
  test = dataset(test_data)

  test_dataloader = torch.utils.data.DataLoader(test, batch_size=2)

  use_cuda = torch.cuda.is_available()
  device = torch.device("cuda" if use_cuda else "cpu")

  if use_cuda:
    model = model.cuda()

  total_acc_test = 0

  with torch.no_grad():
    for test_input, test_label in test_dataloader:
      test_label = test_label.to(device)
      mask = test_input['attention_mask'].to(device)
      input_id = test_input['input_ids'].squeeze(1).to(device)
      
      outputs = model(input_ids=input_id, attention_mask=mask, labels=test_label)
      loss = outputs.loss
      logits = outputs.logits
      acc = (logits.argmax(axis=-1) == test_label).sum().item()
      total_acc_test += acc
  
  print(f'Test Accuracy: {total_acc_test / len(test_data): .3f}')

In [17]:
if 'model' in locals():
  del model

if 'configuration' in locals():
  del configuration

configuration = GPT2Config()
configuration.num_labels = len(vals)

model = GPT2ForSequenceClassification(configuration)
model.resize_token_embeddings(len(tokenizer))
model.config.pad_token_id = model.config.eos_token_id



In [23]:
LR = 1e-6
EPOCHS = 5
train(model, df_train, df_val, LR, EPOCHS)

  0%|          | 0/7537 [00:00<?, ?it/s]


RuntimeError: Placeholder storage has not been allocated on MPS device!

In [None]:
evaluate(model, df_test)