### Imports and configs

In [20]:
from tqdm import tqdm
import pandas as pd
import numpy as np
import os

from transformers import AutoTokenizer, AutoModelForTokenClassification, AutoConfig
from torch.utils.data import Dataset, DataLoader
import torch

In [24]:
from torch import cuda

config = {'model_name': 'google/bigbird-roberta-base',
         'max_length': 1024,
         'train_batch_size': 4,
         'valid_batch_size': 4,
         'epochs':5,
         'learning_rates': [2.5e-5, 2.5e-5, 2.5e-6, 2.5e-6, 2.5e-7],
         'max_grad_norm': 10,
         'device': 'cuda' if cuda.is_available() else 'cpu'}

In [15]:
full_df = pd.read_csv('./dataset.csv')
full_df.head()

Unnamed: 0,id,content,labels
0,73DC1D49FAD5,eletoral college can be a very good thing caus...,[ 3 4 4 4 4 4 4 4 4 4 4 4 4 4 4 ...
1,D840AC3957E5,"STUDENT_NAME\n\nADDRESS_NAME\n\nFebruary 22, 2...",[ 0 0 0 0 0 0 0 3 4 4 4 4 4 4 4 ...
2,753E320B186B,In my opinion as a student: I don't agree at t...,[1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2...
3,C2ABDAC2BC2C,When it comes to at home learning and attendin...,[ 3 4 4 4 4 4 4 4 4 4 4 4 4 4 4 ...
4,B2DDBAAC084C,Y\n\nou can ask many different people for advi...,[ 3 4 4 4 4 4 4 4 4 4 4 4 4 4 4 ...


In [26]:
id2label = {
  0: 'Unnanotated',
  1: 'B-Lead',
  2: 'I-Lead',
  3: 'B-Position',
  4: 'I-Position',
  5: 'B-Evidence',
  6: 'I-Evidence',
  7: 'B-Claim',
  8: 'I-Claim',
  9: 'B-Concluding_Statement',
  10: 'I-Concluding_Statement',
  11: 'B-Counterclaim',
  12: 'I-Counterclaim',
  13: 'B-Rebuttal',
  14: 'I-Rebuttal'
}

label2id = {
  'Unnanotated': 0,
  'B-Lead': 1,
  'I-Lead': 2,
  'B-Position': 3,
  'I-Position': 4,
  'B-Evidence': 5,
  'I-Evidence': 6,
  'B-Claim': 7,
  'I-Claim': 8,
  'B-Concluding_Statement': 9,
  'I-Concluding_Statement': 10,
  'B-Counterclaim': 11,
  'I-Counterclaim': 12,
  'B-Rebuttal': 13,
  'I-Rebuttal': 14
}

In [27]:
class dataset(Dataset):
  def __init__(self, df, tokenizer, max_len, get_word_ids):
        self.len = len(df)
        self.df = df
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.get_word_ids = get_word_ids # for validation

  def __getitem__(self, index):
        text = self.df.text[index]

        encoding = self.tokenizer(text.split(),
                             is_split_into_words=True,
                             padding='max_length',
                             truncation=True,
                             max_length=self.max_len)
        word_ids = encoding.word_ids()
        word_labels = None

        if not self.get_word_ids:
            word_labels = self.df.entities[index]
            previous_word_idx = None
            label_ids = []
            for word_idx in word_ids:
                if word_idx is None:
                    label_ids.append(-100)
                elif word_idx != previous_word_idx:
                    label_ids.append(label2id[word_labels[word_idx]] )
                else:
                    label_ids.append(label2id[word_labels[word_idx]] )
                previous_word_idx = word_idx
            encoding['labels'] = label_ids

        item = {k: torch.as_tensor(v) for k, v in encoding.items()}
        if self.get_word_ids:
            word_ids2 = [w if w is not None else -1 for w in word_ids]
            item['wids'] = torch.as_tensor(word_ids2)

        return item

  def __len__(self):
        return self.len

### Create training and validation datasets + DataLoaders

In [16]:
validation_split_size = 0.1
dataset_split_seed = 1

#### Split dataset into training and validation

In [17]:
from sklearn.model_selection import train_test_split

train_df, val_df = train_test_split(full_df, test_size=validation_split_size, random_state=dataset_split_seed)

# Drop id column from train dataframe
train_df = train_df[['content', 'labels']]

train_df.reset_index(drop=True, inplace=True)
val_df.reset_index(drop=True, inplace=True)

print("train_df shape: ", train_df.shape)
print("val_df shape: ", val_df.shape)

train_df shape:  (14034, 2)
val_df shape:  (1560, 3)


#### Create DataLoaders

In [29]:
os.mkdir('model')
# Upload pretrained model here. For this project, we are using HuggingFace's BigBird.

In [30]:
tokenizer = AutoTokenizer.from_pretrained('./model')
training_set = dataset(train_df, tokenizer, config['max_length'], False)
testing_set = dataset(val_df, tokenizer, config['max_length'], True)

loading file spiece.model
loading file tokenizer.json
loading file added_tokens.json
loading file special_tokens_map.json
loading file tokenizer_config.json


In [31]:
train_params = {'batch_size': config['train_batch_size'],
                'shuffle': True,
                'num_workers': 2,
                }

validation_params = {'batch_size': config['valid_batch_size'],
               'shuffle': False,
               'num_workers': 2,
               }

In [32]:
training_loader = DataLoader(training_set, **train_params)
validation_loader = DataLoader(testing_set, **validation_params)

Save DataLoaders to be able to quickly load them later.

In [33]:
torch.save(training_loader, 'training_loader.pth')
torch.save(validation_loader, 'validation_loader.pth')