In [1]:
from transformers import T5ForConditionalGeneration, T5Tokenizer
from transformers import Adafactor, get_linear_schedule_with_warmup, AdamW
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset, RandomSampler, SequentialSampler

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import pandas as pd
from matplotlib import pyplot as plt
import numpy as np
import random
from tqdm.notebook import tqdm

In [2]:
# Hyper params
batch_size = 32
epochs = 2
learning_rate = 1e-4

# Helper functions

In [3]:
def set_seed(seed_val=42):
    random.seed(seed_val)
    np.random.seed(seed_val)
    torch.manual_seed(seed_val)
    torch.cuda.manual_seed(seed_val)

In [4]:
if torch.cuda.is_available():
    device = torch.device('cuda')
else:
    device = torch.device('cpu')

# Load Data

In [5]:
df = pd.read_csv('data/filtered.csv')
df = df.dropna()
df['label'] = df['label'].apply(lambda x: x.lower())

In [6]:
# Lets split our dataset
train_df, eval_df = train_test_split(df, test_size=0.4, random_state=2021)
eval_df, test_df = train_test_split(df, test_size=0.5, random_state=20201)

In [7]:
tokenizer = T5Tokenizer.from_pretrained('t5-small')

In [8]:
class T5Dataset(Dataset):
    def __init__(self, tokenizer, df, max_length=100, set_type='train'):
        super(T5Dataset, self).__init__()
        
        self.tokenizer = tokenizer
        self.df = df
        self.max_length = max_length
        self.set_type = set_type
        
    def __len__(self):
        return self.df.shape[0]
    
    def __getitem__(self, index):
        input_text = self.df['text'].iloc[index]
        src_tokenized = self.tokenizer.encode_plus(
            input_text, 
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_token_type_ids=False,
            return_tensors='pt'
        )
        
        input_ids = src_tokenized['input_ids'].squeeze()
        src_mask = src_tokenized['attention_mask'].squeeze()
    
        if self.set_type == 'train':
            labels = self.df['label'].iloc[index]
            target_tokenized = self.tokenizer.encode_plus(
                labels,
                max_length=self.max_length,
                padding='max_length',
                truncation=True,
                return_token_type_ids=False,
                return_tensors='pt'
            )
            
            target_ids = target_tokenized['input_ids'].squeeze()
            target_mask = target_tokenized['attention_mask'].squeeze()
            
            return {
                'input_ids': input_ids.long(),
                'src_mask': src_mask.long(),
                'target_ids': target_ids.long(),
                'target_mask': target_mask.long()
            }
        
        else:
            return {
                'input_ids': input_ids.long(),
                'src_mask': src_mask.long(),
                'label': labels
            }

In [9]:
train_dataset = T5Dataset(tokenizer, train_df)
train_sampler = RandomSampler(train_dataset)
train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=batch_size)

eval_dataset = T5Dataset(tokenizer, eval_df, set_type='test')
eval_sampler = SequentialSampler(eval_dataset)
eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=batch_size)

# Prepare T5 model

In [10]:
model = T5ForConditionalGeneration.from_pretrained('t5-small')
model.to(device)

T5ForConditionalGeneration(
  (shared): Embedding(32128, 512)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 512)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=512, out_features=512, bias=False)
              (k): Linear(in_features=512, out_features=512, bias=False)
              (v): Linear(in_features=512, out_features=512, bias=False)
              (o): Linear(in_features=512, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 8)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseReluDense(
              (wi): Linear(in_features=512, out_features=2048, bias=False)
              (wo): Linear(in_features=2048, out_features=512, bias=False)
              (dropout): Dro

In [11]:
optimizer = Adafactor(model.parameters(), relative_step=True, warmup_init=True, lr=None)
# optimizer = AdamW(model.parameters(), lr=learning_rate eps=1e-8)
# total_steps = len(train_dataloader) * epochs
# scheduler = get_linear_schedule_with_warmup(
#     optimizer, num_warmup_steps=500, num_training_steps=total_steps
# )

In [12]:
def one_hot(batch_text):
    return [1 if text == 'positive' else 0 for text in batch_text]

In [None]:
for i in tqdm(range(epochs)):
    model.train()
    total_loss = 0
    
    for step, batch in enumerate(tqdm(train_dataloader)):
        b_src_input_ids = batch['input_ids'].to(device)
        b_src_attn_mask = batch['src_mask'].to(device)
        
        lm_labels = batch['target_ids'].to(device)
        lm_labels[lm_labels[:, :] == tokenizer.pad_token_id] = -100
        
        optimizer.zero_grad()
        
        outputs = model(
            input_ids=b_src_input_ids,
            attention_mask=b_src_attn_mask,
            labels=lm_labels
        )
        
        loss = outputs[0]
        total_loss += loss.item()
        
        # Back prop
        loss.backward()
        optimizer.step()
        
    avg_train_loss = total_loss / len(train_dataloader)
    print('Training loss:', avg_train_loss)
    
    # Validation mode
    model.eval()
    
    y_true = []
    y_pred = []
    
    for step, batch in enumerate(tqdm(eval_dataloader)):
        b_src_input_ids = batch['input_ids'].to(device)
        b_stc_attn_mask = batch['src_mask'].to(device)
                
        outputs = model.generate(b_src_input_ids)
        decoded_output = tokenizer.batch_decode(outputs, clean_up_tokenization_spaces=True, skip_special_tokens=True)
        
        y_true.append(one_hot(batch['labels']))
        y_pred.append(one_hot(decoded_output))
        
    print('Acc: {}'.format(accuracy_score(y_true, y_pred)))

HBox(children=(FloatProgress(value=0.0, max=2.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=29948.0), HTML(value='')))

	add_(Number alpha, Tensor other)
Consider using one of the following signatures instead:
	add_(Tensor other, *, Number alpha) (Triggered internally at  /pytorch/torch/csrc/utils/python_arg_parser.cpp:882.)
  exp_avg_sq_row.mul_(beta2t).add_(1.0 - beta2t, update.mean(dim=-1))
