In [1]:
MAX_LEN = 64
BATCH_SIZE = 1024
MAX_EPCH = 20

In [None]:
import sys
print (sys.version)

3.9.1 (default, Dec 11 2020, 09:29:25) [MSC v.1916 64 bit (AMD64)]


In [3]:
import os
import torch
import torch.utils.data
import torch.nn.functional as F
import pytorch_lightning as pl

import numpy as np
import pandas as pd
import json
import logging

In [6]:
from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint
from pytorch_lightning.loggers import CSVLogger
from torch import nn
from torch.utils.data import Dataset, DataLoader
from transformers import RobertaTokenizerFast, RobertaModel, AdamW, get_linear_schedule_with_warmup

In [7]:
class SeqDataset(Dataset):
    def __init__(self, data, max_len, with_yield=True):
        self.data = pd.DataFrame(data, columns=['Seq1', 'Seq2', 'Yield'])  # pandas dataframe
        self.data['Yield'] = self.data['Yield'] * 100
        
        #Initialize the tokenizer
        self.tokenizer = RobertaTokenizerFast.from_pretrained("tokenizer/", max_len=64)

        self.max_len = max_len
        self.with_yield = with_yield 

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):

        # Selecting sequence1 and sequence2 at the specified index in the data frame
        seq1 = str(self.data.loc[index, 'Seq1'])
        seq2 = str(self.data.loc[index, 'Seq2'])

        # Tokenize the pair of sentences to get token ids, attention masks and token type ids
        encoded_pair = self.tokenizer(seq1, seq2, 
                                      padding='max_length',         # Pad to max_length
                                      truncation=True,              # Truncate to max_length
                                      max_length=self.max_len,  
                                      return_tensors='pt')          # Return torch.Tensor objects

        token_ids = encoded_pair['input_ids'].squeeze(0)            # tensor of token ids
        attn_masks = encoded_pair['attention_mask'].squeeze(0)      # binary tensor with "0" for padded values and "1" for the other values
#         token_type_ids = encoded_pair['token_type_ids'].squeeze(0)  # binary tensor with "0" for the 1st sentence tokens & "1" for the 2nd sentence tokens

        if self.with_yield:  # True if the dataset has yields
            yld = self.data.loc[index, 'Yield']
            return token_ids, attn_masks, yld  
        else:
            return token_ids, attn_masks

In [8]:
class RoBERTaFineTuner(pl.LightningModule):
    def __init__(self, roberta_model_path, freeze_roberta=False, hidden_size=256, lr = 2e-5):
        super(RoBERTaFineTuner, self).__init__()
        self.roberta_layer = RobertaModel.from_pretrained(roberta_model_path)
        self.hidden_size = hidden_size
        self.lr = lr
        self.out_predictions = []
    
        # Freeze RoBERTa layers and only train the classification layer weights
        if freeze_roberta:
            for p in self.roberta_layer.parameters():
                p.requires_grad = False
                
        # Regression layer
        self.dense = nn.Linear(self.hidden_size, 1)
        self.dropout = nn.Dropout(p=0.1)

    def forward(self, input_ids, attn_masks):
        # Feeding the inputs to the RoBERTa-based model to obtain contextualized representations
        roberta_out = self.roberta_layer(input_ids, attn_masks)
        last_hidden_state, pooler_output, hidden_states = roberta_out['last_hidden_state'], roberta_out['pooler_output'], roberta_out['hidden_states']
        return self.hidden_layer(self.dropout(pooler_output))
    
    def training_step(self, batch, batch_idx):
        token_ids, attn_masks, yields = batch
        out = self(token_ids, attn_masks)
        loss = F.mse_loss(torch.squeeze(out), yields)
        self.log('train_loss', loss)
        return loss

    def configure_optimizers(self):
        optimizer = AdamW(self.parameters(), lr=self.lr, weight_decay=1e-2)
        t_total = 250 * MAX_EPCH  # Necessary to take into account Gradient accumulation
        lr_scheduler = get_linear_schedule_with_warmup(optimizer=optimizer, num_warmup_steps=500, num_training_steps=t_total)
        return [optimizer], [lr_scheduler]
            
    def validation_step(self, batch, batch_idx):
        token_ids, attn_masks, yields = batch
        val_out = self(token_ids, attn_masks)
        val_loss = F.mse_loss(torch.squeeze(val_out), yields)
        self.log('val_loss', val_loss)
        return val_loss
            
    def test_step(self, batch, batch_idx):
        token_ids, attn_masks, yields = batch
        test_out = self(token_ids, attn_masks)
        self.out_predictions.append(test_out)
        test_loss = F.mse_loss(torch.squeeze(test_out), yields)
        self.log('test_loss', test_loss)
        return test_loss

In [9]:
# Replace json with dd like in the other files
train = json.load(open('splits/train.json'))
val = json.load(open('splits/val.json'))
test = json.load(open('splits/test.json'))
y_train = np.load('splits/y_train.npy')
y_val = np.load('splits/y_val.npy')
y_test = np.load('splits/y_test.npy')

In [10]:
torch.set_default_tensor_type(torch.DoubleTensor)
torch.set_default_dtype(torch.double)

In [11]:
train_loader = DataLoader(SeqDataset(train, max_len=64, with_yield=True), batch_size=BATCH_SIZE, num_workers=8, shuffle=True)
val_loader = DataLoader(SeqDataset(val, max_len=64, with_yield=True), batch_size=BATCH_SIZE, num_workers=8, shuffle=False)
test_loader = DataLoader(SeqDataset(test, max_len=64, with_yield=True), batch_size=BATCH_SIZE, num_workers=8, shuffle=False)

In [None]:
model = RoBERTaFineTuner('existing_roberta')
model.roberta_layer.config.__dict__['output_hidden_states'] = True
early_stopping = EarlyStopping('val_loss', patience=3)

checkpoint_callback = ModelCheckpoint(
    monitor='val_loss',
    dirpath='lightning_checkpoints_roberta_finetune/',
    filename='roberta-finetune-{epoch:02d}-{val_loss:.2f}',
    save_top_k=3,
    mode='min',
)

logger = CSVLogger('logs_roberta_finetune', name='roberta_finetune')

In [None]:
trainer = pl.Trainer(callbacks=[early_stopping, checkpoint_callback], gpus=[0], logger=logger, max_epochs=MAX_EPCH)
trainer.fit(model, train_loader, val_loader)