In [1]:
MAX_LEN = 64
BATCH_SIZE = 256

In [2]:
import torch
import torch.nn.functional as F
import pytorch_lightning as pl

import json
import numpy as np
import pandas as pd
import logging

from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint
from torch import nn
from torch.utils.data import Dataset, DataLoader, TensorDataset
from transformers import RobertaTokenizerFast, RobertaModel, AdamW, get_linear_schedule_with_warmup

In [3]:
from sklearn.metrics import confusion_matrix, precision_recall_fscore_support, roc_auc_score, roc_curve, matthews_corrcoef, plot_confusion_matrix, average_precision_score, auc, roc_auc_score
from collections import defaultdict
import seaborn as sns

In [5]:
class SeqDataset(Dataset):
    def __init__(self, data, max_len, with_yield=True):
        self.data = pd.DataFrame(data, columns=['Seq1', 'Seq2', 'Yield'])  # pandas dataframe
        self.data['Yield'] = self.data['Yield'] * 100
        #Initialize the tokenizer
        self.tokenizer = RobertaTokenizerFast.from_pretrained("tokenizer", max_len=64)

        self.max_len = max_len
        self.with_yield = with_yield 

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):

        # Selecting sequence1 and sequence2 at the specified index in the data frame
        seq1 = str(self.data.loc[index, 'Seq1'])
        seq2 = str(self.data.loc[index, 'Seq2'])

        # Tokenize the pair of sentences to get token ids, attention masks and token type ids
        encoded_pair = self.tokenizer(seq1, seq2, 
                                      padding='max_length',         # Pad to max_length
                                      truncation=True,              # Truncate to max_length
                                      max_length=self.max_len,  
                                      return_tensors='pt')          # Return torch.Tensor objects

        token_ids = encoded_pair['input_ids'].squeeze(0)            # tensor of token ids
        attn_masks = encoded_pair['attention_mask'].squeeze(0)      # binary tensor with "0" for padded values and "1" for the other values
#         token_type_ids = encoded_pair['token_type_ids'].squeeze(0)  # binary tensor with "0" for the 1st sentence tokens & "1" for the 2nd sentence tokens

        if self.with_yield:  # True if the dataset has yields
            yld = self.data.loc[index, 'Yield']
            return token_ids, attn_masks, yld  
        else:
            return token_ids, attn_masks

In [6]:
class RoBERTaFineTuner(pl.LightningModule):
    def __init__(self, roberta_model_path='.', freeze_roberta=False, hidden_size=256, lr = 1e-5):
        super(RoBERTaFineTuner, self).__init__()
        self.roberta_layer = RobertaModel.from_pretrained(roberta_model_path)
        self.hidden_size = hidden_size
        self.lr = lr
        self.out_predictions = []
    
        # Freeze bert layers and only train the classification layer weights
        if freeze_roberta:
            for p in self.roberta_layer.parameters():
                p.requires_grad = False
                
        # Regression layer
        self.hidden_layer = nn.Linear(self.hidden_size, 1)
        self.dropout = nn.Dropout(p=0.1)

    def forward(self, input_ids, attn_masks):
        # Feeding the inputs to the RoBERTa-based model to obtain contextualized representations
        roberta_out = self.roberta_layer(input_ids, attn_masks)
        last_hidden_state, pooler_output = roberta_out['last_hidden_state'], roberta_out['pooler_output']
        return self.hidden_layer(self.dropout(pooler_output))
    
    def training_step(self, batch, batch_idx):
        token_ids, attn_masks, yields = batch
        out = self(token_ids, attn_masks)
        loss = F.mse_loss(torch.squeeze(out), yields)
        self.log('train_loss', loss, on_step=False, on_epoch=True, prog_bar=True, logger=True)
        return loss

    def configure_optimizers(self):
        optimizer = AdamW(self.parameters(), lr=self.lr, weight_decay=1e-2)
        return optimizer
            
    def validation_step(self, batch, batch_idx):
        token_ids, attn_masks, yields = batch
        val_out = self(token_ids, attn_masks)
        val_loss = F.mse_loss(torch.squeeze(val_out), yields)
        self.log('val_loss', val_loss, on_step=False, on_epoch=True, prog_bar=True, logger=True)
        return val_loss
            
    def test_step(self, batch, batch_idx):
        token_ids, attn_masks, yields = batch
        test_out = self(token_ids, attn_masks)
        self.out_predictions.append(test_out)
        test_loss = F.mse_loss(torch.squeeze(test_out), yields)
        self.log('test_loss', test_loss, on_step=False, on_epoch=True, prog_bar=True, logger=True)
        return test_loss

In [7]:
def read_json(path):
    with open(path) as json_file:
        return json.load(json_file)

In [8]:
train = read_json('splits/train.json')
val = read_json('splits/val.json')
test = read_json('splits/test.json')

In [9]:
y_test = np.load('splits/y_test.npy')
y_test = y_test * 100

In [10]:
torch.set_default_tensor_type(torch.DoubleTensor)
torch.set_default_dtype(torch.double)

In [11]:
train_loader = DataLoader(SeqDataset(train, max_len=64, with_yield=True), batch_size=BATCH_SIZE, num_workers=0, shuffle=True)
val_loader = DataLoader(SeqDataset(val, max_len=64, with_yield=True), batch_size=BATCH_SIZE, num_workers=0, shuffle=False)
test_loader = DataLoader(SeqDataset(test, max_len=64, with_yield=True), batch_size=BATCH_SIZE, num_workers=0, shuffle=False)

In [12]:
model = RoBERTaFineTuner.load_from_checkpoint('...', roberta_model_path='existing_roberta', freeze_roberta=False)

Some weights of RobertaModel were not initialized from the model checkpoint at roberta_h256_attn8_drop03 and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [14]:
trainer = pl.Trainer(gpus=[0])

GPU available: True, used: True
TPU available: None, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


In [15]:
trainer.test(model, test_dataloaders=test_loader)

Testing: 0it [00:00, ?it/s]

--------------------------------------------------------------------------------
DATALOADER:0 TEST RESULTS
{'test_loss': tensor(305.0496, device='cuda:0', dtype=torch.float32)}
--------------------------------------------------------------------------------


[{'test_loss': 305.049560546875}]

In [16]:
predictions = [[pred.item() for pred in preds] for preds in model.out_predictions]
preds_flat = [j for sub in predictions for j in sub]
predictions_np = np.array(preds_flat)

In [24]:
predictions_np.shape

(255701,)

### Performance on other temperatures

In [53]:
class SeqDatasetTemp(Dataset):
    def __init__(self, data, max_len, with_yield=True, col='Yield_37C'):
        self.data = pd.DataFrame(data, columns=['Seq1', 'Seq2', col])  # pandas dataframe
        self.data[col] = self.data[col] * 100
        self.col = col
        #Initialize the tokenizer
        self.tokenizer = RobertaTokenizerFast.from_pretrained("tokenizer", max_len=64)

        self.max_len = max_len
        self.with_yield = with_yield 

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):

        # Selecting sequence1 and sequence2 at the specified index in the data frame
        seq1 = str(self.data.loc[index, 'Seq1'])
        seq2 = str(self.data.loc[index, 'Seq2'])

        # Tokenize the pair of sentences to get token ids, attention masks and token type ids
        encoded_pair = self.tokenizer(seq1, seq2, 
                                      padding='max_length',         # Pad to max_length
                                      truncation=True,              # Truncate to max_length
                                      max_length=self.max_len,  
                                      return_tensors='pt')          # Return torch.Tensor objects

        token_ids = encoded_pair['input_ids'].squeeze(0)            # tensor of token ids
        attn_masks = encoded_pair['attention_mask'].squeeze(0)      # binary tensor with "0" for padded values and "1" for the other values
#         token_type_ids = encoded_pair['token_type_ids'].squeeze(0)  # binary tensor with "0" for the 1st sentence tokens & "1" for the 2nd sentence tokens

        if self.with_yield:  # True if the dataset has yields
            yld = self.data.loc[index, self.col]
            return token_ids.to(device=dev), attn_masks.to(device=dev), yld  
        else:
            return token_ids.to(device=dev), attn_masks.to(device=dev)

In [54]:
all_tempds_df = pd.read_csv('test_set_other_temperatures.csv')

test_dataloader_37 = DataLoader(SeqDatasetTemp(all_tempds_df, max_len=64, with_yield=True, col='Yield_37C'), batch_size=BATCH_SIZE, num_workers=0, shuffle=False)
test_dataloader_42 = DataLoader(SeqDatasetTemp(all_tempds_df, max_len=64, with_yield=True, col='Yield_42C'), batch_size=BATCH_SIZE, num_workers=0, shuffle=False)
test_dataloader_47 = DataLoader(SeqDatasetTemp(all_tempds_df, max_len=64, with_yield=True, col='Yield_47C'), batch_size=BATCH_SIZE, num_workers=0, shuffle=False)
test_dataloader_52 = DataLoader(SeqDatasetTemp(all_tempds_df, max_len=64, with_yield=True, col='Yield_52C'), batch_size=BATCH_SIZE, num_workers=0, shuffle=False)
test_dataloader_62 = DataLoader(SeqDatasetTemp(all_tempds_df, max_len=64, with_yield=True, col='Yield_62C'), batch_size=BATCH_SIZE, num_workers=0, shuffle=False)

In [55]:
len(model.out_predictions)

999

In [56]:
for test_loader_temp in [test_dataloader_37, test_dataloader_42, test_dataloader_47, test_dataloader_52, test_dataloader_62]:
    trainer_chkp = pl.Trainer(gpus=[0])
    trainer_chkp.test(model, test_dataloaders=test_loader_temp)

GPU available: True, used: True
TPU available: None, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: 0it [00:00, ?it/s]

GPU available: True, used: True
TPU available: None, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


--------------------------------------------------------------------------------
DATALOADER:0 TEST RESULTS
{'test_loss': tensor(546.0577, device='cuda:0', dtype=torch.float32)}
--------------------------------------------------------------------------------


Testing: 0it [00:00, ?it/s]

GPU available: True, used: True
TPU available: None, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


--------------------------------------------------------------------------------
DATALOADER:0 TEST RESULTS
{'test_loss': tensor(452.8763, device='cuda:0', dtype=torch.float32)}
--------------------------------------------------------------------------------


Testing: 0it [00:00, ?it/s]

GPU available: True, used: True
TPU available: None, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


--------------------------------------------------------------------------------
DATALOADER:0 TEST RESULTS
{'test_loss': tensor(336.5995, device='cuda:0', dtype=torch.float32)}
--------------------------------------------------------------------------------


Testing: 0it [00:00, ?it/s]

GPU available: True, used: True
TPU available: None, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


--------------------------------------------------------------------------------
DATALOADER:0 TEST RESULTS
{'test_loss': tensor(249.7258, device='cuda:0', dtype=torch.float32)}
--------------------------------------------------------------------------------


Testing: 0it [00:00, ?it/s]

--------------------------------------------------------------------------------
DATALOADER:0 TEST RESULTS
{'test_loss': tensor(674.3285, device='cuda:0', dtype=torch.float32)}
--------------------------------------------------------------------------------


In [57]:
len(model.out_predictions)

5994

In [68]:
preds_pt = torch.cat(model.out_predictions)

In [69]:
preds_pt.shape

torch.Size([1534206, 1])

In [71]:
chunks = torch.chunk(preds_pt, 6)

In [74]:
temp_chunk = chunks[1]

In [76]:
y_test_37 = all_tempds_df['Yield_37C'].values
y_test_42 = all_tempds_df['Yield_42C'].values
y_test_47 = all_tempds_df['Yield_47C'].values
y_test_52 = all_tempds_df['Yield_52C'].values
y_test_62 = all_tempds_df['Yield_62C'].values

y_test_37 = y_test_37 * 100
y_test_42 = y_test_42 * 100
y_test_47 = y_test_47 * 100
y_test_52 = y_test_52 * 100
y_test_62 = y_test_62 * 100

In [77]:
y_temp_all = [y_test_37, y_test_42, y_test_47, y_test_52, y_test_62]

In [78]:
for i in range(len(chunks[1:])):
    predictions_labels = [1 if x > 20 else 0 for x in chunks[1:][i]]
    true_labels = [1 if x > 20 else 0 for x in y_temp_all[i]]
    print('MCC: ', matthews_corrcoef(true_labels, predictions_labels))
    fpr, tpr, thresholds = roc_curve(true_labels, predictions_labels)
    print('AUROC: ', auc(fpr, tpr))
    print('Avg. prec: ', average_precision_score(true_labels, predictions_labels))
    print(precision_recall_fscore_support(true_labels, predictions_labels, average=None))
    print()

MCC:  0.9253407679688482
AUROC:  0.9705506599596415
Avg. prec:  0.9776205923831391
(array([0.91501096, 0.9948332 ]), array([0.99087445, 0.95022687]), array([0.95143284, 0.97201855]), array([ 89748, 165953], dtype=int64))

MCC:  0.929655290644327
AUROC:  0.9708993635540727
Avg. prec:  0.976416713669797
(array([0.92598957, 0.99167886]), array([0.98555549, 0.95624323]), array([0.95484446, 0.97363873]), array([ 91315, 164386], dtype=int64))

MCC:  0.9302274897836948
AUROC:  0.9680316033740517
Avg. prec:  0.97055319800303
(array([0.94150573, 0.98292243]), array([0.97126662, 0.96479658]), array([0.95615465, 0.97377516]), array([ 94211, 161490], dtype=int64))

MCC:  0.9137365425628989
AUROC:  0.9541943687945057
Avg. prec:  0.9487468727176351
(array([0.96208419, 0.95703164]), array([0.93210387, 0.97628486]), array([0.94685677, 0.96656239]), array([100315, 155386], dtype=int64))

MCC:  0.7619331687484813
AUROC:  0.8699215552757904
Avg. prec:  0.7874826430058636
(array([0.99635761, 0.78832517]),

In [80]:
chunks[1:][0]

tensor([[-2.4475],
        [-2.5762],
        [-2.1789],
        ...,
        [99.6723],
        [97.8562],
        [99.8846]], device='cuda:0')