# File-Mask Testing

In this notebook we'll setup a fill-mask pipeline so we can test our new model qualitatively.

In [1]:
# !pip install numpy
# !pip install torch
# !pip install transformers
# !pip install tensorflowÍ
# !pip install wandb

In [None]:
import torch
from tqdm.auto import tqdm
import wandb
from importlib import reload
import pandas as pd
import os

In [None]:
!python3 -m wandb login $WANDB_TOKEN

In [4]:
samples_count = '1m'
model_name = f'd-bert_{samples_count}'

dataset_name = 'bbbp'
target = 'p_np'

In [5]:
tqdm.pandas()

def read_data(part: str):
    data = pd.read_csv(f'{dataset_name}/{dataset_name}_{part}.csv')
    import ast

    def string_to_array(input_string):
        try:
            # Use ast.literal_eval to safely evaluate the string as a Python literal
            result = ast.literal_eval(input_string)
            return result
        except (SyntaxError, ValueError) as e:
            print(f"Error parsing the string: {e}")
            return None
    data['descriptors'] = data['descriptors'].progress_apply(lambda x: string_to_array(x))

    data = data.dropna(subset=[target]).reset_index(drop=True)
    return data

In [None]:
data_train = read_data("train")
data_eval = read_data("valid")
data_test = read_data("test")
data_train['descriptors'][0]

In [7]:
import shifter as sh
reload(sh)

shifter = sh.Shifter()

In [8]:
for data in [data_train, data_eval, data_test]:
    for descriptors_of_substructures in data['descriptors']:
        shifter.shift(descriptors_of_substructures)

In [9]:
data_train['descriptors'][0]

[[[10, 114, 207, 306, 406, 506, 606, 706, 806, 906, 1006, 1106],
  [1211],
  [1259, 1557, 1856, 2156],
  [2468],
  [2776],
  [7760],
  [7766],
  [7773],
  [7823]],
 [[6, 108, 207, 306, 406, 506, 606, 706, 806, 906, 1006, 1106],
  [1207],
  [1256, 1556, 1856, 2156],
  [2458],
  [2756],
  [7759],
  [7766],
  [7773],
  [7823]],
 [[10, 116, 206, 306, 406, 506, 606, 706, 806, 906, 1006, 1106],
  [1210],
  [1259, 1556, 1856, 2156],
  [2469],
  [2765],
  [7760],
  [7766],
  [7773],
  [7823]],
 [[12, 112, 206, 306, 406, 506, 606, 706, 806, 906, 1006, 1106],
  [1212],
  [1256, 1556, 1856, 2162],
  [2468],
  [2783],
  [7760],
  [7766],
  [7779],
  [7824]],
 [[6, 109, 206, 307, 406, 506, 606, 706, 806, 906, 1006, 1106],
  [1207],
  [1256, 1556, 1856, 2156],
  [2459],
  [2756],
  [7759],
  [7766],
  [7773],
  [7823]],
 [[8, 111, 206, 306, 406, 506, 606, 707, 806, 906, 1006, 1106],
  [1209],
  [1258, 1556, 1856, 2156],
  [2463],
  [2760],
  [7760],
  [7766],
  [7773],
  [7823]],
 [[8, 111, 206, 306

In [None]:
import tokenizer as tokenizer
reload(tokenizer)

In [11]:
def tokenize_descriptors(data, start = 0, end = -1):
    sample = tokenizer.tokenize(data['descriptors'][start:end], max_length=512)
    return torch.tensor(sample['input_ids']), torch.tensor(sample['attention_mask']), torch.tensor(data[target][start:end])

In [12]:
train_input_ids, train_mask, train_targets                = tokenize_descriptors(data_train)
validation_input_ids, validation_mask, validation_targets = tokenize_descriptors(data_eval)
test_input_ids, test_mask, test_targets                   = tokenize_descriptors(data_test)

In [13]:
print(train_input_ids.shape)
print(validation_input_ids.shape)
print(validation_input_ids.shape)

torch.Size([1549, 512])
torch.Size([193, 512])
torch.Size([193, 512])


In [14]:
train_input_ids[0]

tensor([   0,   10,  114,  207,  306,  406,  506,  606,  706,  806,  906, 1006,
        1106, 1211, 1259, 1557, 1856, 2156, 2468, 2776, 7760, 7766, 7773, 7823,
           3,    6,  108,  207,  306,  406,  506,  606,  706,  806,  906, 1006,
        1106, 1207, 1256, 1556, 1856, 2156, 2458, 2756, 7759, 7766, 7773, 7823,
           3,   10,  116,  206,  306,  406,  506,  606,  706,  806,  906, 1006,
        1106, 1210, 1259, 1556, 1856, 2156, 2469, 2765, 7760, 7766, 7773, 7823,
           3,   12,  112,  206,  306,  406,  506,  606,  706,  806,  906, 1006,
        1106, 1212, 1256, 1556, 1856, 2162, 2468, 2783, 7760, 7766, 7779, 7824,
           3,    6,  109,  206,  307,  406,  506,  606,  706,  806,  906, 1006,
        1106, 1207, 1256, 1556, 1856, 2156, 2459, 2756, 7759, 7766, 7773, 7823,
           3,    8,  111,  206,  306,  406,  506,  606,  707,  806,  906, 1006,
        1106, 1209, 1258, 1556, 1856, 2156, 2463, 2760, 7760, 7766, 7773, 7823,
           3,    8,  111,  206,  306,  4

In [15]:
train_targets[0]

tensor(1)

In [None]:
# torch.save(input_ids, 'preprocessed_tensors/input_ids.pt')
# torch.save(mask, 'preprocessed_tensors/attention_mask.pt')
# torch.save(labels, 'preprocessed_tensors/labels.pt')

# del input_ids, mask, labels

In [None]:
# input_ids = torch.load('preprocessed_tensors/input_ids.pt')
# mask = torch.load('preprocessed_tensors/attention_mask.pt')
# labels = torch.load('preprocessed_tensors/labels.pt')

### dataset and dataloader

In [18]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __len__(self):
        return self.encodings['input_ids'].shape[0]

    def __getitem__(self, i):
        return {key: tensor[i] for key, tensor in self.encodings.items()}

In [19]:
train_dataset = Dataset({'input_ids': train_input_ids, 'attention_mask': train_mask, target: train_targets})
validation_dataset = Dataset({'input_ids': validation_input_ids, 'attention_mask': validation_mask, target: validation_targets})
test_dataset = Dataset({'input_ids': test_input_ids, 'attention_mask': test_mask, target: test_targets})

In [20]:
batch_size = 32

In [21]:
train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
eval_dataloader = torch.utils.data.DataLoader(validation_dataset, batch_size=batch_size, shuffle=False, drop_last=True)
test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

### RoBERTa model 

In [22]:
from transformers import AutoModel, AutoConfig, PreTrainedModel

class MolecularPropertiesClassification(PreTrainedModel):
    def __init__(self, model_name_):
        configuration = AutoConfig.from_pretrained(model_name_)
        super(MolecularPropertiesClassification, self).__init__(configuration)
        
        self.transformer1 = AutoModel.from_pretrained(model_name_, config=configuration)
        # removing last layer of transformer
        self.transformer1.pooler = torch.nn.Identity()
        # freezing transformer weights
        for param in self.transformer1.parameters():
            param.requires_grad = False

        self.linear1 = torch.nn.Linear(768, 768, bias=True)
        self.linear2 = torch.nn.Linear(768, 2, bias=True)

    def forward(self, input_ids = None, attention_mask=None):
        outputs1 = self.transformer1(input_ids=input_ids, attention_mask=attention_mask)
        last_hidden_state1 = outputs1[0]
        
        first_linear_out = self.linear1(last_hidden_state1[:, 0, : ].view(-1, 768))
        logits = self.linear2(torch.nn.functional.sigmoid(first_linear_out))

        return logits

In [None]:
model = MolecularPropertiesClassification(model_name)

And now we move onto training. First we setup GPU/CPU usage.

In [24]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"
torch.cuda.is_available()

True

In [25]:
device = torch.device('cuda', index=2) if torch.cuda.is_available() else torch.device('cpu')
# and move our model over to the selected device
model.to(device)

MolecularPropertiesClassification(
  (transformer1): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(7835, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-5): 6 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
          

Activate the training mode of our model, and initialize our optimizer (Adam with weighted decay - reduces chance of overfitting).

In [None]:
from transformers import AdamW

model.train()
optimizer = AdamW(model.parameters(), lr=1e-5)
loss_func = torch.nn.CrossEntropyLoss()

### Do Training

In [None]:
wandb.init(
    project="bert_transformer",
    name=f"{dataset_name} {model_name} training"
)

In [28]:
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, roc_auc_score

def wandb_log(epoch_loss_train,
              total_train_true_labels, 
              total_train_pred_labels,

              epoch_loss_eval,
              total_eval_true_labels, 
              total_eval_pred_labels):
    
    train_labels = [total_train_true_labels, total_train_pred_labels]
    eval_labels = [total_eval_true_labels, total_eval_pred_labels]

    wandb.log({"loss/train":                epoch_loss_train / len(train_dataloader),
               "accuracy/train":            accuracy_score( *train_labels),
               "f1/train":                  f1_score(       *train_labels, average='micro'),
               "precision/train":           precision_score(*train_labels, average='micro'),
               "recall/train":              recall_score(   *train_labels, average='micro'),
               "roc_auc_score/train":       roc_auc_score(  *train_labels),
               
               "loss/validation":           epoch_loss_eval / len(eval_dataloader),
               "accuracy/validation":       accuracy_score( *eval_labels),
               "f1/validation":             f1_score(       *eval_labels, average='micro'),
               "precision/validation":      precision_score(*eval_labels, average='micro'),
               "recall/validation":         recall_score(   *eval_labels, average='micro'),
               "roc_auc_score/validation":  roc_auc_score(  *eval_labels)})

In [None]:
from tqdm import tqdm

epochs = 10
step = 0

progress_bar_train = tqdm(range(epochs * len(train_dataloader)))
progress_bar_eval  = tqdm(range(epochs * len(eval_dataloader)))

for epoch in range(epochs):
    progress_bar_train.set_description(f"Epoch: {epoch}")
    progress_bar_eval.set_description(f"Epoch: {epoch}")

    total_train_pred_labels = []
    total_train_true_labels = []
    epoch_loss_train = 0

    for batch in train_dataloader:
        input_batch = { k: v.to(device) for k, v in batch.items() if k in ['input_ids', 'attention_mask'] }
        batch[target] = batch[target].to(device)
        
        logits = model(**input_batch)
        loss = loss_func(logits.view(-1, 2), batch[target].view(-1).type(torch.cuda.LongTensor))
        loss.backward()
        epoch_loss_train += loss.item()
        
        optimizer.step()
        optimizer.zero_grad()
        progress_bar_train.update(1)

        pred_labels = torch.argmax(logits, dim=-1)
        true_labels = batch[target]
        total_train_pred_labels.append(pred_labels)
        total_train_true_labels.append(true_labels)

    total_train_pred_labels = torch.cat(total_train_pred_labels).cpu().detach().numpy()
    total_train_true_labels = torch.cat(total_train_true_labels).cpu().detach().numpy()
    

    model.eval()
    
    total_eval_pred_labels = []
    total_eval_true_labels = []
    epoch_loss_eval = 0

    for batch in eval_dataloader:
        input_batch = { k: v.to(device) for k, v in batch.items() if k in ['input_ids', 'attention_mask'] }
        batch[target] = batch[target].to(device)
        
        with torch.no_grad():
            logits = model(**input_batch)
            loss = loss_func(logits.view(-1, 2), batch[target].view(-1).type(torch.cuda.LongTensor))
            epoch_loss_eval += loss.item()

            pred_labels = torch.argmax(logits, dim=-1)
            true_labels = batch[target]
            total_eval_pred_labels.append(pred_labels)
            total_eval_true_labels.append(true_labels)
        
        progress_bar_eval.update(1)

    total_eval_pred_labels = torch.cat(total_eval_pred_labels).cpu().detach().numpy()
    total_eval_true_labels = torch.cat(total_eval_true_labels).cpu().detach().numpy()
    
    wandb_log(epoch_loss_train,
              total_train_true_labels, 
              total_train_pred_labels,

              epoch_loss_eval,
              total_eval_true_labels, 
              total_eval_pred_labels)


Epoch: 0:   0%|          | 0/490 [00:00<?, ?it/s]

Epoch: 5:  52%|█████▏    | 256/490 [06:41<04:56,  1.27s/it]

In [None]:
wandb.finish()

### Do Testing

In [None]:
wandb.init(
    project="bert_transformer",
    name=f"{dataset_name} {model_name} testing"
)

In [None]:
def test_loop():
    model.eval()
    total_pred_labels = []
    total_true_labels = []
    epoch_loss = 0
    for batch in test_dataloader:
        input_batch = { k: v.to(device) for k, v in batch.items() if k in ['input_ids', 'attention_mask'] }
        batch[target] = batch[target].to(device)
        
        with torch.no_grad():
            logits = model(**input_batch)
            loss = loss_func(logits.view(-1, 2), batch[target].view(-1).type(torch.cuda.LongTensor))
            epoch_loss += loss.item()

            pred_labels = torch.argmax(logits, dim=-1)
            true_labels = batch[target]
            total_pred_labels.append(pred_labels)
            total_true_labels.append(true_labels)
        
        progress_bar_eval.update(1)

    total_pred_labels = torch.cat(total_pred_labels).cpu().detach().numpy()
    total_true_labels = torch.cat(total_true_labels).cpu().detach().numpy()
    
    wandb.log({"loss/test": epoch_loss / len(test_dataloader),
               "accuracy/test": accuracy_score(total_true_labels, total_pred_labels),
               "f1/test": f1_score(total_true_labels, total_pred_labels, average='micro'),
               "precision/test": precision_score(total_true_labels, total_pred_labels, average='micro'),
               "recall/test": recall_score(total_true_labels, total_pred_labels, average='micro'),
               "roc_auc_score/test": roc_auc_score(total_true_labels, total_pred_labels)})

In [None]:
test_loop()

In [None]:
wandb.finish()

In [None]:
model.save_pretrained(model_name + "-" + dataset_name)

In [None]:
torch.cuda.empty_cache()