In [1]:
# !pip install numpy
# !pip install torch
# !pip install transformers
# !pip install tensorflowÍ
# !pip install wandb

In [2]:
import torch
from tqdm.auto import tqdm
import wandb
from importlib import reload
import pandas as pd

In [None]:
!python3 -m wandb login $WANDB_TOKEN

In [4]:
samples_count = '1m'
model_name = f'd-bert_{samples_count}'

In [5]:
tqdm.pandas()

def read_data():
    # data = pd.read_csv(f'out-{samples_count}.csv')
    data1 = pd.read_csv(f'preprocessed_1_500k_with_descriptors.csv')
    data2 = pd.read_csv(f'preprocessed_2_500k_with_descriptors.csv')
    data = pd.concat([data1, data2], ignore_index=True)
    import ast

    def string_to_array(input_string):
        try:
            # Use ast.literal_eval to safely evaluate the string as a Python literal
            result = ast.literal_eval(input_string)
            return result
        except (SyntaxError, ValueError) as e:
            print(f"Error parsing the string: {e}")
            return None
    data['descriptors'] = data['descriptors'].progress_apply(lambda x: string_to_array(x))
    return data

In [None]:
data = read_data()
len(data)

In [7]:
data['descriptors'][0]

[[[1, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
  [2],
  [1, 0, 0, 0],
  [5],
  [1],
  [3],
  [2],
  [0],
  [0]],
 [[1, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
  [2],
  [0, 1, 0, 0],
  [3],
  [1],
  [3],
  [2],
  [0],
  [0]],
 [[0, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
  [1],
  [0, 0, 0, 0],
  [2],
  [0],
  [3],
  [2],
  [0],
  [0]],
 [[17, 20, 3, 1, 0, 0, 0, 0, 0, 0, 0, 0],
  [21],
  [18, 1, 0, 6],
  [45],
  [691],
  [3],
  [3],
  [18],
  [5]],
 '$',
 [[2, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0],
  [4],
  [2, 1, 0, 0],
  [7],
  [10],
  [3],
  [2],
  [0],
  [0]],
 [[1, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0],
  [3],
  [1, 1, 0, 0],
  [4],
  [4],
  [3],
  [2],
  [0],
  [0]],
 [[17, 20, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0],
  [22],
  [19, 1, 0, 6],
  [46],
  [788],
  [3],
  [3],
  [18],
  [5]]]

In [8]:
import shifter as sh
reload(sh)

shifter = sh.Shifter()

In [9]:
for descriptors_of_substructures in data['descriptors']:
    shifter.shift(descriptors_of_substructures)

In [10]:
maximum = 0
for mol in data['descriptors']:
    for substr in mol:
        if substr == '$':
            continue
        for descriptor in substr:
            for i in descriptor:
                maximum = max(maximum, i)
maximum # vocab_size

7834

In [None]:
import tokenizer as tokenizer
reload(tokenizer)
# tokenized_descriptors = tokenizer.tokenize(data['descriptors'], max_length=512)

In [12]:
def mlm(tensor):
    print(tensor)
    # create random array of floats with equal dims to tensor
    rand = torch.rand(tensor.shape)
    # mask random 15% where token is not 0 <s>, 1 <pad>, or 2 <s/>
    mask_arr = (rand < .15) * (tensor != 0) * (tensor != 1) * (tensor != 2)
    # loop through each row in tensor (cannot do in parallel)
    for i in range(tensor.shape[0]):
        # get indices of mask positions from mask array
        selection = torch.flatten(mask_arr[i].nonzero()).tolist()
        # mask tensor
        tensor[i, selection] = 4
    return tensor

In [13]:
def tokenize_descriptors(data, start, end = -1):
    input_ids = []
    mask = []
    labels = []
    sample = tokenizer.tokenize(data['descriptors'][start:end], max_length=512)
    
    labels.append(torch.tensor(sample['input_ids']))
    mask.append(torch.tensor(sample['attention_mask']))
    input_ids.append(mlm(labels[-1].detach().clone())) # mask ~15% of tokens to create inputs
    
    input_ids = torch.cat(input_ids)
    mask = torch.cat(mask)
    
    labels = torch.cat(labels)
    return input_ids, mask, labels

In [14]:
input_ids, mask, labels = tokenize_descriptors(data, 0, 1)

100%|██████████| 1/1 [00:00<00:00, 4993.22it/s]

tensor([[   0,    7,  110,  207,  306,  406,  506,  606,  706,  806,  906, 1006,
         1106, 1208, 1257, 1556, 1856, 2156, 2461, 2757, 7759, 7766, 7773, 7823,
            3,    7,  108,  207,  306,  406,  506,  606,  706,  806,  906, 1006,
         1106, 1208, 1256, 1557, 1856, 2156, 2459, 2757, 7759, 7766, 7773, 7823,
            3,    6,  108,  207,  306,  406,  506,  606,  706,  806,  906, 1006,
         1106, 1207, 1256, 1556, 1856, 2156, 2458, 2756, 7759, 7766, 7773, 7823,
            3,   23,  126,  209,  307,  406,  506,  606,  706,  806,  906, 1006,
         1106, 1227, 1274, 1557, 1856, 2162, 2501, 3447, 7759, 7767, 7791, 7828,
            3,    5,    8,  110,  208,  306,  406,  506,  606,  706,  806,  906,
         1006, 1106, 1210, 1258, 1557, 1856, 2156, 2463, 2766, 7759, 7766, 7773,
         7823,    3,    7,  108,  208,  306,  406,  506,  606,  706,  806,  906,
         1006, 1106, 1209, 1257, 1557, 1856, 2156, 2460, 2760, 7759, 7766, 7773,
         7823,    3,   23,  




In [15]:
train_input_ids, train_mask, train_labels = tokenize_descriptors(data, 0, int(0.8 * len(data)))
validation_input_ids, validation_mask, validation_labels = tokenize_descriptors(data, int(0.8 * len(data)), int(0.9 * len(data)))
test_input_ids, test_mask, test_labels = tokenize_descriptors(data, int(0.9 * len(data)))

100%|██████████| 763016/763016 [00:47<00:00, 15918.06it/s]


tensor([[  0,   7, 110,  ...,   1,   1,   1],
        [  0,  11, 113,  ...,   1,   1,   1],
        [  0,  14, 116,  ...,   1,   1,   1],
        ...,
        [  0,   7, 108,  ...,   1,   1,   1],
        [  0,  14, 115,  ...,   1,   1,   1],
        [  0,   9, 114,  ...,   1,   1,   1]])


100%|██████████| 95377/95377 [00:05<00:00, 17279.57it/s]


tensor([[  0,  16, 124,  ...,   1,   1,   1],
        [  0,   7, 110,  ...,   1,   1,   1],
        [  0,   7, 110,  ...,   1,   1,   1],
        ...,
        [  0,   7, 107,  ...,   1,   1,   1],
        [  0,  11, 113,  ...,   1,   1,   1],
        [  0,  13, 121,  ...,   1,   1,   1]])


100%|██████████| 95377/95377 [00:05<00:00, 17352.77it/s]


tensor([[  0,  14, 114,  ...,   1,   1,   1],
        [  0,   8, 110,  ...,   1,   1,   1],
        [  0,  13, 113,  ...,   1,   1,   1],
        ...,
        [  0,  14, 114,  ...,   1,   1,   1],
        [  0,  19, 119,  ...,   1,   1,   1],
        [  0,   8, 112,  ...,   1,   1,   1]])


In [16]:
print(train_input_ids.shape)
print(validation_input_ids.shape)
print(validation_input_ids.shape)

torch.Size([763016, 512])
torch.Size([95377, 512])
torch.Size([95377, 512])


In [17]:
train_labels[0]

tensor([   0,    7,  110,  207,  306,  406,  506,  606,  706,  806,  906, 1006,
        1106, 1208, 1257, 1556, 1856, 2156, 2461, 2757, 7759, 7766, 7773, 7823,
           3,    7,  108,  207,  306,  406,  506,  606,  706,  806,  906, 1006,
        1106, 1208, 1256, 1557, 1856, 2156, 2459, 2757, 7759, 7766, 7773, 7823,
           3,    6,  108,  207,  306,  406,  506,  606,  706,  806,  906, 1006,
        1106, 1207, 1256, 1556, 1856, 2156, 2458, 2756, 7759, 7766, 7773, 7823,
           3,   23,  126,  209,  307,  406,  506,  606,  706,  806,  906, 1006,
        1106, 1227, 1274, 1557, 1856, 2162, 2501, 3447, 7759, 7767, 7791, 7828,
           3,    5,    8,  110,  208,  306,  406,  506,  606,  706,  806,  906,
        1006, 1106, 1210, 1258, 1557, 1856, 2156, 2463, 2766, 7759, 7766, 7773,
        7823,    3,    7,  108,  208,  306,  406,  506,  606,  706,  806,  906,
        1006, 1106, 1209, 1257, 1557, 1856, 2156, 2460, 2760, 7759, 7766, 7773,
        7823,    3,   23,  126,  210,  3

In [None]:
# torch.save(input_ids, 'preprocessed_tensors/input_ids.pt')
# torch.save(mask, 'preprocessed_tensors/attention_mask.pt')
# torch.save(labels, 'preprocessed_tensors/labels.pt')

# del input_ids, mask, labels

In [None]:
# input_ids = torch.load('preprocessed_tensors/input_ids.pt')
# mask = torch.load('preprocessed_tensors/attention_mask.pt')
# labels = torch.load('preprocessed_tensors/labels.pt')

### dataset and dataloader

In [20]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __len__(self):
        return self.encodings['input_ids'].shape[0]

    def __getitem__(self, i):
        return {key: tensor[i] for key, tensor in self.encodings.items()}

In [21]:
train_dataset = Dataset({'input_ids': train_input_ids, 'attention_mask': train_mask, 'labels': train_labels})
validation_dataset = Dataset({'input_ids': validation_input_ids, 'attention_mask': validation_mask, 'labels': validation_labels})
test_dataset = Dataset({'input_ids': test_input_ids, 'attention_mask': test_mask, 'labels': test_labels})

In [22]:
batch_size = 32

In [23]:
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
validation_loader = torch.utils.data.DataLoader(validation_dataset, batch_size=batch_size, shuffle=False, drop_last=True)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

And move onto building our model, we first need to create a RoBERTa config object, which will describe which features we want to initialize our RoBERTa model with.

In [24]:
from transformers import RobertaConfig

config = RobertaConfig(
    vocab_size=maximum + 1,
    max_position_embeddings=514,
    hidden_size=768,
    num_attention_heads=12,
    num_hidden_layers=6,
    type_vocab_size=1
)

Then we import and initialize a RoBERTa model with a language modeling head.

In [25]:
from transformers import RobertaForMaskedLM

model = RobertaForMaskedLM(config)

And now we move onto training. First we setup GPU/CPU usage.

In [26]:
import os
# os.environ['CUDA_LAUNCH_BLOCKING'] = "1"
torch.cuda.is_available()

True

In [27]:
device = torch.device('cuda', index=5) if torch.cuda.is_available() else torch.device('cpu')
# and move our model over to the selected device
model.to(device)

RobertaForMaskedLM(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(7835, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-5): 6 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): Lay

Activate the training mode of our model, and initialize our optimizer (Adam with weighted decay - reduces chance of overfitting).

In [None]:
from transformers import AdamW

model.train()
optim = AdamW(model.parameters(), lr=1e-5)

In [None]:
wandb.init(
    project="bert_transformer",
    name=f"RobertaForMLM on molecular descriptors training ({samples_count})",
    config=config
)

Now we move onto the training loop.

In [30]:
%env CUDA_LAUNCH_BLOCKING=1

env: CUDA_LAUNCH_BLOCKING=1


In [None]:
from tqdm import tqdm 
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score

epochs = 2
step = 0

validation_iterator = iter(validation_loader)
for epoch in range(epochs):
    loop = tqdm(train_loader, leave=True)
    for batch in loop:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        
        outputs = model(input_ids, attention_mask=attention_mask,
                        labels=labels)
        loss = outputs.loss
        logits = outputs.logits
        true_labels = batch['labels'].numpy().flatten()
        pred_labels = torch.nn.functional.softmax(logits, dim=1).argmax(axis=-1).cpu().detach().numpy().flatten()

        # write down loss and metrics
        wandb.log({"loss/train": loss}, step=step)
        wandb.log({"accuracy/train": accuracy_score(true_labels, pred_labels)}, step=step)
        wandb.log({"f1/train": f1_score(true_labels, pred_labels, average='micro')}, step=step)
        wandb.log({"precision/train": precision_score(true_labels, pred_labels, average='micro')}, step=step)
        wandb.log({"recall/train": recall_score(true_labels, pred_labels, average='micro')}, step=step)
        
        loss.backward()
        optim.step()
        optim.zero_grad()

        with torch.no_grad():
            try:
                validation_batch = next(validation_iterator)
            except StopIteration:
                validation_dataset = Dataset({'input_ids': validation_input_ids, 'attention_mask': validation_mask, 'labels': validation_labels})
                validation_loader = torch.utils.data.DataLoader(validation_dataset, batch_size=batch_size, shuffle=False, drop_last=True)
                validation_iterator = iter(validation_loader)
                
                validation_batch = next(validation_iterator)
            
            input_ids = validation_batch['input_ids'].to(device)
            attention_mask = validation_batch['attention_mask'].to(device)
            labels = validation_batch['labels'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)

            loss = outputs.loss
            logits = outputs.logits
            true_labels = validation_batch['labels'].numpy().flatten()
            pred_labels = torch.nn.functional.softmax(logits, dim=1).argmax(axis=-1).cpu().detach().numpy().flatten()
    
            # write down loss and metrics
            wandb.log({"loss/validation": loss}, step=step)
            wandb.log({"accuracy/validation": accuracy_score(true_labels, pred_labels)}, step=step)
            wandb.log({"f1/validation": f1_score(true_labels, pred_labels, average='micro')}, step=step)
            wandb.log({"precision/validation": precision_score(true_labels, pred_labels, average='micro')}, step=step)
            wandb.log({"recall/validation": recall_score(true_labels, pred_labels, average='micro')}, step=step)
            
        loop.set_description(f'Epoch {epoch}')
        loop.set_postfix(loss=loss.item())
        step += len(batch['input_ids'])


In [None]:
wandb.finish()

In [None]:
step

In [None]:
wandb.init(
    project="bert_transformer",
    name=f"RobertaForMLM on molecular descriptors testing ({samples_count})",
    config=config
)

In [None]:
step = 0

with torch.no_grad():
    loop = tqdm(test_loader, leave=True)
    for batch in loop:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        
        outputs = model(input_ids, attention_mask=attention_mask,
                        labels=labels)
        loss = outputs.loss
        logits = outputs.logits
        true_labels = batch['labels'].numpy().flatten()
        pred_labels = torch.nn.functional.softmax(logits, dim=1).argmax(axis=-1).cpu().detach().numpy().flatten()

        # write down loss and metrics
        wandb.log({"loss/test": loss}, step=step)
        wandb.log({"accuracy/test": accuracy_score(true_labels, pred_labels)}, step=step)
        wandb.log({"f1/test": f1_score(true_labels, pred_labels, average='micro')}, step=step)
        wandb.log({"precision/test": precision_score(true_labels, pred_labels, average='micro')}, step=step)
        wandb.log({"recall/test": recall_score(true_labels, pred_labels, average='micro')}, step=step)
        
        loop.set_description(f'Epoch {epoch}')
        loop.set_postfix(loss=loss.item())
        step += len(batch)

wandb.finish()

In [None]:
wandb.finish()

In [None]:
model.save_pretrained(model_name)

In [None]:
torch.cuda.empty_cache()

In [None]:
print(torch.cuda.device_count())
print(torch.cuda.current_device())

In [None]:
torch.device('cuda', index=1)