In [32]:
import os
import torch
import random
from transformers import AutoTokenizer, AutoModel, AutoModelForMaskedLM, DebertaV2ForMaskedLM, DebertaV2Tokenizer
from sklearn.metrics import f1_score

In [2]:
%pip install scikit-learn

Collecting scikit-learn
  Downloading scikit_learn-1.5.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (13 kB)
Collecting scipy>=1.6.0 (from scikit-learn)
  Downloading scipy-1.13.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
Collecting joblib>=1.2.0 (from scikit-learn)
  Downloading joblib-1.4.2-py3-none-any.whl.metadata (5.4 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn)
  Downloading threadpoolctl-3.5.0-py3-none-any.whl.metadata (13 kB)
Downloading scikit_learn-1.5.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (13.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.4/13.4 MB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hDownloading joblib-1.4.2-py3-none-any.whl (301 kB)
Downloading scipy-1.13.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (38.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m38.6/38.6 MB[0m [31m1.2 MB/s[0m eta [36m0:00:00

In [2]:
os.chdir("/scratch/hm62/hl4138/polyBERT/polyBERT/")

In [3]:
pretrain_sizes = ['1M','5M','10M','20M','50M','90M']
size = '1M'

In [36]:
def create_masked_test_set(tokenizer, sentences, mask_prob=0.15):
    masked_sentences = []
    ground_truth = []

    for sentence in sentences:
        tokenized_input = tokenizer.tokenize(sentence)
        masked_sentence = tokenized_input.copy()  # Copy of tokenized sentence
        ground_truth_sentence = []

        for i, token in enumerate(tokenized_input):
            if random.random() < mask_prob:  # Mask with a certain probability (e.g., 15%)
                ground_truth.append(token)  # Store original token
                masked_sentence[i] = tokenizer.mask_token  # Replace with [MASK]

        masked_sentences.append(tokenizer.convert_tokens_to_string(masked_sentence))
        # ground_truth.append(ground_truth_sentence)

    return masked_sentences, ground_truth

In [37]:
# Check GPU availability
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
torch.cuda.is_available() #checking if CUDA + Colab GPU works

# Load test dataset
file_path = 'data/generated_polymer_smiles_dev.txt'

with open(file_path, 'r') as file:
    psmiles_strings = [line.strip() for line in file]

psmiles_strings = psmiles_strings[:20]

In [43]:
psmiles_strings[0]

'[*]c1cccc(P(=O)(c2ccccc2)c2cccc(-c3nc4ccc(-c5ccccc5)cc4nc3N3C(=O)c4ccc(-c5ccc6c(c5)C(=O)N([*])C6=O)cc4C3=O)c2)c1'

In [44]:
masked_psmiles

['[MASK][MASK] 1cccc[MASK] p(=o)([MASK] 2[MASK] cccc[MASK] )c2ccc[MASK] (-c3nc4[MASK] cc(-c5ccccc5)cc4nc3n[MASK] c([MASK] o[MASK] c4ccc(-c5c[MASK] c6[MASK] (c5)c(=[MASK] )[MASK][MASK][*])c6=o)cc4c3=o)c2)c1',
 '[*]oc1ccc[MASK] -c2ccc(c)c(-c3[MASK] c(-c4cc(n5c([MASK] o)c[MASK] (=[MASK] )n([*])[MASK] 5=s)ccc4c)[MASK] cc[MASK] c)c2[MASK] cc[MASK]',
 '[*]oc1ccc(-c2ccc3c(c2)c[MASK][MASK] cc(oc([*])=o)cc[MASK][MASK] -3)[MASK]1',
 '[*][MASK] (=o)c1(cc)[MASK] cc(oc[MASK] cc[MASK] c(n3c(=o)[MASK] 4ccc([*])c[MASK] 4c3=o)[MASK]2)cc1',
 '[MASK] os[MASK][MASK] o)(=o[MASK] c1[MASK] c[MASK] (-c[MASK] cc[MASK] (-c3ccc[MASK] os([MASK] o)[MASK] =o)c4[MASK] cc(-c5ccc(-c6cc(br)c([*])c(br)c6)cc5)cc4)cc[MASK] )c([MASK])c2)cc1',
 '[MASK] oc1c[MASK] c[MASK] nc(-c[MASK] ccc4nc(n5c(=[MASK] )[MASK] 6cc[MASK][MASK] sc7cccc([*])c7[MASK] cc6c[MASK] =o)o[MASK] 4c3[MASK]oc2c1',
 '[MASK] c1sc(-c2[MASK] ([MASK][MASK] 3c(c)[MASK] c[MASK] -c[MASK] sc([MASK] c5[MASK] cc[MASK] c[MASK] )c[MASK] c[MASK] o[MASK] co5)cc3c([*])[

In [45]:
ground_truth

['[*]',
 'c',
 '(',
 'c',
 'c',
 '2',
 'c',
 'c',
 '3',
 '=',
 ')',
 'c',
 'c',
 'o',
 'n',
 '(',
 '(',
 'c',
 '=',
 'c',
 'o',
 'c',
 'c',
 '3',
 ')',
 '1',
 'c',
 '2',
 'c',
 '2',
 's',
 'c',
 'c',
 '2',
 'c',
 'c',
 'c',
 'c',
 '[*]',
 '(',
 '=',
 ')',
 'c',
 'c',
 '2',
 'c',
 '(',
 '=',
 '(',
 'c',
 '3',
 'cl',
 '[*]',
 'c',
 '2',
 '3',
 'o',
 'c',
 'c',
 '(',
 ')',
 '5',
 'c',
 ')',
 '[*]',
 'sc',
 '-',
 'c',
 'c',
 '(',
 '4',
 '-',
 'c',
 'c',
 '5',
 '5',
 '4',
 'c',
 '(',
 'f',
 ')',
 ')',
 'o',
 '3',
 '1',
 'c',
 'c',
 '(',
 'c',
 'n',
 '\\',
 '5',
 '(',
 'c',
 '1',
 'c',
 ')',
 '(',
 '(',
 'c',
 'c',
 '=',
 'n',
 '=',
 'o',
 '(',
 'o',
 'br',
 'c',
 'n',
 'c',
 'n',
 '(',
 '=',
 'o',
 'o',
 'br',
 'c',
 'c',
 ')',
 'c',
 '2',
 'c',
 'c',
 '=',
 'n',
 'c',
 'c',
 'c',
 '5',
 'c',
 '6',
 ')',
 '▁',
 'c',
 'c',
 'c',
 '2',
 'c',
 'n',
 'n',
 '(',
 'c',
 '=',
 '4',
 'c',
 '5',
 ')',
 'n',
 '(',
 '=',
 'n',
 'c',
 '9',
 'c',
 '%',
 'c',
 '%',
 'c',
 'c',
 'c',
 ')',
 '(',
 '=',
 'o

In [50]:
# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained('kuelumbus/polyBERT')
model = AutoModelForMaskedLM.from_pretrained('kuelumbus/polyBERT').to(device)
# tokenizer = DebertaV2Tokenizer(f"spm_{size}.model",f"spm_{size}.vocab")
# model = DebertaV2ForMaskedLM.from_pretrained(f'model_{size}_final/').to(device)

# Set the model to evaluation mode
model.eval()

# Mask 15% of tokens of each string in test data
masked_psmiles, ground_truth = create_masked_test_set(tokenizer,psmiles_strings)

# Tokenize the sentences
inputs = tokenizer(masked_psmiles, return_tensors='pt', padding=True)
inputs = inputs.to(device)

# Run inference to get predictions for masked tokens
with torch.no_grad():
    outputs = model(**inputs)
    predictions = outputs.logits

# Get the predicted token IDs for the masked positions
masked_indices = (inputs.input_ids == tokenizer.mask_token_id).nonzero(as_tuple=True)
predicted_token_ids = predictions[masked_indices].argmax(dim=-1)
print(predicted_token_ids)

# Convert predicted token IDs back to words
predicted_tokens = tokenizer.convert_ids_to_tokens(predicted_token_ids)

# Convert true tokens to token IDs
true_token_ids = tokenizer.convert_tokens_to_ids(ground_truth)

# Compute F1 score (using token IDs for comparison)
f1 = f1_score(true_token_ids, predicted_token_ids.cpu().numpy(), average='micro')

print(f"pretrained F1 Score: {f1}")

Some weights of DebertaV2ForMaskedLM were not initialized from the model checkpoint at kuelumbus/polyBERT and are newly initialized: ['cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tensor([153, 153,  62,  62, 111,  10,  35,  41, 195,  27,  42,  35, 152,  62,
        229, 166,  21,  42, 200, 166, 166,  36,  27, 229,  35, 264, 166, 229,
        166,  22,  35, 229, 229, 166,  27,  27, 229,  62,  62,  25,  27, 195,
         18,  34,  80, 153,  64,  20,  80, 164, 195, 166,  16,  21, 166,  21,
        153, 200, 229, 153, 161,  62, 172, 229,  21, 229,  42,  42, 229, 229,
        166,  32, 153, 240,  35, 229, 229, 229, 209, 161, 166, 166,  21, 166,
         10,  32, 111, 195,  34, 161,  34,  43, 111, 111,  62, 111, 111,  27,
        229,  42, 195,  62, 229,   8,   7,  41,  10,  25, 229,  27, 151, 152,
        200,   7,  10,  10,  36,  11, 229,  23,  34, 153,  34, 161, 153,  42,
        161, 195,  27,  11, 153,  42,  42,  42, 166, 229, 195,  20,  25, 195,
         80,   7, 111, 153, 229, 111,  42,  11,  42, 195, 229,  80, 153,  34,
         48, 152, 166,  42,  27, 166, 161,  19,  27, 161, 195, 195,  62,  34,
        161,  28, 111,  43, 161,  42,  48,  48,  22, 153,  42,  

In [49]:
outputs

BaseModelOutput(last_hidden_state=tensor([[[ 1.6763,  0.0287,  0.4598,  ..., -1.2926,  0.6870, -0.7143],
         [ 1.6320,  0.0815,  0.3851,  ..., -1.3084,  0.6598, -0.6348],
         [ 1.7983, -0.8584,  1.2366,  ..., -1.5453,  0.6853, -0.3230],
         ...,
         [ 0.2695,  0.6533,  0.5087,  ..., -0.1991, -0.1496,  0.5560],
         [ 0.2695,  0.6533,  0.5087,  ..., -0.1991, -0.1496,  0.5560],
         [ 0.2695,  0.6533,  0.5087,  ..., -0.1991, -0.1496,  0.5560]],

        [[ 1.1026,  0.1546, -0.7523,  ..., -1.7872,  1.5069,  0.1879],
         [ 1.1024,  0.1513, -0.7469,  ..., -1.8025,  1.5083,  0.1821],
         [ 1.6554, -0.7777,  0.3114,  ..., -2.1281,  1.0336,  0.2962],
         ...,
         [ 0.2695,  0.6533,  0.5087,  ..., -0.1991, -0.1496,  0.5560],
         [ 0.2695,  0.6533,  0.5087,  ..., -0.1991, -0.1496,  0.5560],
         [ 0.2695,  0.6533,  0.5087,  ..., -0.1991, -0.1496,  0.5560]],

        [[ 0.6098,  0.5521,  0.7161,  ..., -0.1110,  0.4773,  0.2431],
         [ 

In [47]:
true_token_ids

[151,
 151,
 151,
 7,
 13,
 151,
 151,
 6,
 151,
 151,
 15,
 14,
 6,
 151,
 17,
 7,
 8,
 35,
 5,
 33,
 7,
 8,
 35,
 7,
 7,
 6,
 34,
 8,
 7,
 151,
 33,
 33,
 151,
 264,
 151,
 161,
 33,
 15,
 151,
 15,
 33,
 13,
 7,
 33,
 5,
 151,
 6,
 151,
 151,
 7,
 7,
 15,
 151,
 151,
 6,
 21,
 151,
 6,
 7,
 6,
 7,
 151,
 151,
 151,
 14,
 151,
 7,
 151,
 13,
 151,
 151,
 14,
 152,
 16,
 8,
 151,
 151,
 48,
 18,
 151,
 151,
 151,
 7,
 151,
 13,
 12,
 151,
 6,
 151,
 6,
 16,
 15,
 16,
 7,
 7,
 7,
 12,
 151,
 21,
 33,
 8,
 33,
 151,
 15,
 152,
 7,
 151,
 14,
 264,
 151,
 6,
 35,
 34,
 8,
 151,
 151,
 12,
 151,
 151,
 6,
 151,
 151,
 6,
 26,
 14,
 35,
 7,
 13,
 5,
 7,
 62,
 6,
 151,
 62,
 6,
 6,
 7,
 21,
 6,
 16,
 7,
 34,
 151,
 33,
 151,
 151,
 7,
 151,
 17,
 7,
 62,
 62,
 151,
 264,
 151,
 151,
 151,
 151,
 151,
 151,
 151,
 151,
 6,
 16,
 17,
 7,
 13,
 151,
 151,
 33,
 33,
 6,
 33,
 151,
 151,
 12,
 21,
 12,
 151,
 6,
 7,
 151,
 6,
 151,
 33,
 33,
 151,
 151,
 20,
 151,
 8,
 33,
 264,
 6,
 151,
 43,
 

In [39]:
# Load tokenizer and model
# tokenizer = AutoTokenizer.from_pretrained('kuelumbus/polyBERT')
# model = AutoModelForMaskedLM.from_pretrained('kuelumbus/polyBERT').to(device)
tokenizer = DebertaV2Tokenizer(f"spm_{size}.model",f"spm_{size}.vocab")
model = DebertaV2ForMaskedLM.from_pretrained(f'model_{size}_final/').to(device)

# Set the model to evaluation mode
model.eval()

# Mask 15% of tokens of each string in test data
masked_psmiles, ground_truth = create_masked_test_set(tokenizer,psmiles_strings)

# Tokenize the sentences
inputs = tokenizer(masked_psmiles, return_tensors='pt', padding=True)
inputs = inputs.to(device)

# Run inference to get predictions for masked tokens
with torch.no_grad():
    outputs = model(**inputs)
    predictions = outputs.logits

# Get the predicted token IDs for the masked positions
masked_indices = (inputs.input_ids == tokenizer.mask_token_id).nonzero(as_tuple=True)
predicted_token_ids = predictions[masked_indices].argmax(dim=-1)
print(masked_indices)

# Convert predicted token IDs back to words
predicted_tokens = tokenizer.convert_ids_to_tokens(predicted_token_ids)

# Convert true tokens to token IDs
true_token_ids = tokenizer.convert_tokens_to_ids(ground_truth)

# Compute F1 score (using token IDs for comparison)
f1 = f1_score(true_token_ids, predicted_token_ids.cpu().numpy(), average='micro')

print(f"pretrained1M F1 Score: {f1}")



(tensor([ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,
         1,  1,  1,  1,  1,  1,  1,  1,  2,  2,  2,  2,  2,  3,  3,  3,  3,  3,
         3,  3,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  5,  5,
         5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  6,  6,  6,  6,  6,  6,  6,  6,
         6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  7,  7,  7,  7,  7,  7,
         7,  7,  7,  7,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  9,  9,  9,
         9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9, 10, 10, 10, 10,
        10, 10, 10, 10, 10, 10, 10, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 12,
        12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 13,
        13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
        14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 15, 15, 15, 15,
        15, 15, 15, 15, 15, 16, 16, 16, 16, 16, 16, 17, 17, 17, 18, 18, 18, 18,
        18, 18, 18, 18, 18, 18, 18, 18,