In [1]:
import os
import torch
import random
from transformers import AutoTokenizer, AutoModel, AutoModelForMaskedLM, DebertaV2ForMaskedLM, DebertaV2Tokenizer
from sklearn.metrics import f1_score
os.chdir("/scratch/hm62/hl4138/polyBERT/polyBERT/")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def create_masked_test_set(tokenizer, sentences, mask_prob=0.15):
    masked_sentences = []
    ground_truth = []

    for sentence in sentences:
        tokenized_input = tokenizer.tokenize(sentence)
        masked_sentence = tokenized_input.copy()  # Copy of tokenized sentence
        ground_truth_sentence = []

        for i, token in enumerate(tokenized_input):
            if random.random() < mask_prob:  # Mask with a certain probability (e.g., 15%)
                ground_truth.append(token)  # Store original token
                masked_sentence[i] = tokenizer.mask_token  # Replace with [MASK]

        masked_sentences.append(tokenizer.convert_tokens_to_string(masked_sentence))
        # ground_truth.append(ground_truth_sentence)

    return masked_sentences, ground_truth

In [3]:
# Check GPU availability
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Load test dataset
file_path = 'data/generated_polymer_smiles_dev.txt'

with open(file_path, 'r') as file:
    psmiles_strings = [line.strip() for line in file]

psmiles_strings = psmiles_strings[:100] #try 100 strings

In [4]:
# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained('kuelumbus/polyBERT')
model = AutoModelForMaskedLM.from_pretrained('kuelumbus/polyBERT').to(device)

# size = '1M'
# tokenizer = DebertaV2Tokenizer(f"spm_{size}.model",f"spm_{size}.vocab")
# model = DebertaV2ForMaskedLM.from_pretrained(f'model_{size}_final/').to(device)

# Set the model to evaluation mode
model.eval()

# Mask 15% of tokens of each string in test data
masked_psmiles, ground_truth = create_masked_test_set(tokenizer,psmiles_strings)

# Tokenize the sentences
inputs = tokenizer(masked_psmiles, return_tensors='pt', padding=True)
inputs = inputs.to(device)

# Run inference to get predictions for masked tokens
with torch.no_grad():
    outputs = model(**inputs)
    predictions = outputs.logits

# Get the predicted token IDs for the masked positions
masked_indices = (inputs.input_ids == tokenizer.mask_token_id).nonzero(as_tuple=True)
predicted_token_ids = predictions[masked_indices].argmax(dim=-1)

# Convert predicted token IDs back to words
predicted_tokens = tokenizer.convert_ids_to_tokens(predicted_token_ids)

# Convert true tokens to token IDs
true_token_ids = tokenizer.convert_tokens_to_ids(ground_truth)

# Compute F1 score (using token IDs for comparison)
f1 = f1_score(true_token_ids, predicted_token_ids.cpu().numpy(), average='micro')

print(f"pretrained F1 Score: {f1}")

Some weights of DebertaV2ForMaskedLM were not initialized from the model checkpoint at kuelumbus/polyBERT and are newly initialized: ['cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


pretrained F1 Score: 0.012944983818770227


In [5]:
# Load tokenizer and model
model = DebertaV2ForMaskedLM.from_pretrained('kuelumbus/polyBERT').to(device)
tokenizer = DebertaV2Tokenizer.from_pretrained('kuelumbus/polyBERT')
# tokenizer = DebertaV2Tokenizer(f"spm_{size}.model",f"spm_{size}.vocab")
# model = DebertaV2ForMaskedLM.from_pretrained(f'model_{size}_final/').to(device)

# Set the model to evaluation mode
model.eval()

# Mask 15% of tokens of each string in test data
masked_psmiles, ground_truth = create_masked_test_set(tokenizer,psmiles_strings)

# Tokenize the sentences
inputs = tokenizer(masked_psmiles, return_tensors='pt', padding=True)
inputs = inputs.to(device)

# Run inference to get predictions for masked tokens
with torch.no_grad():
    outputs = model(**inputs)
    predictions = outputs.logits

# Get the predicted token IDs for the masked positions
masked_indices = (inputs.input_ids == tokenizer.mask_token_id).nonzero(as_tuple=True)
predicted_token_ids = predictions[masked_indices].argmax(dim=-1)

# Convert predicted token IDs back to words
predicted_tokens = tokenizer.convert_ids_to_tokens(predicted_token_ids)

# Convert true tokens to token IDs
true_token_ids = tokenizer.convert_tokens_to_ids(ground_truth)

# Compute F1 score (using token IDs for comparison)
f1 = f1_score(true_token_ids, predicted_token_ids.cpu().numpy(), average='micro')

print(f"pretrained1M F1 Score: {f1}")

Some weights of DebertaV2ForMaskedLM were not initialized from the model checkpoint at kuelumbus/polyBERT and are newly initialized: ['cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


pretrained1M F1 Score: 0.005742411812961444




In [6]:
# Load tokenizer and model
# model = DebertaV2ForMaskedLM.from_pretrained('kuelumbus/polyBERT').to(device)
# tokenizer = DebertaV2Tokenizer.from_pretrained('kuelumbus/polyBERT')
size='1M'
tokenizer = DebertaV2Tokenizer(f"spm_{size}.model",f"spm_{size}.vocab")
model = DebertaV2ForMaskedLM.from_pretrained(f'model_{size}_final/').to(device)

# Set the model to evaluation mode
model.eval()

# Mask 15% of tokens of each string in test data
masked_psmiles, ground_truth = create_masked_test_set(tokenizer,psmiles_strings)

# Tokenize the sentences
inputs = tokenizer(masked_psmiles, return_tensors='pt', padding=True)
inputs = inputs.to(device)

# Run inference to get predictions for masked tokens
with torch.no_grad():
    outputs = model(**inputs)
    predictions = outputs.logits

# Get the predicted token IDs for the masked positions
masked_indices = (inputs.input_ids == tokenizer.mask_token_id).nonzero(as_tuple=True)
predicted_token_ids = predictions[masked_indices].argmax(dim=-1)

# Convert predicted token IDs back to words
predicted_tokens = tokenizer.convert_ids_to_tokens(predicted_token_ids)

# Convert true tokens to token IDs
true_token_ids = tokenizer.convert_tokens_to_ids(ground_truth)

# Compute F1 score (using token IDs for comparison)
f1 = f1_score(true_token_ids, predicted_token_ids.cpu().numpy(), average='micro')

print(f"pretrained1M F1 Score: {f1}")



pretrained1M F1 Score: 0.5572263993316625


In [7]:
# Load tokenizer and model
# model = DebertaV2ForMaskedLM.from_pretrained('kuelumbus/polyBERT').to(device)
# tokenizer = DebertaV2Tokenizer.from_pretrained('kuelumbus/polyBERT')
size='5M'
tokenizer = DebertaV2Tokenizer(f"spm_{size}.model",f"spm_{size}.vocab")
model = DebertaV2ForMaskedLM.from_pretrained(f'model_{size}_final/').to(device)

# Set the model to evaluation mode
model.eval()

# Mask 15% of tokens of each string in test data
masked_psmiles, ground_truth = create_masked_test_set(tokenizer,psmiles_strings)

# Tokenize the sentences
inputs = tokenizer(masked_psmiles, return_tensors='pt', padding=True)
inputs = inputs.to(device)

# Run inference to get predictions for masked tokens
with torch.no_grad():
    outputs = model(**inputs)
    predictions = outputs.logits

# Get the predicted token IDs for the masked positions
masked_indices = (inputs.input_ids == tokenizer.mask_token_id).nonzero(as_tuple=True)
predicted_token_ids = predictions[masked_indices].argmax(dim=-1)

# Convert predicted token IDs back to words
predicted_tokens = tokenizer.convert_ids_to_tokens(predicted_token_ids)

# Convert true tokens to token IDs
true_token_ids = tokenizer.convert_tokens_to_ids(ground_truth)

# Compute F1 score (using token IDs for comparison)
f1 = f1_score(true_token_ids, predicted_token_ids.cpu().numpy(), average='micro')

print(f"pretrained1M F1 Score: {f1}")



pretrained1M F1 Score: 0.509976057462091
