In [1]:
import torch
import pandas as pd
import numpy as np
from transformers import RobertaForSequenceClassification, RobertaTokenizer
from peft import PeftModel
from PolymerSmilesTokenization import PolymerSmilesTokenizer
import yaml

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Paths to to best model checkpoints, loading of tokenizer, base_model, and test
# csv file

# --- Checkpoint, Test File, and Config Paths ---
Tg_checkpoint_path = "ckpt/neurips.pt/Tg"
FFV_checkpoint_path = "ckpt/neurips.pt/FFV"
Tc_checkpoint_path = "ckpt/neurips.pt/Tc"
density_checkpoint_path = "ckpt/neurips.pt/density"
Rg_checkpoint_path = "ckpt/neurips.pt/Rg"

checkpoints = [Tg_checkpoint_path, FFV_checkpoint_path, Tc_checkpoint_path,
               density_checkpoint_path, Rg_checkpoint_path]
test_csv = "data/neurips-open-polymer-prediction-2025/test.csv"
finetune_config = yaml.load(open("config_finetune.yaml", "r"), Loader=yaml.FullLoader)

# --- General Configuration ---
num_properties = 1  # Set to number of regression targets
blocksize = 411     # Set to match training
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# --- Load Test File  ---
df = pd.read_csv(test_csv)
smiles_list = df["SMILES"].tolist()
ids = df["id"].tolist()
print(f"Shape of ids is {np.shape(ids)}")

# --- Load Model & Tokenizer ---
base_model = RobertaForSequenceClassification.from_pretrained(
    finetune_config['model_path'],
    num_labels=num_properties,
    problem_type="regression"
)

tokenizer = PolymerSmilesTokenizer.from_pretrained("roberta-base", max_len=blocksize)

Shape of ids is (3,)


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at ckpt/pretrain.pt and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RobertaTokenizer'. 
The class this function is called from is 'PolymerSmilesTokenizer'.


In [3]:
# Looping through inference with each checkpoint, and writing the results
# to the submission file

ID = [ids]
ID = np.array(ID).astype(int)
ID = np.transpose(ID).astype(int)
print(ID)
results = []

for item in checkpoints:

    loopresults = []
    # --- Load Checkpoint Models ---
    model = PeftModel.from_pretrained(base_model, item)
    model.to(device)
    model.eval()
    
    # --- Inference ---
    with torch.no_grad():
        for idx, smiles in zip(ids, smiles_list):
            encoding = tokenizer(
                str(smiles),
                add_special_tokens=True,
                max_length=blocksize,
                return_token_type_ids=False,
                padding="max_length",
                truncation=True,
                return_attention_mask=True,
                return_tensors='pt',
            )
            input_ids = encoding["input_ids"].to(device)
            attention_mask = encoding["attention_mask"].to(device)
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            preds = outputs.logits.squeeze().cpu().numpy()
            # print(f"predictions are {preds}")

            if item == Tg_checkpoint_path:

                max_temp = 745.4
                preds = preds*max_temp-273.15
                # mean = 375.7287743413897
                # std = 111.55579067640443
                # preds = preds*std+mean-273.15
            
            loopresults.append(preds)
    
    results.append(loopresults)

results = np.transpose(results)
print(results)

[[1109053969]
 [1422188626]
 [2032016830]]




[[162.57144      0.3814352    0.22103797   1.1560313   23.261528  ]
 [191.25104      0.38276163   0.26801115   0.98656523  21.147026  ]
 [ 88.973175     0.36296433   0.27237323   1.1232697   18.875105  ]]


In [5]:
# --- Save Results ---
   
results_df = pd.DataFrame(results, columns=["Tg", "FFV", "Tc", "Density", "Rg"])
ids_df = pd.DataFrame(ID, columns=["id"])
final_results_df = pd.concat([ids_df, results_df],axis=1)
final_results_df.to_csv("submission1.csv", index=False)
print("Inference complete. Results saved to submission1.csv.")

Inference complete. Results saved to submission1.csv.
