In [1]:
import polaris as po
import numpy as np

# Load the competition from the Hub
competition = po.load_competition("asap-discovery/antiviral-potency-2025")

# Get the train and test data-loaders
train, test = competition.get_train_test_split()

In [2]:
test_smis = [t for t in test]
smis = [t[0] for t in train]
y1 = [t[1]['pIC50 (SARS-CoV-2 Mpro)'] for t in train]
y2 = [t[1]['pIC50 (MERS-CoV Mpro)'] for t in train]
smis1 = [smi for i,smi in enumerate(smis) if not np.isnan(y1[i])]
y1 = [y for y in y1 if not np.isnan(y)]
smis2 = [smi for i,smi in enumerate(smis) if not np.isnan(y2[i])]
y2 = [y for y in y2 if not np.isnan(y)]


In [3]:

from rdkit import Chem
from rdkit.Chem import Descriptors,rdFingerprintGenerator
MFPGEN = rdFingerprintGenerator.GetMorganGenerator(3,fpSize=2048)
def calculateDescriptors(mol: Chem.Mol, missingVal: float | None = 0.0) -> dict:
    """Calculate the full list of descriptors for a molecule.
    adapted from
    https://github.com/jonswain/tabpfn-tdc/blob/main/submission.py#L12
    """
    
    res = []
    for nm, fn in Descriptors._descList:
        try:
            val = fn(mol)
        except:
            val = missingVal
        res.append(val)
    return res + list(MFPGEN.GetFingerprint(mol))

X1 = [calculateDescriptors(Chem.MolFromSmiles(smi)) for smi in smis1]
X2 = [calculateDescriptors(Chem.MolFromSmiles(smi)) for smi in smis2]

In [53]:
import pandas as pd
df = pd.read_csv("sarscov2_augment_ic50.csv",sep=";")
x1_aug = []
y1_aug = []
for i,smi in enumerate(df["Smiles"]):
    if df["Standard Relation"][i]=="'='":
        if not np.isnan(df["pChEMBL Value"][i]):
            y1_aug.append(df["pChEMBL Value"][i])
            x1_aug.append(calculateDescriptors(Chem.MolFromSmiles(smi))+list(MFPGEN.GetFingerprint(Chem.MolFromSmiles(smi))))

In [4]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "1"

In [None]:
#check if it works
from tabpfn import TabPFNRegressor

from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.model_selection import train_test_split

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X1, y1, test_size=0.5, random_state=42)

# Initialize the regressor
regressor = TabPFNRegressor(ignore_pretraining_limits=True)  
regressor.fit(X_train, y_train)

# Predict on the test set
predictions = regressor.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, predictions)
mae = mean_absolute_error(y_test, predictions)
r2 = r2_score(y_test, predictions)

print("Mean Squared Error (MSE):", mse)
print("Mean Absolute Error (MAE):", mae)
print("R² Score:", r2)



In [6]:
# compare to random forest
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.model_selection import train_test_split

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X2, y2, test_size=0.5, random_state=42)

# Initialize the regressor
regressor = RandomForestRegressor(n_jobs=-1)  
regressor.fit(X_train, y_train)

# Predict on the test set
predictions = regressor.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, predictions)
mae = mean_absolute_error(y_test, predictions)
r2 = r2_score(y_test, predictions)

print("Mean Squared Error (MSE):", mse)
print("Mean Absolute Error (MAE):", mae)
print("R² Score:", r2)



Mean Squared Error (MSE): 0.7822980848248262
Mean Absolute Error (MAE): 0.519799525305318
R² Score: 0.13309033981326723


In [None]:
#train model on entire set
X_test = [calculateDescriptors(Chem.MolFromSmiles(smi)) for smi in test_smis]

regressor = TabPFNRegressor(ignore_pretraining_limits=True)  
regressor.fit(X1, y1)
sars_predictions = regressor.predict(X_test)
regressor = TabPFNRegressor(ignore_pretraining_limits=True)  
regressor.fit(X2, y2)
mers_predictions = regressor.predict(X_test)


In [10]:
y_pred={}
y_pred['pIC50 (SARS-CoV-2 Mpro)'] = sars_predictions
y_pred['pIC50 (MERS-CoV Mpro)'] = mers_predictions
competition.submit_predictions(
    predictions=y_pred,
    prediction_name="potency-tabPFN",
    prediction_owner="wim0",
    report_url="https://molecular.beauty/blog/2025/03/14/polaris.html", 
    # The below metadata is optional, but recommended.
    github_url="https://github.com/dehaenw/polaris-baseline",
    description="Source code and report will be made public once challenge is finished",
    user_attributes={"Framework": "RDKit + TabPFN", "Method": "TabPFN on logtransformed data with ECFP and rdkit descriptors"}
)


Output()