In [None]:
import pandas as pd
from sklearn.metrics import (
    accuracy_score,
    f1_score,
    matthews_corrcoef,
    precision_score,
    recall_score,
)
from sklearn.model_selection import train_test_split
from tqdm import tqdm
from transformers import (
    AutoModelForCausalLM,
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
    pipeline,
)

# Load model & tokenizer
model_id = "google/flan-t5-base"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForSeq2SeqLM.from_pretrained(model_id)

In [26]:
# Load data
data = pd.read_csv('HIV.csv')
data.drop('activity', axis=1, inplace=True)
data.rename(columns={'smiles': 'input', 'HIV_active': 'label'}, inplace=True)

# Split data into train and test sets
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)
train_data.reset_index(drop=True, inplace=True)
test_data.reset_index(drop=True, inplace=True)

label_0_sample = test_data[test_data['label'] == 0].sample(100, random_state=42)
label_1_sample = test_data[test_data['label'] == 1].sample(100, random_state=42)

test_data = pd.concat([label_0_sample, label_1_sample], ignore_index=True)

print(f'Train data size: {len(train_data)}')
print(f'Test data size: {len(test_data)}')

print(f'\nTest data values: {test_data["label"].value_counts()}')

Train data size: 32901
Test data size: 200

Test data values: label
0    100
1    100
Name: count, dtype: int64


In [27]:
# Prompt template
def make_prompt_zero_shot(smiles):
    return f"""Is the following compound active against HIV? Answer with "yes" or "no".

SMILES: {smiles}
Answer:"""

def make_prompt_few_shot(smiles):
    return f"""You are a chemistry assistant that predicts whether molecules are active against HIV.

Here are some examples:

SMILES: CC1=CC=CC=C1
Answer: no

SMILES: CN1CCCC1C2=CC=CC=C2
Answer: yes

Now evaluate this molecule:

SMILES: {smiles}
Answer:"""


In [None]:
# Prediction function
def predict_hiv_activity(smiles: str, method: str = 'zero-shot') -> int:
    if method == 'zero-shot':
        prompt = make_prompt_zero_shot(smiles)
    elif method == 'few-shot':
        prompt = make_prompt_few_shot(smiles)
    else:
        raise ValueError("Method must be 'zero-shot' or 'few-shot'.")
    inputs = tokenizer(prompt, return_tensors="pt")
    outputs = model.generate(
        **inputs,
        max_new_tokens=4,
        do_sample=True,
        temperature=0.7
    )
    response = tokenizer.decode(outputs[0], skip_special_tokens=True).strip().lower()
    if response.startswith("yes"):
        return 1
    elif response.startswith("no"):
        return 0
    else:
        return None

In [32]:
# Apply to all rows with progress bar
tqdm.pandas()
test_data['predicted_label'] = test_data['input'].progress_apply(predict_hiv_activity)

100%|██████████| 200/200 [00:27<00:00,  7.32it/s]


In [33]:
print(f'Accuracy: {accuracy_score(test_data["label"], test_data["predicted_label"]):.4f}')
print(f'F1 Score: {f1_score(test_data["label"], test_data["predicted_label"]):.4f}')
print(f'Precision: {precision_score(test_data["label"], test_data["predicted_label"]):.4f}')
print(f'Recall: {recall_score(test_data["label"], test_data["predicted_label"]):.4f}')
print(f'Matthews Correlation Coefficient: {matthews_corrcoef(test_data["label"], test_data["predicted_label"]):.4f}')

Accuracy: 0.5150
F1 Score: 0.3742
Precision: 0.5273
Recall: 0.2900
Matthews Correlation Coefficient: 0.0336
