In [1]:
import os
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertForSequenceClassification
from peft import LoraConfig, get_peft_model
from sklearn.metrics import classification_report
from tqdm import tqdm
import pandas as pd
import wandb

In [2]:
# from kaggle_secrets import UserSecretsClient
# user_secrets = UserSecretsClient()
# wandb_api_key = user_secrets.get_secret("wandb-key")

In [3]:
# wandb.login(key=wandb_api_key)

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mankittriescoding[0m ([33mankittriescoding-indian-institute-of-technology-kharagpur[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [4]:
wandb.init(
    project="lora-bert-finetune",
    config={
        "learning_rate": 5e-5,
        "epochs": 4,
        "batch_size": 16,
        "max_len": 512,
        "lora_r": 8,
        "lora_alpha": 16,
        "lora_dropout": 0.1,
    },
)
config = wandb.config

In [5]:
class SMILESDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.smiles = dataframe["smiles"].values
        self.labels = dataframe["HIV_active"].values
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.smiles) 

    def __getitem__(self, idx):
        smiles = self.smiles[idx]
        label = self.labels[idx]

        encoding = self.tokenizer(
            smiles,
            max_length=self.max_len,
            padding="max_length",
            truncation=True,
            return_tensors="pt",
        )

        return {
            "input_ids": encoding["input_ids"].squeeze(0),
            "attention_mask": encoding["attention_mask"].squeeze(0),
            "label": torch.tensor(label, dtype=torch.long),
        }

In [6]:
def get_model():
    model = BertForSequenceClassification.from_pretrained(
        "bert-base-uncased", num_labels=2
    )
    return model

In [7]:
config = LoraConfig(
    task_type="SEQ_CLS",
    inference_mode=False,
    r=8,
    lora_alpha=16,
    lora_dropout=0.1
)

In [8]:
def train_epoch(device, model, train_loader, optimizer, epoch, checkpoint_dir):
    model.train()
    total_loss = 0

    if not os.path.exists(checkpoint_dir):
        os.makedirs(checkpoint_dir)

    train_loader = tqdm(train_loader, desc="Training", leave=True)
    for batch_idx, batch in enumerate(train_loader):
        optimizer.zero_grad()
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["label"].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss

        total_loss += loss.item()
        loss.backward()
        optimizer.step()

        train_loader.set_postfix(loss=loss.item())
        wandb.log({"batch_loss": loss.item()})

        if (batch_idx + 1) % 500 == 0:
            checkpoint_path = os.path.join(checkpoint_dir, f"checkpoint_epoch{epoch}.pt")
            torch.save({
                "epoch": epoch,
                "batch_idx": batch_idx,
                "model_state_dict": model.state_dict(),
                "optimizer_state_dict": optimizer.state_dict(),
            }, checkpoint_path)
            print(f"***Checkpoint saved at {checkpoint_path}***")

    return total_loss / len(train_loader)


def train_model(device, model, epochs, train_loader, optimizer, checkpoint_path, train_ckpt=False):
    start_epoch = 0

    if train_ckpt:
        checkpoint = torch.load(checkpoint_path)
        model.load_state_dict(checkpoint["model_state_dict"])
        optimizer.load_state_dict(checkpoint["optimizer_state_dict"])
        start_epoch = checkpoint["epoch"] + 1
        print(f"***Resumed training from epoch {start_epoch}***")

    for epoch in range(start_epoch, epochs):
        print(f"Epoch {epoch + 1}/{epochs}")
        train_loss = train_epoch(device, model, train_loader, optimizer, epoch, checkpoint_path)
        wandb.log({"epoch_train_loss": train_loss})
        print(f"Training Loss: {train_loss:.4f}")

        epoch_checkpoint_path = os.path.join(checkpoint_path, f"checkpoint_epoch{epoch}_complete.pt")
        torch.save({
            "epoch": epoch,
            "model_state_dict": model.state_dict(),
            "optimizer_state_dict": optimizer.state_dict(),
        }, epoch_checkpoint_path)
        print(f"Checkpoint saved at {epoch_checkpoint_path}")

In [24]:
def eval_model(device, model, test_loader):
    model.eval()
    predictions, true_labels = [], []

    test_loader = tqdm(test_loader, desc="Evaluating", leave=True)
    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["label"]

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            preds = torch.argmax(outputs.logits, dim=-1).cpu().numpy()

            predictions.extend(preds)
            true_labels.extend(labels.numpy())

    report = classification_report(true_labels, predictions, output_dict=True)
    
    wandb.log({
        "eval_accuracy": report["accuracy"],
        "eval_precision_macro": report["macro avg"]["precision"],
        "eval_recall_macro": report["macro avg"]["recall"],
        "eval_f1_macro": report["macro avg"]["f1-score"],
        "eval_precision_weighted": report["weighted avg"]["precision"],
        "eval_recall_weighted": report["weighted avg"]["recall"],
        "eval_f1_weighted": report["weighted avg"]["f1-score"]
    })

    return report

In [10]:
TRAIN = os.path.join("../input/moleculenet-hiv-split/train.csv")
TEST = os.path.join("../input/moleculenet-hiv-split/test.csv")

In [11]:
if not os.path.exists("/kaggle/working/training/"):
    os.makedirs("/kaggle/working/training/")

In [12]:
train_data = pd.read_csv(TRAIN)
test_data = pd.read_csv(TEST)

In [13]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
max_len = 512
base_model = get_model()

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [14]:
lora_model = get_peft_model(base_model, config)
lora_model.print_trainable_parameters()

trainable params: 296,450 || all params: 109,780,228 || trainable%: 0.2700


In [15]:
train_dataset = SMILESDataset(train_data, tokenizer, max_len)
test_dataset = SMILESDataset(test_data, tokenizer, max_len)

del train_data, test_data

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16)

del train_dataset, test_dataset

In [16]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
lora_model.to(device)
wandb.watch(lora_model)
print(f"***Using {device}***")

***Using cuda***


In [17]:
optimizer = torch.optim.AdamW(lora_model.parameters(), lr=5e-5)
criterion = torch.nn.CrossEntropyLoss()

In [18]:
epochs = 4

In [20]:
train_model(device, lora_model, epochs, train_loader, optimizer, "/kaggle/working/training/")
lora_model.save_pretrained("/kaggle/working/training/")
tokenizer.save_pretrained("/kaggle/working/training/")

('/kaggle/working/training/tokenizer_config.json',
 '/kaggle/working/training/special_tokens_map.json',
 '/kaggle/working/training/vocab.txt',
 '/kaggle/working/training/added_tokens.json')

In [21]:
# run only if you want to start from a checkpoint
# train_model(device, lora_model, epochs, train_loader, optimizer, train_ckpt=False, "/kaggle/working/training/checkpoint_epoch{write}_{can be complete.pt})

In [22]:
wandb.save(os.path.join("/kaggle/working/training/", "*"))



['/kaggle/working/wandb/run-20250124_155009-rnld8plc/files/training/README.md',
 '/kaggle/working/wandb/run-20250124_155009-rnld8plc/files/training/checkpoint_epoch2_complete.pt',
 '/kaggle/working/wandb/run-20250124_155009-rnld8plc/files/training/checkpoint_epoch3.pt',
 '/kaggle/working/wandb/run-20250124_155009-rnld8plc/files/training/checkpoint_epoch0_complete.pt',
 '/kaggle/working/wandb/run-20250124_155009-rnld8plc/files/training/adapter_config.json',
 '/kaggle/working/wandb/run-20250124_155009-rnld8plc/files/training/checkpoint_epoch2.pt',
 '/kaggle/working/wandb/run-20250124_155009-rnld8plc/files/training/tokenizer_config.json',
 '/kaggle/working/wandb/run-20250124_155009-rnld8plc/files/training/checkpoint_epoch1_complete.pt',
 '/kaggle/working/wandb/run-20250124_155009-rnld8plc/files/training/special_tokens_map.json',
 '/kaggle/working/wandb/run-20250124_155009-rnld8plc/files/training/checkpoint_epoch3_complete.pt',
 '/kaggle/working/wandb/run-20250124_155009-rnld8plc/files/tra

In [25]:
print("Test Results:")
print(eval_model(device, lora_model, test_loader))

Test Results:


Evaluating: 100%|██████████| 415/415 [01:52<00:00,  3.69it/s]

{'0': {'precision': 0.9773036768043577, 'recall': 0.9987629503633834, 'f1-score': 0.9879167941266441, 'support': 6467}, '1': {'precision': 0.7142857142857143, 'recall': 0.11764705882352941, 'f1-score': 0.20202020202020202, 'support': 170}, 'accuracy': 0.9761940635829441, 'macro avg': {'precision': 0.845794695545036, 'recall': 0.5582050045934563, 'f1-score': 0.5949684980734231, 'support': 6637}, 'weighted avg': {'precision': 0.9705667393886324, 'recall': 0.9761940635829441, 'f1-score': 0.9677868527889772, 'support': 6637}}





In [26]:
wandb.finish()

0,1
batch_loss,▇▅▁██▁▅▃▁▃▁▁▄▃▁▂▁▁▄▇▂▃▆▂▆▁▃▂▁▁▁▂█▁▁▁▁▁▆▇
epoch_train_loss,█▃▂▁
eval_accuracy,▁
eval_f1_macro,▁
eval_f1_weighted,▁
eval_precision_macro,▁
eval_precision_weighted,▁
eval_recall_macro,▁
eval_recall_weighted,▁

0,1
batch_loss,0.02401
epoch_train_loss,0.13778
eval_accuracy,0.97619
eval_f1_macro,0.59497
eval_f1_weighted,0.96779
eval_precision_macro,0.84579
eval_precision_weighted,0.97057
eval_recall_macro,0.55821
eval_recall_weighted,0.97619


In [32]:
import shutil

shutil.make_archive('all_set', 'zip', '/kaggle/working/training')

'/kaggle/working/all_set.zip'

In [33]:
from IPython.display import FileLink
FileLink(r'all_set.zip')