# Initialize

In [2]:
# clone the repo
!git clone https://fas38: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@github.com/fas38/nnti-project-25.git

Cloning into 'nnti-project-25'...
remote: Enumerating objects: 29, done.[K
remote: Counting objects: 100% (29/29), done.[K
remote: Compressing objects: 100% (26/26), done.[K
remote: Total 29 (delta 7), reused 17 (delta 1), pack-reused 0 (from 0)[K
Receiving objects: 100% (29/29), 1.63 MiB | 19.19 MiB/s, done.
Resolving deltas: 100% (7/7), done.


In [3]:
# set path
import os
%cd /content/nnti-project-25/
print(os.getcwd())

/content/nnti-project-25
/content/nnti-project-25


In [4]:
# install required packages
!pip install -r requirements.txt

Collecting jupyter (from -r requirements.txt (line 2))
  Downloading jupyter-1.1.1-py2.py3-none-any.whl.metadata (2.0 kB)
Collecting datasets (from -r requirements.txt (line 7))
  Downloading datasets-3.3.2-py3-none-any.whl.metadata (19 kB)
Collecting jupyterlab (from jupyter->-r requirements.txt (line 2))
  Downloading jupyterlab-4.3.5-py3-none-any.whl.metadata (16 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch->-r requirements.txt (line 6))
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch->-r requirements.txt (line 6))
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch->-r requirements.txt (line 6))
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch->-r requirements.

In [3]:
# import dependencies
import os
import torch
from datasets import load_dataset
import torch.nn as nn
from transformers import AutoModel, AutoTokenizer, AutoModelForMaskedLM, DataCollatorForLanguageModeling
from torch.utils.data import DataLoader, Dataset, Subset
from sklearn.model_selection import train_test_split
import pandas as pd
from tqdm.notebook import tqdm
import random

In [None]:
# set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Setting Up Model and Data

In [1]:
MODEL_NAME = "ibm/MoLFormer-XL-both-10pct"  #MoLFormer model

In [4]:
# load pre-trained model from HuggingFace
model = AutoModel.from_pretrained(MODEL_NAME, deterministic_eval=True, trust_remote_code=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [5]:
# model with regression head
class MoLFormerWithRegressionHead(nn.Module):
    def __init__(self, base_model):
        super().__init__()
        self.base_model = base_model
        hidden_size = base_model.config.hidden_size
        self.regression_head = nn.Linear(hidden_size, 1)

    def forward(self, input_ids, attention_mask=None):
        outputs = self.base_model(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.last_hidden_state[:, 0, :]  # CLS token
        return self.regression_head(pooled_output)



In [8]:
# Instantiate model
regression_model = MoLFormerWithRegressionHead(model).to(device)

## Loading the Dataset

In [6]:
class SMILESDataset(Dataset):
    def __init__(self, smiles_list, targets, tokenizer, max_length=128):
        self.smiles_list = smiles_list
        self.targets = targets
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.smiles_list)

    def __getitem__(self, idx):
        smiles = self.smiles_list[idx]
        target = torch.tensor(self.targets[idx], dtype=torch.float)

        # Tokenize the SMILES string
        encoding = self.tokenizer(
            smiles,
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )

        return {
            "input_ids": encoding["input_ids"].squeeze(0),
            "attention_mask": encoding["attention_mask"].squeeze(0),
            "target": target
        }

# loading dataset
DATASET_PATH = "scikit-fingerprints/MoleculeNet_Lipophilicity"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
dataset = load_dataset(DATASET_PATH)
df = pd.DataFrame(dataset["train"])
smiles_list = df["SMILES"].tolist()
targets = df["label"].tolist()
smiles_dataset = SMILESDataset(smiles_list, targets, tokenizer)

# creating train-test split
train_size = int(0.7 * len(smiles_dataset))
test_size = len(smiles_dataset) - train_size
train_dataset, test_dataset = torch.utils.data.random_split(smiles_dataset, [train_size, test_size])

# creating dataloaders
batch_size = 32
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# Training the Regression Model

## Training

In [10]:
criterion = nn.MSELoss()
optimizer = torch.optim.AdamW(regression_model.parameters(), lr=5e-5)

num_epochs = 5
regression_model.train()

for epoch in range(num_epochs):
    total_loss = 0
    for batch in train_dataloader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        targets = batch["target"].to(device)

        optimizer.zero_grad()
        predictions = regression_model(input_ids, attention_mask).squeeze()
        loss = criterion(predictions, targets)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    avg_loss = total_loss / len(train_dataloader)
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {avg_loss:.4f}")


Epoch 1/5, Loss: 1.2897
Epoch 2/5, Loss: 0.6066
Epoch 3/5, Loss: 0.4248
Epoch 4/5, Loss: 0.3362
Epoch 5/5, Loss: 0.2761


## Evaluating

In [11]:
regression_model.eval()
total_loss = 0

with torch.no_grad():
    for batch in test_dataloader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        targets = batch["target"].to(device)

        predictions = regression_model(input_ids, attention_mask).squeeze()
        loss = criterion(predictions, targets)
        total_loss += loss.item()

    avg_loss = total_loss / len(test_dataloader)
    print(f"Test Loss: {avg_loss:.4f}")


Test Loss: 0.4409


## Save and Load

In [26]:
# save model to google drive
from google.colab import drive
drive.mount('/content/drive')
path = '/content/drive/My Drive/Colab Notebooks/nnti/'
os.chdir(path)
torch.save(regression_model.state_dict(), "regression_model_dummy.pth")

# reset the path to git repo
os.chdir("/content/nnti-project-25/")
print(os.getcwd())

In [7]:
# loading saved model
from google.colab import drive
drive.mount('/content/drive')
path = '/content/drive/My Drive/Colab Notebooks/nnti/'
os.chdir(path)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
regression_model = MoLFormerWithRegressionHead(model).to(device)
regression_model.load_state_dict(torch.load("regression_model_dummy.pth"))
regression_model.eval()

# reset the path to git repo
os.chdir("/content/nnti-project-25/")
print(os.getcwd())

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


  regression_model.load_state_dict(torch.load("regression_model_dummy.pth"))


/content/nnti-project-25


# Implementation of Influence Function

## resource

In [None]:
# place to start - https://github.com/nimarb/pytorch_influence_functions
# https://github.com/xbeat/Machine-Learning/blob/main/Second-Order%20Equations%20in%20Machine%20Learning%20Algorithms%20Using%20Python.md

## loading external dataset

In [8]:
# loading external dataset
ext_data = pd.read_csv("./tasks/External-Dataset_for_Task2.csv")
ext_data = ext_data.iloc[0:5]
ext_smiles_list = ext_data["SMILES"].tolist()
ext_targets = ext_data["Label"].tolist()
ext_dataset = SMILESDataset(ext_smiles_list, ext_targets, tokenizer)

# creating dataloaders
batch_size = 1
# train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
# test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
ext_dataloader = DataLoader(ext_dataset, batch_size=batch_size, shuffle=False)

## computation functions

In [9]:
def compute_hvp(loss, model, v, max_norm=10.0):
    """
    Computes Hessian-vector product (HVP) with gradient clipping

    Parameters:
    - loss: Computed scalar loss function
    - model: Pre-trained model
    - v: Gradient vector
    - max_norm: Maximum allowed norm for HVP to prevent explosions

    Returns:
    - Hessian-vector product (HVP)
    """

    # first order gradient
    grads = torch.autograd.grad(loss, model.parameters(), create_graph=True, retain_graph=True)
    flat_grads = torch.cat([g.view(-1) for g in grads])

    # hessian-gradient product
    hvp = torch.autograd.grad(flat_grads @ v, model.parameters(), retain_graph=True)
    hvp_flat = torch.cat([h.view(-1) for h in hvp])

    # norm
    hvp_norm = torch.norm(hvp_flat)

    # clipping
    if hvp_norm > max_norm:
        hvp_flat = (hvp_flat / hvp_norm) * max_norm

    # Debugging: check for explosion
    if torch.norm(hvp_flat) > 1e6:
        print(f"\nExploding values detected in HVP after clipping! Norm: {torch.norm(hvp_flat)}")

    return hvp_flat


def lissa_approximation(loss, model, v, damping=0.1, num_samples=5, num_iter=100):
    """
    Approximates Hessian-inverse-vector product (iHVP) using LiSSA

    Parameters:
    - loss: Computed scalar loss function
    - model: pre-trained model
    - v: Gradient vector
    - damping: Damping factor for LiSSA
    - num_samples: Number of independent samples - S1
    - num_iter: Number of recursive iterations - S2

    Returns:
    - Approximate inverse Hessian-vector product (iHVP)
    """
    ihvp_estimates = []

    for i in range(num_samples):  # S1
        z = v.clone()
        for j in range(num_iter):  # S2
            hvp = compute_hvp(loss, model, z)
            # Debugging: check for nan during hessian-gradient product
            if torch.isnan(hvp).any() or torch.isinf(hvp).any():
              print(f"NaN detected in HVP at iteration {j}!")
            z = v + (1 - damping) * (z - hvp)

        ihvp_estimates.append(z)

    ihvp = torch.stack(ihvp_estimates).mean(dim=0)
    return ihvp

def influence_on_test_set(train_point, train_label, grad_test_vector, model, criterion, num_iter=100):
    """
    Computes the influence of a training point on the test set using precomputed test gradient

    Parameters:
    - train_point: Dictionary containing {'input_ids': tensor, 'attention_mask': tensor}
    - train_label: targets
    - grad_test_vector: Precomputed gradient of test loss w.r.t. model parameters
    - model: Pre-trained model
    - criterion: Loss function
    - num_iter: Number of LiSSA iterations for S2

    Returns:
    - Influence of the training point on the test loss
    """

    model.zero_grad()

    # Compute gradient of training loss w.r.t. model parameters
    print("on train")
    # Debugging: check for shape in train
    print(model(train_point['input_ids'], train_point['attention_mask']).view(-1).shape, train_label.shape)
    train_loss = criterion(model(train_point['input_ids'], train_point['attention_mask']).view(-1), train_label)
    grad_train = torch.autograd.grad(train_loss, model.parameters(), retain_graph=True)
    grad_train_vector = torch.cat([g.view(-1) for g in grad_train])
    # Debugging: check for nan in train loss or gradient
    if torch.isnan(train_loss) or torch.isnan(grad_train_vector).any():
        print("NaN detected in training loss or gradients!")
        exit()

    # Compute Hessian-inverse-vector product using LiSSA
    ihvp = lissa_approximation(train_loss, model, grad_train_vector, num_iter=num_iter)

    # Compute influence using pre-computed test gradient
    print("on test")
    influence = torch.dot(grad_test_vector, ihvp)

    return -influence / len(grad_test_vector)


## driver code - computing influence

In [10]:
influences = []
criterion = nn.MSELoss()
test_input_ids = []
test_attention_masks = []
test_labels = []

# compute overall test loss
print("Computing test loss gradient...")
grad_test_accum = None
num_test_samples = 0

for test_batch in test_dataloader:
    batch_input_ids = test_batch['input_ids'].to(device)
    batch_attention_mask = test_batch['attention_mask'].to(device)
    batch_labels = test_batch['target'].to(device)

    test_loss = criterion(regression_model(batch_input_ids, batch_attention_mask).squeeze(), batch_labels)
    grad_test = torch.autograd.grad(test_loss, regression_model.parameters(), retain_graph=True)
    grad_test_vector = torch.cat([g.view(-1) for g in grad_test])

    if grad_test_accum is None:
        grad_test_accum = grad_test_vector.clone()
    else:
        grad_test_accum += grad_test_vector

    num_test_samples += batch_labels.shape[0]

grad_test_vector = grad_test_accum / num_test_samples


# computing influence for each data point in external dataset
for train_batch in ext_dataloader:
    print(f"External data sample {len(influences) + 1}/{len(ext_dataloader)}")


    train_input_ids = train_batch['input_ids'].to(device)
    train_attention_mask = train_batch['attention_mask'].to(device)
    train_label = train_batch['target'].to(device)

    print("Computing influence...")
    influence = influence_on_test_set(
        {'input_ids': train_input_ids, 'attention_mask': train_attention_mask}, train_label,
        grad_test_vector,
        regression_model, criterion
    )
    influences.append(influence.item())
    print(f"Influence for current training batch: {influence.item()}")

# Rank training points by influence
ranked_indices = sorted(range(len(influences)), key=lambda i: influences[i], reverse=True)
print("Most influential training points:", ranked_indices)


Computing test loss gradient...
External data sample 1/5
Computing influence...
on train
torch.Size([1]) torch.Size([1])
on test
Influence for current training batch: -1.5946781672937504e-07
External data sample 2/5
Computing influence...
on train
torch.Size([1]) torch.Size([1])
on test
Influence for current training batch: -5.369622613216052e-08
External data sample 3/5
Computing influence...
on train
torch.Size([1]) torch.Size([1])
on test
Influence for current training batch: -9.149073889602732e-07
External data sample 4/5
Computing influence...
on train
torch.Size([1]) torch.Size([1])
on test
Influence for current training batch: 5.023048288421705e-07
External data sample 5/5
Computing influence...
on train
torch.Size([1]) torch.Size([1])
on test
Influence for current training batch: 4.1415549389967055e-07
Most influential training points: [3, 4, 1, 0, 2]


# Garbage Cleaning

In [11]:
del regression_model
del train_dataset
del test_dataset
del train_dataloader
del test_dataloader
del ext_dataset
del ext_dataloader
torch.cuda.empty_cache()