# Initialize

In [25]:
# clone the repo
!git clone https://fas38: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@github.com/fas38/nnti-project-25.git

Cloning into 'nnti-project-25'...
remote: Enumerating objects: 41, done.[K
remote: Counting objects: 100% (41/41), done.[K
remote: Compressing objects: 100% (38/38), done.[K
remote: Total 41 (delta 15), reused 17 (delta 1), pack-reused 0 (from 0)[K
Receiving objects: 100% (41/41), 1.64 MiB | 28.55 MiB/s, done.
Resolving deltas: 100% (15/15), done.


In [26]:
# set path
import os
%cd /content/nnti-project-25/
print(os.getcwd())

/content/nnti-project-25
/content/nnti-project-25


In [27]:
# install required packages
!pip install -r requirements.txt



In [28]:
# import dependencies
import os
import torch
from datasets import load_dataset, concatenate_datasets
from datasets import Dataset as HF_Dataset
import torch.nn as nn
from transformers import AutoModel, AutoTokenizer, AutoModelForMaskedLM, DataCollatorForLanguageModeling
from torch.utils.data import DataLoader, Dataset, Subset
from sklearn.model_selection import train_test_split
import pandas as pd
from tqdm.notebook import tqdm
import random
from google.colab import drive

In [29]:
# set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Class and Methods

In [30]:
# model class with regression head
class MoLFormerWithRegressionHead(nn.Module):
    # TODO: your code goes here
  def __init__(self,model):
    super(MoLFormerWithRegressionHead, self).__init__()
    self.pretrained = model
    hidden_size = self.pretrained.config.hidden_size

    #dropout?

    self.regression = nn.Linear(hidden_size, 1)


  def forward(self, ids, mask):
    # pass input to the pre-trained model
    output = self.pretrained(ids, attention_mask=mask)
    # extracts the last hidden state
    hidden_states = output.last_hidden_state
    # selects the cls token, represents the summary of the entire sequence
    cls_representation = hidden_states[:, 0, :]

    output = self.regression(cls_representation)
    return output.squeeze(-1) # to remove the last dimension

# dataset class
class SMILESDataset(Dataset):

  def __init__(self, data, tokenizer, max_length):
      self.data = data
      self.tokenizer = tokenizer
      self.max_len = max_length

  def __len__(self):
      return len(self.data)

  def __getitem__(self, idx):
      row = self.data[idx]
      # row = self.data.iloc[idx]
      # row = self.data if isinstance(self.data, dict) else self.data[idx]
      SMILES = row['SMILES']
      label = row['label']

      inputs = self.tokenizer.encode_plus(
      SMILES,
      add_special_tokens=True,
      max_length=self.max_len,
      padding='max_length',
      return_token_type_ids=False,
      truncation=True
  )

      return {
    'input_ids': torch.tensor(inputs['input_ids'], dtype=torch.long),
    'attention_mask': torch.tensor(inputs['attention_mask'], dtype=torch.long),
    'target': torch.tensor(label, dtype=torch.float)  # Directly convert the target to float
}

## Methods for Computing Influence

In [31]:
def compute_hvp(model, loss, v, max_norm=1.0):
    """
    Computes the Hessian-vector product (HVP)

    Parameters:
    - model: Pre-trained model
    - loss: MSE Output
    - v: Gradient vector
    - max_norm: Maximum allowed norm for HVP.

    Returns:
    - The Hessian-vector product (HVP)
    """
    grads = torch.autograd.grad(loss, model.parameters(), create_graph=True, retain_graph=True)
    flat_grads = torch.cat([g.view(-1) for g in grads])

    hvp = torch.autograd.grad(flat_grads @ v, model.parameters(), retain_graph=True)
    hvp_flat = torch.cat([h.view(-1) for h in hvp])

    # clipping
    hvp_norm = torch.norm(hvp_flat, p=2) # L2 norm
    if hvp_norm > max_norm:
        hvp_flat = hvp_flat * (max_norm / hvp_norm)

    # Debugging: check for explosion
    if torch.norm(hvp_flat) > 1e6:
        print(f"\nExploding values detected in HVP after clipping! Norm: {torch.norm(hvp_flat)}")

    return hvp_flat


def lissa_approximation(model, train_dataloader, v, damping=0.1, num_samples=5, num_iter=100, num_repeats=5, criterion=None):
    """
    Approximates Hessian-inverse-vector product (iHVP) using the stochastic estimation method
    explained in https://arxiv.org/pdf/1703.04730 and https://arxiv.org/pdf/1602.03943

    Parameters:
    - model: Pre-trained model
    - train_dataloader: Dataloader for training data
    - v: Gradient vector
    - damping: Damping factor for stabilization
    - num_samples: Number of training points (t) to sample per iteration
    - num_iter: Number of Taylor approximation iterations
    - num_repeats: Number of times to repeat estimation to reduce variance (r)

    Returns:
    - Approximate inverse Hessian-vector product (iHVP)
    """
    ihvp_estimates = []

    for i in range(num_repeats):
        # H^{-1}_0 v = v
        print(f"Repeat {i+1}/{num_repeats}")
        z = v.clone()

        # sampling training points
        print(f"Dataset length: {len(train_dataloader.dataset)}")
        print(f"Dataset type: {type(train_dataloader.dataset)}")
        indices = torch.randint(len(train_dataloader.dataset), (num_samples,)).tolist()
        sampled_train_data = [train_dataloader.dataset[i] for i in indices]

        # taylor approximation
        for j in range(num_iter):
            print(f"Iteration {j+1}/{num_iter}")
            train_batch = sampled_train_data[j % num_samples] # Filtering the sampled train instances
            train_input_ids = torch.stack([item['input_ids'] for item in sampled_train_data]).to(device)
            train_attention_mask = torch.stack([item['attention_mask'] for item in sampled_train_data]).to(device)
            train_label = torch.stack([item['target'] for item in sampled_train_data]).to(device)

            # Compute Hessian-gradient product using the sampled loss
            train_loss = criterion(
                model(train_input_ids, train_attention_mask).view(-1), train_label
            )
            hvp = compute_hvp(model, train_loss, z)

            # update: H_j^{-1} v = v + (I - H) H_{j-1}^{-1} v
            z = v + (z - hvp)

        ihvp_estimates.append(z)

    return torch.stack(ihvp_estimates).mean(dim=0)

def influence_function(train_point, train_label, grad_test_vector, model, criterion, train_dataloader, num_samples=5, num_iter=5, num_repeats=5):
    """
    Computes the influence of a training point over full test set using stochastic method

    Parameters:
    - train_point: Dictionary containing {'input_ids': tensor, 'attention_mask': tensor}
    - train_label: Target label for the training point
    - grad_test_vector: Precomputed gradient of test loss w.r.t. model parameters
    - model: Pre-trained model
    - criterion: Loss function (MSELoss)
    - train_dataloader: Dataloader for training data
    - num_samples: Number of training points (t) to sample per iteration
    - num_iter: Number of Taylor approximation iterations
    - num_repeats: Number of times to repeat estimation (r)

    Returns:
    - Influence of the training point on the test set
    """

    model.zero_grad()

    # Compute gradient of training loss w.r.t. model parameters
    train_loss = criterion(model(train_point['input_ids'], train_point['attention_mask']).view(-1), train_label)
    grad_train = torch.autograd.grad(train_loss, model.parameters(), retain_graph=True)
    grad_train_vector = torch.cat([g.view(-1) for g in grad_train])

    # Compute Hessian-inverse-vector product using LiSSA
    print("computing ihvp")
    ihvp = lissa_approximation(model, train_dataloader, grad_train_vector, num_samples=num_samples, num_iter=num_iter, num_repeats=num_repeats, criterion=criterion)

    # Compute influence using dot product
    influence = torch.dot(grad_test_vector, ihvp)

    return -influence

# Setting Up Model and Data

In [32]:
MODEL_NAME = "ibm/MoLFormer-XL-both-10pct"  #MoLFormer model
DATASET_PATH = "scikit-fingerprints/MoleculeNet_Lipophilicity"

# load pre-trained model from HuggingFace
model = AutoModel.from_pretrained(MODEL_NAME, deterministic_eval=True, trust_remote_code=True)

# load the fine-tuned masked model from task-1
# for mouting drive in google colab
drive.mount('/content/drive')
path = '/content/drive/My Drive/Colab Notebooks/nnti/'
os.chdir(path)
mlm_finetuned_model = AutoModel.from_pretrained("./mlm_finetuned_model", local_files_only=True, trust_remote_code=True).to(device) # fine tuned model
mlm_regression_model = MoLFormerWithRegressionHead(mlm_finetuned_model).to(device) # initialize with regression head
# reset the path to git repo
os.chdir("/content/nnti-project-25/")
print(os.getcwd())

# load dataset
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
dataset = load_dataset(DATASET_PATH)

# loading external dataset
ext_data = pd.read_csv("./tasks/External-Dataset_for_Task2.csv")
ext_data = ext_data.iloc[0:5] # select a subset from external dataset
ext_data = ext_data.rename(columns={"Label": "label"}) # making column names consistent
ext_dataset = HF_Dataset.from_pandas(ext_data)
ext_dataset = ext_dataset.remove_columns(["__index__"]) if "__index__" in ext_dataset.column_names else ext_dataset

# train-test-val split
split_dataset = dataset["train"].train_test_split(test_size=0.2, seed=42) # 80:20
train_valid_dataset = split_dataset["train"]
test_dataset = split_dataset["test"]
split_train_valid = train_valid_dataset.train_test_split(test_size=0.1, seed=42) # 90:10
train_dataset = split_train_valid["train"]
valid_dataset = split_train_valid["test"]
combined_train = concatenate_datasets([train_dataset, ext_dataset])

# create dataset and dataloader
train_dataset = SMILESDataset(train_dataset, tokenizer, max_length=128)
valid_dataset = SMILESDataset(valid_dataset, tokenizer, max_length=128)
test_dataset  = SMILESDataset(test_dataset, tokenizer, max_length=128)
ext_dataset = SMILESDataset(ext_dataset, tokenizer, max_length=128)
combined_train = SMILESDataset(combined_train, tokenizer, max_length=128)
train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)
valid_dataloader = DataLoader(valid_dataset, batch_size=16, shuffle=False)
test_dataloader  = DataLoader(test_dataset, batch_size=16, shuffle=False)
combined_train_dataloader = DataLoader(combined_train, batch_size=16, shuffle=True)
# ext_train_dataloader = DataLoader(ext_dataset, batch_size=16, shuffle=False) # for training the model - batch size 16
ext_influence_dataloader = DataLoader(ext_dataset, batch_size=1, shuffle=False) # for determing influence of each train points - batch size 1

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/nnti-project-25


# Training

In [33]:
EPOCHS_reg = 4
LEARNING_RATE_reg = 1e-7
optimizer_reg = torch.optim.Adam(mlm_regression_model.parameters(), lr=LEARNING_RATE_reg)
loss_fn = nn.MSELoss()

for epoch in range(EPOCHS_reg):
    mlm_regression_model.train()
    total_train_loss = 0.0

    # training with combined set
    for batch in combined_train_dataloader:
        input_ids = batch['input_ids'].to(device)
        mask = batch['attention_mask'].to(device)
        targets = batch['target'].to(device)

        optimizer_reg.zero_grad()
        outputs = mlm_regression_model(input_ids, mask)
        loss = loss_fn(outputs, targets)
        loss.backward()
        optimizer_reg.step()

        total_train_loss += loss.item() * input_ids.size(0)

    avg_train_loss = total_train_loss / len(train_dataset)
    print(f"Epoch {epoch+1} - Train Loss: {avg_train_loss:.4f}")

    # Evaluation phase inside the epoch loop using validation set from original dataset
    mlm_regression_model.eval()
    total_valid_loss = 0.0
    with torch.no_grad():
        for batch in valid_dataloader:
            input_ids = batch['input_ids'].to(device)
            mask = batch['attention_mask'].to(device)
            targets = batch['target'].to(device)
            outputs = mlm_regression_model(input_ids, mask)
            loss = loss_fn(outputs, targets)
            total_valid_loss += loss.item() * input_ids.size(0)

    avg_valid_loss = total_valid_loss / len(valid_dataset)
    print(f"Epoch {epoch+1} - Validation Loss: {avg_valid_loss:.4f}")


Epoch 1 - Train Loss: 5.0597
Epoch 1 - Validation Loss: 4.9108
Epoch 2 - Train Loss: 4.3090
Epoch 2 - Validation Loss: 4.0798
Epoch 3 - Train Loss: 3.6379
Epoch 3 - Validation Loss: 3.4412
Epoch 4 - Train Loss: 3.0849
Epoch 4 - Validation Loss: 2.8947


# Influence Checking

In [34]:
influences = []
criterion = nn.MSELoss()
regression_model = mlm_regression_model

# compute loss over full test set
test_input_ids = []
test_attention_masks = []
test_labels = []
print("Computing test loss gradient...")
grad_test_accum = None
num_test_samples = 0
for test_batch in test_dataloader:
    batch_input_ids = test_batch['input_ids'].to(device)
    batch_attention_mask = test_batch['attention_mask'].to(device)
    batch_labels = test_batch['target'].to(device)

    test_loss = criterion(regression_model(batch_input_ids, batch_attention_mask).squeeze(), batch_labels)
    grad_test = torch.autograd.grad(test_loss, regression_model.parameters(), retain_graph=True)
    grad_test_vector = torch.cat([g.view(-1) for g in grad_test])

    if grad_test_accum is None:
        grad_test_accum = grad_test_vector.clone()
    else:
        grad_test_accum += grad_test_vector

    num_test_samples += batch_labels.shape[0]
grad_test_vector = grad_test_accum / num_test_samples

# Compute influence for each sample in external dataset
for train_batch in ext_influence_dataloader:
    print(f"External data sample {len(influences) + 1}/{len(ext_influence_dataloader)}")
    train_input_ids = train_batch['input_ids'].to(device)
    train_attention_mask = train_batch['attention_mask'].to(device)
    train_label = train_batch['target'].to(device)

    train_point = {
        'input_ids': train_input_ids,
        'attention_mask': train_attention_mask
    }

    # Compute influence
    influence = influence_function(
        train_point, train_label, grad_test_vector,
        regression_model, criterion, train_dataloader
    )

    influences.append(influence.item())
    print(f"Influence for current training batch: {influence.item()}")

# Rank external data points by influence
# ranked_indices = sorted(range(len(influences)), key=lambda i: influences[i], reverse=True)
ranked_indices = sorted(enumerate(influences), key=lambda x: x[1], reverse=True) # sorted by influence score
print("Most influential training points:", ranked_indices)


Computing test loss gradient...
External data sample 1/5
computing ihvp
Repeat 1/5
Dataset length: 3024
Dataset type: <class '__main__.SMILESDataset'>
Iteration 1/5
Iteration 2/5
Iteration 3/5
Iteration 4/5
Iteration 5/5
Repeat 2/5
Dataset length: 3024
Dataset type: <class '__main__.SMILESDataset'>
Iteration 1/5
Iteration 2/5
Iteration 3/5
Iteration 4/5
Iteration 5/5
Repeat 3/5
Dataset length: 3024
Dataset type: <class '__main__.SMILESDataset'>
Iteration 1/5
Iteration 2/5
Iteration 3/5
Iteration 4/5
Iteration 5/5
Repeat 4/5
Dataset length: 3024
Dataset type: <class '__main__.SMILESDataset'>
Iteration 1/5
Iteration 2/5
Iteration 3/5
Iteration 4/5
Iteration 5/5
Repeat 5/5
Dataset length: 3024
Dataset type: <class '__main__.SMILESDataset'>
Iteration 1/5
Iteration 2/5
Iteration 3/5
Iteration 4/5
Iteration 5/5
Influence for current training batch: -945.623291015625
External data sample 2/5
computing ihvp
Repeat 1/5
Dataset length: 3024
Dataset type: <class '__main__.SMILESDataset'>
Iteratio

# Garbage Cleaning

In [None]:
del regression_model
del train_dataset
del test_dataset
del train_dataloader
del test_dataloader
del ext_dataset
del ext_dataloader
torch.cuda.empty_cache()