# Initialize

In [1]:
# clone the repo
!git clone https://fas38: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@github.com/fas38/nnti-project-25.git

Cloning into 'nnti-project-25'...
remote: Enumerating objects: 50, done.[K
remote: Counting objects: 100% (50/50), done.[K
remote: Compressing objects: 100% (47/47), done.[K
remote: Total 50 (delta 20), reused 17 (delta 1), pack-reused 0 (from 0)[K
Receiving objects: 100% (50/50), 1.67 MiB | 3.93 MiB/s, done.
Resolving deltas: 100% (20/20), done.


In [2]:
import os
from google.colab import drive
# for mouting drive in google colab
drive.mount('/content/drive')
# set path
%cd /content/nnti-project-25/
print(os.getcwd())

Mounted at /content/drive
/content/nnti-project-25
/content/nnti-project-25


In [3]:
# install required packages
!pip install -r requirements.txt

Collecting jupyter (from -r requirements.txt (line 2))
  Downloading jupyter-1.1.1-py2.py3-none-any.whl.metadata (2.0 kB)
Collecting datasets (from -r requirements.txt (line 7))
  Downloading datasets-3.3.2-py3-none-any.whl.metadata (19 kB)
Collecting jupyterlab (from jupyter->-r requirements.txt (line 2))
  Downloading jupyterlab-4.3.5-py3-none-any.whl.metadata (16 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch->-r requirements.txt (line 6))
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch->-r requirements.txt (line 6))
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch->-r requirements.txt (line 6))
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch->-r requirements.

In [4]:
# import dependencies
import os
import torch
from datasets import load_dataset, concatenate_datasets
from datasets import Dataset as HF_Dataset
import torch.nn as nn
from transformers import AutoModel, AutoTokenizer, AutoModelForMaskedLM, DataCollatorForLanguageModeling
from torch.utils.data import DataLoader, Dataset, Subset
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
from itertools import islice
import random

In [5]:
# set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Class and Methods

In [6]:
# model class with regression head
class MoLFormerWithRegressionHead(nn.Module):
    # TODO: your code goes here
  def __init__(self,model):
    super(MoLFormerWithRegressionHead, self).__init__()
    self.pretrained = model
    hidden_size = self.pretrained.config.hidden_size

    self.regression = nn.Linear(hidden_size, 1)


  def forward(self, ids, mask):
    # pass input to the pre-trained model
    output = self.pretrained(ids, attention_mask=mask)
    # extracts the last hidden state
    hidden_states = output.last_hidden_state
    # selects the cls token, represents the summary of the entire sequence
    cls_representation = hidden_states[:, 0, :]

    output = self.regression(cls_representation)
    return output.squeeze(-1) # to remove the last dimension

# dataset class
class SMILESDataset(Dataset):

  def __init__(self, data, tokenizer, max_length):
      self.data = data
      self.tokenizer = tokenizer
      self.max_len = max_length

  def __len__(self):
      return len(self.data)

  def __getitem__(self, idx):
      row = self.data[idx]
      SMILES = row['SMILES']
      label = row['label']

      inputs = self.tokenizer.encode_plus(
      SMILES,
      add_special_tokens=True,
      max_length=self.max_len,
      padding='max_length',
      return_token_type_ids=False,
      truncation=True
  )

      return {
    'input_ids': torch.tensor(inputs['input_ids'], dtype=torch.long),
    'attention_mask': torch.tensor(inputs['attention_mask'], dtype=torch.long),
    'target': torch.tensor(label, dtype=torch.float)  # Directly convert the target to float
}

## Methods for Computing Influence

In [10]:
def compute_hvp(model, loss, v, max_norm=1.0):
    """
    Computes the Hessian-vector product (HVP) using Fast Exact Multiplication by the Hessian

    Parameters:
    - model: Pre-trained model
    - loss: MSE Output
    - v: Gradient vector
    - max_norm: Maximum allowed norm for HVP.

    Returns:
    - The Hessian-vector product (HVP)
    """
    grads = torch.autograd.grad(loss, model.parameters(), create_graph=True, retain_graph=True)
    flat_grads = torch.cat([g.view(-1) for g in grads])

    hvp = torch.autograd.grad(v @ flat_grads, model.parameters(), retain_graph=True)
    hvp_flat = torch.cat([h.view(-1) for h in hvp])

    # clipping
    hvp_norm = torch.norm(hvp_flat, p=2) # L2 norm
    if hvp_norm > max_norm:
        hvp_flat = hvp_flat * (max_norm / hvp_norm)

    # Debugging: check for explosion
    if torch.norm(hvp_flat) > 1e6:
        print(f"\nExploding values detected in HVP after clipping! Norm: {torch.norm(hvp_flat)}")

    return hvp_flat


def lissa_approximation(model, train_dataloader, v, num_samples=5, num_repeats=5, criterion=None):
    """
    Approximates Hessian-inverse-vector product (iHVP) using the stochastic estimation method
    explained in https://arxiv.org/pdf/1703.04730 and https://arxiv.org/pdf/1602.03943

    Parameters:
    - model: Pre-trained model
    - train_dataloader: Dataloader for training data
    - v: Gradient vector
    - damping: Damping factor for stabilization
    - num_samples: Number of training points (t) to sample per iteration
    - num_iter: Number of Taylor approximation iterations
    - num_repeats: Number of times to repeat estimation to reduce variance (r)

    Returns:
    - Approximate inverse Hessian-vector product (iHVP)
    """
    ihvp_estimates = []

    for i in range(num_repeats):
        # H^{-1}_0 v = v
        print(f"Repeat {i+1}/{num_repeats}")
        z = v.clone()

        # sampling training points for unbiased estimator
        indices = torch.randint(len(train_dataloader.dataset), (num_samples,)).tolist()
        sampled_train_data = [train_dataloader.dataset[i] for i in indices]

        # taylor approximation
        for j in range(num_samples):
            print(f"Iteration {j+1}/{num_samples}")
            train_batch = sampled_train_data[j] # Filtering the sampled train instance
            train_input_ids = train_batch['input_ids'].unsqueeze(0).to(device)
            train_attention_mask = train_batch['attention_mask'].unsqueeze(0).to(device)
            train_label = train_batch['target'].unsqueeze(0).to(device)

            # Compute Hessian-gradient product using the sampled loss
            train_loss = criterion(
                model(train_input_ids, train_attention_mask).view(-1), train_label
            )
            hvp = compute_hvp(model, train_loss, z)

            # update: H_j^{-1} v = v + (I - H) H_{j-1}^{-1} v
            z = (v + (z - hvp)).detach()

        ihvp_estimates.append(z)

    return torch.stack(ihvp_estimates).mean(dim=0)


def compute_test_ihvp(model, test_point, test_label, train_dataloader, num_samples=5, num_repeats=5, criterion=None):
    """
    Precomputes the Hessian-inverse-vector product (iHVP) for a test point

    Parameters:
    - model: Pre-trained model
    - test_point: Dictionary containing {'input_ids': tensor, 'attention_mask': tensor}
    - test_label: Target label for the test point
    - train_dataloader: Dataloader for training data
    - num_samples: Number of steps for Taylor approximation (t)
    - num_repeats: Number of times to repeat estimation to reduce variance (r)
    - criterion: Loss function

    Returns:
    - Precomputed iHVP for the test point.
    """

    model.zero_grad()

    # Compute gradient of test loss w.r.t. model parameters
    test_loss = criterion(model(test_point['input_ids'], test_point['attention_mask']).view(-1), test_label)
    grad_test = torch.autograd.grad(test_loss, model.parameters(), retain_graph=True)
    grad_test_vector = torch.cat([g.view(-1) for g in grad_test])

    # Compute Hessian-inverse-vector product using LiSSA
    print("Computing iHVP...")
    ihvp = lissa_approximation(model, train_dataloader, grad_test_vector, num_samples=num_samples, num_repeats=num_repeats, criterion=criterion)

    return ihvp

def compute_test_ihvp_all(model, test_dataloader, train_dataloader, num_samples=5, num_repeats=5, criterion=None):
    """
    Precomputes the Hessian-inverse-vector product (iHVP) for a test set

    Parameters:
    - model: Pre-trained model
    - test_loader: Dataloader for test data
    - train_dataloader: Dataloader for training data
    - num_samples: Number of steps for Taylor approximation (t)
    - num_repeats: Number of times to repeat estimation to reduce variance (r)
    - criterion: Loss function

    Returns:
    - Precomputed iHVP for the test point.
    """

    model.zero_grad()

    # compute gradient of test loss w.r.t. model parameters
    grad_test_accum = None
    num_test_samples = 0
    for test_batch in test_dataloader:
        test_input_ids = test_batch['input_ids'].to(device)
        test_attention_mask = test_batch['attention_mask'].to(device)
        test_label = test_batch['target'].to(device)

        test_loss = criterion(model(test_input_ids, test_attention_mask).view(-1), test_label)
        grad_test = torch.autograd.grad(test_loss, model.parameters(), retain_graph=True)
        grad_test_vector = torch.cat([g.view(-1) for g in grad_test])

        if grad_test_accum is None:
            grad_test_accum = grad_test_vector.clone()
        else:
            grad_test_accum += grad_test_vector

        num_test_samples += test_label.shape[0]
    grad_test_vector = grad_test_accum / num_test_samples

    # Compute Hessian-inverse-vector product using LiSSA
    print("Computing iHVP...")
    ihvp = lissa_approximation(model, train_dataloader, grad_test_vector, num_samples=num_samples, num_repeats=num_repeats, criterion=criterion)

    return ihvp

def influence_by_train_point(train_point, train_label, ihvp, model, criterion):
    """
    Computes the influence of a training point using the precomputed iHVP for test

    Parameters:
    - train_point: Dictionary containing {'input_ids': tensor, 'attention_mask': tensor}
    - train_label: Target label for the training point
    - ihvp: Precomputed Hessian-inverse-vector product (iHVP) for test
    - model: Pre-trained model
    - criterion: Loss function (MSELoss)

    Returns:
    - Influence of the training point
    """

    model.zero_grad()

    # Compute gradient of training loss w.r.t. model parameters
    train_loss = criterion(model(train_point['input_ids'], train_point['attention_mask']).view(-1), train_label)
    grad_train = torch.autograd.grad(train_loss, model.parameters(), retain_graph=True)
    grad_train_vector = torch.cat([g.view(-1) for g in grad_train])

    # Compute influence using dot product
    influence = - (torch.dot(ihvp, grad_train_vector))

    return influence

# Setting Up Model and Data

In [73]:
MODEL_NAME = "ibm/MoLFormer-XL-both-10pct"  #MoLFormer model
DATASET_PATH = "scikit-fingerprints/MoleculeNet_Lipophilicity"

# load pre-trained model from HuggingFace
model = AutoModel.from_pretrained(MODEL_NAME, deterministic_eval=True, trust_remote_code=True)

# load the fine-tuned masked model from task-1
path = '/content/drive/My Drive/Colab Notebooks/nnti/'
os.chdir(path)
mlm_finetuned_model = AutoModel.from_pretrained("./mlm_finetuned_model", local_files_only=True, trust_remote_code=True).to(device) # fine tuned model
mlm_regression_model = MoLFormerWithRegressionHead(mlm_finetuned_model).to(device) # initialize with regression head
# reset the path to git repo
os.chdir("/content/nnti-project-25/")
print(os.getcwd())

# load dataset
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
dataset = load_dataset(DATASET_PATH)

# loading external dataset
ext_data = pd.read_csv("./tasks/External-Dataset_for_Task2.csv")
ext_data = ext_data.rename(columns={"Label": "label"}) # making column names consistent
ext_dataset = HF_Dataset.from_pandas(ext_data)
ext_dataset = ext_dataset.remove_columns(["__index__"]) if "__index__" in ext_dataset.column_names else ext_dataset

# train-test-val split
split_dataset = dataset["train"].train_test_split(test_size=0.2, seed=42) # 80:20
train_valid_dataset = split_dataset["train"]
test_dataset = split_dataset["test"]
split_train_valid = train_valid_dataset.train_test_split(test_size=0.1, seed=42) # 90:10
train_dataset = split_train_valid["train"]
valid_dataset = split_train_valid["test"]
combined_train = concatenate_datasets([train_dataset, ext_dataset])

# create dataset and dataloader
train_dataset = SMILESDataset(train_dataset, tokenizer, max_length=128)
valid_dataset = SMILESDataset(valid_dataset, tokenizer, max_length=128)
test_dataset  = SMILESDataset(test_dataset, tokenizer, max_length=128)
ext_dataset = SMILESDataset(ext_dataset, tokenizer, max_length=128)
combined_train = SMILESDataset(combined_train, tokenizer, max_length=128)
train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)
valid_dataloader = DataLoader(valid_dataset, batch_size=16, shuffle=False)
test_dataloader  = DataLoader(test_dataset, batch_size=16, shuffle=False)
test_single_dataloader  = DataLoader(test_dataset, batch_size=1, shuffle=False)
combined_train_dataloader = DataLoader(combined_train, batch_size=16, shuffle=True)
# ext_train_dataloader = DataLoader(ext_dataset, batch_size=16, shuffle=False) # for training the model - batch size 16
ext_influence_dataloader = DataLoader(ext_dataset, batch_size=1, shuffle=False) # for determing influence of each train points - batch size 1

/content/nnti-project-25


# Training

In [None]:
EPOCHS_reg = 200
LEARNING_RATE_reg = 1e-7
patience = 5
epochs_no_improve = 0
best_valid_loss = float("inf")
optimizer_reg = torch.optim.Adam(mlm_regression_model.parameters(), lr=LEARNING_RATE_reg)
loss_fn = nn.MSELoss()
path = '/content/drive/My Drive/Colab Notebooks/nnti/'
os.chdir(path)

for epoch in range(EPOCHS_reg):
    mlm_regression_model.train()
    total_train_loss = 0.0

    # training with combined set
    for batch in combined_train_dataloader:
        input_ids = batch['input_ids'].to(device)
        mask = batch['attention_mask'].to(device)
        targets = batch['target'].to(device)

        optimizer_reg.zero_grad()
        outputs = mlm_regression_model(input_ids, mask)
        loss = loss_fn(outputs, targets)
        loss.backward()
        optimizer_reg.step()

        total_train_loss += loss.item() * input_ids.size(0)

    avg_train_loss = total_train_loss / len(train_dataset)
    print(f"Epoch {epoch+1} - Train Loss: {avg_train_loss:.4f}")

    # validation
    mlm_regression_model.eval()
    total_valid_loss = 0.0
    with torch.no_grad():
        for batch in valid_dataloader:
            input_ids = batch['input_ids'].to(device)
            mask = batch['attention_mask'].to(device)
            targets = batch['target'].to(device)
            outputs = mlm_regression_model(input_ids, mask)
            loss = loss_fn(outputs, targets)
            total_valid_loss += loss.item() * input_ids.size(0)

    avg_valid_loss = total_valid_loss / len(valid_dataset)
    print(f"Epoch {epoch+1} - Validation Loss: {avg_valid_loss:.4f}")

    # early stop check
    if avg_valid_loss < best_valid_loss:
        best_valid_loss = avg_valid_loss
        epochs_no_improve = 0
        torch.save(mlm_regression_model.state_dict(), "best_mlm_regression_model.pth")
        print("Validation loss improved, model saved.")
    else:
        epochs_no_improve += 1
        print(f"No improvement for {epochs_no_improve} consecutive epochs.")
    if epochs_no_improve >= patience:
        print(f"Early stopping triggered after {epoch+1} epochs.")
        break

    # saving model after each 5 epoch
    if (epoch+1) % 10 == 0:
      save_path = f"mlm_regression_model_ckp_{epoch+1}.pth"
      torch.save(mlm_regression_model.state_dict(), save_path)
      print(f"Model saved at {save_path}")

# reset the path to git repo
os.chdir("/content/nnti-project-25/")
print(os.getcwd())

Epoch 1 - Train Loss: 7.3085
Epoch 1 - Validation Loss: 6.3140
Validation loss improved, model saved.
Epoch 2 - Train Loss: 6.0983
Epoch 2 - Validation Loss: 5.3792
Validation loss improved, model saved.
Epoch 3 - Train Loss: 5.0745
Epoch 3 - Validation Loss: 4.3145
Validation loss improved, model saved.
Epoch 4 - Train Loss: 4.1745
Epoch 4 - Validation Loss: 3.4567
Validation loss improved, model saved.
Epoch 5 - Train Loss: 3.4033
Epoch 5 - Validation Loss: 2.8227
Validation loss improved, model saved.
Epoch 6 - Train Loss: 2.8089
Epoch 6 - Validation Loss: 2.2587
Validation loss improved, model saved.
Epoch 7 - Train Loss: 2.3321
Epoch 7 - Validation Loss: 1.8646
Validation loss improved, model saved.
Epoch 8 - Train Loss: 2.0165
Epoch 8 - Validation Loss: 1.6263
Validation loss improved, model saved.
Epoch 9 - Train Loss: 1.8016
Epoch 9 - Validation Loss: 1.4502
Validation loss improved, model saved.
Epoch 10 - Train Loss: 1.6947
Epoch 10 - Validation Loss: 1.4168
Validation loss i

# Setup Regression Model

In [9]:
influences = []
criterion = nn.MSELoss()

# loading pre-trained mlm regression model
path = '/content/drive/My Drive/Colab Notebooks/nnti/'
os.chdir(path)
mlm_regression_model = MoLFormerWithRegressionHead(mlm_finetuned_model).to(device)
mlm_regression_model.load_state_dict(torch.load("best_mlm_regression_model.pth"))
# reset the path to git repo
os.chdir("/content/nnti-project-25/")
print(os.getcwd())
regression_model = mlm_regression_model.eval()

  mlm_regression_model.load_state_dict(torch.load("best_mlm_regression_model.pth"))


/content/nnti-project-25


# Computing IHVP for Test

In [None]:
# Computing iHVP for each test point
test_ihvps = []
start_idx = 10
end_idx = 12
sliced_dataloader = islice(test_single_dataloader, start_idx, end_idx)
# flag_count = 0
for _, test_batch in enumerate(sliced_dataloader, start=start_idx):
    print(f"Processing test sample {len(test_ihvps) + 1}/{(end_idx - start_idx)}")
    test_input_ids = test_batch['input_ids'].to(device)
    test_attention_mask = test_batch['attention_mask'].to(device)
    test_label = test_batch['target'].to(device)
    test_point = {'input_ids': test_input_ids, 'attention_mask': test_attention_mask}

    ihvp = compute_test_ihvp(regression_model, test_point, test_label, combined_train_dataloader,
                             num_samples=300, num_repeats=5, criterion=criterion)
    test_ihvps.append(ihvp)
    # flag_count += 1
    # if flag_count == 2:
    #   break

# Save the iHVP
path = '/content/drive/My Drive/Colab Notebooks/nnti/'
os.chdir(path)
torch.save(test_ihvps, f"test_ihvps_{start_idx}_{end_idx}.pt")
os.chdir("/content/nnti-project-25/")
print(os.getcwd())


Processing test sample 1/2
Computing iHVP...
Repeat 1/5
Iteration 1/300
Iteration 2/300
Iteration 3/300
Iteration 4/300
Iteration 5/300
Iteration 6/300
Iteration 7/300
Iteration 8/300
Iteration 9/300
Iteration 10/300
Iteration 11/300
Iteration 12/300
Iteration 13/300
Iteration 14/300
Iteration 15/300
Iteration 16/300
Iteration 17/300
Iteration 18/300
Iteration 19/300
Iteration 20/300
Iteration 21/300
Iteration 22/300
Iteration 23/300
Iteration 24/300
Iteration 25/300
Iteration 26/300
Iteration 27/300
Iteration 28/300
Iteration 29/300
Iteration 30/300
Iteration 31/300
Iteration 32/300
Iteration 33/300
Iteration 34/300
Iteration 35/300
Iteration 36/300
Iteration 37/300
Iteration 38/300
Iteration 39/300
Iteration 40/300
Iteration 41/300
Iteration 42/300
Iteration 43/300
Iteration 44/300
Iteration 45/300
Iteration 46/300
Iteration 47/300
Iteration 48/300
Iteration 49/300
Iteration 50/300
Iteration 51/300
Iteration 52/300
Iteration 53/300
Iteration 54/300
Iteration 55/300
Iteration 56/300
I

In [11]:
# Computing iHVP for full test set
test_ihvps = []
ihvp = compute_test_ihvp_all(regression_model, test_dataloader, combined_train_dataloader,
                          num_samples=300, num_repeats=5, criterion=criterion)
test_ihvps.append(ihvp)

# Save the iHVP
path = '/content/drive/My Drive/Colab Notebooks/nnti/'
os.chdir(path)
torch.save(test_ihvps, f"test_ihvps_full_set.pt")
os.chdir("/content/nnti-project-25/")
print(os.getcwd())

Computing iHVP...
Repeat 1/5
Iteration 1/300
Iteration 2/300
Iteration 3/300
Iteration 4/300
Iteration 5/300
Iteration 6/300
Iteration 7/300
Iteration 8/300
Iteration 9/300
Iteration 10/300
Iteration 11/300
Iteration 12/300
Iteration 13/300
Iteration 14/300
Iteration 15/300
Iteration 16/300
Iteration 17/300
Iteration 18/300
Iteration 19/300
Iteration 20/300
Iteration 21/300
Iteration 22/300
Iteration 23/300
Iteration 24/300
Iteration 25/300
Iteration 26/300
Iteration 27/300
Iteration 28/300
Iteration 29/300
Iteration 30/300
Iteration 31/300
Iteration 32/300
Iteration 33/300
Iteration 34/300
Iteration 35/300
Iteration 36/300
Iteration 37/300
Iteration 38/300
Iteration 39/300
Iteration 40/300
Iteration 41/300
Iteration 42/300
Iteration 43/300
Iteration 44/300
Iteration 45/300
Iteration 46/300
Iteration 47/300
Iteration 48/300
Iteration 49/300
Iteration 50/300
Iteration 51/300
Iteration 52/300
Iteration 53/300
Iteration 54/300
Iteration 55/300
Iteration 56/300
Iteration 57/300
Iteration 5

# Influence for Each External Data Point

In [None]:
# computing influence of each external data point on individual test samples
# load pre-computed iHVP
path = '/content/drive/My Drive/Colab Notebooks/nnti/'
os.chdir(path)
index_ranges = [(0, 10), (10, 12)] # test sample values as stored after computing iHVP
test_ihvps = []
for start_idx, end_idx in index_ranges:
    test_ihvps.extend(torch.load(f"test_ihvps_{start_idx}_{end_idx}.pt"))
os.chdir("/content/nnti-project-25/")
print(os.getcwd())

# Computing influence for each sample in external dataset
influences = []
influences_per_test_sample = []
for train_batch in ext_influence_dataloader:
    print(f"External data sample {len(influences) + 1}/{len(ext_influence_dataloader)}")
    train_input_ids = train_batch['input_ids'].to(device)
    train_attention_mask = train_batch['attention_mask'].to(device)
    train_label = train_batch['target'].to(device)

    train_point = {'input_ids': train_input_ids, 'attention_mask': train_attention_mask}

    # for all test points
    influence_scores = []
    for ihvp in test_ihvps:
        influence = influence_by_train_point(train_point, train_label, ihvp, regression_model, criterion)
        influence_scores.append(influence.item())

    # storing all test scores separately
    influences_per_test_sample.append({
        "train_index": len(influences),
        "influences": influence_scores.copy()
    })
    influences.append(sum(influence_scores) / len(influence_scores)) # mean influence over all test samples
    print(f"Influence for current training sample: {influences[-1]}")

# Rank external data points by influence
ranked_indices = sorted(enumerate(influences), key=lambda x: x[1], reverse=True)  # sorted by influence score
print("Most influential training points:", ranked_indices)

# Save the influence score
path = '/content/drive/My Drive/Colab Notebooks/nnti/'
os.chdir(path)
influence_scores = pd.DataFrame(ranked_indices, columns=["Index", "Influence Score"])
influence_scores.to_csv("ranked_indices.csv", index=False)
# saving the influence for for each individual test samples
influence_scores_separate = pd.DataFrame.from_records(influences_per_test_sample)
influence_scores_separate = influence_scores_separate.explode("influences").reset_index(drop=True)
influence_scores_separate["train_index"] = influence_scores_separate["train_index"].astype(int)
influence_scores_separate["test_index"] = influence_scores_separate.groupby("train_index").cumcount()
influence_scores_separate = influence_scores_separate.pivot(index="train_index", columns="test_index", values="influences")
influence_scores_separate.columns = [f"Test Sample {i+1}" for i in range(len(influence_scores_separate.columns))]
influence_scores_separate.to_csv("influence_scores_separate.csv", index=True)

os.chdir("/content/nnti-project-25/")
print(os.getcwd())

  test_ihvps.extend(torch.load(f"test_ihvps_{start_idx}_{end_idx}.pt"))


/content/nnti-project-25
External data sample 1/300
Influence for current training sample: -184609.02180989584
External data sample 2/300
Influence for current training sample: 103597.876953125
External data sample 3/300
Influence for current training sample: -316927.1953125
External data sample 4/300
Influence for current training sample: 289419.7955729167
External data sample 5/300
Influence for current training sample: -164343.64518229166
External data sample 6/300
Influence for current training sample: -185660.736328125
External data sample 7/300
Influence for current training sample: -96637.15494791667
External data sample 8/300
Influence for current training sample: -166085.224609375
External data sample 9/300
Influence for current training sample: 6098.886393229167
External data sample 10/300
Influence for current training sample: -179477.61783854166
External data sample 11/300
Influence for current training sample: -65699.0517578125
External data sample 12/300
Influence for cur

In [39]:
# computing influence of each external data point on test set
# load pre-computed iHVP
path = '/content/drive/My Drive/Colab Notebooks/nnti/'
os.chdir(path)
test_ihvps.extend(torch.load(f"test_ihvps_full_set.pt"))
os.chdir("/content/nnti-project-25/")
print(os.getcwd())

# Computing influence for each sample in external dataset
influences = []
for train_batch in ext_influence_dataloader:
    print(f"External data sample {len(influences) + 1}/{len(ext_influence_dataloader)}")
    train_input_ids = train_batch['input_ids'].to(device)
    train_attention_mask = train_batch['attention_mask'].to(device)
    train_label = train_batch['target'].to(device)

    train_point = {'input_ids': train_input_ids, 'attention_mask': train_attention_mask}

    ihvp = test_ihvps[0] # pre-computed iHVP over full test set
    influence_scores = influence_by_train_point(train_point, train_label, ihvp, regression_model, criterion)

    influences.append(influence_scores.item())
    print(f"Influence for current training sample: {influence_scores}")

# Rank external data points by influence
ranked_indices = sorted(enumerate(influences), key=lambda x: x[1], reverse=True)  # sorted by influence score
print("Most influential training points:", ranked_indices)

# Save the influence score
path = '/content/drive/My Drive/Colab Notebooks/nnti/'
os.chdir(path)
influence_scores = pd.DataFrame(ranked_indices, columns=["Index", "Influence Score"])
influence_scores.to_csv("ranked_indices_full_set.csv", index=False)
os.chdir("/content/nnti-project-25/")
print(os.getcwd())

  test_ihvps.extend(torch.load(f"test_ihvps_full_set.pt"))


/content/nnti-project-25
External data sample 1/300
Influence for current training sample: -180.46450805664062
External data sample 2/300
Influence for current training sample: 95.83807373046875
External data sample 3/300
Influence for current training sample: -339.0596618652344
External data sample 4/300
Influence for current training sample: 648.889892578125
External data sample 5/300
Influence for current training sample: -157.52743530273438
External data sample 6/300
Influence for current training sample: -267.3226623535156
External data sample 7/300
Influence for current training sample: -128.4178009033203
External data sample 8/300
Influence for current training sample: -132.17430114746094
External data sample 9/300
Influence for current training sample: 37.25246047973633
External data sample 10/300
Influence for current training sample: -290.90106201171875
External data sample 11/300
Influence for current training sample: -105.2376708984375
External data sample 12/300
Influence 

# Fine Tuning Model with Influencial Points

In [74]:
# load the influence score
path = '/content/drive/My Drive/Colab Notebooks/nnti/'
os.chdir(path)
influence_scores = pd.read_csv("ranked_indices_full_set.csv")
os.chdir("/content/nnti-project-25/")
print(os.getcwd())

positive_count = influence_scores[influence_scores['Influence Score'] > 0].shape[0]
print(positive_count)
print(influence_scores.iloc[100])

/content/nnti-project-25
161
Index               8.00000
Influence Score    37.25246
Name: 100, dtype: float64


In [75]:
# setting new train data with top 100 influential ext_data points
ext_data = pd.read_csv("./tasks/External-Dataset_for_Task2.csv")
ext_data = ext_data.iloc[influence_scores.iloc[:100]['Index']] # selecting top 100 points
ext_data = ext_data.rename(columns={"Label": "label"}) # making column names consistent
ext_dataset = HF_Dataset.from_pandas(ext_data)
ext_dataset = ext_dataset.remove_columns(["__index__"]) if "__index__" in ext_dataset.column_names else ext_dataset

# train-test-val split
split_dataset = dataset["train"].train_test_split(test_size=0.2, seed=42) # 80:20
train_valid_dataset = split_dataset["train"]
test_dataset = split_dataset["test"]
split_train_valid = train_valid_dataset.train_test_split(test_size=0.1, seed=42) # 90:10
train_dataset = split_train_valid["train"]
valid_dataset = split_train_valid["test"]
combined_train = concatenate_datasets([train_dataset, ext_dataset])

# create dataset and dataloader
combined_train = SMILESDataset(combined_train, tokenizer, max_length=128)
valid_dataset = SMILESDataset(valid_dataset, tokenizer, max_length=128)
test_dataset  = SMILESDataset(test_dataset, tokenizer, max_length=128)
combined_train_dataloader = DataLoader(combined_train, batch_size=16, shuffle=True)
valid_dataloader = DataLoader(valid_dataset, batch_size=16, shuffle=False)
test_dataloader  = DataLoader(test_dataset, batch_size=16, shuffle=False)

In [76]:
EPOCHS_reg = 200
LEARNING_RATE_reg = 1e-7
patience = 5
epochs_no_improve = 0
best_valid_loss = float("inf")
optimizer_reg = torch.optim.Adam(mlm_regression_model.parameters(), lr=LEARNING_RATE_reg)
loss_fn = nn.MSELoss()
path = '/content/drive/My Drive/Colab Notebooks/nnti/'
os.chdir(path)

for epoch in range(EPOCHS_reg):
    mlm_regression_model.train()
    total_train_loss = 0.0

    # training with combined set
    for batch in combined_train_dataloader:
        input_ids = batch['input_ids'].to(device)
        mask = batch['attention_mask'].to(device)
        targets = batch['target'].to(device)

        optimizer_reg.zero_grad()
        outputs = mlm_regression_model(input_ids, mask)
        loss = loss_fn(outputs, targets)
        loss.backward()
        optimizer_reg.step()

        total_train_loss += loss.item() * input_ids.size(0)

    avg_train_loss = total_train_loss / len(train_dataset)
    print(f"Epoch {epoch+1} - Train Loss: {avg_train_loss:.4f}")

    # validation
    mlm_regression_model.eval()
    total_valid_loss = 0.0
    with torch.no_grad():
        for batch in valid_dataloader:
            input_ids = batch['input_ids'].to(device)
            mask = batch['attention_mask'].to(device)
            targets = batch['target'].to(device)
            outputs = mlm_regression_model(input_ids, mask)
            loss = loss_fn(outputs, targets)
            total_valid_loss += loss.item() * input_ids.size(0)

    avg_valid_loss = total_valid_loss / len(valid_dataset)
    print(f"Epoch {epoch+1} - Validation Loss: {avg_valid_loss:.4f}")

    # early stop check
    if avg_valid_loss < best_valid_loss:
        best_valid_loss = avg_valid_loss
        epochs_no_improve = 0
        torch.save(mlm_regression_model.state_dict(), "best_mlm_regression_model_task2.pth")
        print("Validation loss improved, model saved.")
    else:
        epochs_no_improve += 1
        print(f"No improvement for {epochs_no_improve} consecutive epochs.")
    if epochs_no_improve >= patience:
        print(f"Early stopping triggered after {epoch+1} epochs.")
        break

# reset the path to git repo
os.chdir("/content/nnti-project-25/")
print(os.getcwd())

Epoch 1 - Train Loss: 6.5601
Epoch 1 - Validation Loss: 6.0653
Validation loss improved, model saved.
Epoch 2 - Train Loss: 5.5992
Epoch 2 - Validation Loss: 5.1098
Validation loss improved, model saved.
Epoch 3 - Train Loss: 4.7369
Epoch 3 - Validation Loss: 4.2993
Validation loss improved, model saved.
Epoch 4 - Train Loss: 3.9592
Epoch 4 - Validation Loss: 3.5940
Validation loss improved, model saved.
Epoch 5 - Train Loss: 3.3479
Epoch 5 - Validation Loss: 2.9269
Validation loss improved, model saved.
Epoch 6 - Train Loss: 2.8080
Epoch 6 - Validation Loss: 2.4272
Validation loss improved, model saved.
Epoch 7 - Train Loss: 2.3320
Epoch 7 - Validation Loss: 2.0372
Validation loss improved, model saved.
Epoch 8 - Train Loss: 2.0253
Epoch 8 - Validation Loss: 1.7211
Validation loss improved, model saved.
Epoch 9 - Train Loss: 1.7779
Epoch 9 - Validation Loss: 1.5551
Validation loss improved, model saved.
Epoch 10 - Train Loss: 1.6379
Epoch 10 - Validation Loss: 1.4419
Validation loss i

In [79]:
# test evaluation
influences = []
criterion = nn.MSELoss()

# loading pre-trained mlm regression model
path = '/content/drive/My Drive/Colab Notebooks/nnti/'
os.chdir(path)
mlm_regression_model = MoLFormerWithRegressionHead(mlm_finetuned_model).to(device)
mlm_regression_model.load_state_dict(torch.load("best_mlm_regression_model_task2.pth"))
# reset the path to git repo
os.chdir("/content/nnti-project-25/")
print(os.getcwd())
# regression_model = mlm_regression_model.eval()

mlm_regression_model.eval()
total_test_loss = 0.0
with torch.no_grad():
    for batch in test_dataloader:
        input_ids = batch['input_ids'].to(device)
        mask = batch['attention_mask'].to(device)
        targets = batch['target'].to(device)
        outputs = mlm_regression_model(input_ids, mask)
        loss = loss_fn(outputs, targets)
        total_test_loss += loss.item() * input_ids.size(0)

avg_test_loss = total_test_loss / len(test_dataset)
print()
print(f"Fine Tuned Model with Influential Ext Sample Test Loss: {avg_test_loss:.4f}")

  mlm_regression_model.load_state_dict(torch.load("best_mlm_regression_model_task2.pth"))


/content/nnti-project-25

Fine Tuned Model with Influential Ext Sample Test Loss: 0.9426


# Garbage Cleaning

In [None]:
del regression_model
del train_dataset
del test_dataset
del train_dataloader
del test_dataloader
del ext_dataset
del ext_dataloader
torch.cuda.empty_cache()