# Initialize

In [1]:
# clone the repo
!git clone https://fas38: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@github.com/fas38/nnti-project-25.git

Cloning into 'nnti-project-25'...
remote: Enumerating objects: 22, done.[K
remote: Counting objects: 100% (22/22), done.[K
remote: Compressing objects: 100% (20/20), done.[K
remote: Total 22 (delta 4), reused 17 (delta 1), pack-reused 0 (from 0)[K
Receiving objects: 100% (22/22), 1.47 MiB | 10.86 MiB/s, done.
Resolving deltas: 100% (4/4), done.


In [2]:
# set path
import os
%cd /content/nnti-project-25/
print(os.getcwd())

/content/nnti-project-25
/content/nnti-project-25


In [3]:
# install required packages
!pip install -r requirements.txt

Collecting jupyter (from -r requirements.txt (line 2))
  Downloading jupyter-1.1.1-py2.py3-none-any.whl.metadata (2.0 kB)
Collecting datasets (from -r requirements.txt (line 7))
  Downloading datasets-3.3.2-py3-none-any.whl.metadata (19 kB)
Collecting jupyterlab (from jupyter->-r requirements.txt (line 2))
  Downloading jupyterlab-4.3.5-py3-none-any.whl.metadata (16 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch->-r requirements.txt (line 6))
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch->-r requirements.txt (line 6))
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch->-r requirements.txt (line 6))
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch->-r requirements.

In [4]:
# import dependencies
import torch
from datasets import load_dataset
import torch.nn as nn
from transformers import AutoModel, AutoTokenizer, AutoModelForMaskedLM, DataCollatorForLanguageModeling
from torch.utils.data import DataLoader, Dataset, Subset
from sklearn.model_selection import train_test_split
import pandas as pd
from tqdm.notebook import tqdm
import random

# Setting Up Model and Data

In [5]:
MODEL_NAME = "ibm/MoLFormer-XL-both-10pct"  #MoLFormer model

In [6]:
# load pre-trained model from HuggingFace
model = AutoModel.from_pretrained(MODEL_NAME, deterministic_eval=True, trust_remote_code=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.01k [00:00<?, ?B/s]

configuration_molformer.py:   0%|          | 0.00/7.60k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/ibm/MoLFormer-XL-both-10pct:
- configuration_molformer.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling_molformer.py:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/ibm/MoLFormer-XL-both-10pct:
- modeling_molformer.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


model.safetensors:   0%|          | 0.00/187M [00:00<?, ?B/s]

In [7]:
# model with regression head
class MoLFormerWithRegressionHead(nn.Module):
    def __init__(self, base_model):
        super().__init__()
        self.base_model = base_model
        hidden_size = base_model.config.hidden_size
        self.regression_head = nn.Linear(hidden_size, 1)

    def forward(self, input_ids, attention_mask=None):
        outputs = self.base_model(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.last_hidden_state[:, 0, :]  # CLS token
        return self.regression_head(pooled_output)



In [8]:
# Instantiate model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
regression_model = MoLFormerWithRegressionHead(model).to(device)

## Loading the Dataset

In [9]:
class SMILESDataset(Dataset):
    def __init__(self, smiles_list, targets, tokenizer, max_length=128):
        self.smiles_list = smiles_list
        self.targets = targets
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.smiles_list)

    def __getitem__(self, idx):
        smiles = self.smiles_list[idx]
        target = torch.tensor(self.targets[idx], dtype=torch.float)

        # Tokenize the SMILES string
        encoding = self.tokenizer(
            smiles,
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )

        return {
            "input_ids": encoding["input_ids"].squeeze(0),
            "attention_mask": encoding["attention_mask"].squeeze(0),
            "target": target
        }

# loading dataset
DATASET_PATH = "scikit-fingerprints/MoleculeNet_Lipophilicity"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
dataset = load_dataset(DATASET_PATH)
df = pd.DataFrame(dataset["train"])
smiles_list = df["SMILES"].tolist()
targets = df["label"].tolist()
smiles_dataset = SMILESDataset(smiles_list, targets, tokenizer)

# creating train-test split
train_size = int(0.7 * len(smiles_dataset))
test_size = len(smiles_dataset) - train_size
train_dataset, test_dataset = torch.utils.data.random_split(smiles_dataset, [train_size, test_size])

# creating dataloaders
batch_size = 32
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

tokenizer_config.json:   0%|          | 0.00/1.29k [00:00<?, ?B/s]

tokenization_molformer_fast.py:   0%|          | 0.00/6.50k [00:00<?, ?B/s]

tokenization_molformer.py:   0%|          | 0.00/9.48k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/ibm/MoLFormer-XL-both-10pct:
- tokenization_molformer.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
A new version of the following files was downloaded from https://huggingface.co/ibm/MoLFormer-XL-both-10pct:
- tokenization_molformer_fast.py
- tokenization_molformer.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


vocab.json:   0%|          | 0.00/41.6k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/54.0k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/1.16k [00:00<?, ?B/s]

lipophilicity.csv:   0%|          | 0.00/223k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/4200 [00:00<?, ? examples/s]

# Training the Regression Model

## Training

In [10]:
criterion = nn.MSELoss()
optimizer = torch.optim.AdamW(regression_model.parameters(), lr=5e-5)

num_epochs = 5
regression_model.train()

for epoch in range(num_epochs):
    total_loss = 0
    for batch in train_dataloader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        targets = batch["target"].to(device)

        optimizer.zero_grad()
        predictions = regression_model(input_ids, attention_mask).squeeze()
        loss = criterion(predictions, targets)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    avg_loss = total_loss / len(train_dataloader)
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {avg_loss:.4f}")


Epoch 1/5, Loss: 1.1544
Epoch 2/5, Loss: 0.5790
Epoch 3/5, Loss: 0.4255
Epoch 4/5, Loss: 0.3190
Epoch 5/5, Loss: 0.2398


## Evaluating

In [18]:
regression_model.eval()
total_loss = 0

with torch.no_grad():
    for batch in test_dataloader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        targets = batch["target"].to(device)

        predictions = regression_model(input_ids, attention_mask).squeeze()
        loss = criterion(predictions, targets)
        total_loss += loss.item()

    avg_loss = total_loss / len(test_dataloader)
    print(f"Test Loss: {avg_loss:.4f}")


OutOfMemoryError: CUDA out of memory. Tried to allocate 12.00 MiB. GPU 0 has a total capacity of 14.74 GiB of which 4.12 MiB is free. Process 28800 has 14.73 GiB memory in use. Of the allocated memory 14.44 GiB is allocated by PyTorch, and 133.31 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

# Implementation of Influence Function

In [None]:
# place to start - https://github.com/nimarb/pytorch_influence_functions

In [13]:
def compute_hvp(loss, model, v):
    """ Computes Hessian-vector product """
    grads = torch.autograd.grad(loss, model.parameters(), create_graph=True, retain_graph=True)
    flat_grads = torch.cat([g.view(-1) for g in grads])
    hvp = torch.autograd.grad(flat_grads @ v, model.parameters(), retain_graph=True)
    return torch.cat([h.view(-1) for h in hvp])


def lissa_approximation(loss, model, v, damping=0.01, num_iter=100):
    """ Approximates Hessian-inverse-vector product (iHVP) using LiSSA. """
    ihvp = v.clone()
    for _ in range(num_iter):
        hvp = compute_hvp(loss, model, ihvp)
        ihvp = v + (1 - damping) * ihvp - hvp
    return ihvp

def influence_on_test_set(train_point, train_label, test_points, test_labels, model, criterion, num_iter=100):
    """ Computes the influence of a training point on the entire test set. """
    model.zero_grad()

    # Compute gradient of training loss w.r.t. model parameters
    # train_loss = criterion(model(train_point), train_label)
    print("on train")
    print(model(train_point['input_ids'], train_point['attention_mask']).view(-1).shape, train_label.shape)
    train_loss = criterion(model(train_point['input_ids'], train_point['attention_mask']).view(-1), train_label)
    grad_train = torch.autograd.grad(train_loss, model.parameters(), retain_graph=True)
    grad_train_vector = torch.cat([g.view(-1) for g in grad_train])

    # Compute Hessian-inverse-vector product using LiSSA
    ihvp = lissa_approximation(train_loss, model, grad_train_vector, num_iter=num_iter)

    # Compute influence on all test points and average

    # total_influence = 0
    # for test_point, test_label in zip(test_points, test_labels):
    #     test_loss = criterion(
    #       model(test_point['input_ids'].unsqueeze(0), test_point['attention_mask'].unsqueeze(0)),
    #       test_label.unsqueeze(0)
    #     ) # with batch size = 1
    #     grad_test = torch.autograd.grad(test_loss, model.parameters(), retain_graph=True)
    #     grad_test_vector = torch.cat([g.view(-1) for g in grad_test])
    #     total_influence += torch.dot(grad_test_vector, ihvp)

    print("on test")
    print(model(test_points['input_ids'], test_points['attention_mask']).squeeze().shape, test_labels.shape)
    test_loss = criterion(model(test_points['input_ids'], test_points['attention_mask']).squeeze(), test_labels)
    grad_test = torch.autograd.grad(test_loss, model.parameters(), retain_graph=True)
    grad_test_vector = torch.cat([g.view(-1) for g in grad_test])
    total_influence = torch.dot(grad_test_vector, ihvp)

    return -total_influence / len(test_points)

In [14]:
# loading external dataset
ext_data = pd.read_csv("./tasks/External-Dataset_for_Task2.csv")
ext_smiles_list = ext_data["SMILES"].tolist()
ext_targets = ext_data["Label"].tolist()
ext_dataset = SMILESDataset(ext_smiles_list, ext_targets, tokenizer)

# creating dataloaders
batch_size = 1
# train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
# test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
ext_dataloader = DataLoader(ext_dataset, batch_size=batch_size, shuffle=False)

In [17]:
# computing influences for each point
# influences = []
# influences = []
# for train_batch in ext_dataloader:
#     train_point, train_label = train_batch  # Explicitly unpack

#     influence = influence_on_test_set(train_point, train_label,
#                                       [test_point for test_point, _ in test_dataloader],
#                                       [test_label for _, test_label in test_dataloader],
#                                       regression_model, criterion)
#     influences.append(influence.item())

# # Rank training points by influence
# ranked_indices = sorted(range(len(influences)), key=lambda i: influences[i], reverse=True)
# print("Most influential training points:", ranked_indices)


influences = []
for train_batch in ext_dataloader:
    print(f"Processing training batch {len(influences) + 1}/{len(ext_dataloader)}")
    # Extract tensors from batch
    train_input_ids = train_batch['input_ids'].to(device)
    train_attention_mask = train_batch['attention_mask'].to(device)
    train_label = train_batch['target'].to(device)

    # Collect test inputs and labels
    test_input_ids = []
    test_attention_masks = []
    test_labels = []
    for test_batch in test_dataloader:
        test_input_ids.append(test_batch['input_ids'].to(device))
        test_attention_masks.append(test_batch['attention_mask'].to(device))
        test_labels.append(test_batch['target'].to(device))

    # Stack test inputs to form a batch
    test_input_ids = torch.cat(test_input_ids, dim=0)
    test_attention_masks = torch.cat(test_attention_masks, dim=0)
    test_labels = torch.cat(test_labels, dim=0)

    # Compute influence
    # influence = influence_on_test_set(
    #     train_input_ids, train_attention_mask, train_label,
    #     test_input_ids, test_attention_masks, test_labels,
    #     regression_model, criterion
    # )

    # influence = influence_on_test_set(
    #     {'input_ids': train_input_ids, 'attention_mask': train_attention_mask}, train_label,
    #     [{'input_ids': ti, 'attention_mask': ta} for ti, ta in zip(test_input_ids, test_attention_masks)], test_labels,
    #     regression_model, criterion
    # )
    # influences.append(influence.item())

    print("Computing influence...")
    influence = influence_on_test_set(
        {'input_ids': train_input_ids, 'attention_mask': train_attention_mask}, train_label,
        {'input_ids': test_input_ids, 'attention_mask': test_attention_masks}, test_labels,
        regression_model, criterion
    )
    influences.append(influence.item())
    print(f"Influence for current training batch: {influence.item()}")

# Rank training points by influence
ranked_indices = sorted(range(len(influences)), key=lambda i: influences[i], reverse=True)
print("Most influential training points:", ranked_indices)




Processing training batch 1/300
Computing influence...
on train
torch.Size([1]) torch.Size([1])


OutOfMemoryError: CUDA out of memory. Tried to allocate 20.00 MiB. GPU 0 has a total capacity of 14.74 GiB of which 4.12 MiB is free. Process 28800 has 14.73 GiB memory in use. Of the allocated memory 14.49 GiB is allocated by PyTorch, and 78.92 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)