## Installation

In [None]:
#%pip install 

# Download this repository
#!git clone https://github.com/pluskal-lab/DreaMS.git
#!cd DreaMS

# Create conda environment
#!conda update -n base -c defaults conda
#!conda create -n dreams python==3.11.0 --yes
#!conda init
#!conda activate dreams

# Install DreaMS
#%pip install -e ./DreaMS

#%pip install pytorch-lightning
#!git clone https://github.com/colorfulcereal/MassSpecGym
#%pip install wandb


In [None]:
#pip3 install -e MassSpecGym

## How to enable GPU support for TensorFlow or PyTorch on MacOS
## https://medium.com/bluetuple-ai/how-to-enable-gpu-support-for-tensorflow-or-pytorch-on-macos-4aaaad057e74

## Loading the MassSpecGym dataset

In [None]:
from massspecgym.utils import load_massspecgym
df = load_massspecgym()
df.head(1)


In [None]:
df.loc['MassSpecGymID0013583']['mzs'].size

## MS/MS EDA

In [None]:
## Plot spectra

import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np

# Set plot style
sns.set_style('whitegrid')

# Function to plot histogram of mzs and intensities
def plot_spectrum(mzs, intensities):
    plt.figure(figsize=(8, 6))
    plt.stem(mzs, intensities, basefmt='-')
    plt.title('Mass Spectrum')
    plt.xlabel('m/z')
    plt.ylabel('Intensity')
    plt.show()

# Plot a random spectrum
random_index = np.random.randint(0, len(df))
print(random_index)
mzs = df.iloc[random_index]['mzs']
intensities = df.iloc[random_index]['intensities']
plot_spectrum(mzs, intensities)

# Plot multiple spectra
num_spectra = 5
random_indices = np.random.randint(0, len(df), num_spectra)
fig, axs = plt.subplots(nrows=num_spectra, ncols=1, figsize=(8, 6*num_spectra))
for i, idx in enumerate(random_indices):
    mzs = df.iloc[idx]['mzs']
    intensities = df.iloc[idx]['intensities']
    axs[i].stem(mzs, intensities, basefmt='-')
    axs[i].set_title(f'Spectrum {idx}')
    axs[i].set_xlabel('m/z')
    axs[i].set_ylabel('Intensity')
plt.tight_layout()
plt.show()

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Halogen symbols
halogens = ['F', 'Cl', 'Br', 'I', 'At', 'Ts']

# Initialize halogen count dictionary
halogen_counts = {halogen: 0 for halogen in halogens}

# Count mass spectra containing each halogen
for index, row in df.iterrows():
    for halogen in halogens:
        if halogen in row['precursor_formula'] or halogen in row['formula']:
            if halogen == 'F':
                print(index)
            halogen_counts[halogen] += 1

# Calculate percentages
halogen_percentages = {halogen: (count/len(df))*100 for halogen, count in halogen_counts.items()}

# Print results
print("Halogen Percentages:")
for halogen, percentage in halogen_percentages.items():
    print(f"{halogen}: {percentage:.2f}%")

for halogen, count in halogen_counts.items():
    print(f"{halogen}: {count}")

# Plot (%)
# plt.bar(halogen_percentages.keys(), halogen_percentages.values())
# plt.xlabel('Halogen')
# plt.ylabel('Percentage (%)')
# plt.title('Distribution of Halogens in Mass Spectra')
# plt.ylim(0, 50)  # Set y-axis limit to 100%
# plt.gca().yaxis.set_major_formatter(plt.FuncFormatter(lambda x, loc: f"{x:.0f}%"))  # Format y-axis ticks as percentages
# plt.show()

# Plot results (count)
plt.bar(halogen_counts.keys(), halogen_counts.values())
plt.xlabel('Halogen')
plt.ylabel('Count')
plt.title('Distribution of Halogens in Mass Spectra By Count')
plt.show()

## Testing an existing MassSpecGymModel

In [None]:
import torch
import torch.nn as nn
import pytorch_lightning as pl
from pytorch_lightning import Trainer
import numpy, sys

from massspecgym.data import RetrievalDataset, MassSpecDataModule
from massspecgym.data.transforms import SpecTokenizer, MolFingerprinter
from massspecgym.models.base import Stage
from massspecgym.models.retrieval.base import RetrievalMassSpecGymModel
numpy.set_printoptions(threshold=sys.maxsize)


In [None]:
class MyDeepSetsRetrievalModel(RetrievalMassSpecGymModel):
    # constructor
    def __init__(
        self,
        hidden_channels: int = 128,
        out_channels: int = 4096,  # fingerprint size
        # out_channels: int = 4096,  # fingerprint size
        *args,
        **kwargs
    ):
        """Implement your architecture."""
        super().__init__(*args, **kwargs)

        self.phi = nn.Sequential(
            nn.Linear(2, hidden_channels),
            nn.ReLU(),
            nn.Linear(hidden_channels, hidden_channels),
            nn.ReLU(),
        )
        self.rho = nn.Sequential(
            nn.Linear(hidden_channels, hidden_channels),
            nn.ReLU(),
            nn.Linear(hidden_channels, out_channels),
            nn.Sigmoid()
        )

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """Implement your prediction logic."""
        x = self.phi(x)
        x = x.sum(dim=-2)  # sum over peaks
        x = self.rho(x)
        return x

    def step(
        self, batch: dict, stage: Stage
    ) -> tuple[torch.Tensor, torch.Tensor]:
        """Implement your custom logic of using predictions for training and inference."""
        # Unpack inputs
        x = batch["spec"]  # input spectra
        fp_true = batch["mol"]  # true fingerprints
        cands = batch["candidates"]  # candidate fingerprints concatenated for a batch
        #print(cands)
        batch_ptr = batch["batch_ptr"]  # number of candidates per sample in a batch
        #print(batch_ptr)

        # Predict fingerprint
        fp_pred = self.forward(x)

        # Calculate loss
        loss = nn.functional.mse_loss(fp_true, fp_pred)

        # Calculate final similarity scores between predicted fingerprints and retrieval candidates
        fp_pred_repeated = fp_pred.repeat_interleave(batch_ptr, dim=0)
        scores = nn.functional.cosine_similarity(fp_pred_repeated, cands)

        return dict(loss=loss, scores=scores)

In [None]:
# Init hyperparameters
n_peaks = 10
fp_size = 4096
batch_size = 2

# Load dataset
dataset = RetrievalDataset(
    spec_transform=SpecTokenizer(n_peaks=n_peaks),
    mol_transform=MolFingerprinter(fp_size=fp_size),
)

# Init data module
data_module = MassSpecDataModule(
    dataset=dataset,
    batch_size=batch_size,
    num_workers=4
)




In [None]:
from lightning.pytorch import loggers as pl_loggers

# Init model
model = MyDeepSetsRetrievalModel(out_channels=fp_size)

# Init trainer
tb_logger = pl_loggers.TensorBoardLogger(save_dir="logs/")
trainer = Trainer(accelerator="auto", devices="auto", max_epochs=1, logger=tb_logger)

# Train
trainer.fit(model, datamodule=data_module)

## Fluoride Detection

In [1]:
import torch
import torch.nn as nn
import pytorch_lightning as pl
from pytorch_lightning import Trainer
import numpy, sys
import wandb
from pathlib import Path

from massspecgym.data import MassSpecDataset, MassSpecDataModule
from massspecgym.data.transforms import SpecTokenizer, MolFingerprinter
from massspecgym.models.base import Stage
from massspecgym.models.retrieval.base import MassSpecGymModel
from sklearn.metrics import precision_score, recall_score
from pytorch_lightning import loggers as pl_loggers
from pytorch_lightning.loggers import CSVLogger
from pytorch_lightning.utilities import grad_norm


from torch import nn
import torch.nn.functional as F
from massspecgym.models.base import Stage
from dreams.api import PreTrainedModel
from dreams.models.dreams.dreams import DreaMS as DreaMSModel

numpy.set_printoptions(threshold=sys.maxsize)


  from .autonotebook import tqdm as notebook_tqdm
Determination of memory status is not supported on this 
 platform, measuring for memoryleaks will never fail


In [2]:
import numpy as np
from rdkit import Chem
from massspecgym.data.transforms import MolToHalogensVector


# Example usage
checker = MolToHalogensVector()
smiles_string = "CC(F)(F)F"
halogen_vector = checker.from_smiles(smiles_string)
print(halogen_vector)
# Example usage
smiles_string = "CCBr"
halogen_vector = checker.from_smiles(smiles_string)
print(halogen_vector)

[1 0 0 0]
[0 0 1 0]


In [3]:
pl.seed_everything(0)

DEBUG = False

if DEBUG:
    mgf_pth = Path("/teamspace/studios/this_studio/MassSpecGym/data/debug/example_5_spectra.mgf")
    split_pth = Path("/teamspace/studios/this_studio/MassSpecGym/data/debug/example_5_spectra_split.tsv")
else:
    mgf_pth = None
    split_pth = None

# Check if MPS is available, otherwise use CUDA
if torch.backends.mps.is_available():
    mps_device = torch.device("mps")
else:
    mps_device = None

Seed set to 0


In [4]:
import string
from pathlib import Path

# base model contains definitions for step and on_batch_end
class HalogenPredMassSpecGymModel(MassSpecGymModel):
    def __init__(
        self,
        alpha: float=0.8,
        gamma: float=0.5,
        batch_size: int=64,
        threshold: float=0.5,
        pos_weight: float=1.0,
        *args,
        **kwargs
    ):
        super().__init__(*args, **kwargs)
        if mps_device is not None:
            self.alpha = torch.tensor([1-alpha, alpha], device=mps_device)
        else:
            self.alpha = torch.tensor([1-alpha, alpha]).cuda()
        self.gamma = gamma
        self.batch_size = batch_size
        self.threshold = threshold
        self.pos_weight = pos_weight
        # training metrics
        self.train_num_actual_positives = 0
        self.train_num_predicted_positives = 0
        self.train_num_true_positives = 0
        # validation metrics
        self.val_num_actual_positives = 0
        self.val_num_predicted_positives = 0
        self.val_num_true_positives = 0

    # def step(
    #     self, batch: dict, stage: Stage
    # ) -> tuple[torch.Tensor, torch.Tensor]:
    #     return { 'loss': torch.tensor(0.0, requires_grad=True) } 
 
    def step(
        self, batch: dict, stage: Stage
    ) -> tuple[torch.Tensor, torch.Tensor]:
        """Implement your custom logic of using predictions for training and inference."""
        # Unpack inputs
        x = batch["spec"]  
        # input spectra [batch_size, num_peaks + 1, 2]

        halogen_vector_true = batch["mol"]
        # Extract the 1st column --> fluorine predictions
        true_values = halogen_vector_true[:, 0] # shape [batch_size]
        predicted_probs = self.forward(x) # shape [batch_size x 1]
        
        if DEBUG:
            predicted_probs = predicted_probs[0] # for testing
        else:
            predicted_probs = predicted_probs.squeeze() # shape [batch_size]

        # BCE Loss & apply weight to positive examples
        #weight = torch.where(true_values == 1.0, self.pos_weight, 1)
        #bce_loss = nn.BCELoss(weight=weight)
        #loss = bce_loss(predicted_probs, true_values)
        
        # if DEBUG and stage.to_pref() == 'val_':
        #     print('\n--true_values', true_values)
        #     print('\n--predicted_probs', predicted_probs)
        
        # return { 'loss': loss } 

        # Focal Loss: https://amaarora.github.io/posts/2020-06-29-FocalLoss.html # 
        # Increase loss for minority misclassification (F = 1 but predicted as 0) and 
        # decreases loss for majority class misclassification (F = 0 but predicted as 1)
        # Our MassEpcGym training data is skewed with only 5% of molecules containing Fluorine
        bce_loss = nn.BCELoss(reduction='none')
        loss = bce_loss(predicted_probs, true_values)
        targets = true_values.type(torch.long)
        at = self.alpha.gather(0, targets.data.view(-1))
        pt = torch.exp(-loss)
        F_loss = at * (1 - pt)**self.gamma * loss
        return { 'loss': F_loss.mean() } 


    def on_batch_end(
        self, outputs: [], batch: dict, batch_idx: int, stage: Stage
    ) -> None:
        x = batch["spec"]
        halogen_vector_true = batch["mol"] # shape [batch_size]
        halogen_vector_pred = self.forward(x) # shape [batch_size x 1]
        halogen_vector_pred_binary = torch.where(halogen_vector_pred >= self.threshold, 1, 0)

        # Extract the 1st column --> fluorine predictions
        true_values = halogen_vector_true[:, 0].cpu().numpy() # shape [batch_size]
        pred_values = halogen_vector_pred_binary.squeeze().cpu().numpy() # shape [batch_size]

        if stage.to_pref() == 'train_':
            self.train_num_actual_positives += np.sum(true_values) # all true positives
            self.train_num_predicted_positives += np.sum(pred_values) # all predicted positives
            true_positives = np.logical_and(true_values, pred_values)
            self.train_num_true_positives += np.sum(true_positives)
        elif stage.to_pref() == 'val_':
            self.val_num_actual_positives += np.sum(true_values) # all true positives
            self.val_num_predicted_positives += np.sum(pred_values) # all predicted positives
            true_positives = np.logical_and(true_values, pred_values)
            self.val_num_true_positives += np.sum(true_positives)

        self.log_dict({ f"{stage.to_pref()}/loss": outputs['loss'] },
                prog_bar=True,
                on_epoch=True,
                batch_size=self.batch_size
        )

    def _reset_metrics_train(self):
        self.train_num_actual_positives = 0
        self.train_num_predicted_positives = 0
        self.train_num_true_positives = 0

    def _reset_metrics_val(self):
        self.val_num_actual_positives = 0
        self.val_num_predicted_positives = 0
        self.val_num_true_positives = 0

    def on_train_epoch_start(self) -> None:
        self._reset_metrics_train()

    def on_validation_epoch_start(self) -> None:
        self._reset_metrics_val()

    def on_train_epoch_end(self) -> None:
        precision = self.train_num_true_positives/self.train_num_predicted_positives if self.train_num_predicted_positives != 0 else 0
        recall = self.train_num_true_positives/self.train_num_actual_positives if self.train_num_actual_positives != 0 else 0
        f1_score = (2*precision*recall)/(precision + recall) if (precision + recall) != 0 else 0
        self.log_dict({   
                f"train_/num_actual_positives" : self.train_num_actual_positives, 
                f"train_/num_predicted_positives": self.train_num_predicted_positives, 
                f"train_/num_true_positives": self.train_num_true_positives,
                f"train_/precision": precision,
                f"train_/recall": recall,
                f"train_/f1_score": f1_score
            },
            prog_bar=True,
            on_epoch=True,
            on_step=False
        )
        
    def on_validation_epoch_end(self) -> None:
        precision = self.val_num_true_positives/self.val_num_predicted_positives if self.val_num_predicted_positives != 0 else 0
        recall = self.val_num_true_positives/self.val_num_actual_positives if self.val_num_actual_positives != 0 else 0
        f1_score = (2*precision*recall)/(precision + recall) if (precision + recall) != 0 else 0
        self.log_dict({   
                f"val_/num_actual_positives" : self.val_num_actual_positives, 
                f"val_/num_predicted_positives": self.val_num_predicted_positives, 
                f"val_/num_true_positives": self.val_num_true_positives,
                f"val_/precision": precision,
                f"val_/recall": recall,
                f"val_/f1_score": f1_score
            },
            prog_bar=True,
            on_epoch=True,
            on_step=False
        )

# final model containing the network definition
class HalogenDetectorDreams(HalogenPredMassSpecGymModel):
    def __init__(
        self,
        *args,
        **kwargs
    ):
        super().__init__(*args, **kwargs)
        self.spec_encoder = PreTrainedModel.from_ckpt(
            # ckpt_path should be replaced with the path to the ssl_model.ckpt model downloaded from https://zenodo.org/records/10997887
            ckpt_path="https://zenodo.org/records/10997887/files/ssl_model.ckpt?download=1", ckpt_cls=DreaMSModel, n_highest_peaks=60
        ).model.train()
        #self.lin_out = nn.Linear(1024, 4) # for the 4 halogens (F, Cl, Br, I)
        self.lin_out = nn.Linear(1024, 1) # for F

    def forward(self, x):
        x = self.spec_encoder(x)[:, 0, :] # to get the precursor peak token embedding 
        x = F.sigmoid(self.lin_out(x))
        return x

In [None]:
from pytorch_lightning.loggers import WandbLogger
from notebooks.fluorine_balanced_dataset1 import FluorineBalancedDataset


# Init hyperparameters
n_peaks = 60
threshold = 0.50
pos_weight = 10
gammas = [0.5, 1, 2, 5]

if DEBUG:
    batch_size = 1
else:
    batch_size = 64

for gamma in gammas:
    # Load dataset
    dataset = FluorineBalancedDataset(
        spec_transform=SpecTokenizer(n_peaks=n_peaks),
        mol_transform = MolToHalogensVector(),
        pth=mgf_pth,
    )

    # Init data module
    data_module = MassSpecDataModule(
        dataset=dataset,
        batch_size=batch_size,
        split_pth=split_pth,
        num_workers=4
    )

    # Init model
    model = HalogenDetectorDreams(
        threshold=threshold,
        pos_weight=pos_weight,
        alpha=0.8,
        gamma=gamma
    )
    
    # initialise the wandb logger and name your wandb project
    wandb_logger = WandbLogger(project='M4-MassSpecGym-DreaMS-HalogenDetection-FocalLoss')

    # add your batch size to the wandb config
    wandb_logger.experiment.config["batch_size"] = batch_size
    wandb_logger.experiment.config["n_peaks"] = n_peaks
    wandb_logger.experiment.config["threshold"] = threshold
    wandb_logger.experiment.config["pos_weight"] = pos_weight
    wandb_logger.experiment.config["alpha"] = model.alpha
    wandb_logger.experiment.config["gamma"] = model.gamma

    # Init trainer
    if mps_device is not None:
        print("Using MPS device for training")
        trainer = Trainer(accelerator="mps", devices="1", max_epochs=1, logger=wandb_logger, val_check_interval= 0.01)
    else:
        trainer = Trainer(accelerator="auto", devices="auto", max_epochs=1, logger=wandb_logger, val_check_interval= 0.01)

    # Validate before training
    data_module.prepare_data()  # Explicit call needed for validate before fit
    data_module.setup()  # Explicit call needed for validate before fit
    trainer.validate(model, datamodule=data_module)

    # # Train
    trainer.fit(model, datamodule=data_module)

    # [optional] finish the wandb run, necessary in notebooks
    wandb.finish()

---train 17436
---val 19429


Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
wandb: Currently logged in as: hamsini-m-ramanathan (hamsini-m-ramanathan-iocb-prague). Use `wandb login --relogin` to force relogin


Using MPS device for training


GPU available: True (mps), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


Validation DataLoader 0: 100%|██████████| 304/304 [02:12<00:00,  2.30it/s]
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
      Validate metric               DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
       val_/f1_score             0.2336229532957077
         val_/loss              0.16408193111419678
 val_/num_actual_positives             2662.0
val_/num_predicted_positives          11840.0
  val_/num_true_positives              1694.0
       val_/precision           0.14307431876659393
        val_/recall              0.6363636255264282
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────



  | Name         | Type   | Params
----------------------------------------
0 | spec_encoder | DreaMS | 95.5 M
1 | lin_out      | Linear | 1.0 K 
----------------------------------------
95.5 M    Trainable params
6.0 K     Non-trainable params
95.6 M    Total params
382.202   Total estimated model params size (MB)


Epoch 0:   1%|          | 2/273 [00:02<05:22,  0.84it/s, v_num=mruy, train_/loss_step=0.272]
Validation: |          | 0/? [00:00<?, ?it/s]
Validation:   0%|          | 0/304 [00:00<?, ?it/s]
Validation DataLoader 0:   0%|          | 0/304 [00:00<?, ?it/s]
Validation DataLoader 0:   0%|          | 1/304 [00:00<00:03, 76.29it/s]
Validation DataLoader 0:   1%|          | 2/304 [00:00<01:14,  4.06it/s]
Validation DataLoader 0:   1%|          | 3/304 [00:01<01:42,  2.95it/s]
Validation DataLoader 0:   1%|▏         | 4/304 [00:01<01:54,  2.62it/s]
Validation DataLoader 0:   2%|▏         | 5/304 [00:02<02:00,  2.47it/s]
Validation DataLoader 0:   2%|▏         | 6/304 [00:02<02:04,  2.40it/s]
Validation DataLoader 0:   2%|▏         | 7/304 [00:02<02:06,  2.35it/s]
Validation DataLoader 0:   3%|▎         | 8/304 [00:03<02:07,  2.32it/s]
Validation DataLoader 0:   3%|▎         | 9/304 [00:03<02:08,  2.30it/s]
Validation DataLoader 0:   3%|▎         | 10/304 [00:04<02:08,  2.29it/s]
Validation Da

## Scratch

In [None]:
import torch
import torch.nn as nn

# Define the network
hidden_channels = 5
net = nn.Sequential(
    nn.Linear(2, hidden_channels),
    nn.ReLU(),
    nn.Linear(hidden_channels, hidden_channels),
    nn.ReLU(),
)

# Initialize the network
net = net.float()

# Create a dummy input tensor
input_tensor = torch.ones(5, 10, 2)
# Forward pass
output = net(input_tensor)
print(output) 

In [None]:
true_values = np.array([1, 1., 1])
pred_values = np.array([0.0, 1, 1])

precision_score(true_values, pred_values)
recall_score(true_values, pred_values)

In [None]:
from torch import nn
import torch.nn.functional as F
from massspecgym.models.base import Stage
from dreams.api import PreTrainedModel
from dreams.models.dreams.dreams import DreaMS as DreaMSModel

# Example forward pass (not needed to explicitly initialize the DataLoader if you are using MassSpecGym)
from massspecgym.data.datasets import MassSpecDataset
from massspecgym.data.transforms import SpecTokenizer
from torch.utils.data import DataLoader

dataset = MassSpecDataset(
    spec_transform=SpecTokenizer(n_peaks=n_peaks),
    mol_transform = MolToHalogensVector()
)
dataloader = DataLoader(dataset, batch_size=4)
model = HalogenDetectorDreams()

dummy_batch = next(iter(dataloader))
dummy_output = model(dummy_batch)
print(dummy_output)  # Should print a tensor of shape (4, 4) containing halogen probabilties


In [None]:
m = nn.Sigmoid()
loss = nn.BCELoss(reduction='mean')
input = torch.tensor([[2.0, 3.0, 5.0], [2.0, 3.0, 5.0]])
print(m(input))
target = torch.tensor([[1.0, 0.0, 1.0], [1.0, 0.0, 0.0]])
print(target)
output = loss(m(input), target)
output

In [None]:
import numpy as np
a1 = np.array([1, 1, 1])
a2 = np.array([0, 1, 1])

print(np.sum(np.logical_and(a1, a2))) # should return 2

a1 = np.array([1, 1, 0])
a2 = np.array([0, 0, 1])

print(np.sum(np.logical_and(a1, a2))) # should return 0

In [None]:
#check for gpu
if torch.backends.mps.is_available():
   mps_device = torch.device("mps")
   x = torch.ones(1, device=mps_device)
   print (x)
else:
   print ("MPS device not found.")

In [None]:
import time
# GPU
start_time = time.time()

# syncrocnize time with cpu, otherwise only time for oflaoding data to gpu would be measured
torch.mps.synchronize()

a = torch.ones(4000,4000, device="mps")
for _ in range(200):
   a +=a

elapsed_time = time.time() - start_time
print( "GPU Time: ", elapsed_time)