## Fluorine/PFAS Model

In [1]:
import torch
import torch.nn as nn
import pytorch_lightning as pl
from pytorch_lightning import Trainer
import numpy, sys
import wandb
from pathlib import Path
import torch.optim as optim

from massspecgym.data import MassSpecDataset, MassSpecDataModule
from massspecgym.data.transforms import SpecTokenizer, MolFingerprinter
from massspecgym.models.base import Stage
from massspecgym.models.retrieval.base import MassSpecGymModel
from sklearn.metrics import precision_score, recall_score
from pytorch_lightning import loggers as pl_loggers
from pytorch_lightning.loggers import CSVLogger
from pytorch_lightning.utilities import grad_norm

from torch import nn
import torch.nn.functional as F
from massspecgym.models.base import Stage
from dreams.api import PreTrainedModel
from dreams.models.dreams.dreams import DreaMS as DreaMSModel
from torchmetrics.classification import BinaryPrecision, BinaryRecall, BinaryAccuracy

import numpy as np

numpy.set_printoptions(threshold=sys.maxsize)
torch.set_float32_matmul_precision('high')


In [2]:
import numpy as np
from rdkit import Chem
from massspecgym.data.transforms import MolToHalogensVector, MolToPFASVector

# Example usage
checker = MolToHalogensVector() # creating an object of type MolToHalogensVector
smiles_string = "CC(F)(F)F"
halogen_vector = checker.from_smiles(smiles_string)
print(halogen_vector)
# Example usage
smiles_string = "CCBr"
halogen_vector = checker.from_smiles(smiles_string)
print(halogen_vector)

checker = MolToPFASVector()
smiles_string = "CC(F)(F)F"
halogen_vector = checker.from_smiles(smiles_string)
print(halogen_vector)

# Example usage
smiles_string = "CCBr"
halogen_vector = checker.from_smiles(smiles_string)
print(halogen_vector)

[1 0 0 0]
[0 0 1 0]
[1 0 0 0]
[0 0 0 0]


In [3]:
pl.seed_everything(0)

DEBUG = False

if DEBUG:
    mgf_pth = Path("/teamspace/studios/this_studio/MassSpecGym/data/debug/example_5_spectra.mgf")
    split_pth = Path("/teamspace/studios/this_studio/MassSpecGym/data/debug/example_5_spectra_split.tsv")
else:
    mgf_pth = None
    split_pth = None

# Check if MPS is available, otherwise use CUDA
if torch.backends.mps.is_available():
    mps_device = torch.device("mps")
else:
    mps_device = None

Global seed set to 0


In [4]:
# final model containing the network definition
import pandas as pd

n_examples_to_plot = 20

class HalogenDetectorDreamsTest(MassSpecGymModel):
    def __init__(
        self,
        alpha: float=0.25,
        gamma: float=0.5,
        batch_size: int=64,
        threshold: float=0.5,
        *args,
        **kwargs
    ):
        super().__init__(*args, **kwargs)
        if mps_device is not None:
            self.alpha = torch.tensor([1-alpha, alpha], device=mps_device)
        else:
            self.alpha = torch.tensor([1-alpha, alpha]).cuda()
        self.gamma = gamma
        self.batch_size = batch_size
        self.threshold = threshold
        print(f"Training with threshold: {self.threshold}, alpha: {self.alpha}, gamma: {self.gamma}, batch_size: {self.batch_size}")
        
        # Metrics
        self.train_precision = BinaryPrecision()
        self.train_recall = BinaryRecall()
        self.val_precision = BinaryPrecision()
        self.val_recall = BinaryRecall()
        self.train_accuracy = BinaryAccuracy()
        self.val_accuracy = BinaryAccuracy()

        # loading the DreaMS model weights from the internet
        self.main_model = PreTrainedModel.from_ckpt(
            # ckpt_path should be replaced with the path to the ssl_model.ckpt model downloaded from https://zenodo.org/records/10997887
            ckpt_path="https://zenodo.org/records/10997887/files/ssl_model.ckpt?download=1", ckpt_cls=DreaMSModel, n_highest_peaks=60
        ).model.train()
        self.lin_out = nn.Linear(1024, 1) # for F

    def forward(self, x):
        output_main_model = self.main_model(x)[:, 0, :] # to get the precursor peak token embedding 
        fl_probability = F.sigmoid(self.lin_out(output_main_model))
        return fl_probability

    def step(
        self, batch: dict, stage: Stage
    ) -> tuple[torch.Tensor, torch.Tensor]:
        """Implement your custom logic of using predictions for training and inference."""
        # Unpack inputs
        x = batch["spec"]  # shape: [batch_size, num_peaks + 1, 2]
        #print("--batch.keys", batch.keys())

        halogen_vector_true = batch["mol"] # shape: [batch_size, 4]

        # Extract the 1st column --> fluorine predictions
        true_values = halogen_vector_true[:, 0] # shape [batch_size]

        # the forward pass
        predicted_probs = self.forward(x) # shape [batch_size x 1]
        
        if DEBUG:
            predicted_probs = predicted_probs[0] # for testing
        else:
            predicted_probs = predicted_probs.squeeze() # shape [batch_size]

        #print("--predicted_probs", predicted_probs)

        ### Focal Loss: https://amaarora.github.io/posts/2020-06-29-FocalLoss.html ### 
        # Increase loss for minority misclassification (F = 1 but predicted as 0) and 
        # decreases loss for majority class misclassification (F = 0 but predicted as 1)
        # Our MassSpecGym training data is skewed with only 5% of molecules containing Fluorine
       
        bce_loss = nn.BCELoss(reduction='none')
        loss = bce_loss(predicted_probs, true_values)
        targets = true_values.type(torch.long)
        at = self.alpha.gather(0, targets.data.view(-1))
        pt = torch.exp(-loss)
        F_loss = at * (1 - pt)**self.gamma * loss
        return { 'loss': F_loss.mean() } 

    def on_batch_end(
        self, outputs: [], batch: dict, batch_idx: int, stage: Stage
    ) -> None:
        x = batch["spec"] # shape: [batch_size, num_peaks + 1, 2]
        halogen_vector_true = batch["mol"] # shape [batch_size]
        # updated predictions with the updated weights at the end of the batch
        pred_probs = self.forward(x) # shape [batch_size x 1]

        # thresholding
        halogen_vector_pred_binary = torch.where(pred_probs >= self.threshold, 1, 0)

        # Extract the 1st column --> fluorine predictions
        true_labels = halogen_vector_true[:, 0] # shape [batch_size]
        
        # make shape [batch_size x 1] into shape [batch_size]
        pred_bool_labels = halogen_vector_pred_binary.squeeze() # shape [batch_size]

        if stage.to_pref() == 'train_':
            self.train_precision.update(pred_bool_labels, true_labels)
            self.train_recall.update(pred_bool_labels, true_labels)
            self.train_accuracy.update(pred_bool_labels, true_labels)
        elif stage.to_pref() == 'val_':
            self.val_precision.update(pred_bool_labels, true_labels)
            self.val_recall.update(pred_bool_labels, true_labels)
            self.val_accuracy.update(pred_bool_labels, true_labels)

            ## debugging the false negatives ##
            # Collect optional metadata if available
            identifiers = batch.get("identifier", ["NA"] * len(true_labels))
            spectra = batch.get("spec", None)

            pred_probs_flat = pred_probs.squeeze()  # remove extra dimensions
            if pred_probs_flat.ndim == 0:           # single sample
                pred_probs_flat = [pred_probs_flat.item()]
            else:
                pred_probs_flat = pred_probs_flat.tolist()

            self.all_predicted_probs.extend(pred_probs_flat)
            self.all_true_labels.extend(true_labels.tolist())
            self.all_identifiers.extend(identifiers)

            if spectra is not None:
                self.all_spectra.extend([s.detach().cpu().numpy() for s in spectra])

        self.log_dict({ f"{stage.to_pref()}/loss": outputs['loss'] },
                prog_bar=True,
                on_epoch=True,
                batch_size=self.batch_size
        )

    def _reset_metrics_train(self):
        # Reset states for next epoch
        self.train_precision.reset()
        self.train_recall.reset()
        self.train_accuracy.reset()

    def _reset_metrics_val(self):
        # Reset states for next epoch
        self.val_precision.reset()
        self.val_recall.reset()
        self.val_accuracy.reset()
        self.all_predicted_probs = []  # reset the list of predicted probabilities for validation
        self.all_true_labels = []
        self.all_identifiers = []
        self.all_spectra = []

    def on_train_epoch_start(self) -> None:
        self._reset_metrics_train()

    def on_validation_epoch_start(self) -> None:
        self._reset_metrics_val()

    def on_train_epoch_end(self) -> None:
        precision = self.train_precision.compute()
        recall = self.train_recall.compute()
        accuracy = self.train_accuracy.compute()
        f1_score = (2*precision*recall)/(precision + recall) if (precision + recall) != 0 else 0
        self.log_dict({
                f"train_/precision": precision,
                f"train_/recall": recall,
                f"train_/accuracy": accuracy,
                f"train_/f1_score": f1_score
            },
            prog_bar=True,
            on_epoch=True,
            on_step=False
        )
        
    def on_validation_epoch_end(self) -> None:
        precision = self.val_precision.compute()
        recall = self.val_recall.compute()
        accuracy = self.val_accuracy.compute()
        f1_score = (2*precision*recall)/(precision + recall) if (precision + recall) != 0 else 0
        self.log_dict({
                f"val_/precision": precision,
                f"val_/recall": recall,
                f"val_/accuracy": accuracy,
                f"val_/f1_score": f1_score
            },
            prog_bar=True,
            on_epoch=True,
            on_step=False
        )

         # Stop if no predictions
        if len(self.all_predicted_probs) == 0:
            return

        # Create a dataframe of predictions
        df_preds = pd.DataFrame({
            "identifier": self.all_identifiers,
            "true_label": self.all_true_labels,
            "pred_prob": self.all_predicted_probs,
            "spec": self.all_spectra
        })
        df_preds["pred_label"] = (df_preds["pred_prob"] >= self.threshold).astype(int)

        # Identify True Positives and False Negatives
        df_tp = df_preds[(df_preds["true_label"] == 1) & (df_preds["pred_label"] == 1)]
        df_fn = df_preds[(df_preds["true_label"] == 1) & (df_preds["pred_label"] == 0)]

        print(f"\nüß™ Validation Summary:")
        print(f"  True Positives (TP): {len(df_tp)}")
        print(f"  False Negatives (FN): {len(df_fn)}")

        # ---- PRINT IDENTIFIERS ----
        # ---- Randomly sample examples ----
        tp_samples = df_tp.sample(min(n_examples_to_plot, len(df_tp)), random_state=42)
        fn_samples = df_fn.sample(min(n_examples_to_plot, len(df_fn)), random_state=42)

        # ---- Print Identifiers ----
        tp_filename = "true_positive_identifiers.txt"
        fn_filename = "false_negative_identifiers.txt"

        # Write True Positive identifiers
        with open(tp_filename, "w") as f:
            f.write("True Positive Identifiers:\n")
            for i, ident in enumerate(tp_samples["identifier"].tolist(), 1):
                f.write(f"  {i}. {ident}\n")

        # Write False Negative identifiers
        with open(fn_filename, "w") as f:
            f.write("False Negative Identifiers:\n")
            for i, ident in enumerate(fn_samples["identifier"].tolist(), 1):
                f.write(f"  {i}. {ident}\n")

        print(f"‚úÖ True Positive identifiers written to {tp_filename}")
        print(f"‚úÖ False Negative identifiers written to {fn_filename}")

In [5]:
# removed adduct due to a str error
class TestMassSpecDataset(MassSpecDataset):

    def __getitem__(
        self, i: int, transform_spec: bool = True, transform_mol: bool = True
    ) -> dict:
        spec = self.spectra[i]
        metadata = self.metadata.iloc[i]
        mol = metadata["smiles"]

        # Apply all transformations to the spectrum
        item = {}
        if transform_spec and self.spec_transform:
            if isinstance(self.spec_transform, dict):
                for key, transform in self.spec_transform.items():
                    item[key] = transform(spec) if transform is not None else spec
            else:
                item["spec"] = self.spec_transform(spec)
        else:
            item["spec"] = spec

        # Apply all transformations to the molecule
        if transform_mol and self.mol_transform:
            if isinstance(self.mol_transform, dict):
                for key, transform in self.mol_transform.items():
                    item[key] = transform(mol) if transform is not None else mol
            else:
                item["mol"] = self.mol_transform(mol)
        else:
            item["mol"] = mol

        # Add other metadata to the item
        item.update({
            k: metadata[k] for k in ["precursor_mz"] # removed adduct due to a str error
        })

        if self.return_mol_freq:
            item["mol_freq"] = metadata["mol_freq"]

        if self.return_identifier:
            item["identifier"] = metadata["identifier"]

        # TODO: this should be refactored
        for k, v in item.items():
            if not isinstance(v, str):
                item[k] = torch.as_tensor(v, dtype=self.dtype)
        
        return item

## Training Code

In [None]:
from pytorch_lightning.loggers import WandbLogger

torch.set_float32_matmul_precision('high')

# Init hyperparameters
max_epochs = 1
n_peaks = 60
threshold = 0.9
alpha = 0.25 
gamma = 0.75
lr = 1e-5
num_iterations = 1

if DEBUG:
    batch_size = 1
else:
    batch_size = 64

for i in range (0, num_iterations):
    # Load dataset
    dataset = TestMassSpecDataset(
        spec_transform=SpecTokenizer(n_peaks=n_peaks),
        mol_transform = MolToPFASVector(),
        #pth='/teamspace/studios/this_studio/files/merged_massspec_nist20_with_fold.tsv'
        pth='/teamspace/studios/this_studio/files/merged_massspec_nist20_pfas_labeled_oversampled_df.tsv'
    )

    # Init data module
    data_module = MassSpecDataModule(
        dataset=dataset,
        batch_size=batch_size,
        split_pth=split_pth,
        num_workers=4
    )

    # Init model
    model = HalogenDetectorDreamsTest(
        threshold=threshold,
        alpha=alpha,
        gamma=gamma,
        batch_size=batch_size,
        lr=lr
    )
    # initialise the wandb logger and name your wandb project
    wandb_logger = WandbLogger(project='PFASDetection-FocalLoss-MergedMassSpecNIST20OECDWith_PFASExceptions')
    # add your batch size to the wandb config
    wandb_logger.experiment.config["batch_size"] = batch_size
    wandb_logger.experiment.config["n_peaks"] = n_peaks
    wandb_logger.experiment.config["threshold"] = threshold
    wandb_logger.experiment.config["alpha"] = alpha
    wandb_logger.experiment.config["gamma"] = gamma

    trainer = Trainer(accelerator="auto", devices="auto", max_epochs=max_epochs, logger=wandb_logger, val_check_interval=0.2)

    # Validate before training
    data_module.prepare_data()  # Explicit call needed for validate before fit
    data_module.setup()  # Explicit call needed for validate before fit
    trainer.validate(model, datamodule=data_module)

    # # Train
    trainer.fit(model, datamodule=data_module)

    # [optional] finish the wandb run, necessary in notebooks
    wandb.finish()

## Detecting PFAS from mzML files

In [8]:
# alpha = 0.25, gamma = 0.75 model
ckpt_path = '/teamspace/studios/this_studio/MassSpecGym/scripts/HalogenDetection-FocalLoss-MergedMassSpecNIST20_NISTNew_OECD/2fei8ahc/checkpoints/epoch=0-step=9285.ckpt'
model = HalogenDetectorDreamsTest.load_from_checkpoint(ckpt_path)
print(model)

Training with threshold: 0.9, alpha: tensor([0.7500, 0.2500], device='cuda:0'), gamma: 0.75, batch_size: 64
HalogenDetectorDreamsTest(
  (train_precision): BinaryPrecision()
  (train_recall): BinaryRecall()
  (val_precision): BinaryPrecision()
  (val_recall): BinaryRecall()
  (train_accuracy): BinaryAccuracy()
  (val_accuracy): BinaryAccuracy()
  (main_model): DreaMS(
    (fourier_enc): FourierFeatures()
    (ff_fourier): FeedForward(
      (ff): Sequential(
        (0): Linear(in_features=11994, out_features=512, bias=True)
        (1): Dropout(p=0.1, inplace=False)
        (2): ReLU()
        (3): Linear(in_features=512, out_features=512, bias=True)
        (4): Dropout(p=0.1, inplace=False)
        (5): ReLU()
        (6): Linear(in_features=512, out_features=512, bias=True)
        (7): Dropout(p=0.1, inplace=False)
        (8): ReLU()
        (9): Linear(in_features=512, out_features=512, bias=True)
        (10): Dropout(p=0.1, inplace=False)
        (11): ReLU()
        (12): Lin

In [9]:
from pathlib import Path
from tqdm import tqdm
from dreams.utils.data import MSData
from dreams.api import dreams_predictions, PreTrainedModel
from dreams.models.heads.heads import BinClassificationHead
from dreams.utils.io import append_to_stem
import pandas as pd
import numpy as np
from pathlib import Path
from dreams.utils.dformats import DataFormatA
from dreams.utils.data import MSData
from dreams.utils.io import append_to_stem

# Load model
#model = HalogenDetectorDreamsTest.load_from_checkpoint(ckpt_path)

def find_PFAS(in_pth):
    # in_pth = 'data/teo/<in_file>.mgf'  # or .mzML
    # out_csv_pth = 'data/teo/<in_file>_f_preds.csv'

    # in_pth = Path('/teamspace/studios/this_studio/SLI23_040.mzML')

    n_highest_peaks = 60

    print(f'Processing {in_pth}...')

    # Load data
    try:
        msdata = MSData.from_mzml(in_pth, verbose_parser=True)
    except ValueError as e:
        print(f'Skipping {in_pth} because of {e}.')
        return

    # Get spectra (m/z and inetsnity arrays) and precursor m/z values from the input dataset
    spectra = msdata['spectrum']
    prec_mzs = msdata['precursor_mz']

    # Ref: https://dreams-docs.readthedocs.io/en/latest/tutorials/spectral_quality.html
    # Subject each spectrum to spectral quality checks
    dformat = DataFormatA()
    quality_lvls = [dformat.val_spec(s, p, return_problems=True) for s, p in zip(spectra, prec_mzs)]

    # Check how many spectra passed all filters (`All checks passed`) and how many spectra did not pass some of the filters
    print(pd.Series(quality_lvls).value_counts())

    # Define path for output high-quality file
    hq_pth = append_to_stem(in_pth, 'high_quality').with_suffix('.hdf5')

    # Pick only high-quality spectra and save them to `hq_pth`
    msdata.form_subset(
        idx=np.where(np.array(quality_lvls) == 'All checks passed')[0],
        out_pth=hq_pth
    )

    # Try reading the new file
    msdata_hq = MSData.load(hq_pth)

    # Compute fluorine probabilties
    df = msdata_hq.to_pandas()
    
  #  f_preds = dreams_predictions(
  #      spectra=msdata_hq,
  #      model_ckpt=model,
  #      n_highest_peaks=n_highest_peaks
  #  )

  #  df[f'PFAS_preds'] = f_preds

    # Store predictions
    # df.to_csv(append_to_stem(in_pth, 'PFAS_preds').with_suffix('.csv'), index=False)
    return df

Training with threshold: 0.9, alpha: tensor([0.7500, 0.2500], device='cuda:0'), gamma: 0.75, batch_size: 64


In [10]:
#find_PFAS(Path('/teamspace/studios/this_studio/Moorea24_MSRun_mzml/Blank_36.mzML'))

In [12]:
import os
import pandas as pd

def scan_and_run_pfas(directory, output_csv="pfas_hits.csv", threshold=0.95):
    """
    Scan directory for .mzML files, run find_PFAS() on each, 
    filter predictions, aggregate results, and save to CSV.
    """
    all_hits = []   # list of DataFrames

    # Loop over all files in directory
    total_len = 0
    for fname in os.listdir(directory):
        if fname.lower().endswith(".mzml"):
            file_path = os.path.join(directory, fname)
            print(f"Processing: {file_path}")

            try:
                # Call your PFAS detection function
                df = find_PFAS(Path(file_path))   # must return a pandas DataFrame

                # Confirm required column exists
                if "PFAS_preds" not in df.columns:
                    print(f"  ‚ö† Warning: no PFAS_preds column in {fname}, skipping.")
                    total_len = total_len + len(df)
                    continue

                # Filter based on threshold
                df_hits = df[df["PFAS_preds"] >= threshold].copy()

                # Add file path reference
                df_hits["file_path"] = file_path

                # Only append if non-empty
                if not df_hits.empty:
                    all_hits.append(df_hits)

            except Exception as e:
                print(f"‚ùå Error processing {fname}: {e}")


    print(f"Total spectra: {total_len}")
    with open("total_spectra_len.txt", 'w') as file_object:
        file_object.write(total_len)

    # Combine all records
    if all_hits:
        final_df = pd.concat(all_hits, ignore_index=True)
        final_df.to_csv(output_csv, index=False)
        print(f"\n‚ú® Done. Found {len(final_df)} PFAS-like entries.")
        print(f"Output saved to: {output_csv}")

        return final_df
    else:
        print("\nüö´ No PFAS candidates found in any file.")
        return pd.DataFrame()  # empty


# Example usage:
final_results = scan_and_run_pfas("/teamspace/studios/this_studio/Moorea24_MSRun_mzml/")

Processing: /teamspace/studios/this_studio/Moorea24_MSRun_mzml/Blank10.mzML
Processing /teamspace/studios/this_studio/Moorea24_MSRun_mzml/Blank10.mzML...


Reading Blank10.mzML: 4408it [00:00, 24942.65it/s]
  return max(peak_list[1]) / min(peak_list[1])


Loading dataset Blank10 into memory (3577 spectra)...
All checks passed                      2898
Number of high intensity peaks >= 3     667
Precursor m/z <= 1000.0                   8
Intensity amplitude >= 20.0               2
m/z range <= 1000.0                       2
Name: count, dtype: int64
Processing: /teamspace/studios/this_studio/Moorea24_MSRun_mzml/Blank11.mzML
Processing /teamspace/studios/this_studio/Moorea24_MSRun_mzml/Blank11.mzML...


Reading Blank11.mzML: 4406it [00:00, 26950.20it/s]
  return max(peak_list[1]) / min(peak_list[1])


Loading dataset Blank11 into memory (3546 spectra)...
All checks passed                      2847
Number of high intensity peaks >= 3     694
Precursor m/z <= 1000.0                   4
Intensity amplitude >= 20.0               1
Name: count, dtype: int64
Processing: /teamspace/studios/this_studio/Moorea24_MSRun_mzml/Blank12.mzML
Processing /teamspace/studios/this_studio/Moorea24_MSRun_mzml/Blank12.mzML...


Reading Blank12.mzML: 4411it [00:00, 29931.74it/s]
  return max(peak_list[1]) / min(peak_list[1])


Loading dataset Blank12 into memory (3558 spectra)...
All checks passed                      2714
Number of high intensity peaks >= 3     836
Precursor m/z <= 1000.0                   7
m/z range <= 1000.0                       1
Name: count, dtype: int64
Processing: /teamspace/studios/this_studio/Moorea24_MSRun_mzml/Blank13.mzML
Processing /teamspace/studios/this_studio/Moorea24_MSRun_mzml/Blank13.mzML...


Reading Blank13.mzML: 4408it [00:00, 29914.96it/s]
  return max(peak_list[1]) / min(peak_list[1])


Loading dataset Blank13 into memory (3559 spectra)...
All checks passed                      2883
Number of high intensity peaks >= 3     663
Precursor m/z <= 1000.0                   9
Intensity amplitude >= 20.0               4
Name: count, dtype: int64
Processing: /teamspace/studios/this_studio/Moorea24_MSRun_mzml/Blank14.mzML
Processing /teamspace/studios/this_studio/Moorea24_MSRun_mzml/Blank14.mzML...


Reading Blank14.mzML: 4400it [00:00, 28874.14it/s]
  return max(peak_list[1]) / min(peak_list[1])


Loading dataset Blank14 into memory (3495 spectra)...
All checks passed                      2621
Number of high intensity peaks >= 3     860
Intensity amplitude >= 20.0               9
Precursor m/z <= 1000.0                   4
m/z range <= 1000.0                       1
Name: count, dtype: int64
Processing: /teamspace/studios/this_studio/Moorea24_MSRun_mzml/Blank15.mzML
Processing /teamspace/studios/this_studio/Moorea24_MSRun_mzml/Blank15.mzML...


Reading Blank15.mzML: 4409it [00:00, 24467.37it/s]
  return max(peak_list[1]) / min(peak_list[1])


Loading dataset Blank15 into memory (3559 spectra)...
All checks passed                      2901
Number of high intensity peaks >= 3     647
Intensity amplitude >= 20.0               6
Precursor m/z <= 1000.0                   5
Name: count, dtype: int64
Processing: /teamspace/studios/this_studio/Moorea24_MSRun_mzml/Blank15r.mzML
Processing /teamspace/studios/this_studio/Moorea24_MSRun_mzml/Blank15r.mzML...


Reading Blank15r.mzML: 4391it [00:00, 29557.55it/s]
  return max(peak_list[1]) / min(peak_list[1])


Loading dataset Blank15r into memory (3434 spectra)...
All checks passed                      2649
Number of high intensity peaks >= 3     779
Precursor m/z <= 1000.0                   4
Intensity amplitude >= 20.0               1
m/z range <= 1000.0                       1
Name: count, dtype: int64
Processing: /teamspace/studios/this_studio/Moorea24_MSRun_mzml/Blank16.mzML
Processing /teamspace/studios/this_studio/Moorea24_MSRun_mzml/Blank16.mzML...


Reading Blank16.mzML: 4415it [00:00, 10994.40it/s]
  return max(peak_list[1]) / min(peak_list[1])


Loading dataset Blank16 into memory (3582 spectra)...
All checks passed                      2846
Number of high intensity peaks >= 3     726
Precursor m/z <= 1000.0                   5
Intensity amplitude >= 20.0               4
m/z range <= 1000.0                       1
Name: count, dtype: int64
Processing: /teamspace/studios/this_studio/Moorea24_MSRun_mzml/Blank17.mzML
Processing /teamspace/studios/this_studio/Moorea24_MSRun_mzml/Blank17.mzML...


Reading Blank17.mzML: 4400it [00:00, 29477.34it/s]
  return max(peak_list[1]) / min(peak_list[1])


Loading dataset Blank17 into memory (3518 spectra)...
All checks passed                      2649
Number of high intensity peaks >= 3     865
Precursor m/z <= 1000.0                   3
Intensity amplitude >= 20.0               1
Name: count, dtype: int64
Processing: /teamspace/studios/this_studio/Moorea24_MSRun_mzml/Blank17r.mzML
Processing /teamspace/studios/this_studio/Moorea24_MSRun_mzml/Blank17r.mzML...


Reading Blank17r.mzML: 4413it [00:00, 24274.48it/s]
  return max(peak_list[1]) / min(peak_list[1])


Loading dataset Blank17r into memory (3529 spectra)...
All checks passed                      2830
Number of high intensity peaks >= 3     693
Precursor m/z <= 1000.0                   4
Intensity amplitude >= 20.0               2
Name: count, dtype: int64
Processing: /teamspace/studios/this_studio/Moorea24_MSRun_mzml/Blank18.mzML
Processing /teamspace/studios/this_studio/Moorea24_MSRun_mzml/Blank18.mzML...


Reading Blank18.mzML: 4412it [00:00, 26303.30it/s]
  return max(peak_list[1]) / min(peak_list[1])


Loading dataset Blank18 into memory (3535 spectra)...
All checks passed                      2811
Number of high intensity peaks >= 3     711
Intensity amplitude >= 20.0              10
Precursor m/z <= 1000.0                   3
Name: count, dtype: int64
Processing: /teamspace/studios/this_studio/Moorea24_MSRun_mzml/Blank19.mzML
Processing /teamspace/studios/this_studio/Moorea24_MSRun_mzml/Blank19.mzML...


Reading Blank19.mzML: 4397it [00:00, 29385.90it/s]
  return max(peak_list[1]) / min(peak_list[1])


Loading dataset Blank19 into memory (3483 spectra)...
All checks passed                      2726
Number of high intensity peaks >= 3     749
Precursor m/z <= 1000.0                   5
Intensity amplitude >= 20.0               2
m/z range <= 1000.0                       1
Name: count, dtype: int64
Processing: /teamspace/studios/this_studio/Moorea24_MSRun_mzml/Blank20.mzML
Processing /teamspace/studios/this_studio/Moorea24_MSRun_mzml/Blank20.mzML...


Reading Blank20.mzML: 4416it [00:00, 28373.97it/s]
  return max(peak_list[1]) / min(peak_list[1])


Loading dataset Blank20 into memory (3431 spectra)...
All checks passed                      2751
Number of high intensity peaks >= 3     670
Precursor m/z <= 1000.0                   6
Intensity amplitude >= 20.0               3
m/z range <= 1000.0                       1
Name: count, dtype: int64
Processing: /teamspace/studios/this_studio/Moorea24_MSRun_mzml/Blank21.mzML
Processing /teamspace/studios/this_studio/Moorea24_MSRun_mzml/Blank21.mzML...


Reading Blank21.mzML: 4401it [00:00, 27887.89it/s]
  return max(peak_list[1]) / min(peak_list[1])


Loading dataset Blank21 into memory (3555 spectra)...
All checks passed                      2934
Number of high intensity peaks >= 3     599
m/z range <= 1000.0                      10
Precursor m/z <= 1000.0                   9
Intensity amplitude >= 20.0               3
Name: count, dtype: int64
Processing: /teamspace/studios/this_studio/Moorea24_MSRun_mzml/Blank22.mzML
Processing /teamspace/studios/this_studio/Moorea24_MSRun_mzml/Blank22.mzML...


Reading Blank22.mzML: 4402it [00:00, 25616.72it/s]
  return max(peak_list[1]) / min(peak_list[1])


Loading dataset Blank22 into memory (3552 spectra)...
All checks passed                      2920
Number of high intensity peaks >= 3     584
m/z range <= 1000.0                      33
Precursor m/z <= 1000.0                  11
Intensity amplitude >= 20.0               4
Name: count, dtype: int64
Processing: /teamspace/studios/this_studio/Moorea24_MSRun_mzml/Blank23.mzML
Processing /teamspace/studios/this_studio/Moorea24_MSRun_mzml/Blank23.mzML...


Reading Blank23.mzML: 4406it [00:00, 28686.55it/s]
  return max(peak_list[1]) / min(peak_list[1])


Loading dataset Blank23 into memory (3566 spectra)...
All checks passed                      2814
Number of high intensity peaks >= 3     703
Precursor m/z <= 1000.0                  30
m/z range <= 1000.0                      17
Intensity amplitude >= 20.0               2
Name: count, dtype: int64
Processing: /teamspace/studios/this_studio/Moorea24_MSRun_mzml/Blank24.mzML
Processing /teamspace/studios/this_studio/Moorea24_MSRun_mzml/Blank24.mzML...


Reading Blank24.mzML: 4403it [00:00, 26718.75it/s]
  return max(peak_list[1]) / min(peak_list[1])


Loading dataset Blank24 into memory (3568 spectra)...
All checks passed                      2846
Number of high intensity peaks >= 3     638
m/z range <= 1000.0                      49
Precursor m/z <= 1000.0                  30
Intensity amplitude >= 20.0               5
Name: count, dtype: int64
Processing: /teamspace/studios/this_studio/Moorea24_MSRun_mzml/Blank25.mzML
Processing /teamspace/studios/this_studio/Moorea24_MSRun_mzml/Blank25.mzML...


Reading Blank25.mzML: 4391it [00:00, 28853.82it/s]
  return max(peak_list[1]) / min(peak_list[1])


Loading dataset Blank25 into memory (3569 spectra)...
All checks passed                      2588
Number of high intensity peaks >= 3     708
m/z range <= 1000.0                     162
Precursor m/z <= 1000.0                 111
Name: count, dtype: int64
Processing: /teamspace/studios/this_studio/Moorea24_MSRun_mzml/Blank26.mzML
Processing /teamspace/studios/this_studio/Moorea24_MSRun_mzml/Blank26.mzML...


Reading Blank26.mzML: 4404it [00:00, 28319.51it/s]
  return max(peak_list[1]) / min(peak_list[1])


Loading dataset Blank26 into memory (3551 spectra)...
All checks passed                      2614
Number of high intensity peaks >= 3     820
m/z range <= 1000.0                      77
Precursor m/z <= 1000.0                  40
Name: count, dtype: int64
Processing: /teamspace/studios/this_studio/Moorea24_MSRun_mzml/Blank27.mzML
Processing /teamspace/studios/this_studio/Moorea24_MSRun_mzml/Blank27.mzML...


Reading Blank27.mzML: 4372it [00:00, 30343.57it/s]
  return max(peak_list[1]) / min(peak_list[1])


Loading dataset Blank27 into memory (3293 spectra)...
All checks passed                      2533
Number of high intensity peaks >= 3     605
m/z range <= 1000.0                      90
Precursor m/z <= 1000.0                  63
Intensity amplitude >= 20.0               2
Name: count, dtype: int64
Processing: /teamspace/studios/this_studio/Moorea24_MSRun_mzml/Blank27_r_cln_src.mzML
Processing /teamspace/studios/this_studio/Moorea24_MSRun_mzml/Blank27_r_cln_src.mzML...


Reading Blank27_r_cln_src.mzML: 4403it [00:00, 26872.21it/s]
  return max(peak_list[1]) / min(peak_list[1])


Loading dataset Blank27_r_cln_src into memory (3565 spectra)...
All checks passed                      2790
Number of high intensity peaks >= 3     667
Precursor m/z <= 1000.0                  70
m/z range <= 1000.0                      31
Intensity amplitude >= 20.0               7
Name: count, dtype: int64
Processing: /teamspace/studios/this_studio/Moorea24_MSRun_mzml/Blank28.mzML
Processing /teamspace/studios/this_studio/Moorea24_MSRun_mzml/Blank28.mzML...


Reading Blank28.mzML: 4401it [00:00, 28886.40it/s]
  return max(peak_list[1]) / min(peak_list[1])


Loading dataset Blank28 into memory (3569 spectra)...
All checks passed                      2877
Number of high intensity peaks >= 3     586
Precursor m/z <= 1000.0                  57
m/z range <= 1000.0                      43
Intensity amplitude >= 20.0               6
Name: count, dtype: int64
Processing: /teamspace/studios/this_studio/Moorea24_MSRun_mzml/Blank29.mzML
Processing /teamspace/studios/this_studio/Moorea24_MSRun_mzml/Blank29.mzML...


Reading Blank29.mzML: 4399it [00:00, 30761.75it/s]
  return max(peak_list[1]) / min(peak_list[1])


Loading dataset Blank29 into memory (3533 spectra)...
All checks passed                      2727
Number of high intensity peaks >= 3     731
Precursor m/z <= 1000.0                  36
m/z range <= 1000.0                      35
Intensity amplitude >= 20.0               4
Name: count, dtype: int64
Processing: /teamspace/studios/this_studio/Moorea24_MSRun_mzml/Blank30.mzML
Processing /teamspace/studios/this_studio/Moorea24_MSRun_mzml/Blank30.mzML...


Reading Blank30.mzML: 4406it [00:00, 25776.36it/s]
  return max(peak_list[1]) / min(peak_list[1])


Loading dataset Blank30 into memory (3517 spectra)...
All checks passed                      2847
Number of high intensity peaks >= 3     615
m/z range <= 1000.0                      36
Precursor m/z <= 1000.0                  16
Intensity amplitude >= 20.0               3
Name: count, dtype: int64
Processing: /teamspace/studios/this_studio/Moorea24_MSRun_mzml/Blank31.mzML
Processing /teamspace/studios/this_studio/Moorea24_MSRun_mzml/Blank31.mzML...


Reading Blank31.mzML: 4398it [00:00, 29238.70it/s]
  return max(peak_list[1]) / min(peak_list[1])


Loading dataset Blank31 into memory (3471 spectra)...
All checks passed                      2864
Number of high intensity peaks >= 3     551
m/z range <= 1000.0                      32
Precursor m/z <= 1000.0                  18
Intensity amplitude >= 20.0               6
Name: count, dtype: int64
Processing: /teamspace/studios/this_studio/Moorea24_MSRun_mzml/Blank32.mzML
Processing /teamspace/studios/this_studio/Moorea24_MSRun_mzml/Blank32.mzML...


Reading Blank32.mzML: 4392it [00:00, 30377.37it/s]
  return max(peak_list[1]) / min(peak_list[1])


Loading dataset Blank32 into memory (3434 spectra)...
All checks passed                      2733
Number of high intensity peaks >= 3     661
m/z range <= 1000.0                      23
Precursor m/z <= 1000.0                  13
Intensity amplitude >= 20.0               4
Name: count, dtype: int64
Processing: /teamspace/studios/this_studio/Moorea24_MSRun_mzml/Blank33.mzML
Processing /teamspace/studios/this_studio/Moorea24_MSRun_mzml/Blank33.mzML...


Reading Blank33.mzML: 4396it [00:00, 30714.65it/s]
  return max(peak_list[1]) / min(peak_list[1])


Loading dataset Blank33 into memory (3450 spectra)...
All checks passed                      2729
Number of high intensity peaks >= 3     700
m/z range <= 1000.0                      17
Intensity amplitude >= 20.0               2
Precursor m/z <= 1000.0                   2
Name: count, dtype: int64
Processing: /teamspace/studios/this_studio/Moorea24_MSRun_mzml/Blank33_5.mzML
Processing /teamspace/studios/this_studio/Moorea24_MSRun_mzml/Blank33_5.mzML...


Reading Blank33_5.mzML: 4397it [00:00, 31016.67it/s]
  return max(peak_list[1]) / min(peak_list[1])


Loading dataset Blank33_5 into memory (3425 spectra)...
All checks passed                      2729
Number of high intensity peaks >= 3     682
m/z range <= 1000.0                      11
Precursor m/z <= 1000.0                   3
Name: count, dtype: int64
Processing: /teamspace/studios/this_studio/Moorea24_MSRun_mzml/Blank34.mzML
Processing /teamspace/studios/this_studio/Moorea24_MSRun_mzml/Blank34.mzML...


Reading Blank34.mzML: 4383it [00:00, 11809.71it/s]
  return max(peak_list[1]) / min(peak_list[1])


Loading dataset Blank34 into memory (3427 spectra)...
All checks passed                      2775
Number of high intensity peaks >= 3     643
m/z range <= 1000.0                       6
Intensity amplitude >= 20.0               2
Precursor m/z <= 1000.0                   1
Name: count, dtype: int64
Processing: /teamspace/studios/this_studio/Moorea24_MSRun_mzml/Blank34_5.mzML
Processing /teamspace/studios/this_studio/Moorea24_MSRun_mzml/Blank34_5.mzML...


Reading Blank34_5.mzML: 4269it [00:00, 24679.89it/s]
  return max(peak_list[1]) / min(peak_list[1])


Loading dataset Blank34_5 into memory (2953 spectra)...
All checks passed                      2283
Number of high intensity peaks >= 3     662
Intensity amplitude >= 20.0               8
Name: count, dtype: int64
Processing: /teamspace/studios/this_studio/Moorea24_MSRun_mzml/Blank34_5_flowcheck.mzML
Processing /teamspace/studios/this_studio/Moorea24_MSRun_mzml/Blank34_5_flowcheck.mzML...


Reading Blank34_5_flowcheck.mzML: 4379it [00:00, 30896.34it/s]
  return max(peak_list[1]) / min(peak_list[1])


Loading dataset Blank34_5_flowcheck into memory (3214 spectra)...
All checks passed                      2656
Number of high intensity peaks >= 3     536
m/z range <= 1000.0                      11
Intensity amplitude >= 20.0               8
Precursor m/z <= 1000.0                   3
Name: count, dtype: int64
Processing: /teamspace/studios/this_studio/Moorea24_MSRun_mzml/Blank35.mzML
Processing /teamspace/studios/this_studio/Moorea24_MSRun_mzml/Blank35.mzML...


Reading Blank35.mzML: 4372it [00:00, 29014.20it/s]
  return max(peak_list[1]) / min(peak_list[1])


Loading dataset Blank35 into memory (3340 spectra)...
All checks passed                      2716
Number of high intensity peaks >= 3     620
Intensity amplitude >= 20.0               2
Precursor m/z <= 1000.0                   1
m/z range <= 1000.0                       1
Name: count, dtype: int64
Processing: /teamspace/studios/this_studio/Moorea24_MSRun_mzml/Blank4_5.mzML
Processing /teamspace/studios/this_studio/Moorea24_MSRun_mzml/Blank4_5.mzML...


Reading Blank4_5.mzML: 4400it [00:00, 24420.08it/s]
  return max(peak_list[1]) / min(peak_list[1])


Loading dataset Blank4_5 into memory (3418 spectra)...
All checks passed                      2891
Number of high intensity peaks >= 3     516
Intensity amplitude >= 20.0               4
Precursor m/z <= 1000.0                   4
m/z range <= 1000.0                       3
Name: count, dtype: int64
Processing: /teamspace/studios/this_studio/Moorea24_MSRun_mzml/Blank5.mzML
Processing /teamspace/studios/this_studio/Moorea24_MSRun_mzml/Blank5.mzML...


Reading Blank5.mzML: 4424it [00:00, 26388.57it/s]
  return max(peak_list[1]) / min(peak_list[1])


Loading dataset Blank5 into memory (3452 spectra)...
All checks passed                      2875
Number of high intensity peaks >= 3     566
Precursor m/z <= 1000.0                   6
Intensity amplitude >= 20.0               5
Name: count, dtype: int64
Processing: /teamspace/studios/this_studio/Moorea24_MSRun_mzml/Blank52_5.mzML
Processing /teamspace/studios/this_studio/Moorea24_MSRun_mzml/Blank52_5.mzML...


Reading Blank52_5.mzML: 4392it [00:00, 29534.23it/s]
  return max(peak_list[1]) / min(peak_list[1])


Loading dataset Blank52_5 into memory (3399 spectra)...
All checks passed                      2753
Number of high intensity peaks >= 3     620
Precursor m/z <= 1000.0                  17
Intensity amplitude >= 20.0               8
m/z range <= 1000.0                       1
Name: count, dtype: int64
Processing: /teamspace/studios/this_studio/Moorea24_MSRun_mzml/Blank6.mzML
Processing /teamspace/studios/this_studio/Moorea24_MSRun_mzml/Blank6.mzML...


Reading Blank6.mzML: 4399it [00:00, 27309.43it/s]
  return max(peak_list[1]) / min(peak_list[1])


Loading dataset Blank6 into memory (3539 spectra)...
All checks passed                      2848
Number of high intensity peaks >= 3     676
Intensity amplitude >= 20.0               8
Precursor m/z <= 1000.0                   5
m/z range <= 1000.0                       2
Name: count, dtype: int64
Processing: /teamspace/studios/this_studio/Moorea24_MSRun_mzml/Blank7.mzML
Processing /teamspace/studios/this_studio/Moorea24_MSRun_mzml/Blank7.mzML...


Reading Blank7.mzML: 4406it [00:00, 26224.78it/s]
  return max(peak_list[1]) / min(peak_list[1])


Loading dataset Blank7 into memory (3565 spectra)...
All checks passed                      2884
Number of high intensity peaks >= 3     654
Precursor m/z <= 1000.0                  19
Intensity amplitude >= 20.0               7
m/z range <= 1000.0                       1
Name: count, dtype: int64
Processing: /teamspace/studios/this_studio/Moorea24_MSRun_mzml/Blank8.mzML
Processing /teamspace/studios/this_studio/Moorea24_MSRun_mzml/Blank8.mzML...


Reading Blank8.mzML: 4410it [00:00, 24158.34it/s]
  return max(peak_list[1]) / min(peak_list[1])


Loading dataset Blank8 into memory (3578 spectra)...
All checks passed                      2867
Number of high intensity peaks >= 3     698
Precursor m/z <= 1000.0                   7
Intensity amplitude >= 20.0               4
m/z range <= 1000.0                       2
Name: count, dtype: int64
Processing: /teamspace/studios/this_studio/Moorea24_MSRun_mzml/Blank9.mzML
Processing /teamspace/studios/this_studio/Moorea24_MSRun_mzml/Blank9.mzML...


Reading Blank9.mzML: 4416it [00:00, 29789.18it/s]
  return max(peak_list[1]) / min(peak_list[1])


Loading dataset Blank9 into memory (3541 spectra)...
All checks passed                      2878
Number of high intensity peaks >= 3     650
Precursor m/z <= 1000.0                   8
Intensity amplitude >= 20.0               4
m/z range <= 1000.0                       1
Name: count, dtype: int64
Processing: /teamspace/studios/this_studio/Moorea24_MSRun_mzml/BlankN2cng_cln_src.mzML
Processing /teamspace/studios/this_studio/Moorea24_MSRun_mzml/BlankN2cng_cln_src.mzML...


Reading BlankN2cng_cln_src.mzML: 4248it [00:00, 37909.13it/s]
  return max(peak_list[1]) / min(peak_list[1])


Loading dataset BlankN2cng_cln_src into memory (2630 spectra)...
All checks passed                      1769
Number of high intensity peaks >= 3     735
Intensity amplitude >= 20.0             101
Precursor m/z <= 1000.0                  19
m/z range <= 1000.0                       6
Name: count, dtype: int64
Processing: /teamspace/studios/this_studio/Moorea24_MSRun_mzml/Blank_36.mzML
Processing /teamspace/studios/this_studio/Moorea24_MSRun_mzml/Blank_36.mzML...


Reading Blank_36.mzML: 4367it [00:00, 31000.93it/s]
  return max(peak_list[1]) / min(peak_list[1])


Loading dataset Blank_36 into memory (3397 spectra)...
All checks passed                      2655
Number of high intensity peaks >= 3     742
Name: count, dtype: int64
Processing: /teamspace/studios/this_studio/Moorea24_MSRun_mzml/Blank_37.mzML
Processing /teamspace/studios/this_studio/Moorea24_MSRun_mzml/Blank_37.mzML...


Reading Blank_37.mzML: 4373it [00:00, 30674.24it/s]
  return max(peak_list[1]) / min(peak_list[1])


Loading dataset Blank_37 into memory (3391 spectra)...
All checks passed                      2723
Number of high intensity peaks >= 3     633
m/z range <= 1000.0                      30
Intensity amplitude >= 20.0               3
Precursor m/z <= 1000.0                   2
Name: count, dtype: int64
Processing: /teamspace/studios/this_studio/Moorea24_MSRun_mzml/Blank_38.mzML
Processing /teamspace/studios/this_studio/Moorea24_MSRun_mzml/Blank_38.mzML...


Reading Blank_38.mzML: 4365it [00:00, 28015.60it/s]
  return max(peak_list[1]) / min(peak_list[1])


Loading dataset Blank_38 into memory (3378 spectra)...
All checks passed                      2679
Number of high intensity peaks >= 3     646
Precursor m/z <= 1000.0                  32
m/z range <= 1000.0                      20
Intensity amplitude >= 20.0               1
Name: count, dtype: int64
Processing: /teamspace/studios/this_studio/Moorea24_MSRun_mzml/Blank_39.mzML
Processing /teamspace/studios/this_studio/Moorea24_MSRun_mzml/Blank_39.mzML...


Reading Blank_39.mzML: 4392it [00:00, 29337.15it/s]
  return max(peak_list[1]) / min(peak_list[1])


Loading dataset Blank_39 into memory (3463 spectra)...
All checks passed                      2795
Number of high intensity peaks >= 3     617
m/z range <= 1000.0                      26
Precursor m/z <= 1000.0                  23
Intensity amplitude >= 20.0               2
Name: count, dtype: int64
Processing: /teamspace/studios/this_studio/Moorea24_MSRun_mzml/Blank_40.mzML
Processing /teamspace/studios/this_studio/Moorea24_MSRun_mzml/Blank_40.mzML...


Reading Blank_40.mzML: 4370it [00:00, 30138.98it/s]
  return max(peak_list[1]) / min(peak_list[1])


Loading dataset Blank_40 into memory (3381 spectra)...
All checks passed                      2718
Number of high intensity peaks >= 3     616
Precursor m/z <= 1000.0                  24
m/z range <= 1000.0                      21
Intensity amplitude >= 20.0               2
Name: count, dtype: int64
Processing: /teamspace/studios/this_studio/Moorea24_MSRun_mzml/Blank_41.mzML
Processing /teamspace/studios/this_studio/Moorea24_MSRun_mzml/Blank_41.mzML...


Reading Blank_41.mzML: 4393it [00:00, 28160.15it/s]
  return max(peak_list[1]) / min(peak_list[1])


Loading dataset Blank_41 into memory (3321 spectra)...
All checks passed                      2823
Number of high intensity peaks >= 3     440
m/z range <= 1000.0                      26
Precursor m/z <= 1000.0                  20
Intensity amplitude >= 20.0              12
Name: count, dtype: int64
Processing: /teamspace/studios/this_studio/Moorea24_MSRun_mzml/Blank_42.mzML
Processing /teamspace/studios/this_studio/Moorea24_MSRun_mzml/Blank_42.mzML...


Reading Blank_42.mzML: 4354it [00:00, 25045.02it/s]
  return max(peak_list[1]) / min(peak_list[1])


Loading dataset Blank_42 into memory (3220 spectra)...
All checks passed                      2611
Number of high intensity peaks >= 3     573
Precursor m/z <= 1000.0                  20
m/z range <= 1000.0                      15
Intensity amplitude >= 20.0               1
Name: count, dtype: int64
Processing: /teamspace/studios/this_studio/Moorea24_MSRun_mzml/Blank_43.mzML
Processing /teamspace/studios/this_studio/Moorea24_MSRun_mzml/Blank_43.mzML...


Reading Blank_43.mzML: 4325it [00:00, 33732.01it/s]
  return max(peak_list[1]) / min(peak_list[1])


Loading dataset Blank_43 into memory (3192 spectra)...
All checks passed                      2678
Number of high intensity peaks >= 3     469
Precursor m/z <= 1000.0                  20
m/z range <= 1000.0                      15
Intensity amplitude >= 20.0              10
Name: count, dtype: int64
Processing: /teamspace/studios/this_studio/Moorea24_MSRun_mzml/Blank_44.mzML
Processing /teamspace/studios/this_studio/Moorea24_MSRun_mzml/Blank_44.mzML...


Reading Blank_44.mzML: 4291it [00:00, 31870.91it/s]
  return max(peak_list[1]) / min(peak_list[1])


Loading dataset Blank_44 into memory (3117 spectra)...
All checks passed                      2603
Number of high intensity peaks >= 3     469
Precursor m/z <= 1000.0                  17
m/z range <= 1000.0                      17
Intensity amplitude >= 20.0              11
Name: count, dtype: int64
Processing: /teamspace/studios/this_studio/Moorea24_MSRun_mzml/Blank_45.mzML
Processing /teamspace/studios/this_studio/Moorea24_MSRun_mzml/Blank_45.mzML...


Reading Blank_45.mzML: 552it [00:00, 51356.55it/s]


KeyboardInterrupt: 

## Merging NIST20 and MassSpecGym

In [None]:
import pandas as pd

# Replace with your actual file path
file_path = '/teamspace/studios/this_studio/MassSpecGym/NIST20_MoNA_A_all_with_F_Murcko_split_MCE_test_minimum_cols.pkl'

# Load the pickle file
nist20_df = pd.read_pickle(file_path)

# Check the result
nist20_df.info()

In [None]:
nist20_df.head(3)

In [None]:
# Filter rows where the 'ID' starts with "NIST20"
nist20_df = nist20_df[nist20_df['ID'].str.startswith("NIST20")].copy()

nist20_df.info()

In [77]:
from massspecgym.utils import load_massspecgym
massspec_df = load_massspecgym().reset_index()

In [None]:
massspec_df.head(1)

In [None]:
import pandas as pd

# -----------------------------
# STEP 1: Preprocess nist20_df
# -----------------------------
nist20_df = nist20_df.copy()

# Split 'PARSED PEAKS' into two columns
nist20_df['mzs'] = nist20_df['PARSED PEAKS'].apply(lambda x: x[0])
nist20_df['intensities'] = nist20_df['PARSED PEAKS'].apply(lambda x: x[1])

# Build a MassSpec-compatible DataFrame from NIST20
nist20_converted = pd.DataFrame({
    'identifier': nist20_df['ID'],
    'mzs': nist20_df['mzs'],
    'intensities': nist20_df['intensities'],
    'smiles': nist20_df['SMILES'],
    'inchikey': None,  # Not available in NIST20
    'formula': nist20_df['FORMULA'],
    'precursor_formula': nist20_df['FORMULA'],  # Assume it's the same
    'parent_mass': nist20_df['PRECURSOR M/Z'],  # Approximate
    'precursor_mz': nist20_df['PRECURSOR M/Z'],
    'adduct': '[M+H]+',
    'instrument_type': None,
    'collision_energy': None,
    'fold': nist20_df['fold'],
    'simulation_challenge': False  # NIST20 is real, not simulated
})

# -----------------------------
# STEP 2: Normalize MassSpec df
# -----------------------------
expected_columns = [
    'identifier', 'mzs', 'intensities', 'smiles', 'inchikey', 'formula', 'precursor_formula',
    'parent_mass', 'precursor_mz', 'adduct', 'instrument_type',
    'collision_energy', 'fold', 'simulation_challenge'
]

nist20_converted = nist20_converted[expected_columns]
massspec_gym_df = massspec_df.copy()
massspec_gym_df = massspec_gym_df[expected_columns]

# -----------------------------
# STEP 3: Merge the datasets
# -----------------------------
merged_df = pd.concat([massspec_gym_df, nist20_converted], ignore_index=True)

# -----------------------------
# STEP 4: Save merged dataset
# -----------------------------
# Save as TSV
merged_df.to_pickle('merged_massspec_nist20.pkl')

# Check result
print(f"Merged dataset shape: {merged_df.shape}")


## Murcko Histogram Split

In [None]:
import pandas as pd

# Replace with your actual file path
file_path = '/teamspace/studios/this_studio/files/merged_massspec_nist20_with_nist_new.tsv'

df = pd.read_csv(file_path, sep='\t')

In [None]:
# Check the result
df.tail(2)

In [4]:
# Load the necessary libraries
from rdkit import Chem
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm import tqdm
from dreams.algorithms.murcko_hist import murcko_hist
from dreams.utils.data import MSData, evaluate_split
from dreams.utils.plots import init_plotting
from dreams.definitions import *
tqdm.pandas()
%load_ext autoreload
%autoreload 2

In [None]:
hist = murcko_hist.murcko_hist(Chem.MolFromSmiles('O=C(O)[C@@H]1/N=C(\SC1)c2sc3cc(O)ccc3n2'), show_mol_scaffold=True)
print('Murcko histogram:', hist)

In [None]:
df_us = df.drop_duplicates(subset=[SMILES]).copy()  # Uniquify SMILES

# Compute Murcko histograms
df_us['MurckoHist'] = df_us[SMILES].progress_apply(
    lambda x: murcko_hist.murcko_hist(Chem.MolFromSmiles(x))
)

# Convert dictionaries to strings for easier handling
df_us['MurckoHistStr'] = df_us['MurckoHist'].astype(str)

In [None]:
print('Num. unique SMILES:', df_us[SMILES].nunique(), 'Num. unique Murcko histograms:', df_us['MurckoHistStr'].nunique())
print('Top 20 most common Murcko histograms:')
df_us['MurckoHistStr'].value_counts()[:20]

In [None]:
# Group by MurckoHistStr and aggregate
df_gb = df_us.groupby('MurckoHistStr').agg(
    count=(SMILES, 'count'),
    smiles_list=(SMILES, list)
).reset_index()

# Convert MurckoHistStr to MurckoHist
df_gb['MurckoHist'] = df_gb['MurckoHistStr'].apply(eval)

# Sort by 'n' in descending order and reset index
df_gb = df_gb.sort_values('count', ascending=False).reset_index(drop=True)

df_gb

In [None]:
median_i = len(df_gb) // 2
cum_val_mols = 0
val_mols_frac = 0.15  # Approximately 15% of the molecules go to validation set
val_idx, train_idx = [], []

# Iterate from median to start, assigning molecules to train or val sets
for i in range(median_i, -1, -1):
    current_hist = df_gb.iloc[i]['MurckoHist']
    is_val_subhist = any(
        murcko_hist.are_sub_hists(current_hist, df_gb.iloc[j]['MurckoHist'], k=3, d=4)
        for j in val_idx
    )

    if is_val_subhist:
        train_idx.append(i)
    else:
        if cum_val_mols / len(df_us) <= val_mols_frac:
            cum_val_mols += df_gb.iloc[i]['count']
            val_idx.append(i)
        else:
            train_idx.append(i)

# Add remaining indices to train set
train_idx.extend(range(median_i + 1, len(df_gb)))
assert(len(train_idx) + len(val_idx) == len(df_gb))

# Map SMILES to their assigned fold
smiles_to_fold = {}
for i, row in df_gb.iterrows():
    fold = 'val' if i in val_idx else 'train'
    for smiles in row['smiles_list']:
        smiles_to_fold[smiles] = fold
df[FOLD] = df[SMILES].map(smiles_to_fold)

# Display fold distributions
print('Distribution of spectra:')
display(df[FOLD].value_counts(normalize=True))
print('Distribution of SMILES:')
display(df.drop_duplicates(subset=[SMILES])[FOLD].value_counts(normalize=True))

In [None]:
eval_res = evaluate_split(df, n_workers=4)
init_plotting(figsize=(3, 3))
sns.histplot(eval_res['val'], bins=100)
plt.xlabel('Max Tanimoto similarity to training set')
plt.ylabel('Num. validation set molecules')
plt.show()

In [None]:
df.info()
print('Num. unique inchikey:', df['inchikey'].nunique())

In [None]:
df_t = df.groupby('inchikey').agg(
    count=(SMILES, 'count')
).reset_index()

df_t = df_t.sort_values(by='count', ascending=False).reset_index()
df_t

In [None]:
df.head(5)

In [58]:
def remove_zero_peaks(mzs, intensities):
    # Filter out zero values in either mz or intensity
    filtered = [(mz, inten) for mz, inten in zip(mzs, intensities) if mz != 0 and inten != 0]
    
    if not filtered:
        return [], []
    
    # Sort by mz
    filtered.sort(key=lambda x: x[0])
    
    mzs_clean, intensities_clean = zip(*filtered)
    return list(mzs_clean), list(intensities_clean)

# Apply to entire DataFrame
df[['mzs', 'intensities']] = df.apply(
    lambda row: pd.Series(remove_zero_peaks(row['mzs'], row['intensities'])),
    axis=1
)

In [None]:
#df.to_csv('merged_massspec_nist20_nist_new_with_fold.tsv', sep='\t')
import pandas as pd

test_df = pd.read_csv('/teamspace/studios/this_studio/files/merged_massspec_nist20_nist_new_with_fold.tsv', sep='\t')


In [None]:
df_t = test_df.groupby('fold').agg(
    count=('fold', 'count')
).reset_index()
df_t

## Detect PFAS in dataset

In [1]:
from rdkit import Chem
from rdkit.Chem import rdchem

# Definition of PFAS based on OECD: https://pubs.acs.org/doi/10.1021/acs.est.1c06896
def is_pfas_oecd(smiles: str) -> int:
    try:
        mol = Chem.MolFromSmiles(smiles)
        if mol is None:
            return 0

        for atom in mol.GetAtoms():
            if atom.GetAtomicNum() != 6:  # carbon only
                continue

            # neighbors
            neigh = atom.GetNeighbors()
            sym = [n.GetSymbol() for n in neigh]

            num_F = sum(1 for s in sym if s == "F")
            has_X  = any(s in ("Cl", "Br", "I") for s in sym)
            has_H  = atom.GetTotalNumHs() > 0  # implicit + explicit Hs

            # require sp3 and all single bonds (rules out alkenes like TFE)
            is_sp3 = atom.GetHybridization() == rdchem.HybridizationType.SP3
            all_single = all(
                mol.GetBondBetweenAtoms(atom.GetIdx(), n.GetIdx()).GetBondType() == rdchem.BondType.SINGLE
                for n in neigh
            )

            # CF3: at least 3 F neighbors; CF2: at least 2 F neighbors
            if (num_F >= 3 or num_F >= 2) and is_sp3 and all_single and not has_H and not has_X:
                # For CF2, make sure there's at least one non-F neighbor so it's truly "-CF2-"
                if num_F >= 3:
                    return 1
                else:  # CF2
                    nonF_neighbors = sum(1 for s in sym if s != "F")
                    if nonF_neighbors >= 1:  # "-CF2-" has something other than F attached
                        return 1

        return 0
    except Exception:
        return 0


In [None]:
import pandas as pd
from rdkit import Chem

# Load your dataframe
df = pd.read_csv('/teamspace/studios/this_studio/files/merged_massspec_nist20_nist_new_with_fold.tsv', sep='\t')

# Add PFAS col
df['is_PFAS'] = df['smiles'].apply(is_pfas_oecd)

# View how many were identified
print(f"Identified {df['is_PFAS'].sum()} potential PFAS compounds out of {len(df)} total.")

#df.to_csv('merged_massspec_nist20_nist_new_with_pfas_fold.tsv', sep='\t')

# Optionally: get only the PFAS rows
pfas_df = df[df['is_PFAS'] == 1]


In [None]:
from rdkit.Chem import Draw
import random as r
import pandas as pd

unique_pfas_train = pfas_df[pfas_df['fold'] == 'train']['smiles'].unique()
unique_pfas_val = pfas_df[pfas_df['fold'] == 'val']['smiles'].unique()
print(f"Train uniq PFAS = {len(unique_pfas_train)}, Val uniq PFAS = {len(unique_pfas_val)}")

pfas_train = len(pfas_df[pfas_df['fold'] == 'train'])
pfas_val = len(pfas_df[pfas_df['fold'] == 'val'])
print(f"Train PFAS = {pfas_train}, Val PFAS = {pfas_val}")


print(f"Drawing a random molecule from train")
smiles_list = unique_pfas_train.tolist()
m = Chem.MolFromSmiles(r.choice(smiles_list))
img = Draw.MolToImage(m)
img

In [None]:
#pfas_df.to_csv('pfas_only_records.csv', sep='\t')
#only_val_df = pfas_df[pfas_df['fold'] == 'val']
#only_val_df.to_csv('pfas_only_records_val.tsv', sep='\t')

In [None]:
# Load dataset
pfas_dataset = TestMassSpecDataset(
    spec_transform=SpecTokenizer(n_peaks=60),
    mol_transform = MolToHalogensVector(),
    pth='/teamspace/studios/this_studio/pfas_only_records.tsv'
)

print(len(pfas_dataset))

# Init data module
pfas_data_module = MassSpecDataModule(
    dataset=pfas_dataset,
    batch_size=64,
    num_workers=1
)
pfas_data_module.setup()

ckpt_path = '/teamspace/studios/this_studio/HalogenDetection-FocalLoss-MergedMassSpecNIST20/opi4lx8s/checkpoints/epoch=0-step=8920.ckpt'
model = HalogenDetectorDreamsTest.load_from_checkpoint(ckpt_path)


trainer = Trainer(accelerator="auto", devices="auto", max_epochs=1)
trainer.validate(model=model, datamodule=pfas_data_module)

## Playground

In [None]:
import torch

# Fluorine Model
## threshold - 0.9
## /teamspace/studios/this_studio/HalogenDetection-FocalLoss-MergedMassSpecNIST20/opi4lx8s/checkpoints/epoch=0-step=8920.ckpt
## PFAS Model
## threshold - 0.9
## /teamspace/studios/this_studio/PFASDetection-FocalLoss-MergedMassSpecNIST20OECDWith_PFASExceptions/7zi45xm4/checkpoints/epoch=0-step=8920.ckpt

# Path to your checkpoint file
ckpt_path = '/teamspace/studios/this_studio/MassSpecGym/notebooks/fluorine_model_nohead.ckpt'
#ckpt_path = '/teamspace/studios/this_studio/HalogenDetection-FocalLoss-MergedMassSpecNIST20/opi4lx8s/checkpoints/epoch=0-step=8920.ckpt'

# Load the checkpoint
checkpoint = torch.load(ckpt_path, map_location='cpu')

# Print available metadata keys
print("Checkpoint keys:")
print(checkpoint.keys())

# Optionally, display specific metadata if available
if 'state_dict' in checkpoint:
    print(f"Model state_dict {checkpoint['hyper_parameters']}")


In [None]:
# Path to your checkpoint file
# Fluorine Model
## threshold - 0.9
import torch
from collections import OrderedDict

ckpt_path = '/teamspace/studios/this_studio/HalogenDetection-FocalLoss-MergedMassSpecNIST20/opi4lx8s/checkpoints/epoch=0-step=8920.ckpt'

# Load the Fluorine model
model = torch.load(ckpt_path, map_location='cpu')

new_model = dict()

# copy the old model
for k, v in model.items():
    new_model[k] = v

# Inspect keys
state_dict = new_model['state_dict']  # may contain 'state_dict', 'model_state_dict', etc.

# Remove head layer (lin_out)
new_state_dict = OrderedDict()
for k, v in state_dict.items():
    if "lin_out" not in k:  # or "fc" if named differently
        new_state_dict[k] = v
    else:
        print("skipping lin_out layer")

new_model['state_dict'] = new_state_dict

# Save modified checkpoint
torch.save(new_model, "fluorine_model_nohead.ckpt")

print(f"‚úÖ Stripped last layer and saved new checkpoint.{new_state_dict}",)



import pandas as pd
df_unique = pd.read_pickle('/teamspace/studios/this_studio/files/merged_massspec_nist20.pkl')

In [None]:
df_unique.head(2)

In [16]:
df_massspec = df_unique[df_unique["identifier"].str.startswith("MassSpecGym")]
df_nist = df_unique[df_unique["identifier"].str.startswith("NIST20")]

In [None]:
num_inchi_ms = df_massspec["inchikey"].nunique()
num_inchi_nist = df_nist["inchikey"].nunique()
num_smil_ms = df_massspec["smiles"].nunique()
num_smil_nist = df_nist["smiles"].nunique()

print(f"num_inchi_ms = {num_inchi_ms}, num_inchi_nist = {num_inchi_nist}, num_smil_ms = {num_smil_ms}, num_smil_nist = {num_smil_nist}")

In [18]:
import numpy as np
import pandas as pd
import itertools
import urllib
import json
import time
import ase
import rdkit
import base64
from io import BytesIO
from tqdm import tqdm
from rdkit import DataStructs, RDLogger
from rdkit.Chem import AllChem as Chem
from rdkit.Chem import rdchem, Draw, rdMolDescriptors, QED, Crippen, Lipinski
from rdkit.Chem.Draw import rdMolDraw2D
from rdkit.Chem.MACCSkeys import GenMACCSKeys
from rdkit.Contrib.SA_Score import sascorer
from rdkit.Chem.Descriptors import ExactMolWt
from collections import defaultdict
from typing import List, Optional
from pathlib import Path
import dreams.utils.misc as utils


def show_mols(mols, legends='new_indices', smiles_in=None, svg=False, sort_by_legend=False, max_mols=500,
              legend_float_decimals=4, mols_per_row=6, save_pth: Optional[Path] = None):
    """
    Returns svg image representing a grid of skeletal structures of the given molecules

    :param mols: list of rdkit molecules
    :param legends: list of labels for each molecule, length must be equal to the length of mols. 
                   Can be 'new_indices' for default numbering, 'masses' for molecular weights,
                   or a list of custom labels
    :param smiles_in: True - SMILES inputs, False - RDKit mols, None - determine automatically
    :param svg: True - return svg image, False - return png image
    :param sort_by_legend: True - sort molecules by legend values
    :param max_mols: maximum number of molecules to show
    :param legend_float_decimals: number of decimal places to show for float legends
    :param mols_per_row: number of molecules per row to show
    :param save_pth: path to save the .svg image to
    """
    disable_rdkit_log()

    if smiles_in is None:
        smiles_in = all(isinstance(e, str) for e in mols)

    if smiles_in:
        mols = [Chem.MolFromSmiles(e) for e in mols]

    if isinstance(legends, str):
        if legends == 'new_indices':
            legends = list(range(len(mols)))
        elif legends == 'masses':
            legends = [ExactMolWt(m) for m in mols]
    elif callable(legends):
        legends = [legends(e) for e in mols]
    elif isinstance(legends, (list, np.ndarray, pd.Series)):
        legends = [str(l) for l in legends]
    else:
        raise ValueError(f'Invalid legends type: {type(legends)}. Must be a list, numpy array, pandas series or'
                         '"new_indices" or "masses".')

    if sort_by_legend:
        idx = np.argsort(legends).tolist()
        legends = [legends[i] for i in idx]
        mols = [mols[i] for i in idx]

    legends = [f'{l:.{legend_float_decimals}f}' if isinstance(l, float) else str(l) for l in legends]

    img = Draw.MolsToGridImage(mols, maxMols=max_mols, legends=legends, molsPerRow=min(max_mols, mols_per_row),
                         useSVG=svg, returnPNG=False)

    if save_pth:
        with open(save_pth, 'w') as f:
            f.write(img.data)

    return img


def mol_to_formula(mol, as_dict=False):
    formula = rdMolDescriptors.CalcMolFormula(mol)
    return formula_to_dict(formula) if as_dict else formula


def smiles_to_formula(s, as_dict=False, invalid_mol_smiles=''):
    mol = Chem.MolFromSmiles(s)
    if not mol and invalid_mol_smiles is not None:
        f = invalid_mol_smiles
    else:
        f = rdMolDescriptors.CalcMolFormula(mol)
    if as_dict:
        f = formula_to_dict(f)
    return f


class MolPropertyCalculator:
    def __init__(self):
        # Estimates of min and max values from the training part of MoNA and NIST20 Murcko histograms split
        self.min_maxs = {
            'AtomicLogP': {'min': -13.054800000000025, 'max': 26.849200000000053},
            'NumHAcceptors': {'min': 0.0, 'max': 36.0},
            'NumHDonors': {'min': 0.0, 'max': 20.0},
            'PolarSurfaceArea': {'min': 0.0, 'max': 585.0300000000002},
            'NumRotatableBonds': {'min': 0.0, 'max': 68.0},
            'NumAromaticRings': {'min': 0.0, 'max': 8.0},
            'NumAliphaticRings': {'min': 0.0, 'max': 22.0},
            'FractionCSP3': {'min': 0.0, 'max': 1.0},
            'QED': {'min': 0.0, 'max': 1.0},  # 'QED': {'min': 0.008950206972239864, 'max': 0.9479380820623227},
            'SyntheticAccessibility': {'min': 1.0, 'max': 10.0},  # 'SyntheticAccessibility': {'min': 1.0549172379947862, 'max': 8.043981630210263},
            'BertzComplexity': {'min': 2.7548875021634682, 'max': 3748.669248605835}
        }
        self.prop_names = list(self.min_maxs.keys())

    def mol_to_props(self, mol, min_max_norm=False):
        props = {
            'AtomicLogP': Crippen.MolLogP(mol),
            'NumHAcceptors': Lipinski.NumHAcceptors(mol),
            'NumHDonors': Lipinski.NumHDonors(mol),
            'PolarSurfaceArea': rdMolDescriptors.CalcTPSA(mol),
            'NumRotatableBonds': Lipinski.NumRotatableBonds(mol),
            'NumAromaticRings': Lipinski.NumAromaticRings(mol),
            'NumAliphaticRings': Lipinski.NumAliphaticRings(mol),
            'FractionCSP3': Lipinski.FractionCSP3(mol),
            'QED': QED.qed(mol),
            'SyntheticAccessibility': sascorer.calculateScore(mol),
            'BertzComplexity': rdkit.Chem.GraphDescriptors.BertzCT(mol)
        }
        if min_max_norm:
            props = self.normalize_props(props)
        return props

    def normalize_prop(self, prop, prop_name):
        return (prop - self.min_maxs[prop_name]['min']) / (self.min_maxs[prop_name]['max'] - self.min_maxs[prop_name]['min'])

    def denormalize_prop(self, prop, prop_name, do_not_add_min=False):
        res = prop * (self.min_maxs[prop_name]['max'] - self.min_maxs[prop_name]['min'])
        if not do_not_add_min:
            res = res + self.min_maxs[prop_name]['min']
        return res

    def normalize_props(self, props):
        return {k: self.normalize_prop(v, k) for k, v in props.items()}

    def denormalize_props(self, props):
        return {k: self.denormalize_prop(v, k) for k, v in props.items()}

    def __len__(self):
        return len(self.prop_names)


def formula_to_dict(formula):
    """
    Transforms chemical formula string to dictionary mapping elements to their frequencies
    e.g. 'C15H24' -> {'C': 15, 'H': 24}
    """
    elem_count = defaultdict(int)
    #try:
    formula = formula.replace('+', '').replace('-', '').replace('[', '').replace(']', '')
    formula_counts = ase.formula.Formula(formula)
    formula_counts = formula_counts.count().items()
    for k, v in formula_counts:
        elem_count[k] += v
    #except Exception as e:
    #    print(f'Invalid formula: {formula} ({e.__class__.__name__})')

    return elem_count


def rdkit_fp(mol, fp_size=4096):
    """Default RDKit fingerprint."""
    return Chem.RDKFingerprint(mol, fpSize=fp_size)


def tanimoto_sim(fp1, fp2):
    """Default RDKit Tanimoto distance."""
    return DataStructs.TanimotoSimilarity(fp1, fp2)


def rdkit_mol_sim(m1, m2, fp_size=4096):
    """Default RDKit Tanimoto distance on default RDKit fingerprint."""
    return tanimoto_sim(rdkit_fp(m1, fp_size=fp_size), rdkit_fp(m2, fp_size=fp_size))


def rdkit_smiles_sim(s1, s2, fp_size=4096):
    """Default RDKit Tanimoto distance on default RDKit fingerprint."""
    return rdkit_mol_sim(Chem.MolFromSmiles(s1), Chem.MolFromSmiles(s2), fp_size=fp_size)


def morgan_fp(mol, binary=True, fp_size=4096, radius=2, as_numpy=True):
    if binary:
        fp = Chem.GetMorganFingerprintAsBitVect(mol, radius=radius, nBits=fp_size)
    else:
        fp = Chem.GetHashedMorganFingerprint(mol, radius=radius, nBits=fp_size)

    if as_numpy:
        return rdkit_fp_to_np(fp)
    return fp


def maccs_fp(mol, as_numpy=True):
    """
    NOTE: Since indexing of MACCS keys starts from 1, when converting to numpy array with `as_numpy`, the first element
          is removed, so the resulting array has 166 elements instead of 167.
    """
    fp = GenMACCSKeys(mol)
    if as_numpy:
        return rdkit_fp_to_np(fp)[1:]
    return fp


def fp_func_from_str(s):
    """
    :param s: E.g. "fp_rdkit_2048", "fp_rdkit_2048" or "fp_maccs_166".
    """
    _, fp_type, n_bits = s.split('_')
    n_bits = int(n_bits)
    if fp_type == 'rdkit':
        return lambda mol: np.array(rdkit_fp(mol, fp_size=n_bits), dtype=float)
    elif fp_type == 'morgan':
        return lambda mol: morgan_fp(mol, fp_size=n_bits).astype(float, copy=False)
    elif fp_type == 'maccs':
        return lambda mol: maccs_fp(mol).astype(float, copy=False)
    else:
        raise ValueError(f'Invalid fingerprint function name: "{s}".')


def morgan_mol_sim(m1, m2, fp_size=4096, radius=2):
    return tanimoto_sim(
        morgan_fp(m1, fp_size=fp_size, radius=radius, as_numpy=False),
        morgan_fp(m2, fp_size=fp_size, radius=radius, as_numpy=False)
    )


def morgan_smiles_sim(s1, s2, fp_size=4096, radius=2):
    return morgan_mol_sim(Chem.MolFromSmiles(s1), Chem.MolFromSmiles(s2), fp_size=fp_size, radius=radius)


def rdkit_fp_to_np(fp):
    fp_np = np.zeros((0,), dtype=np.int32)
    DataStructs.ConvertToNumpyArray(fp, fp_np)
    return fp_np


def np_to_rdkit_fp(fp):
    fp = fp.round().astype(int, copy=False)
    bitstring = ''.join(fp.astype(str))
    return DataStructs.cDataStructs.CreateFromBitString(bitstring)


def mol_to_inchi14(mol: Chem.Mol):
    return Chem.MolToInchiKey(mol).split('-')[0]


def smiles_to_inchi14(s):
    return mol_to_inchi14(Chem.MolFromSmiles(s))


def generate_fragments(mol: Chem.Mol, max_cuts: int = None):
    """
    Generates all possible fragments of a molecule up to a certain number of bond cuts or without the restriction if
    `max_cuts` is not specified.

    :param mol: an RDKit molecule object
    :param max_cuts: the maximum number of bonds to cut
    :return a set of RDKit Mol objects representing all possible fragments
    """

    bonds = mol.GetBonds()
    # bonds = [bond for bond in bonds if bond.GetBondType() in [rdchem.BondType.SINGLE, rdchem.BondType.DOUBLE]]
    fragments = set()
    for i in range(1, len(bonds) + 1):

        if max_cuts and i > max_cuts:
            break

        for combination in itertools.combinations(bonds, i):
            new_mol = rdchem.RWMol(mol)
            for bond in combination:
                new_mol.RemoveBond(bond.GetBeginAtomIdx(), bond.GetEndAtomIdx())

            # Update properties such as ring membership after changing the molecule's structure.
            for fragment in Chem.GetMolFrags(new_mol, asMols=True, sanitizeFrags=False):
                fragments.add(Chem.MolToSmiles(fragment))

    fragments = [Chem.MolFromSmiles(f) for f in fragments]
    return [f for f in fragments if f is not None]


def generate_spectrum(mol: Chem.Mol, prec_mz: float = None, fragments: List = None, max_cuts: int = None):
    """
    Generates an MS/MS spectrum by exhaustively simulating the m/z values of theoretical fragments of a given molecule.
    The algorithm is very simplistic since it considers only subgraph-like fragments, does not consider isotopes, etc.

    :param mol: An RDKit molecule object.
    :param prec_mz: The m/z value of a molecule. If not specified, it is calculated as the sum of the
                    exact molecular weight of the molecule and 1.
    :param fragments: A list of RDKit Mol objects representing pre-generated fragments of the molecule. If not specified,
                     the function will generate the fragments automatically.
    :param max_cuts: The maximum number of bonds to cut when generating fragments. If not specified, all possible
                     fragments will be generated without any restriction on the number of cuts.
    :return: A spectrum represented as a numpy array with two columns: m/z values and their respective intensities.
    """

    # Simulate the m/z of "protonated adduct"
    if not prec_mz:
        prec_mz = ExactMolWt(mol) + 1

    # Fragment molecule
    if not fragments:
        fragments = generate_fragments(mol, max_cuts=max_cuts)

    # Simulate spectrum
    masses = np.round(np.array([prec_mz - ExactMolWt(f) for f in fragments]))
    ins, mzs = np.histogram(masses, bins=np.arange(0, np.ceil(max(masses)), 1))
    spec = np.stack([mzs[1:], ins]).T

    return spec


def closest_mz_frags(query_mz, frags, n=1, mass_shift=1, return_masses=False, print_masses=True):
    masses = [ExactMolWt(f) + mass_shift for f in frags]
    idx = utils.get_closest_values(masses, query_mz, n=n, return_idx=True)
    frags, masses = [frags[i] for i in idx], [masses[i] for i in idx]
    if n == 1:
        frags, masses = frags[0], masses[0]
    if print_masses:
        print(masses)
    if return_masses:
        return frags, masses
    return frags


def disable_rdkit_log():
    lg = RDLogger.logger()
    lg.setLevel(RDLogger.CRITICAL)


def np_classify(smiles: List[str], progress_bar=True, sleep_each_n_requests=100):
    np_classes = []
    for i, s in enumerate(tqdm(smiles) if progress_bar else smiles):
        if i % sleep_each_n_requests == 0 and i > 0:
            time.sleep(1)
        print(s)
        with urllib.request.urlopen(f'https://npclassifier.ucsd.edu/classify?smiles={urllib.parse.quote(s)}') as url:
            res = json.load(url)
            for k in list(res.keys()):
                if 'fp' in k:
                    res.pop(k)
            np_classes.append(res)
    return np_classes


def mol_to_img_str(mol, svg_size=200):
    """
    Supposed to be used with `pyvis` for showing molecule images as graph nodes.
    """
    buffered = BytesIO()
    d2d = rdMolDraw2D.MolDraw2DSVG(svg_size, svg_size)
    opts = d2d.drawOptions()
    opts.clearBackground = False
    d2d.DrawMolecule(mol)
    d2d.FinishDrawing()
    img_str = d2d.GetDrawingText()
    buffered.write(str.encode(img_str))
    img_str = base64.b64encode(buffered.getvalue())
    img_str = f"data:image/svg+xml;base64,{repr(img_str)[2:-1]}"
    return img_str


def formula_is_carbohydrate(formula):
    return set(formula.keys()) <= {'C', 'H', 'O'}


def formula_is_halogenated(formula):
    return sum([(formula[e] if e in formula else 0) for e in ['F', 'Cl', 'Br', 'I']]) > 0


def formula_type(f):
    if isinstance(f, str):
        f = formula_to_dict(f)

    if not f:
        return 'No formula'
    elif formula_is_carbohydrate(f):
        return 'Carbohydrate'
    elif set(f.keys()) <= {'C', 'H', 'O', 'N'}:
        return 'Carbohydrate with nitrogen'
    elif set(f.keys()) <= {'C', 'H', 'O', 'N', 'S'} and 'N' in f and 'S' in f:
        return 'Carbohydrate with nitrogen and sulfur'
    elif formula_is_halogenated(f):
        return 'Compound with halogens'
    else:
        return 'Other'


def get_mol_mass(mol):
    return ExactMolWt(mol)

In [None]:
#df_massspec = df_unique[df_unique["identifier"].str.startswith("MassSpecGym")]
#df_nist = df_unique[df_unique["identifier"].str.startswith("NIST20")]
#df_unique = pd.read_pickle('merged_massspec_nist20.pkl')

df_nist['inchikey'] = df_nist['smiles'].apply(smiles_to_inchi14)

In [None]:
num_inchi_ms = df_massspec["inchikey"].nunique()
num_inchi_nist = df_nist["inchikey"].nunique()
print("NIST unique # inchikeys: " + str(num_inchi_nist))
print("MassSpecGym unique # inchikeys: " + str(num_inchi_ms))

In [None]:
# Overlap our PFAS training and PFAS suspect list from data.gov
import pandas as pd

# Load both TSV files
df_records = pd.read_csv("/teamspace/studios/this_studio/files/pfas_only_records.tsv", sep='\t')
df_suspects = pd.read_csv("/teamspace/studios/this_studio/files/PFAS_suspect_list_data_gov.tsv", sep='\t')

# Preview column names
print("Records columns:", df_records.columns.tolist())
print("Suspects columns:", df_suspects.columns.tolist())

# Standardize column names
smiles_records = df_records[df_records['fold'] == 'train']['smiles'].dropna().str.strip().unique()
smiles_suspects = df_suspects['SMILES'].dropna().str.strip().unique()

# Convert to sets for comparison
set_records = set(smiles_records)
set_suspects = set(smiles_suspects)

# Find overlap
overlap = set_suspects.intersection(set_records)

# Report results
print(f"Total in PFAS_Suspect_List: {len(set_suspects)}")
print(f"Total in pfas_only_records: {len(set_records)}")
print(f"Overlapping SMILES: {len(overlap)}")

for smile in sorted(overlap):
        print(smile)

In [None]:
from rdkit import Chem
from rdkit.Chem import Draw

# Your (corrected) SMILES string
smiles = "C(=O)(C(C(C(C(C(C(C(F)(F)F)(F)F)(F)F)(F)F)(F)F)(F)F)(F)F)O"

# Create RDKit molecule object
mol = Chem.MolFromSmiles(smiles)

if mol is None:
    raise ValueError("SMILES string is invalid or could not be parsed.")

# Compute 2D coordinates for drawing
Chem.rdDepictor.Compute2DCoords(mol)

# Draw the molecule to a PNG file
Draw.MolToFile(mol, "molecule.png", size=(400, 400))

print("Saved structure as molecule.png")

# Create RDKit molecule object
mol = Chem.MolFromSmiles(smiles)

if mol is None:
    raise ValueError("SMILES string is invalid or could not be parsed.")

# Compute 2D coordinates for drawing
Chem.rdDepictor.Compute2DCoords(mol)

# Draw the molecule to a PNG file
Draw.MolToFile(mol, "molecule.png", size=(400, 400))

print("Saved structure as molecule.png")
