In [None]:
# Internal package that removes spectra with less than 5 peaks
from metabolomics.cleaner.spectrum_cleaner import SpectrumCleaner
import pandas as pd
import numpy as np

# Load data

In [1]:
DATA_DIR = "../data/"

In [6]:
# This file can be generated in 1_data_stats.ipynb in the same way that
# enpkg_m+h+only_with_ms_data.pq is generated without using the M+H adduct filter
data = pd.read_parquet(f"{DATA_DIR}/processed/enpkg_with_ms_data.pq")

In [7]:
data["ion_mode"] = data.charge.apply(lambda x: "positive" if x > 0 else "negative")

data.rename(
    {
        "mz_list": "normalized_mzs",
        "i_list": "normalized_intensities",
        "precursor_mz": "ms2_precursor_mz",
        "smiles": "smiles_2d",
    },
    axis=1,
    inplace=True,
)

# Clean data (remove spectra with less than 5 peaks)

In [None]:
cleaner = SpectrumCleaner()
data = cleaner.clean_df(
    data,
    message_col="cleaner_message",
    mzs_field="normalized_mzs",
    intensities_field="normalized_intensities",
)

In [10]:
data.cleaner_message.value_counts()

cleaner_message
[]                         1006233
[number of peaks 4 < 5]        292
[number of peaks 3 < 5]        168
[number of peaks 2 < 5]         82
[number of peaks 1 < 5]         22
Name: count, dtype: int64

In [17]:
cleaned_data = data[~data.normalized_mzs.isna()]

In [None]:
# Add unique row id
data["row_uid"] = np.arange(len(data))
data["row_uid"] = data.row_uid.apply(lambda x: f"id_{x}")

In [19]:
cleaned_data.to_parquet("{DATA_DIR}/processed/enpkg_with_ms_data_cleaned.pq")