## Read NIST20 Rows

In [None]:
import pandas as pd

# Replace with your actual file path
file_path = '/teamspace/studios/this_studio/MassSpecGym/NIST20_MoNA_A_all_with_F_Murcko_split_MCE_test_minimum_cols.pkl'

# Load the pickle file into a DataFrame
df = pd.read_pickle(file_path)

# Filter rows where the 'ID' column starts with "NIST20"
nist20_df = df[df['ID'].str.startswith("NIST20")]

# Optional: check the filtered DataFrame
print(nist20_df.info())

In [None]:
nist20_df.head(1)

## Read MassSpecGym

In [None]:
from massspecgym.utils import load_massspecgym
massspec_df = load_massspecgym()
print(massspec_df.info())

In [None]:
massspec_df.head(1)

## Merge both datasets

In [None]:
import pandas as pd

# STEP 1: Prepare NIST20 DataFrame
# -------------------------------
nist20_df = nist20_df.copy()
nist20_df['mzs'] = nist20_df['PARSED PEAKS'].apply(lambda x: x[0])
nist20_df['intensities'] = nist20_df['PARSED PEAKS'].apply(lambda x: x[1])

# Create a new DataFrame in the MassSpecGym format
nist20_converted = pd.DataFrame({
    'mzs': nist20_df['mzs'],
    'intensities': nist20_df['intensities'],
    'smiles': nist20_df['SMILES'],
    'formula': nist20_df['FORMULA'],
    'precursor_formula': nist20_df['FORMULA'],
    'precursor_mz': nist20_df['PRECURSOR M/Z'],
    'parent_mass': nist20_df['PRECURSOR M/Z'],  # you can replace this with monoisotopic mass if needed
    'fold': nist20_df['fold'],
    'inchikey': None,
    'adduct': None,
    'instrument_type': None,
    'collision_energy': None,
    'simulation_challenge': False
})

# STEP 2: Standardize Column Order (to match MassSpecGym)
# -------------------------------------------------------
expected_columns = [
    'mzs', 'intensities', 'smiles', 'inchikey', 'formula', 'precursor_formula',
    'parent_mass', 'precursor_mz', 'adduct', 'instrument_type',
    'collision_energy', 'fold', 'simulation_challenge'
]

# Reorder both DataFrames
nist20_converted = nist20_converted[expected_columns]
massspec_df = massspec_df[expected_columns]

# STEP 3: Concatenate the two DataFrames
# -------------------------------------
merged_df = pd.concat([massspec_df, nist20_converted], ignore_index=True)

# Optional: Shuffle if needed before training
merged_df = merged_df.sample(frac=1.0, random_state=42).reset_index(drop=True)

# Save or pass `merged_df` to your PyTorch Lightning DataModule
print(merged_df.info())


In [None]:
merged_df.tail(2)

In [None]:
import pandas as pd

#merged_df.to_csv('/teamspace/studios/this_studio/nist20_massspecgym_dataset.tsv', sep='\t', index=False)
file_path = '/teamspace/studios/this_studio/nist20_massspecgym_dataset.tsv'
# Load the  file into a DataFrame
result_df = pd.read_csv(file_path, sep='\t')

# Optional: check the filtered DataFrame
print(result_df.info())

In [None]:
result_df.tail(2)