In [1]:
import random
from pyteomics import mgf
from tqdm import tqdm

def split_for_training(fn: str) -> None:
    """
    Splits an MGF file into training and validation subsets. Ensures that
    peptides in the validation set are unique and do not overlap with the training set.

    Args:
        fn (str): The input MGF file name.
    """

    # Open files for reading and writing
    file1 = open(fn, "r")
    valid_f = open(fn.replace(".mgf", "_valid.mgf"), "w")
    training_f = open(fn.replace(".mgf", "_train.mgf"), "w")

    # Read spectra from MGF file
    sps = mgf.read(file1, convert_arrays=1, read_charges=False, dtype='float32', use_index=False)
    list_of_spectra = [sp for sp in tqdm(sps, desc="Reading spectra", unit="spectrum")]

    # Shuffle spectra to randomize distribution
    random.shuffle(list_of_spectra)

    # Extract unique sequences from spectra
    seq_all = [sp['params']['seq'] for sp in list_of_spectra if 'seq' in sp['params']]
    seq_all = list(dict.fromkeys(seq_all))  # Remove duplicate sequences

    # Determine validation set size (1% of total unique sequences)
    seq_all_length = max(1, len(seq_all) // 100)

    # Randomly sample sequences for the validation set
    random.seed(385)
    valid_list = set(random.sample(seq_all, seq_all_length * 2))

    # Write spectra to the appropriate output files based on their sequence
    for sp in tqdm(list_of_spectra, desc="Splitting spectra", unit="spectrum"):
        param = sp['params']
        if 'seq' in param and len(sp['m/z array']) > 0:
            if param['seq'] in valid_list:
                write_spectrum_to_file(valid_f, param, sp)
            else:
                write_spectrum_to_file(training_f, param, sp)

    valid_f.close()
    training_f.close()

def write_spectrum_to_file(file, params: dict, spectrum: dict) -> None:
    """
    Writes a spectrum to the given file in MGF format.

    Args:
        file (file object): The file to write to.
        params (dict): The spectrum parameters.
        spectrum (dict): The spectrum data including m/z and intensity arrays.
    """
    file.write("BEGIN IONS\n")
    file.write(f"TITLE={params['title']}\n")
    file.write(f"PEPMASS={params['pepmass'][0]}\n")
    file.write(f"CHARGE={params['charge']}\n")
    file.write(f"SCANS={params['scans']}\n")
    file.write(f"RTINSECONDS={params['rtinseconds']}\n")
    file.write(f"SEQ={params['seq']}\n")
    for mz, intensity in zip(spectrum['m/z array'], spectrum['intensity array']):
        file.write(f"{mz} {intensity}\n")
    file.write("END IONS\n")

if __name__ == "__main__":
    input_file = "./data/nine_species_all.mgf"
    print("Starting the splitting process...")
    split_for_training(input_file)
    print("Splitting completed. Check the output files.")

Starting the splitting process...


Reading spectra: 1528127spectrum [04:01, 6325.33spectrum/s] 
Splitting spectra: 100%|██████████| 1528127/1528127 [06:35<00:00, 3865.49spectrum/s]


Splitting completed. Check the output files.
