In [None]:
import pandas as pd
import pandas_utils as pu
import rdkit_mol_identifiers as rdkit
from rdkit.Chem import PandasTools
import pyteomics.mgf
from tqdm.notebook import tqdm

## Read multiple metadata library files with inchikey column
only inchikey needed

In [None]:
libraries = {
  "mce": r"C:\git\msn_library\data\library\mce_library_all_cleaned.tsv",
  "nih": r"C:\git\msn_library\data\nih\nih_library_new_headers_cleaned_plate7_removed.tsv",
  "nencka_mce": r"C:\git\msn_library\data\iocb_libraries\Radim_mce_complete_cleaned.tsv",
  "iocb_peptide": r"C:\git\msn_library\data\iocb_libraries\iocb_peptide_library_cleaned.tsv",
}

In [None]:
dfs = []
for key, value in libraries.items():
  df = pu.read_dataframe(value)[["inchikey", "monoisotopic_mass"]].copy()
  df = df[df["monoisotopic_mass"] > 114].drop(columns=["monoisotopic_mass"])
  df = df.dropna(subset="inchikey").drop_duplicates(["inchikey"]).set_index(["inchikey"])
  df[key] = True
  # if key in acquired:
  #   df["acquired"] = True
  # if key in collaborators:
  #   df["collaborators"] = True
  dfs.append(df)


merged_df = pd.concat(dfs, axis=1)
merged_df["entries"] = merged_df.count(axis=1)
merged_df["split_inchikey"] = [rdkit.split_inchikey(inchikey) for inchikey in merged_df.index]


merged_df

## Read multiple library files with inchikey column
only inchikey needed

In [None]:
spectral_libraries = {
  "mce_pos": r"C:\git\msn_library\library\20231031_mce_library_pos_all_lib_MSn.mgf",
  "mce_neg": r"C:\git\msn_library\library\20231030_mce_library_neg_all_lib_MSn.mgf",
  "nih_pos": r"C:\git\msn_library\library\20231031_nih_library_pos_all_lib_MSn.mgf",
  "nih_neg": r"C:\git\msn_library\library\20231031_nih_library_neg_all_lib_MSn.mgf",
  "nencka_mce_pos": r"C:\git\msn_library\library\20231130_nencka_mce_library_pos_all_lib_MSn.mgf",
  "nencka_mce_neg": r"C:\git\msn_library\library\20231130_nencka_mce_library_neg_all_lib_MSn.mgf",
  "iocb_peptide_pos": r"C:\git\msn_library\library\20231130_iocb_peptide_library_pos_all_lib_MSn.mgf",
  "iocb_peptide_neg": r"C:\git\msn_library\library\20231130_iocb_peptide_library_neg_all_lib_MSn.mgf",
}

In [None]:
dfs = []
for key, value in spectral_libraries.items():
  import re
  rows = []
  counter = 0

  with pyteomics.mgf.MGF(value) as f_in:
    for spectrum_dict in tqdm(f_in):
      if spectrum_dict is not None:
        rows.append(spectrum_dict["params"])
      else:
        counter += 1
  
  df = pd.DataFrame(rows)
  if "inchikey" not in df.columns:
    df["inchikey"] = df["inchiaux"] 
  df = df[["inchikey"]].copy()
  df = df.dropna(subset="inchikey").drop_duplicates(["inchikey"]).set_index(["inchikey"])
  df[key] = True
  dfs.append(df)


merged_df = pd.concat(dfs, axis=1)
merged_df["entries"] = merged_df.count(axis=1)
merged_df["split_inchikey"] = [rdkit.split_inchikey(inchikey) for inchikey in merged_df.index]


merged_df
