In [None]:
import pandas as pd

import pyteomics.mgf
from tqdm.notebook import tqdm
import seaborn as sns
import matplotlib.pyplot as plt

import pandas_utils as pu
import well_plate



In [None]:
acquisition_pos = {
  "mce_bioactive": r"C:\git\msn_library\library\20231031_mce_library_pos_all_lib_MSn.mgf",
  "nih_natural_product": r"C:\git\msn_library\library\20231031_nih_library_pos_all_lib_MSn.mgf",
  "mce_scaffold_library": r"C:\git\msn_library\library\20231130_nencka_mce_library_pos_all_lib_MSn.mgf",
  "iocb_peptide": r"C:\git\msn_library\library\20231130_iocb_peptide_library_pos_all_lib_MSn.mgf"
}

acquisition_neg = {
  "mce_bioactive": r"C:\git\msn_library\library\20231030_mce_library_neg_all_lib_MSn.mgf",
  "nih_natural_product": r"C:\git\msn_library\library\20231031_nih_library_neg_all_lib_MSn.mgf",
  "mce_scaffold_library": r"C:\git\msn_library\library\20231130_nencka_mce_library_neg_all_lib_MSn.mgf",
  "iocb_peptide": r"C:\git\msn_library\library\20231130_iocb_peptide_library_neg_all_lib_MSn.mgf"
}


libraries = {
  "mce_bioactive": r"C:\git\msn_library\data\library\mce_library_all_cleaned.tsv",
  "nih_natural_product": r"C:\git\msn_library\data\nih\nih_library_new_headers_cleaned_plate7_removed.tsv",
  "mce_scaffold_library": r"C:\git\msn_library\data\iocb_libraries\Radim_mce_complete_cleaned.tsv",
  "iocb_peptide": r"C:\git\msn_library\data\iocb_libraries\iocb_peptide_library_cleaned.tsv"
}

In [None]:
def read_mgf(infile) -> pd.DataFrame:
  import re
  rows = []
  counter = 0

  with pyteomics.mgf.MGF(infile) as f_in:
    for spectrum_dict in tqdm(f_in):
      if spectrum_dict is not None:
        rows.append(spectrum_dict["params"])
      else:
        counter += 1

  df = pd.DataFrame(rows)
  if "inchikey" not in df.columns:
    df["inchikey"] = df["inchiaux"]
  if "compound_name" not in df.columns:
    df["compound_name"] = df["name"]
  if "usi" in df.columns:
    df["unique_sample_id"] = ["pluskal{}_id".format(re.search(r'pluskal(.*?)_id', usi).group(1)) for
                              usi in df["usi"]]
  return df

In [None]:
dfs = []
for key, value in libraries.items():
  df = pu.read_dataframe(value).copy()
  df = df[df["monoisotopic_mass"] > 114]
  df["library"] = key
  df = df.drop_duplicates(["inchikey", "unique_sample_id"])
  dfs.append(df)


metadata_df = pd.concat(dfs, ignore_index=True)


metadata_df

In [None]:
positive = []
for key, value in acquisition_pos.items():
  df = read_mgf(value)[["inchikey", "unique_sample_id"]].copy()
  df = df.drop_duplicates(["inchikey", "unique_sample_id"])
  positive.append(df)


positive_df = pd.concat(positive, ignore_index=True)


positive_df

In [None]:
negative = []
for key, value in acquisition_neg.items():
  df = read_mgf(value)[["inchikey", "unique_sample_id"]].copy()
  df = df.drop_duplicates(["inchikey", "unique_sample_id"])
  negative.append(df)


negative_df = pd.concat(negative, ignore_index=True)


negative_df

In [None]:
mapper = {
  "left_only": "positive",
  "right_only": "negative",
  "both": "both",
}

detect = {
  "left_only": "missing",
  "right_only": "error",
  "both": "detected",
}


detected_df = pd.merge(positive_df, negative_df, indicator="polarity", how="outer", on=["unique_sample_id", "inchikey"])
metadata_detected_df = pd.merge(metadata_df, detected_df, how="left", on=["unique_sample_id", "inchikey"], indicator="detected")
metadata_detected_df["polarity"] = [mapper.get(pol, "missing") for pol in metadata_detected_df["polarity"]]
metadata_detected_df["detected"] = [detect.get(ex, "") for ex in metadata_detected_df["detected"]]
metadata_detected_df

In [None]:
metadata_detected_df.groupby("detected").count()["inchikey"]

In [None]:
filtered = metadata_detected_df[metadata_detected_df["detected"] == "detected"]

In [None]:
missing = metadata_detected_df[metadata_detected_df["detected"] == "missing"]
missing

In [None]:
pu.save_dataframe(metadata_detected_df, r"C:\git\msn_library\data\acquisition_results\all_lib_no_filter.tsv")
pu.save_dataframe(filtered, r"C:\git\msn_library\data\acquisition_results\all_lib_only_detected.tsv")
pu.save_dataframe(missing, r"C:\git\msn_library\data\acquisition_results\all_lib_missing.tsv")

In [None]:
metadata_df

In [None]:
metadata_df["library"]