In [None]:
import pandas as pd

import pyteomics.mgf
from tqdm.notebook import tqdm
import seaborn as sns
import matplotlib.pyplot as plt

import pandas_utils as pu
import well_plate

from mapper import combine_polarity

In [None]:
outfile_no_filter =  r"C:\git\msn_library\data\acquisition_results\all_lib_no_filter.tsv"
outfile_detected = r"C:\git\msn_library\data\acquisition_results\all_lib_only_detected.tsv"
outfile_missing =  r"C:\git\msn_library\data\acquisition_results\all_lib_missing.tsv"


In [None]:
acquisition_pos = {
  "mce_bioactive": r"C:\git\msn_library\library\20231031_mce_library_pos_all_lib_MSn.mgf",
  "nih_natural_product": r"C:\git\msn_library\library\20231031_nih_library_pos_all_lib_MSn.mgf",
  "mce_scaffold_library": r"C:\git\msn_library\library\20231130_nencka_mce_library_pos_all_lib_MSn.mgf",
  "iocb_peptide": r"C:\git\msn_library\library\20231130_iocb_peptide_library_pos_all_lib_MSn.mgf"
}

acquisition_neg = {
  "mce_bioactive": r"C:\git\msn_library\library\20231030_mce_library_neg_all_lib_MSn.mgf",
  "nih_natural_product": r"C:\git\msn_library\library\20231031_nih_library_neg_all_lib_MSn.mgf",
  "mce_scaffold_library": r"C:\git\msn_library\library\20231130_nencka_mce_library_neg_all_lib_MSn.mgf",
  "iocb_peptide": r"C:\git\msn_library\library\20231130_iocb_peptide_library_neg_all_lib_MSn.mgf"
}


libraries = {
  "mce_bioactive": r"C:\git\msn_library\data\library\mce_library_all_cleaned.tsv",
  "nih_natural_product": r"C:\git\msn_library\data\nih\nih_library_new_headers_cleaned_plate7_removed.tsv",
  "mce_scaffold_library": r"C:\git\msn_library\data\iocb_libraries\Radim_mce_complete_cleaned.tsv",
  "iocb_peptide": r"C:\git\msn_library\data\iocb_libraries\iocb_peptide_library_cleaned.tsv"
}

In [None]:
def read_mgf(infile) -> pd.DataFrame:
  import re
  rows = []
  counter = 0

  with pyteomics.mgf.MGF(infile) as f_in:
    for spectrum_dict in tqdm(f_in):
      if spectrum_dict is not None:
        rows.append(spectrum_dict["params"])
      else:
        counter += 1

  df = pd.DataFrame(rows)
  if "inchikey" not in df.columns:
    df["inchikey"] = df["inchiaux"]
  if "compound_name" not in df.columns:
    df["compound_name"] = df["name"]
  if "usi" in df.columns:
    df["unique_sample_id"] = ["pluskal{}_id".format(re.search(r'pluskal(.*?)_id', usi).group(1)) for
                              usi in df["usi"]]
  return df

In [None]:
dfs = []
for key, value in libraries.items():
  df = pu.read_dataframe(value).copy()
  df = df[df["monoisotopic_mass"] > 114]
  df["library"] = key
  df = df.drop_duplicates(["inchikey", "unique_sample_id"])
  dfs.append(df)


metadata_df = pd.concat(dfs, ignore_index=True)


metadata_df

In [None]:
positive = []
for key, value in acquisition_pos.items():
  df = read_mgf(value)[["inchikey", "unique_sample_id"]].copy()
  df = df.drop_duplicates(["inchikey", "unique_sample_id"])
  positive.append(df)


positive_df = pd.concat(positive, ignore_index=True)


positive_df

In [None]:
negative = []
for key, value in acquisition_neg.items():
  df = read_mgf(value)[["inchikey", "unique_sample_id"]].copy()
  df = df.drop_duplicates(["inchikey", "unique_sample_id"])
  negative.append(df)


negative_df = pd.concat(negative, ignore_index=True)


negative_df

In [None]:
mapper = {
  "left_only": "positive",
  "right_only": "negative",
  "both": "both",
}

detect = {
  "left_only": "missing",
  "right_only": "error",
  "both": "detected",
}


detected_df = pd.merge(positive_df, negative_df, indicator="polarity", how="outer", on=["unique_sample_id", "inchikey"])
metadata_detected_df = pd.merge(metadata_df, detected_df, how="left", on=["unique_sample_id", "inchikey"], indicator="detected")
metadata_detected_df["polarity"] = [mapper.get(pol, "missing") for pol in metadata_detected_df["polarity"]]
metadata_detected_df["detected"] = [detect.get(ex, "") for ex in metadata_detected_df["detected"]]
metadata_detected_df

In [None]:
metadata_detected_df.groupby("detected").count()["inchikey"]

In [None]:
filtered = metadata_detected_df[metadata_detected_df["detected"] == "detected"]

In [None]:
missing = metadata_detected_df[metadata_detected_df["detected"] == "missing"]
missing

In [None]:
pu.save_dataframe(metadata_detected_df, outfile_no_filter)
pu.save_dataframe(filtered, outfile_detected )
pu.save_dataframe(missing, outfile_missing)

## Statistics

In [None]:
df = pu.read_dataframe(outfile_no_filter)

In [None]:
df

In [None]:
sub = df.groupby(["library", "polarity"])

In [None]:
unique_dict = {}
for inchikey, polarity in zip(df["inchikey"], df["polarity"]):
  oldpolarity = unique_dict.get(inchikey, "missing")
  unique_dict[inchikey] = combine_polarity(oldpolarity, polarity)
  
df["new_polarity"] = [unique_dict.get(inchikey) for inchikey  in df["inchikey"]]
df = df.sort_values(by=["detected"]).drop_duplicates(["new_polarity", "inchikey"]).sort_index()
# df[df["inchikey"].duplicated(keep=False)][["inchikey", "polarity", "new_polarity"]]
df

In [None]:
def extract_row(libid: str, df: pd.DataFrame) -> dict:
  return {
    "library": libid,
    "total_compounds (unique)": len(df),
    "unique_structures": len(unique_df),
    "detected_compounds": len(ionmode_df.drop_duplicates(["unique_sample_id", "inchikey"])),
    # "detected_compounds_%": len(ionmode_df.drop_duplicates(["unique_sample_id", "inchikey"])) / len(
    #   libdf.drop_duplicates(["unique_sample_id", "inchikey"])) * 100,
    # "unique_detected_compounds": len(ionmode_df.drop_duplicates(["inchikey"])),
    # "ms2": len(ionmode_df[ionmode_df["mslevel"] == "2"]),
    # "msn": len(ionmode_df),
    # "ms2/annotated comp": len(ionmode_df[ionmode_df["mslevel"] == "2"]) / len(
    #   ionmode_df.drop_duplicates(["unique_sample_id", "inchikey"])),
    # "msn/annotated comp": len(ionmode_df) / len(
    #   ionmode_df.drop_duplicates(["unique_sample_id", "inchikey"])),
    # "precursor purity_%": ionmode_df["precursor_purity"].astype("float").mean() * 100,
    # "chimeric_%": len(ionmode_df[ionmode_df["quality_chimeric"] != "PASSED"]) / len(
    #   ionmode_df) * 100,
    # "average_num_signals": ionmode_df["num peaks"].astype("int").mean(),
    # "average_explained_intensity_by_formula": ionmode_df["quality_explained_intensity"].astype(
    #   "float").mean() * 100,
    # "average_explained_signals_by_formula": ionmode_df["quality_explained_signals"].astype(
    #   "float").mean() * 100,
  }

In [None]:
lib_rows = []
for group in sub:
  libid = group[0]
  groupdf = group[1]

  row = extract_row(libid, groupdf)
  lib_rows.append(row)

# get summary stats
row = extract_row("Summary", df)
lib_rows.append(row)
lib_rows
statistic_df = pd.DataFrame(lib_rows)

In [None]:
statistic_df