In [None]:
import pandas as pd

import pyteomics.mgf
from tqdm.notebook import tqdm
import seaborn as sns
import matplotlib.pyplot as plt

import pandas_utils as pu
import well_plate

In [None]:
def save_fig(file_name):
  plt.savefig(r"C:\git\msn_library\figures/{}.png".format(file_name), dpi=300)
  plt.savefig(r"C:\git\msn_library\figures/{}.pdf".format(file_name))
  plt.savefig(r"C:\git\msn_library\figures/{}.svg".format(file_name))

In [None]:
def read_mgf(infile) -> pd.DataFrame:
  import re
  rows = []
  counter = 0

  with pyteomics.mgf.MGF(infile) as f_in:
    for spectrum_dict in tqdm(f_in):
      if spectrum_dict is not None:
        rows.append(spectrum_dict["params"])
      else:
        counter += 1

  df = pd.DataFrame(rows)
  if "inchikey" not in df.columns:
    df["inchikey"] = df["inchiaux"]
  if "compound_name" not in df.columns:
    df["compound_name"] = df["name"]
  if "monoisotopic_mass" not in df.columns:
    df["monoisotopic_mass"] = df["exactmass"]
  #
  if "usi" in df.columns:
    df["unique_sample_id"] = ["pluskal{}_id".format(re.search(r'pluskal(.*?)_id', usi).group(1)) for
                              usi in df["usi"]]
  return df

In [None]:
positive_file = r"C:\git\msn_library\library\20231130_iocb_peptide_library_pos_all_lib_MSn.mgf"
negative_file = r"C:\git\msn_library\library\20231130_iocb_peptide_library_neg_all_lib_MSn.mgf"
libraryfile = r"C:\git\msn_library\data\iocb_libraries\iocb_peptide_library_cleaned.tsv"
outfile = r"C:\git\msn_library\data\iocb_libraries\20231130_iocb_peptide_library_acquisition_comparison.tsv"


## NIH
# positive_file = r"C:\git\msn_library\library\20231031_nih_library_pos_all_lib_MSn.mgf"
# negative_file = r"C:\git\msn_library\library\20231031_nih_library_neg_all_lib_MSn.mgf"
# libraryfile = r"C:\git\msn_library\data\nih\nih_library_new_headers_cleaned.tsv"

In [None]:
positive_df = read_mgf(positive_file)
negative_df = read_mgf(negative_file)
libdf = pu.read_dataframe(libraryfile)

In [None]:
libdf = libdf[(libdf["monoisotopic_mass"] > 114) & (libdf["plate_id"] != "07P")].copy()
libdf_filtered = libdf.sort_values(by="none")
libdf_filtered = libdf_filtered[
  ["unique_sample_id", "well_location", "plate_id", "monoisotopic_mass", "compound_name",
   "inchikey", "molecular_species", "classyfire_superclass",
   "npclassifier_class_results", "npclassifier_superclass_results", "npclassifier_pathway_results",
   "logp"]].drop_duplicates(["unique_sample_id", "inchikey"]).sort_index()
libdf_filtered

In [None]:
# ## in case no classyfire results
# libdf = libdf[(libdf["monoisotopic_mass"] > 114) & (libdf["plate_id"] != "07P")].copy()
# libdf_filtered = libdf.sort_values(by="none")
# libdf_filtered = libdf_filtered[
#   ["unique_sample_id", "well_location", "plate_id", "monoisotopic_mass", "compound_name",
#    "inchikey", "molecular_species", "npclassifier_class_results", "npclassifier_superclass_results",
#    "npclassifier_pathway_results", "logp"]].drop_duplicates(
#     ["unique_sample_id", "inchikey"]).sort_index()

In [None]:
filtered_positive = positive_df.drop_duplicates(["inchikey", "unique_sample_id"])
filtered_negative = negative_df.drop_duplicates(["inchikey", "unique_sample_id"])

In [None]:
mapper = {
  "left_only": "positive",
  "right_only": "negative",
  "both": "both",
}

detect = {
  "left_only": "missing",
  "right_only": "error",
  "both": "detected",
}

dfs = []

for df in [filtered_positive, filtered_negative]:
  df = df[["inchikey", "unique_sample_id"]].drop_duplicates(
      ["unique_sample_id", "inchikey"]).set_index(
      ["unique_sample_id", "inchikey"])
  dfs.append(df)

outer = pd.merge(dfs[0], dfs[1], indicator="exist", how="outer", left_index=True,
                 right_index=True, suffixes=("", "_data")).reset_index()
df_diff_filtered = pd.merge(libdf_filtered, outer, how="left", on=["unique_sample_id", "inchikey"],
                            indicator="detected")
# df_diff_filtered = df_diff_filtered[["unique_sample_id", "inchikey", "exist", "smiles", "well_location", "plate_id"]]
df_diff_filtered["exist"] = [mapper.get(ex, "missing") for ex in df_diff_filtered["exist"]]
df_diff_filtered["detected"] = [detect.get(ex, "") for ex in df_diff_filtered["detected"]]
df_diff_filtered

In [None]:
df_diff_filtered.groupby("exist").count()

In [None]:
missing_df = df_diff_filtered[df_diff_filtered["detected"] == "missing"]
missing_df

In [None]:
missing_df.groupby(["unique_sample_id"]).count()

In [None]:
libdf_filtered[(libdf_filtered["unique_sample_id"] == "pluskal_mce_1D3_K16_id") | (
    libdf_filtered["unique_sample_id"] == "pluskal_mce_1D3_K17_id") | (
                   libdf_filtered["unique_sample_id"] == "pluskal_mce_1D3_K18_id")]

In [None]:
libdf_filtered["unique_sample_id"].value_counts()

## Get number of detected and missing in each well

In [None]:
comparison_df = pd.crosstab(df_diff_filtered.unique_sample_id, df_diff_filtered.exist).reset_index()

In [None]:
comparison_df["plate_id"] = [str(id).split("_")[2] for id in comparison_df["unique_sample_id"]]
comparison_df["well_location"] = [str(id).split("_")[3] for id in comparison_df["unique_sample_id"]]

In [None]:
comparison_df

In [None]:
libdf.drop_duplicates

In [None]:
pu.save_dataframe(comparison_df, outfile)

In [None]:
positive_df

In [None]:
def extract_stats(libdf, ionmode_df):
  results = {
    "injections": libdf["unique_sample_id"].nunique(),
    "detected_compounds": len(ionmode_df.drop_duplicates(["unique_sample_id", "inchikey"])),
    "detected_compounds_%": len(ionmode_df.drop_duplicates(["unique_sample_id", "inchikey"])) / len(
      libdf.drop_duplicates(["unique_sample_id", "inchikey"])) * 100,
    "unique_detected_compounds": len(ionmode_df.drop_duplicates(["inchikey"])),
    "ms2": len(ionmode_df[ionmode_df["mslevel"] == "2"]),
    "msn": len(ionmode_df),
    "ms2/annotated comp": len(ionmode_df[ionmode_df["mslevel"] == "2"]) / len(
      ionmode_df.drop_duplicates(["unique_sample_id", "inchikey"])),
    "msn/annotated comp": len(ionmode_df) / len(
      ionmode_df.drop_duplicates(["unique_sample_id", "inchikey"])),
    "precursor purity_%": ionmode_df["precursor_purity"].astype("float").mean() * 100,
    "chimeric_%": len(ionmode_df[ionmode_df["quality_chimeric"] != "PASSED"]) / len(
      ionmode_df) * 100,
    "average_num_signals": ionmode_df["num peaks"].astype("int").mean(),
    "average_explained_intensity_by_formula": ionmode_df["quality_explained_intensity"].astype(
      "float").mean() * 100,
    "average_explained_signals_by_formula": ionmode_df["quality_explained_signals"].astype(
      "float").mean() * 100,
    # "explained by substructures":,
  }
  return results


pos = extract_stats(libdf, positive_df)
neg = extract_stats(libdf, negative_df)

sum = {
  "total_compounds": len(libdf.drop_duplicates(["unique_sample_id", "inchikey"])),
  "unique_structures": len(libdf.drop_duplicates(["inchikey"])),
  "detected_compounds_combined": len(df_diff_filtered[df_diff_filtered["detected"] == "detected"]),
  "detected_compounds_combined_%": len(df_diff_filtered[df_diff_filtered["detected"] == "detected"]) / len(
    libdf.drop_duplicates(["unique_sample_id", "inchikey"]))*100,
  "unique_detected_inchikeys": len(df_diff_filtered[df_diff_filtered["detected"] == "detected"].drop_duplicates(["inchikey"])),
  "ms2_combined": (pos["ms2"] + neg["ms2"]),
  "MSn": (pos["msn"] + neg["msn"]),
}

for key, value in sum.items():
  print(f"{value}")
for key, value in pos.items():
  neg_value = neg.get(key)
  print(f"{value}\t{neg_value}")

print("\n")
for key, v in sum.items():
  print("{}\t{}".format(key, v))
for key, v in pos.items():
  neg_value = neg.get(key)
  print("{}\t{}\t{}".format(key, v, neg_value))