In [None]:
import pandas as pd

import pyteomics.mgf
from tqdm.notebook import tqdm
import seaborn as sns
import matplotlib.pyplot as plt

import pandas_utils as pu

In [None]:
outfile_no_filter =  r"C:\git\msn_library\data\acquisition_results\20241003_7libraries_no_filter.tsv"
outfile_detected = r"C:\git\msn_library\data\acquisition_results\20241003_7libraries_only_detected.tsv"
outfile_missing =  r"C:\git\msn_library\data\acquisition_results\20241003_7libraries_missing.tsv"


In [None]:
acquisition_pos = {
  "mcebio": r"C:\git\msn_library\library\20241003_mcebio_pos_msn.mgf",
  "nihnp": r"C:\git\msn_library\library\20241003_nihnp_pos_msn.mgf",
  "mcescaf": r"C:\git\msn_library\library\20241003_mcescaf_pos_msn.mgf",
  "otavapep": r"C:\git\msn_library\library\20241003_otavapep_pos_msn.mgf",
  "mcedrug": r"C:\git\msn_library\library\20241003_mcedrug_pos_msn.mgf",
  "enammol": r"C:\git\msn_library\library\20241003_enammol_pos_msn.mgf",
  "enamdisc": r"C:\git\msn_library\library\20241003_enamdisc_pos_msn.mgf"
}

acquisition_neg = {
  "mcebio": r"C:\git\msn_library\library\20241003_mcebio_neg_msn.mgf",
  "nihnp": r"C:\git\msn_library\library\20241003_nihnp_neg_msn.mgf",
  "mcescaf": r"C:\git\msn_library\library\20241003_mcescaf_neg_msn.mgf",
  "otavapep": r"C:\git\msn_library\library\20241003_otavapep_neg_msn.mgf",
  "mcedrug": r"C:\git\msn_library\library\20241003_mcedrug_neg_msn.mgf",
  "enammol": r"C:\git\msn_library\library\20241003_enammol_neg_msn.mgf",
  "enamdisc": r"C:\git\msn_library\library\20241003_enamdisc_neg_msn.mgf"
}


libraries = {
  "mcebio": r"C:\git\msn_library\data\library\mce_library_all_cleaned.tsv",
  "nihnp": r"C:\git\msn_library\data\nih\nih_library_new_headers_cleaned_plate7_removed.tsv",
  "mcescaf": r"C:\git\msn_library\data\iocb_libraries\MCE\5k_scaffold\mcescaf_cleaned.tsv",
  "otavapep": r"C:\git\msn_library\data\iocb_libraries\iocb_peptide_library_cleaned.tsv",
  "mcedrug": r"C:\git\msn_library\data\iocb_libraries\fda_approved_drugs\mcedrug\mcedrug_cleaned.tsv",
  "enammol": r"C:\git\msn_library\data\iocb_libraries\Veverka_group\enammol_cleaned.tsv",
  "enamdisc": r"C:\git\msn_library\data\iocb_libraries\radim_enamine_10k_diversity\enamdisc_10k_cleaned.tsv"
}

In [None]:
def read_mgf(infile) -> pd.DataFrame:
  import re
  rows = []
  counter = 0

  with pyteomics.mgf.MGF(infile, encoding='utf-8') as f_in:
    for spectrum_dict in tqdm(f_in):
      if spectrum_dict is not None:
        rows.append(spectrum_dict["params"])
      else:
        counter += 1

  df = pd.DataFrame(rows)
  if "inchikey" not in df.columns:
    df["inchikey"] = df["inchiaux"]
  if "compound_name" not in df.columns:
    df["compound_name"] = df["name"]
  if "usi" in df.columns:
    df["unique_sample_id"] = ["pluskal{}_id".format(re.search(r'pluskal(.*?)_id', usi).group(1)) for
                              usi in df["usi"]]
  return df


def combine_polarity(old, new):
  if old == "both":
    return "both"
  match new:
    case "both":
      return new
    case "positive":
      return "both" if old == "negative" else "positive" 
    case "negative":
      return "both" if old == "positive" else "negative" 
    case _:
      return old

In [None]:
dfs = []
for key, value in libraries.items():
  df = pu.read_dataframe(value).copy()
  df = df[df["monoisotopic_mass"] > 114]
  df["library"] = key
  df = df.drop_duplicates(["inchikey", "unique_sample_id"])
  dfs.append(df)


metadata_df = pd.concat(dfs, ignore_index=True)


metadata_df

In [None]:
positive = []
for key, value in acquisition_pos.items():
  df = read_mgf(value)[["inchikey", "unique_sample_id"]].copy()
  df = df.drop_duplicates(["inchikey", "unique_sample_id"])
  positive.append(df)


positive_df = pd.concat(positive, ignore_index=True)


positive_df

In [None]:
negative = []
for key, value in acquisition_neg.items():
  df = read_mgf(value)[["inchikey", "unique_sample_id"]].copy()
  df = df.drop_duplicates(["inchikey", "unique_sample_id"])
  negative.append(df)


negative_df = pd.concat(negative, ignore_index=True)


negative_df

In [None]:
mapper = {
  "left_only": "positive",
  "right_only": "negative",
  "both": "both",
}

detect = {
  "left_only": "missing",
  "right_only": "error",
  "both": "detected",
}


detected_df = pd.merge(positive_df, negative_df, indicator="polarity", how="outer", on=["unique_sample_id", "inchikey"])
metadata_detected_df = pd.merge(metadata_df, detected_df, how="left", on=["unique_sample_id", "inchikey"], indicator="detected")
metadata_detected_df["polarity"] = [mapper.get(pol, "missing") for pol in metadata_detected_df["polarity"]]
metadata_detected_df["detected"] = [detect.get(ex, "") for ex in metadata_detected_df["detected"]]
metadata_detected_df

In [None]:
metadata_detected_df[["unique_sample_id", "inchikey", "polarity"]]

In [None]:
metadata_detected_df.groupby("detected").count()["inchikey"]

In [None]:
filtered = metadata_detected_df[metadata_detected_df["detected"] == "detected"]
filtered

In [None]:
missing = metadata_detected_df[metadata_detected_df["detected"] == "missing"]
missing

In [None]:
pu.save_dataframe(metadata_detected_df, outfile_no_filter)
pu.save_dataframe(filtered, outfile_detected )
pu.save_dataframe(missing, outfile_missing)

In [None]:
metadata_detected_df

## replace lib name

In [None]:
# df["library"] = df["library"].replace({"01_mce_bioactive":"mcebio", "02_mce_scaffold_library":"mcescaf", "03_nih_natural_product":"nihnp", "04_iocb_peptide":"otavapep"})

In [None]:
df["library"]

In [None]:
# pu.save_dataframe(df, file)

In [None]:
detected = pu.read_dataframe(outfile_no_filter)
detected

In [None]:
filtered = detected[["inchikey", "split_inchikey", "canonical_smiles", "isomeric_smiles", "compound_name", "synonyms", "monoisotopic_mass", "logp", "polarity", "library", "unique_sample_id"]]
filtered

In [None]:
# filtered[filtered.duplicated(["inchikey"], keep=False)].drop_duplicates(["inchikey", "polarity"])

In [None]:
pu.save_dataframe(filtered, r"C:\git\msn_library\data\acquisition_results\result_seven_compound_libraries.tsv")