## You need to have a unique_sample_id

In [None]:
import pandas as pd


from tqdm.notebook import tqdm
import seaborn as sns
import matplotlib.pyplot as plt

import pandas_utils as pu
import well_plate

from mapper import combine_polarity
import stats_utils
from library_utils import read_mgf

In [None]:
lib = "diana" # add start of your unique_sample_id if available

positive_file = r"C:\git\msn_library\library\20241020_diana_pos_msn.mgf"
negative_file = r"C:\git\msn_library\library\20241020_diana_neg_msn.mgf"
metadata_file = r"C:\git\msn_library\data\DIANA\diana_plate1_test_standardized_A3_Z3.tsv"
outfile = r"C:\git\msn_library\data\acquisition_results\plate_ionization_comparison\20241020_diana_test_acquisition_comparison.tsv"

In [None]:
positive_df = read_mgf(positive_file, lib=lib)
negative_df = read_mgf(negative_file, lib=lib)
libdf = pu.read_dataframe(metadata_file)

In [None]:
positive_df

## remove compounds that were missing and keep each inchikey once in same well if not cleaned previously 

In [None]:
# only if acquisition method already missed compounds
libdf = libdf[(libdf["monoisotopic_mass"] > 114)].copy() 
libdf_filtered = libdf.copy()

In [None]:
libdf_filtered = libdf.sort_values(by="none").drop_duplicates(["unique_sample_id", "inchikey"]).sort_index() # important to filter if not done during the cleanup

## mapping detection (ionization mode, can be also used for comapring different methods, e.g, APCI vs. ESI, SOLVENT A vs. SOLVENT B,...)

In [None]:
mapper = {
  "left_only": "positive",
  "right_only": "negative",
  "both": "both",
}

detect = {
  "left_only": "missing",
  "right_only": "error",
  "both": "detected",
}

dfs = []

for df in [positive_df, negative_df]:
  df = df[["inchikey", "unique_sample_id"]].drop_duplicates(
      ["unique_sample_id", "inchikey"]).set_index(
      ["unique_sample_id", "inchikey"])
  dfs.append(df)

outer = pd.merge(dfs[0], dfs[1], indicator="polarity", how="outer", left_index=True,
                 right_index=True, suffixes=("", "_data")).reset_index()
df_diff_filtered = pd.merge(libdf_filtered, outer, how="left", on=["unique_sample_id", "inchikey"],
                            indicator="detected")
df_diff_filtered["polarity"] = [mapper.get(pol, "missing") for pol in df_diff_filtered["polarity"]]
df_diff_filtered["detected"] = [detect.get(ex, "") for ex in df_diff_filtered["detected"]]
df_diff_filtered

In [None]:
df_diff_filtered.groupby("polarity").count()[["inchikey", "unique_sample_id"]]

## Get number of detected and missing in each well

In [None]:
comparison_df = pd.crosstab([df_diff_filtered["unique_sample_id"], df_diff_filtered["plate_id"], df_diff_filtered["well_location"]], df_diff_filtered["polarity"]).reset_index()

In [None]:
comparison_df

In [None]:
pu.save_dataframe(comparison_df, outfile)

In [None]:
def extract_stats(libdf, ionmode_df):
  results = {
    "detected_compounds (unique)": [len(ionmode_df.drop_duplicates(["unique_sample_id", "inchikey"])), len(ionmode_df.drop_duplicates(["inchikey"]))],
    "detected_compounds_% (unique)": [len(ionmode_df.drop_duplicates(["unique_sample_id", "inchikey"])) / len(
      libdf.drop_duplicates(["unique_sample_id", "inchikey"])) * 100, len(ionmode_df.drop_duplicates(["inchikey"])) / len(
      libdf.drop_duplicates(["inchikey"])) * 100],
    "ms2": len(ionmode_df[ionmode_df["mslevel"] == "2"]),
    "msn": len(ionmode_df),
    "ms2/annotated comp": len(ionmode_df[ionmode_df["mslevel"] == "2"]) / len(
      ionmode_df.drop_duplicates(["unique_sample_id", "inchikey"])),
    "msn/annotated comp": len(ionmode_df) / len(
      ionmode_df.drop_duplicates(["unique_sample_id", "inchikey"])),
    "precursor purity_%": ionmode_df["precursor_purity"].astype("float").mean() * 100,
    "chimeric_%": len(ionmode_df[ionmode_df["quality_chimeric"] != "PASSED"]) / len(
      ionmode_df) * 100,
    "average_num_signals": ionmode_df["num peaks"].astype("int").mean(),
    "average_explained_intensity_by_formula": ionmode_df["quality_explained_intensity"].astype(
      "float").mean() * 100,
    "average_explained_signals_by_formula": ionmode_df["quality_explained_signals"].astype(
      "float").mean() * 100,
    # "explained by substructures":,
  }
  return results


pos = extract_stats(libdf, positive_df)
neg = extract_stats(libdf, negative_df)

sum = {
  "injections": libdf["unique_sample_id"].nunique(),
  "total_compounds": len(libdf.drop_duplicates(["unique_sample_id", "inchikey"])),
  "unique_structures": len(libdf.drop_duplicates(["inchikey"])),
  "detected_compounds_combined (unique)": [len(df_diff_filtered[df_diff_filtered["detected"] == "detected"]), len(df_diff_filtered[df_diff_filtered["detected"] == "detected"].drop_duplicates(["inchikey"]))],
  "detected_compounds_combined_% (unique)": [len(df_diff_filtered[df_diff_filtered["detected"] == "detected"]) / len(
    libdf.drop_duplicates(["unique_sample_id", "inchikey"]))*100, len(df_diff_filtered[df_diff_filtered["detected"] == "detected"].drop_duplicates(["inchikey"])) / len(
    libdf.drop_duplicates(["inchikey"]))*100],
  "ms2_combined": (pos["ms2"] + neg["ms2"]),
  "MSn": (pos["msn"] + neg["msn"]),
}

for key, value in sum.items():
  print(f"{value}")
for key, value in pos.items():
  neg_value = neg.get(key)
  print(f"{value}\t{neg_value}")

print("\n")
for key, v in sum.items():
  print("{}\t{}".format(key, v))
for key, v in pos.items():
  neg_value = neg.get(key)
  print("{}\t{}\t{}".format(key, v, neg_value))