In [None]:
import pandas as pd
import pandas_utils as pu

import numpy as np

## Strategy

- library spectra with single MS2 scans (ungrouped)
- import in mzmine and export to fbmn
- reference compound spectrum (annotation export) (inchi_key and id for following merging)
- metadata merge by inchikey
- public library matches and best tanimoto and mces match on id
- merge into fbmn into cytoscape on id

In [None]:
metadata_file = r"C:\git\msn_library\data\library\mce_library_all_cleaned.tsv"
spectral_lib_file = r"C:\git\msn_library\data\library_matching\20231031\lib_20231031_mce_library_pos_all_lib_MS2.mgf__48069_spectra__single_scans.csv"
match_file = r"C:\git\msn_library\data\library_matching\20231031\match_top10_20231031_mce_library_pos_all_lib_MS2.mgf__48069_spectra__single_scans_bestmatch.csv"
outfile = r"C:\git\msn_library\data\library_matching\20231031\match_top10_20231031_mce_library_pos_all_lib_MS2.mgf__48069_spectra__single_scans_final_results.csv"

In [None]:
metadata_df = pu.read_dataframe(metadata_file).sort_values(by=["none"], ascending=(True)).drop_duplicates(subset="inchikey", keep="first")
spectral_lib_df = pu.read_dataframe(spectral_lib_file).rename(columns={"inchi_key": "inchikey"})
match_df = pu.read_dataframe(match_file).drop(["smiles", "compound_name", "adduct"], axis=1)

In [None]:
metadata_df

In [None]:
spectral_lib_df

In [None]:
spectral_lib_df[spectral_lib_df["id"] == 32985]

In [None]:
merged = pd.merge(spectral_lib_df, metadata_df, on ="inchikey", suffixes=("_spectra", ""), how="left")

In [None]:
merged

In [None]:
merged[merged["id"] == 32985]

In [None]:
match_df

In [None]:
merged = pd.merge(merged, match_df, on=["id", "inchikey"], suffixes=("", "_public_match"), how="left")

In [None]:
merged

In [None]:
merged[merged["id"] == 32985]

In [None]:
pu.save_dataframe(merged, outfile)

In [None]:
merged["best_tanimoto"].max()

In [None]:
metadata_df[metadata_df["inchikey"] == "LVVKXRQZSRUVPY-HNNXBMFYSA-N"]

In [None]:
df_metadata = pu.read_dataframe(outfile)

In [None]:
df_cytoscape = df_cytoscape[["name"]]
df_cytoscape

In [None]:
df_metadata

In [None]:
df = pd.merge(df_metadata, df_cytoscape, left_on="id", right_on="name", how="left")

In [None]:
df[["name", "id"]]

In [None]:
df[df["name"] == 35339]

In [None]:
pu.save_dataframe(df, r"C:\git\msn_library\data\library_matching\20231031\cytoscape_networking\mce_top10_fbmn_all_metadata_matches.csv")

In [None]:
df_metadata = pu.read_dataframe(outfile)

In [None]:
df_metadata

In [None]:
evaluation_bins = [0, 0.1, 3.5, 10, 50, np.inf]
group_names =     ["exact", 'similar', 'less_similar', "no_similar", "no_match"]


df_metadata["match_evaluation"] = pd.cut(df_metadata['best_mces'], bins=evaluation_bins,
                                     labels=group_names, include_lowest=True, right=True)
df_metadata["match_evaluation"].fillna("no_match", inplace=True)

In [None]:
df_metadata

In [None]:
pu.save_dataframe(df_metadata, outfile)

In [None]:
any_match = df_metadata[df_metadata["match"] == True]

In [None]:
any_match[any_match["compound_name_spectra"] == "ACETOPHENAZINE"]

In [None]:
no_direct_match = any_match[df_metadata["equal_inchikey"] != True]
no_direct_match

In [None]:
statistics = {
  "number_scans": len(df_metadata),
  "unique_structures": len(df_metadata.drop_duplicates(["inchikey"])),
  "has match": len(any_match),
  "has match%": len(any_match)/len(df_metadata)*100,
  "has match unique structures": len(any_match.drop_duplicates(["inchikey"])),
  "has match unique structures%": len(any_match.drop_duplicates(["inchikey"]))/len(df_metadata.drop_duplicates(["inchikey"]))*100,
  "match inchikey": len(any_match[any_match["equal_inchikey"] == True]),
  "match inchikey%": len(any_match[any_match["equal_inchikey"] == True])/len(any_match)*100,
  "match inchikey unique": len(df_metadata[df_metadata["equal_inchikey"] == True].drop_duplicates(["inchikey"])),
  "match tanimoto": len(any_match[any_match["best_tanimoto"] >= 0.85]),
  "match tanimoto%": len(any_match[any_match["best_tanimoto"] >= 0.85])/len(any_match)*100,
  "match tanimoto unique structures": len(any_match[any_match["best_tanimoto"] >= 0.85].drop_duplicates(["inchikey"])),
  "match mces": len(any_match[any_match["best_mces"] < 4]),
  "match mces%": len(any_match[any_match["best_mces"] < 4])/len(any_match)*100,
  "match mces unique structures": len(any_match[any_match["best_mces"] < 4].drop_duplicates(["inchikey"])),
}

for key, value in statistics.items():
  print(f"{value}")

print("\n")
for key, v in statistics.items():
  print("{}\t{}".format(key, v))

In [None]:
df_metadata_sub = df_metadata[["id", "compound_name", "compound_name_spectra", "adduct", "any_phase", "monoisotopic_mass", "unique_sample_id", "natural_product", "best_tanimoto", "best_mces", "match_evaluation"]]

In [None]:
df_metadata_sub[df_metadata_sub["id"] == 32985]

In [None]:
df_metadata[df_metadata["id"] == 32985]

In [None]:
pu.save_dataframe(df_metadata_sub, r"C:\git\msn_library\data\library_matching\20231031\match_top10_20231031_mce_library_pos_all_lib_MS2.mgf__48069_spectra__single_scans_final_results_subset.csv" )