In [1]:
import pandas as pd

import pyteomics.mgf
from tqdm.notebook import tqdm
import seaborn as sns
import matplotlib.pyplot as plt

import pandas_utils as pu
import well_plate

In [2]:
def save_fig(file_name):
  plt.savefig(r"C:\git\msn_library\figures/{}.png".format(file_name), dpi=300)
  plt.savefig(r"C:\git\msn_library\figures/{}.pdf".format(file_name))
  plt.savefig(r"C:\git\msn_library\figures/{}.svg".format(file_name))

In [3]:
def read_mgf(infile) -> pd.DataFrame:
  import re
  rows = []
  counter = 0

  with pyteomics.mgf.MGF(infile) as f_in:
    for spectrum_dict in tqdm(f_in):
      if spectrum_dict is not None:
        rows.append(spectrum_dict["params"])
      else:
        counter += 1

  df = pd.DataFrame(rows)
  if "inchikey" not in df.columns:
    df["inchikey"] = df["inchiaux"]
  if "compound_name" not in df.columns:
    df["compound_name"] = df["name"]
  if "monoisotopic_mass" not in df.columns:
    df["monoisotopic_mass"] = df["exactmass"]
  #
  if "usi" in df.columns:
    df["unique_sample_id"] = ["pluskal{}_id".format(re.search(r'pluskal(.*?)_id', usi).group(1)) for
                              usi in df["usi"]]
  return df

In [6]:
positive_file = r"C:\git\msn_library\library\20231031_mce_library_pos_all_lib_MSn.mgf"
negative_file = r"C:\git\msn_library\library\20231030_mce_library_neg_all_lib_MSn.mgf"
libraryfile = r"C:\git\msn_library\data\library\mce_library_all_cleaned.tsv"
outfile = r"C:\git\msn_library\data\library\20231031_mce_library_acquisition_comparison.tsv"


## NIH
# positive_file = r"C:\git\msn_library\library\20231031_nih_library_pos_all_lib_MSn.mgf"
# negative_file = r"C:\git\msn_library\library\20231031_nih_library_neg_all_lib_MSn.mgf"
# libraryfile = r"C:\git\msn_library\data\nih\nih_library_new_headers_cleaned.tsv"

In [7]:
positive_df = read_mgf(positive_file)
negative_df = read_mgf(negative_file)
libdf = pu.read_dataframe(libraryfile)

0it [00:00, ?it/s]

0it [00:00, ?it/s]

  df = pd.read_csv(file, sep="\t")


In [6]:
# libdf = libdf[(libdf["monoisotopic_mass"] > 114) & (libdf["plate_id"] != "07P")].copy()
# libdf_filtered = libdf.sort_values(by="none")
# libdf_filtered = libdf_filtered[
#   ["unique_sample_id", "well_location", "plate_id", "monoisotopic_mass", "compound_name",
#    "inchikey", "molecular_species", "classyfire_superclass",
#    "npclassifier_class_results", "npclassifier_superclass_results", "npclassifier_pathway_results",
#    "logp"]].drop_duplicates(["unique_sample_id", "inchikey"]).sort_index()
# libdf_filtered

KeyError: "['classyfire_superclass'] not in index"

In [8]:
## in case no classyfire results
libdf = libdf[(libdf["monoisotopic_mass"] > 114) & (libdf["plate_id"] != "07P")].copy()
libdf_filtered = libdf.sort_values(by="none")
libdf_filtered = libdf_filtered[
  ["unique_sample_id", "well_location", "plate_id", "monoisotopic_mass", "compound_name",
   "inchikey", "molecular_species", "npclassifier_class_results", "npclassifier_superclass_results",
   "npclassifier_pathway_results", "logp"]].drop_duplicates(
    ["unique_sample_id", "inchikey"]).sort_index()

In [9]:
filtered_positive = positive_df.drop_duplicates(["inchikey", "unique_sample_id"])
filtered_negative = negative_df.drop_duplicates(["inchikey", "unique_sample_id"])

In [10]:
mapper = {
  "left_only": "positive",
  "right_only": "negative",
  "both": "both",
}

detect = {
  "left_only": "missing",
  "right_only": "error",
  "both": "detected",
}

dfs = []

for df in [filtered_positive, filtered_negative]:
  df = df[["inchikey", "unique_sample_id"]].drop_duplicates(
      ["unique_sample_id", "inchikey"]).set_index(
      ["unique_sample_id", "inchikey"])
  dfs.append(df)

outer = pd.merge(dfs[0], dfs[1], indicator="exist", how="outer", left_index=True,
                 right_index=True, suffixes=("", "_data")).reset_index()
df_diff_filtered = pd.merge(libdf_filtered, outer, how="left", on=["unique_sample_id", "inchikey"],
                            indicator="detected")
# df_diff_filtered = df_diff_filtered[["unique_sample_id", "inchikey", "exist", "smiles", "well_location", "plate_id"]]
df_diff_filtered["exist"] = [mapper.get(ex, "missing") for ex in df_diff_filtered["exist"]]
df_diff_filtered["detected"] = [detect.get(ex, "") for ex in df_diff_filtered["detected"]]
df_diff_filtered

Unnamed: 0,unique_sample_id,well_location,plate_id,monoisotopic_mass,compound_name,inchikey,molecular_species,npclassifier_class_results,npclassifier_superclass_results,npclassifier_pathway_results,logp,exist,detected
0,pluskal_mce_1D1_A1_id,A1,1D1,249.082350,TG003,BGVLELSCIHASRV-QPEQYQDCSA-N,NEUTRAL,Simple indole alkaloids,Tryptophan alkaloids,Alkaloids,2.10657,positive,detected
1,pluskal_mce_1D1_A1_id,A1,1D1,583.158047,Fostemsavir,SWMDAPWAQQTBOG-UHFFFAOYSA-N,ACID,Carboline alkaloids,Tryptophan alkaloids,Alkaloids,-0.07685,both,detected
2,pluskal_mce_1D1_A1_id,A1,1D1,268.019667,NQO1 substrate,PZUSGRHVYDQLHR-UHFFFAOYSA-N,NEUTRAL,,,,1.11070,negative,detected
3,pluskal_mce_1D1_A1_id,A1,1D1,561.115771,ALK inhibitor 1,FTSDLONCFCQDGA-UHFFFAOYSA-N,,,,Alkaloids,0.92133,both,detected
4,pluskal_mce_1D1_A1_id,A1,1D1,272.032957,CCT007093,KPFZCKDPBMGECB-WGDLNXRISA-N,,,,,4.53008,positive,detected
...,...,...,...,...,...,...,...,...,...,...,...,...,...
10311,pluskal_mce_1D3_L18_id,L18,1D3,659.126254,1326-12-1,GJPMVYADDJKGMB-UHFFFAOYSA-O,,,,,3.45767,missing,missing
10312,pluskal_mce_1D3_L19_id,L19,1D3,1199.984546,9050-30-0,HOUUJEJIRQVYJN-UFWJCPMYSA-N,,Aminosugars,Aminosugars and aminoglycosides,Carbohydrates,-10.14740,missing,missing
10313,pluskal_mce_1D3_L19_id,L19,1D3,273.172879,37326-33-3,LVVRHAVPVOVPGG-UHFFFAOYSA-N,,Tetracyclic diterpenoids,Diterpenoids,Terpenoids,3.50758,missing,missing
10314,pluskal_mce_1D3_L19_id,L19,1D3,623.013205,Heparin lithium salt,HXSDFQWQRCUQHF-UHFFFAOYSA-N,,,,Carbohydrates,-4.94240,missing,missing


In [11]:
df_diff_filtered.groupby("exist").count()

Unnamed: 0_level_0,unique_sample_id,well_location,plate_id,monoisotopic_mass,compound_name,inchikey,molecular_species,npclassifier_class_results,npclassifier_superclass_results,npclassifier_pathway_results,logp,detected
exist,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
both,3572,3572,3572,3572,3572,3572,3086,1663,1891,3267,3572,3572
missing,2023,2023,2023,2023,2023,2023,1504,1264,1327,1806,2023,2023
negative,1220,1220,1220,1220,1220,1220,1013,732,761,1091,1220,1220
positive,3501,3501,3501,3501,3501,3501,3068,1614,1925,3232,3501,3501


In [12]:
missing_df = df_diff_filtered[df_diff_filtered["detected"] == "missing"]
missing_df

Unnamed: 0,unique_sample_id,well_location,plate_id,monoisotopic_mass,compound_name,inchikey,molecular_species,npclassifier_class_results,npclassifier_superclass_results,npclassifier_pathway_results,logp,exist,detected
5,pluskal_mce_1D1_A1_id,A1,1D1,283.132077,Brevianamide F,RYFZBPVMVYTEKZ-KBPBESRZSA-N,NEUTRAL,Dipeptides;Indole diketopiperazine alkaloids (...,Peptide alkaloids;Small peptides,Alkaloids;Amino acids and Peptides,0.71667,missing,missing
14,pluskal_mce_1D1_A2_id,A2,1D1,170.105528,Etiracetam,HPHUVLMMVZITSG-UHFFFAOYSA-N,NEUTRAL,Pyrrolidine alkaloids,Ornithine alkaloids,Alkaloids,-0.12730,missing,missing
15,pluskal_mce_1D1_A2_id,A2,1D1,242.094294,fenoprofen,RDJGLLICXDHJDY-UHFFFAOYSA-N,ACID,Simple phenolic acids,Phenolic acids (C6-C1),Shikimates and Phenylpropanoids,2.68747,missing,missing
24,pluskal_mce_1D1_A3_id,A3,1D1,254.105528,Nepafenac,QEFAQIPZVLVERP-UHFFFAOYSA-N,NEUTRAL,,,Alkaloids,0.98286,missing,missing
27,pluskal_mce_1D1_A3_id,A3,1D1,459.186632,134-35-0,ZNOVTXRBGFNYRX-ABLWVSNPSA-N,ACID,pteridine alkaloids,Pseudoalkaloids,Alkaloids,-0.93496,missing,missing
...,...,...,...,...,...,...,...,...,...,...,...,...,...
10311,pluskal_mce_1D3_L18_id,L18,1D3,659.126254,1326-12-1,GJPMVYADDJKGMB-UHFFFAOYSA-O,,,,,3.45767,missing,missing
10312,pluskal_mce_1D3_L19_id,L19,1D3,1199.984546,9050-30-0,HOUUJEJIRQVYJN-UFWJCPMYSA-N,,Aminosugars,Aminosugars and aminoglycosides,Carbohydrates,-10.14740,missing,missing
10313,pluskal_mce_1D3_L19_id,L19,1D3,273.172879,37326-33-3,LVVRHAVPVOVPGG-UHFFFAOYSA-N,,Tetracyclic diterpenoids,Diterpenoids,Terpenoids,3.50758,missing,missing
10314,pluskal_mce_1D3_L19_id,L19,1D3,623.013205,Heparin lithium salt,HXSDFQWQRCUQHF-UHFFFAOYSA-N,,,,Carbohydrates,-4.94240,missing,missing


In [13]:
missing_df.groupby(["unique_sample_id"]).count()

Unnamed: 0_level_0,well_location,plate_id,monoisotopic_mass,compound_name,inchikey,molecular_species,npclassifier_class_results,npclassifier_superclass_results,npclassifier_pathway_results,logp,exist,detected
unique_sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
pluskal_mce_1D1_A12_id,1,1,1,1,1,1,0,0,1,1,1,1
pluskal_mce_1D1_A13_id,1,1,1,1,1,1,0,0,1,1,1,1
pluskal_mce_1D1_A14_id,1,1,1,1,1,1,1,1,1,1,1,1
pluskal_mce_1D1_A16_id,2,2,2,2,2,2,0,0,2,2,2,2
pluskal_mce_1D1_A19_id,1,1,1,1,1,1,1,1,1,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...
pluskal_mce_1D3_L3_id,5,5,5,5,5,3,3,3,4,5,5,5
pluskal_mce_1D3_L5_id,3,3,3,3,3,3,2,2,3,3,3,3
pluskal_mce_1D3_L7_id,4,4,4,4,4,3,3,3,4,4,4,4
pluskal_mce_1D3_L8_id,3,3,3,3,3,2,2,2,3,3,3,3


In [14]:
libdf_filtered[(libdf_filtered["unique_sample_id"] == "pluskal_mce_1D3_K16_id") | (
    libdf_filtered["unique_sample_id"] == "pluskal_mce_1D3_K17_id") | (
                   libdf_filtered["unique_sample_id"] == "pluskal_mce_1D3_K18_id")]

Unnamed: 0,unique_sample_id,well_location,plate_id,monoisotopic_mass,compound_name,inchikey,molecular_species,npclassifier_class_results,npclassifier_superclass_results,npclassifier_pathway_results,logp
10230,pluskal_mce_1D3_K16_id,K16,1D3,189.111341,L-Homocitrulline,XIGSAGMEBXLVJJ-YFKPBYRVSA-N,,Aminoacids,Small peptides,Amino acids and Peptides,-0.7631
10232,pluskal_mce_1D3_K16_id,K16,1D3,138.008195,(3-Methyloxiran-2-yl)phosphonic acid,YMDXZJFXQJVXBF-UHFFFAOYSA-N,ACID,,,,-0.0911
10233,pluskal_mce_1D3_K16_id,K16,1D3,115.063329,L-proline,ONIBWKKTOPOVIA-BYPYZUCNSA-N,ZWITTERION,Aminoacids;Dipeptides,Small peptides,Amino acids and Peptides,-0.177
10234,pluskal_mce_1D3_K16_id,K16,1D3,169.085127,1-Methyl-L-histidine,BRMWTNUJHUMWMS-LURJTMIESA-N,ZWITTERION,Aminoacids,Small peptides,Amino acids and Peptides,-0.68021
10235,pluskal_mce_1D3_K16_id,K16,1D3,116.010959,maleic acid,VZCYOOQTPOCHFL-UPHRSURJSA-N,ACID,Dicarboxylic acids,Fatty Acids and Conjugates,Fatty acids,-0.2882
10236,pluskal_mce_1D3_K16_id,K16,1D3,153.009579,L-Cysteinesulfinic acid,ADVPTQAUNPRNPO-REOHCLBHSA-N,ZWITTERION,Aminoacids,Small peptides,Amino acids and Peptides,-1.38
10237,pluskal_mce_1D3_K16_id,K16,1D3,180.089878,4-Amino-L-Phenylalanine,CMUHFUGDYMFHEI-QMMMGPOBSA-N,ZWITTERION,Aminoacids,Small peptides,Amino acids and Peptides,-0.21212
10238,pluskal_mce_1D3_K16_id,K16,1D3,196.058303,gluconic acid,RGHNJXZEOKUKBD-SQOUGZDYSA-N,ACID,Monosaccharides,Saccharides,Carbohydrates,-3.4931
10239,pluskal_mce_1D3_K16_id,K16,1D3,169.085127,3-Methyl-L-histidine,JDHILDINMRGULE-LURJTMIESA-N,,Aminoacids,Small peptides,Amino acids and Peptides,-0.68021
10241,pluskal_mce_1D3_K17_id,K17,1D3,125.095297,1-Methylhistamine,FHQDWPCFSJMNCT-UHFFFAOYSA-N,BASE,Imidazole alkaloids,Histidine alkaloids,Alkaloids,-0.13341


In [15]:
libdf_filtered["unique_sample_id"].value_counts()

pluskal_mce_1D1_A1_id     10
pluskal_mce_1D2_L12_id    10
pluskal_mce_1D2_K23_id    10
pluskal_mce_1D2_K24_id    10
pluskal_mce_1D2_L1_id     10
                          ..
pluskal_mce_1D3_E23_id     5
pluskal_mce_1D3_I9_id      5
pluskal_mce_1D3_E24_id     4
pluskal_mce_1D3_K17_id     4
pluskal_mce_1D3_L16_id     4
Name: unique_sample_id, Length: 1051, dtype: int64

## Get number of detected and missing in each well

In [16]:
comparison_df = pd.crosstab(df_diff_filtered.unique_sample_id, df_diff_filtered.exist).reset_index()

In [17]:
comparison_df["plate_id"] = [str(id).split("_")[2] for id in comparison_df["unique_sample_id"]]
comparison_df["well_location"] = [str(id).split("_")[3] for id in comparison_df["unique_sample_id"]]

In [18]:
comparison_df

exist,unique_sample_id,both,missing,negative,positive,plate_id,well_location
0,pluskal_mce_1D1_A10_id,5,0,1,4,1D1,A10
1,pluskal_mce_1D1_A11_id,4,0,3,3,1D1,A11
2,pluskal_mce_1D1_A12_id,4,1,1,4,1D1,A12
3,pluskal_mce_1D1_A13_id,5,1,1,2,1D1,A13
4,pluskal_mce_1D1_A14_id,6,1,1,2,1D1,A14
...,...,...,...,...,...,...,...
1046,pluskal_mce_1D3_L5_id,0,3,0,7,1D3,L5
1047,pluskal_mce_1D3_L6_id,0,0,0,9,1D3,L6
1048,pluskal_mce_1D3_L7_id,0,4,0,6,1D3,L7
1049,pluskal_mce_1D3_L8_id,0,3,0,7,1D3,L8


In [22]:
libdf.drop_duplicates

Unnamed: 0,Batch No.,Biological Activity,Catalog Number,Clinical Information,Formula,M.Wt,PathWay,Plate,Quantity,Research Area,...,topical,unichem_id,unichem_url,unii,usan_stem_definition,withdrawn,zinc_id,well_location,plate_id,unique_sample_id
0,188197.0,,HY-Q50247,,C17H23N3O6S,397.45,,HYCPK56697,10mM * 50uL,,...,,1201984.0,https://www.ebi.ac.uk/unichem/compoundsources?...,,,,ZINC000019821108,A3,5000,pluskal_nencka_mce_5000_A3_id
1,204216.0,,HY-Q36553,,C18H14N4O2,318.33,,HYCPK56697,10mM * 50uL,,...,,15458904.0,https://www.ebi.ac.uk/unichem/compoundsources?...,,,,ZINC000000325918,A3,5000,pluskal_nencka_mce_5000_A3_id
2,174830.0,,HY-Q43409,,C22H36N2O3,376.53,,HYCPK56697,10mM * 50uL,,...,,23545050.0,https://www.ebi.ac.uk/unichem/compoundsources?...,,,,ZINC000004868625,A3,5000,pluskal_nencka_mce_5000_A3_id
3,200418.0,,HY-Q33952,,C25H21FN4O3S,476.52,,HYCPK56697,10mM * 50uL,,...,,1943635.0,https://www.ebi.ac.uk/unichem/compoundsources?...,,,,ZINC000009368976,A3,5000,pluskal_nencka_mce_5000_A3_id
4,209719.0,,HY-Q02827,,C22H30N4O5S,462.56,,HYCPK56697,10mM * 50uL,,...,,67954673.0,https://www.ebi.ac.uk/unichem/compoundsources?...,,,,ZINC000021711391,A3,5000,pluskal_nencka_mce_5000_A3_id
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,17295.0,N-Desmethyl imatinib (Norimatinib) is a metabo...,HY-G0017,No Development Reported,C28H29N7O,479.58,Metabolic Enzyme/Protease,HYCPK56712,10mM * 50uL,Others,...,False,23185450.0,https://www.ebi.ac.uk/unichem/compoundsources?...,6GOH0N63QD,,False,ZINC000021981222,P7,5001,pluskal_nencka_mce_5001_P7_id
4996,8959.0,"Olanzapine (LY170053) is a selective, orally a...",HY-14541,Launched,C17H20N4S,312.43,Apoptosis; Autophagy; GPCR/G Protein; Neuronal...,HYCPK56712,10mM * 50uL,Neurological Disease; Cancer,...,False,27240.0,https://www.ebi.ac.uk/unichem/compoundsources?...,N7U69T4SZR,tricyclic compounds,False,ZINC000052957434,P7,5001,pluskal_nencka_mce_5001_P7_id
4997,209174.0,,HY-Q08716,,C24H32N6O,420.55,,HYCPK56712,10mM * 50uL,,...,,95037788.0,https://www.ebi.ac.uk/unichem/compoundsources?...,,,,,P7,5001,pluskal_nencka_mce_5001_P7_id
4998,176497.0,,HY-Q07691,,C20H16ClNO3,353.80,,HYCPK56712,10mM * 50uL,,...,,6389763.0,https://www.ebi.ac.uk/unichem/compoundsources?...,,,,ZINC000057134982,P7,5001,pluskal_nencka_mce_5001_P7_id


In [29]:
libdf.drop_duplicates(["unique_sample_id", "inchikey"]).value()

AttributeError: 'DataFrame' object has no attribute 'value'

In [33]:
# pu.save_dataframe(comparison_df, outfile)

In [25]:
positive_df

Unnamed: 0,name,description,exactmass,formula,inchi,inchiaux,smiles,feature_id,mslevel,rtinseconds,...,msn_collision_energies,msn_precursor_mzs,msn_fragmentation_methods,msn_isolation_windows,other_matched_compounds,other_matched_compounds_names,inchikey,compound_name,monoisotopic_mass,unique_sample_id
0,ethynylcytidine,MCE bioactive compounds,267.085521,C11H13N3O5,InChI=1S/C11H13N3O5/c1-2-11(18)6(5-15)19-9(8(1...,JFIWEPHGRUDAJN-DYUFWOLASA-N,C#C[C@@]1(O)[C@@H](CO)O[C@@H](n2ccc(N)nc2=O)[C...,-1,2,69.34,...,,,,,,,JFIWEPHGRUDAJN-DYUFWOLASA-N,ethynylcytidine,267.085521,pluskal_mce_1D1_A13_id
1,ethynylcytidine,MCE bioactive compounds,267.085521,C11H13N3O5,InChI=1S/C11H13N3O5/c1-2-11(18)6(5-15)19-9(8(1...,JFIWEPHGRUDAJN-DYUFWOLASA-N,C#C[C@@]1(O)[C@@H](CO)O[C@@H](n2ccc(N)nc2=O)[C...,-1,2,69.34,...,,,,,,,JFIWEPHGRUDAJN-DYUFWOLASA-N,ethynylcytidine,267.085521,pluskal_mce_1D1_A13_id
2,ethynylcytidine,MCE bioactive compounds,267.085521,C11H13N3O5,InChI=1S/C11H13N3O5/c1-2-11(18)6(5-15)19-9(8(1...,JFIWEPHGRUDAJN-DYUFWOLASA-N,C#C[C@@]1(O)[C@@H](CO)O[C@@H](n2ccc(N)nc2=O)[C...,412,2,69.34,...,,,,,,,JFIWEPHGRUDAJN-DYUFWOLASA-N,ethynylcytidine,267.085521,pluskal_mce_1D1_A13_id
3,ethynylcytidine,MCE bioactive compounds,267.085521,C11H13N3O5,InChI=1S/C11H13N3O5/c1-2-11(18)6(5-15)19-9(8(1...,JFIWEPHGRUDAJN-DYUFWOLASA-N,C#C[C@@]1(O)[C@@H](CO)O[C@@H](n2ccc(N)nc2=O)[C...,411,2,69.34,...,,,,,,,JFIWEPHGRUDAJN-DYUFWOLASA-N,ethynylcytidine,267.085521,pluskal_mce_1D1_A13_id
4,ethynylcytidine,MCE bioactive compounds,267.085521,C11H13N3O5,InChI=1S/C11H13N3O5/c1-2-11(18)6(5-15)19-9(8(1...,JFIWEPHGRUDAJN-DYUFWOLASA-N,C#C[C@@]1(O)[C@@H](CO)O[C@@H](n2ccc(N)nc2=O)[C...,410,2,69.34,...,,,,,,,JFIWEPHGRUDAJN-DYUFWOLASA-N,ethynylcytidine,267.085521,pluskal_mce_1D1_A13_id
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
373055,Oglemilast,MCE bioactive compounds,514.992103,C20H13Cl2F2N3O5S,"InChI=1S/C20H13Cl2F2N3O5S/c1-33(29,30)27-9-2-4...",OKFDRAHPFKMAJH-UHFFFAOYSA-N,CS(=O)(=O)Nc1cc2c(cc1)oc1c(OC(F)F)ccc(C(=O)Nc3...,-1,4,112.10,...,"[30.0, 40.0, 60.0]","[515.998291015625, 163.966125488281, 146.00027...","[HCD, HCD, HCD]","[1.2000000476839432, 2.0, 2.200000047684]",,,OKFDRAHPFKMAJH-UHFFFAOYSA-N,Oglemilast,514.992103,pluskal_mce_1D3_L8_id
373056,Oglemilast,MCE bioactive compounds,514.992103,C20H13Cl2F2N3O5S,"InChI=1S/C20H13Cl2F2N3O5S/c1-33(29,30)27-9-2-4...",OKFDRAHPFKMAJH-UHFFFAOYSA-N,CS(=O)(=O)Nc1cc2c(cc1)oc1c(OC(F)F)ccc(C(=O)Nc3...,-1,4,112.10,...,"[30.0, 40.0, 60.0]","[515.998291015625, 163.966125488281, 146.00027...","[HCD, HCD, HCD]","[1.2000000476839432, 2.0, 2.200000047684]",,,OKFDRAHPFKMAJH-UHFFFAOYSA-N,Oglemilast,514.992103,pluskal_mce_1D3_L8_id
373057,Oglemilast,MCE bioactive compounds,514.992103,C20H13Cl2F2N3O5S,"InChI=1S/C20H13Cl2F2N3O5S/c1-33(29,30)27-9-2-4...",OKFDRAHPFKMAJH-UHFFFAOYSA-N,CS(=O)(=O)Nc1cc2c(cc1)oc1c(OC(F)F)ccc(C(=O)Nc3...,-1,4,112.10,...,"[30.0, 40.0, 40.0]","[515.998291015625, 163.966125488281, 146.00027...","[HCD, HCD, HCD]","[1.2000000476839432, 2.0, 2.200000047684]",,,OKFDRAHPFKMAJH-UHFFFAOYSA-N,Oglemilast,514.992103,pluskal_mce_1D3_L8_id
373058,Oglemilast,MCE bioactive compounds,514.992103,C20H13Cl2F2N3O5S,"InChI=1S/C20H13Cl2F2N3O5S/c1-33(29,30)27-9-2-4...",OKFDRAHPFKMAJH-UHFFFAOYSA-N,CS(=O)(=O)Nc1cc2c(cc1)oc1c(OC(F)F)ccc(C(=O)Nc3...,509,4,112.10,...,"[30.0, 40.0, 60.0]","[515.99951171875, 163.966461181641, 146.000228...","[HCD, HCD, HCD]","[1.2000000476839432, 2.0, 2.200000047684]",,,OKFDRAHPFKMAJH-UHFFFAOYSA-N,Oglemilast,514.992103,pluskal_mce_1D3_L8_id


In [56]:
negative_df["num peaks"].astype("int").mean()

12.083721866819499

In [59]:
def extract_stats(libdf, ionmode_df):
  results = {
    "total_compounds": len(libdf.drop_duplicates(["unique_sample_id", "inchikey"])),
    "unique_structures": len(libdf.drop_duplicates(["inchikey"])),
    "annotated_compounds": len(ionmode_df.drop_duplicates(["unique_sample_id", "inchikey"])),
    "annotated_compounds_%":  len(ionmode_df.drop_duplicates(["unique_sample_id", "inchikey"]))/len(libdf.drop_duplicates(["unique_sample_id", "inchikey"]))*100,
    "unique_annotated_compounds": len(ionmode_df.drop_duplicates(["inchikey"])), 
    "samples": libdf["unique_sample_id"].nunique(),
    "MS2": len(ionmode_df[ionmode_df["mslevel"] == "2"]),
    "MSn": len(ionmode_df),
    "MS2/annotated comp": len(ionmode_df[ionmode_df["mslevel"] == "2"])/len(ionmode_df.drop_duplicates(["unique_sample_id", "inchikey"])),
    "MSn/annotated comp": len(ionmode_df)/len(ionmode_df.drop_duplicates(["unique_sample_id", "inchikey"])),
    "purity_%": ionmode_df["precursor_purity"].astype("float").mean()*100,
    "chimeric_%": len(ionmode_df[ionmode_df["quality_chimeric"] != "PASSED"])/len(ionmode_df)*100,
    # "explained by formula":,
    # "explained by substructures":,
    "average_num_signals": ionmode_df["num peaks"].astype("int").mean()
  }
  return results

pos = extract_stats(libdf, positive_df)
neg = extract_stats(libdf, negative_df)


for key, value in pos.items():
  neg_value = neg.get(key)
  print(f"{value}\t{neg_value}")

print("\n")
for key, v in pos.items():
  neg_value = neg.get(key)
  print("{}\t{}\t{}".format(key, v, neg_value))

10316	10316
9682	9682
7074	4793
68.573090345095	46.46180690189996
6642	4542
1051	1051
48069	36759
373060	129118
6.795165394402035	7.669309409555602
52.73678258411083	26.938869184227
98.04833951061578	97.79915437251626
1.8026590896906662	1.6124785080314132
20.94510534498472	12.083721866819499


total_compounds	10316	10316
unique_structures	9682	9682
annotated_compounds	7074	4793
annotated_compounds_%	68.573090345095	46.46180690189996
unique_annotated_compounds	6642	4542
samples	1051	1051
MS2	48069	36759
MSn	373060	129118
MS2/annotated comp	6.795165394402035	7.669309409555602
MSn/annotated comp	52.73678258411083	26.938869184227
purity_%	98.04833951061578	97.79915437251626
chimeric_%	1.8026590896906662	1.6124785080314132
average_num_signals	20.94510534498472	12.083721866819499
