In [None]:
import math

import pandas as pd
import pandas_utils as pu
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns


In [None]:
def save_fig(file_name):
  plt.rcParams["pdf.fonttype"] = 42
  plt.tight_layout()
  plt.savefig(r"C:\git\msn_library\figures/all/{}.png".format(file_name), dpi=300,
              bbox_inches="tight", transparent=True)
  plt.savefig(r"C:\git\msn_library\figures/all/{}.pdf".format(file_name),
              bbox_inches="tight", transparent=True)
  plt.savefig(r"C:\git\msn_library\figures/all/{}.svg".format(file_name),
              bbox_inches="tight", transparent=True)

sns.set_theme(font_scale=1, style="ticks")


In [None]:
libraries = {
  "04_alpha_helix_peptidomimetic_library": r"C:\git\msn_library\data\iocb_libraries\iocb_peptide_library_cleaned.tsv",
  "03_nih_natural_product": r"C:\git\msn_library\data\nih\nih_library_new_headers_cleaned_plate7_removed.tsv",
  "02_mce_scaffold_library": r"C:\git\msn_library\data\iocb_libraries\Radim_mce_complete_cleaned.tsv",
  "01_mce_bioactive_library": r"C:\git\msn_library\data\library\mce_library_all_cleaned.tsv",
}

In [None]:
dfs = []
for key, value in libraries.items():
  df = pu.read_dataframe(value).copy()
  df = df[df["monoisotopic_mass"] > 114]
  df = df.drop_duplicates(["inchikey", "unique_sample_id"])
  df["library"] = key
  # if key in acquired:
  #   df["acquired"] = True
  # if key in collaborators:
  #   df["collaborators"] = True
  dfs.append(df)

merged_df = pd.concat(dfs, ignore_index=True)

In [None]:
merged_df

In [None]:
sub = merged_df.groupby("library")

In [None]:
def extract_row(libid: str, df: pd.DataFrame) -> dict:
  unique_df = df.drop_duplicates(["inchikey"])
  return {
    "library": libid,
    "total_compounds": len(df),
    "unique_structures": len(unique_df),
    "NPs flagged in ChEMBL": len(unique_df[unique_df["natural_product"] == 1]),
    "Present in LOTUS": len(unique_df[unique_df["lotus_ncbi_id"].notnull()]),
    "Present in Dictionary of NPs": len(unique_df[unique_df["dnp"].notnull()]),
    "any_clinical_phase": len(unique_df[unique_df["any_phase"] == True]),
    "clinical phase 4": len(unique_df[unique_df["clinical_phase"] == 4]),
    "neutral": len(unique_df[unique_df["molecular_species"] == "NEUTRAL"]),
    "acid": len(unique_df[unique_df["molecular_species"] == "ACID"]),
    "base": len(unique_df[unique_df["molecular_species"] == "BASE"]),
    "zwitterion": len(unique_df[unique_df["molecular_species"] == "ZWITTERION"]),
    "glycoside": len(unique_df[(unique_df["fg_n_glycoside"] > 0)]),
    "np_glycoside": len(unique_df[(unique_df["npclassifier_isglycoside"] == True)]),
  }



In [None]:
lib_rows = []
for group in sub:
  libid = group[0]
  groupdf = group[1]

  row = extract_row(libid, groupdf)
  lib_rows.append(row)

# get summary stats
row = extract_row("Summary", merged_df)
lib_rows.append(row)
lib_rows
statistic_df = pd.DataFrame(lib_rows)

In [None]:
statistic_df

In [None]:
merged_df_unique = merged_df.drop_duplicates(["inchikey", "unique_sample_id"])
merged_df_unique_structure = merged_df.drop_duplicates(["inchikey"])

In [None]:
merged_df_unique_structure

In [None]:
merged_df_unique_structure[merged_df_unique_structure["fg_n_glycoside"] > 0][
  ["smiles", "fg_n_glycoside", "fg_n_flavan", "fg_n_flavone", "npclassifier_isglycoside"]]

In [None]:
merged_df_unique_structure[["smiles", "compound_name", "fg_n_sulfuric_acid_and_ester", "fg_n_sulfate", "fg_n_sulfuric_acid_diester", "fg_n_amidinium", "fg_n_quart_amine", "fg_n_general_amide", "fg_n_carbamate", "unique_sample_id", "monoisotopic_mass"]]

In [None]:
file_name = "monoisotopic_mass_distribution"
plt.figure(figsize=(10, 6))
ax = sns.histplot(data=merged_df_unique_structure, x="monoisotopic_mass", kde=False,
                   hue="library",
                   multiple="stack")
ax.set_xlim(0, 1000)
# axs = sns.histplot(data=merged_df_unique_structure, x="monoisotopic_mass", hue="library",
#              multiple="stack", ax=axs[0, 1])
save_fig(file_name)

In [None]:
merged_df_unique_structure[merged_df_unique_structure["monoisotopic_mass"] > 500]

In [None]:
merged_df_unique_structure[merged_df_unique_structure["at_n_F"] > 8][["smiles", "at_n_F", "at_n_Cl", "at_n_Br", "library"]]


In [None]:
from matplotlib.ticker import MaxNLocator

def count_hist(col, ax):
  sns.histplot(data=moddf[moddf[col] > 0], x=col, hue="library", hue_order=libraries.keys(), multiple="stack", ax=ax, binwidth=1, legend=False)

# def count_hist_percentile(col, ax):
#   tempdf = moddf[moddf[col] > 0]
#   # lower_percentile = np.percentile(tempdf[col], 5)
#   upper_percentile = np.percentile(tempdf[col], 95) +2.5
#   sns.histplot(data=tempdf, x=col, hue="library", hue_order=libraries.keys(), multiple="stack", ax=ax, binwidth=1, binrange=(0.5, upper_percentile), legend=False).set_xlim(0.5, upper_percentile)
  
def count_hist_percentile(col, ax):
  tempdf = moddf[moddf[col] > 0]
  # lower_percentile = np.percentile(tempdf[col], 5)
  upper_percentile = np.percentile(tempdf[col], 95) +2.5
  sns.histplot(data=tempdf, x=col, hue="library", hue_order=libraries.keys(), multiple="stack", ax=ax, binwidth=1, legend=False, discrete=True).set_xlim(0.5, upper_percentile)
  ax.xaxis.set_major_locator(MaxNLocator(integer=True))
  # plt.xticks(np.arange(1, upper_percentile+0.5, [f"{x:.0f}" for x in range(1, upper_percentile+0.5)]))

In [None]:
moddf = merged_df_unique_structure.copy()

In [None]:
file_name = "statistics_metadata_no_legend"
  
# Create a 4x4 grid of subplots to accommodate 16 histograms
num_rows = 15
num_cols = 4
num_histograms = num_rows * num_cols

# Create a figure and subplots
fig, axes = plt.subplots(num_rows, num_cols, figsize=(20, 20))

# Flatten the axes array to iterate through subplots easily
axes_flat = axes.flatten()

# Get a list of (16) distinct colors from the tab20 colormap
colors = plt.cm.tab20.colors[:num_histograms]




i = 0
sns.histplot(data=moddf[moddf["monoisotopic_mass"]<=1000], x="monoisotopic_mass", hue="library", multiple="stack", binwidth=20, ax=axes_flat[i], legend=False).set_xlim(100,1000)
i += 1
sns.histplot(data=moddf, x="logp", hue="library", multiple="stack", binwidth=0.2,  ax=axes_flat[i], legend=False).set_xlim(-5,8.5)
i += 1
sns.histplot(data=moddf[moddf["clinical_phase"] > 0], x="clinical_phase", hue="library", multiple="stack", binwidth = 1, ax=axes_flat[i], legend=False)
i += 1


columns = [
  "glycoside",
  "flavan",
  "flavone",
  "hydroxy",
  "hydroxy_aliphatic",
  "carboxylic_acid",
  "ester",
  "lactone",
  "prim_amine",
  "second_amine",
  "tert_amine",
  "quart_amine",
  "amide",
  "carbamate",
  "enamine",
  "aromatic_amine",
  "amino_acid",
  "guanidine",
  "nitro",
  "hydrazine",
  "hydrazone",
  "azo_nitrogen",
  "sulfuric_acid_and_ester",
  "sulfone",
  # "sulfoxide",
  "phosphoric_acid",
  "steroid"
]
for col in columns:
  col = f"fg_n_{col}"
  count_hist_percentile(col, axes_flat[i])
  i += 1




columns = [
  "C",
  "H",
  "N",
  "O",
  "P",
  "S",
  "F",
  "Cl",
  "Br",
  "I",
]
for col in columns:
  col = f"at_n_{col}"
  count_hist_percentile(col, axes_flat[i])
  i += 1



# Iterate through the DataFrame columns and plot histograms with distinct colors
# for i, column in enumerate(columns):
  # row = math.floor(i / num_cols)
  # col = i % num_cols
  
# for coli in range(0, len(columns)):
#   ax = axes_flat[i + coli]
#   column = columns[coli]
#   sns.histplot(moddf, x=column, hue="library", multiple="stack", ax=ax)
  # merged_df_unique_structure[column].plot.hist(ax=ax, bins=15, alpha=0.7, color=colors[i],
  #                                                edgecolor='black')
  #   ax.set_title(f'Histogram of {column}', fontsize=7)
  #   ax.set_xlabel(column, fontsize=7)

# Remove any extra empty subplots if the number of variables is less than 16
len_data_cols = i
if len_data_cols < num_histograms:
  for j in range(len_data_cols, num_histograms):
    fig.delaxes(axes_flat[j])
save_fig(file_name)


# Adjust layout and display the plot
# plt.tight_layout()
# plt.show()



In [None]:
file_name = "extracted_statistics"
# Create a 4x4 grid of subplots to accommodate 16 histograms
num_rows = 4
num_cols = 3
num_histograms = num_rows * num_cols

# Create a figure and subplots
fig, axes = plt.subplots(num_rows, num_cols, figsize=(18, 20))

# Flatten the axes array to iterate through subplots easily
axes_flat = axes.flatten()

i = 0
sns.histplot(data=moddf[moddf["monoisotopic_mass"]<=1000], x="monoisotopic_mass", hue="library", multiple="stack", binwidth=20, ax=axes_flat[i], legend=False, kde=False).set_xlim(100,1000)
i += 1
sns.histplot(data=moddf, x="logp", hue="library", multiple="stack", binwidth=0.2,  ax=axes_flat[i], legend=False).set_xlim(-5,8.5)
i += 1
count_hist_percentile("fg_n_glycoside", axes_flat[i])
i += 1
count_hist_percentile("fg_n_hydroxy_aliphatic", axes_flat[i])
i += 1
count_hist_percentile("fg_n_amino_acid", axes_flat[i])
i += 1
count_hist_percentile("at_n_O", axes_flat[i])
i += 1
count_hist_percentile("at_n_N", axes_flat[i])
i += 1
count_hist_percentile("at_n_F", axes_flat[i])
i += 1
count_hist_percentile("at_n_Cl", axes_flat[i])
i += 1

len_data_cols = i
if len_data_cols < num_histograms:
  for j in range(len_data_cols, num_histograms):
    fig.delaxes(axes_flat[j])
save_fig(file_name)

In [None]:
file_name = "extracted_statistics_cumulative"
# Create a 4x4 grid of subplots to accommodate 16 histograms
num_rows = 4
num_cols = 3
num_histograms = num_rows * num_cols

# Create a figure and subplots
fig, axes = plt.subplots(num_rows, num_cols, figsize=(18, 10))

# Flatten the axes array to iterate through subplots easily
axes_flat = axes.flatten()

i = 0
sns.histplot(data=moddf[moddf["monoisotopic_mass"]<=1000], x="monoisotopic_mass", hue="library", multiple="stack", binwidth=20, ax=axes_flat[i], legend=False, kde=False).set_xlim(100,1000)
ax2 = axes_flat[i].twinx()
sns.histplot(data=moddf[moddf["monoisotopic_mass"]<=1000], x="monoisotopic_mass", hue="library", element="poly", fill=False, binwidth=20, ax=ax2, cumulative=True, legend=False).set_xlim(100,1000)

i += 1
sns.histplot(data=moddf, x="logp", hue="library", multiple="stack", binwidth=0.2,  ax=axes_flat[i], legend=False).set_xlim(-5,8.5)
ax2 = axes_flat[i].twinx()
sns.histplot(data=moddf, x="logp", hue="library", element="poly", fill=False, binwidth=0.2, ax=ax2, cumulative=True, legend=False).set_xlim(-5,8.5)

i += 1
count_hist_percentile("fg_n_glycoside", axes_flat[i])
i += 1
count_hist_percentile("fg_n_hydroxy_aliphatic", axes_flat[i])
i += 1
count_hist_percentile("fg_n_amino_acid", axes_flat[i])
i += 1
count_hist_percentile("at_n_O", axes_flat[i])
i += 1
count_hist_percentile("at_n_N", axes_flat[i])
i += 1
count_hist_percentile("at_n_F", axes_flat[i])
i += 1
count_hist_percentile("at_n_Cl", axes_flat[i])
i += 1

len_data_cols = i
if len_data_cols < num_histograms:
  for j in range(len_data_cols, num_histograms):
    fig.delaxes(axes_flat[j])

save_fig(file_name)