In [None]:
import pandas as pd
import pandas_utils as pu
from pandas_utils import notnull
import seaborn as sns
import matplotlib.pyplot as plt

from meta_constants import MetaColumns


In [None]:
def save_fig(file_name):
    plt.rcParams["pdf.fonttype"] = 42
    plt.savefig(r"C:\git\msn_library\figures/{}/{}.png".format(library, file_name), dpi=300, bbox_inches="tight", transparent=True)
    plt.savefig(r"C:\git\msn_library\figures/{}/{}.pdf".format(library, file_name), bbox_inches="tight", transparent=True)
    plt.savefig(r"C:\git\msn_library\figures/{}/{}.svg".format(library, file_name), bbox_inches="tight", transparent=True)
sns.set_theme(font_scale=1, style="white")

In [None]:
library = "iocb_peptide"
file = r"C:\git\msn_library\data\iocb_libraries\iocb_peptide_library_cleaned.tsv"

In [None]:
df = pu.read_dataframe(file)
df = df[df["monoisotopic_mass"] > 114].sort_values(by="none", ascending=True)
df = df.drop_duplicates(["unique_sample_id", "inchikey"])
df

In [None]:
df[["natural_product"]]

In [None]:
df[df["inchikey"] == "OOGJQPCLVADCPB-HXUWFJFHSA-N"][["compound_name", "input_name", "unique_sample_id"]]

In [None]:
unique_df = df.drop_duplicates(["inchikey"])

In [None]:
statistics = {
    "total_compounds": len(df),
    "unique_structures": len(unique_df),
    "any_clinical_phase": len(unique_df[unique_df["any_phase"] == True]),
    "clinic": len(unique_df[unique_df["clinical_phase"] == 4]),
    "natural_product": len(unique_df[unique_df["natural_product"] == 1]),
    "no_natural_product": len(unique_df[unique_df["natural_product"] == 0]),
    "unknown": len(unique_df[(unique_df["natural_product"] != 0) & (unique_df["natural_product"] != 1) ]),
    "lotus": len(unique_df[unique_df["lotus_ncbi_id"].notnull()]),
    "neutral": len(unique_df[unique_df["molecular_species"] == "NEUTRAL"]),
    "acid": len(unique_df[unique_df["molecular_species"] == "ACID"]),
    "base": len(unique_df[unique_df["molecular_species"] == "BASE"]),
    "zwitterion": len(unique_df[unique_df["molecular_species"] == "ZWITTERION"]),
}

for key, value in statistics.items():
    print(f"{value}")

print("\n")
for key, v in statistics.items():
    print("{}\t{}".format(key, v))

In [None]:
df = df.drop_duplicates("inchikey")

In [None]:
# df[["compound_name", "input_name", "inchikey", "pubchem_cid", "Clinical Information", "any_phase", "clinical_phase", "chembl_clinical_phase", "broad_clinical_phase", "drugbank_clinical_phase", "drugcentral_clinical_phase"]]

In [None]:
def create_counts_bars(df, column, new_name=None, width = 8, height_per_item = 0.2):
    if not new_name:
        new_name = column
      
    colors = sns.color_palette("crest", as_cmap=True)
    
    counts_df = count_values(df, column, new_name)

    creat_bars(counts_df, new_name, width, height_per_item)


def creat_bars(df, column, width = 8, height_per_item = 0.2):
    height = 0.4 + height_per_item * len(df)
    plt.figure(figsize=(width, height))
    ax = sns.barplot(x="counts", y=column, data=df, palette="mako")
    ax.bar_label(ax.containers[0])
    sns.despine()
    save_fig(column)


def count_values(df, column, new_name=None):
    if not new_name:
        new_name = column
    counts_df = df[column].value_counts().rename_axis(new_name).reset_index(name="counts")
    return counts_df


In [None]:
# np classifier charts
cols = ['npclassifier_class_results', 'npclassifier_superclass_results', 'npclassifier_pathway_results']

for col in cols:
    create_counts_bars(df, col)

In [None]:
# ClassyFire charts
cols = ['classyfire_class', 'classyfire_superclass', 'classyfire_subclass', "classyfire_molecular_framework"]

for col in cols:
    create_counts_bars(df, col)

In [None]:
# # NPAtlas charts
# cols = ['npatlas_original_name', 'npatlas_original_organism']
# 
# for col in cols:
#     create_counts_bars(df, col)

In [None]:
clinicalphase = df['clinical_phase'].value_counts().rename_axis("clinic").reset_index(name="counts")

plt.figure(figsize=(8, 4))
ax = sns.barplot(x="clinic", y="counts", data=clinicalphase, palette="mako")
ax.bar_label(ax.containers[0])
sns.despine()
save_fig("clinical_phase")

plt.figure(figsize=(8, 4))
ax = sns.barplot(x="clinic", y="counts", data=clinicalphase, palette="mako")
ax.bar_label(ax.containers[0])
sns.despine()
ax.set_ylim(0,60)
save_fig("clinical_phase_zoom")

In [None]:
results = {
    "unique_inchikeys": len(df),
    "any_phase": len(df[df["any_phase"] == True]),
    "drugs": len(df[df["clinical_phase"] == 4]),
    "lotus": len(df[df["lotus_ncbi_id"].notnull()])
}

general = pd.DataFrame([results], index=["counts"]).transpose().rename_axis("class").reset_index()
general
    


In [None]:
# drugcentral_pharma_class = count_values(df, "drugcentral_stem", "class").head(10)
natural_product_df = count_values(df, "natural_product", "class").head(10)
pathway_df = count_values(df, MetaColumns.npclassifier_pathway_results, "class").head(10)
cl_superclass_df = count_values(df, MetaColumns.classyfire_superclass, "class").head(10)
molecular_species_df = count_values(df, "molecular_species", "class").head(10)

In [None]:
pathway_df

In [None]:
merged = pd.concat([general, natural_product_df, molecular_species_df, pathway_df, cl_superclass_df])

In [None]:
merged

In [None]:
creat_bars(merged, "class", width = 8, height_per_item = 0.2)

In [None]:
df[["compound_name", "chembl_id", "natural_product", "clinical_phase", "molecular_species"]]