In [None]:
import pandas as pd
from metadata_cleanup import add_molid_columns

In [None]:
file = "mce_library_all_"
final = "{}final.tsv".format(file)
approved_file = "{}approved.tsv".format(file)
phase_file = "{}phase.tsv".format(file)
id_columns=['Product Name', "lib_plate_well", "inchi_key"]
id_columns_exact= id_columns + ["exact_mass"]

In [None]:
df = pd.read_csv("data/mce_library_cleaned.tsv", sep="\t")
add_df = pd.read_csv("data/mce_library_add_compounds_cleaned.tsv", sep="\t")
#
df = pd.concat([df, add_df], ignore_index=True)
df = add_molid_columns(df)
df = df.drop(columns=["mol"])
df["lib_plate_well_unique"] = df["lib_plate_well"] + "."
df

In [None]:
## Getting a compound name for each product
preferred_name = ["compound_name", "drugbank_name", "broad_pert_iname", "drugcentral_name", "Product Name"]

def find_name(row):
    for column in preferred_name:
        value = row.get(column, None)
        if value is not None and pd.notnull(value):
            return value
    return None

df["compound_name"] = df.apply(lambda row: find_name(row), axis=1)
df

## Comparing clinical_phase and clinical_phase_description with approved (drugbank)

In [None]:
def map_drugbank_approval(status):
    match (str(status)):
        case "approved" | "withdrawn":
            return 4
        case _:
            return None


if "drugbank_approved" in df.columns:
    df["drugbank_approved_number"] = [map_drugbank_approval(status) for status in df["drugbank_approved"]]
else:
    df["drugbank_approved_number"] = None

if "drugcentral_administration" in df.columns:
    df["drugcentral_administration_number"] = [4 if pd.notnull(status) else None for status in df["drugcentral_administration"]]
else:
    df["drugcentral_administration_number"] = None

df["clinical_phase"] = df[['clinical_phase', 'drugbank_approved_number', 'drugcentral_administration_number']].max(
        axis=1)

# df[["Product Name", "compound_name", "CAS No.", 'Source', "exact_mass", 'formula', "lib_plate_well", "clinical_phase", "clinical_phase_description", "clinical_phase2", "drugbank_approved", "drugbank_approved_number", "drugcentral_date_of_approval", "drugcentral_administration_number"]]
df["any_phase"] = df["drugbank_approved"].notna() | (df["clinical_phase"] > 0)
df["none"] = df.isnull().sum(axis=1)
df = df.sort_values(by="none", ascending=True).drop_duplicates(["Product Name", "lib_plate_well", "exact_mass"]).sort_values(by="clinical_phase", ascending=False)
df

In [None]:
df.loc[df["clinical_phase"] == 4]

In [None]:
approved_df = df.loc[df["clinical_phase"] == 4]
phase_df = df.loc[df["clinical_phase"] > 0]

In [None]:
approved_df

In [None]:
phase_df

In [None]:
approved_df.to_csv("data/final_tables/{}".format(approved_file), sep="\t", index=False)

In [None]:
df.to_csv("data/final_tables/{}".format(final), sep="\t", index=False)

In [None]:
phase_df = df.loc[df["any_phase"] == True]
phase_df

In [None]:
phase_df.to_csv("data/final_tables/{}".format(phase_file), sep="\t", index=False)

In [None]:
double = df[df.duplicated("Product Name", keep=False)]
double[["lib_plate_well", "Product Name", "Smiles", "exact_mass", "Source", "drugbank_name"]]

In [None]:
df.groupby("drugbank_approved").count()

In [None]:
df = pd.read_csv("data/final_tables/mce_library_all_final.tsv", sep="\t")
df.sort_values(by="none")

In [None]:
df.loc[df["lib_plate_well_unique"]== "pluskal_mce_1D1_A18."]

In [None]:
drop_df = df.drop_duplicates(["Product Name", "lib_plate_well"])

In [None]:
drop_df

In [None]:
filtered_df = drop_df.loc[(drop_df['exact_mass'] >= 114) & (drop_df['exact_mass'] <= 2000)]

In [None]:
filtered_df

In [None]:
none_df = drop_df[drop_df['exact_mass'].isna()]

In [None]:
none_df.to_csv("data/final_tables/mce_library_none.tsv", sep="\t", index=False)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
def save_fig(file_name):
    plt.savefig("figures/mce_msn_library/{}.png".format(file_name), dpi=300)
    plt.savefig("figures/mce_msn_library/{}.pdf".format(file_name))
    plt.savefig("figures/mce_msn_library/{}.svg".format(file_name))

sns.set_theme(font_scale=2, style="ticks")

In [None]:
file_name = "library_histo_exact_mass_cleaned"
plt.figure(figsize=(13, 10))
ax = sns.histplot(data=filtered_df, x="exact_mass", binwidth=25, kde=True)
ax.set_xlim(0,2000)
ax.set(xlabel="Exact Mass")
save_fig(file_name)