In [1]:
import pandas as pd
import numpy as np

In [2]:
# Paths to input data.
PKA_TYPEIII_SUBMISSIONS_DIR_PATH = '../../predictions/typeIII_predictions'
EXPERIMENTAL_DATA_FILE_PATH = '../../experimental_data/pKa_experimental_values.csv'

In [3]:
df_exp = pd.read_csv(EXPERIMENTAL_DATA_FILE_PATH)
df_exp

Unnamed: 0,Molecule ID,pKa1 mean,pKa1 SEM,pKa2 mean,pKa2 SEM,pKa3 mean,pKa3 SEM,Assay Type,Experimental Molecule ID,canonical isomeric SMILES
0,SM01,9.53,0.01,,,,,UV-metric pKa,M01,c1cc2c(cc1O)c3c(o2)C(=O)NCCC3
1,SM02,5.03,0.01,,,,,"UV-metric psKa, Yasuda-Shedlovsky Extrapolation",M02,c1ccc2c(c1)c(ncn2)Nc3cccc(c3)C(F)(F)F
2,SM03,7.02,0.01,,,,,"UV-metric psKa, Yasuda-Shedlovsky Extrapolation",M03,c1ccc(cc1)Cc2nnc(s2)NC(=O)c3cccs3
3,SM04,6.02,0.01,,,,,UV-metric pKa,M04,c1ccc2c(c1)c(ncn2)NCc3ccc(cc3)Cl
4,SM05,4.59,0.01,,,,,"UV-metric psKa, Yasuda-Shedlovsky Extrapolation",M05,c1ccc(c(c1)NC(=O)c2ccc(o2)Cl)N3CCCCC3
5,SM06,3.03,0.04,11.74,0.01,,,UV-metric pKa,M06,c1cc2cccnc2c(c1)NC(=O)c3cc(cnc3)Br
6,SM07,6.08,0.01,,,,,UV-metric pKa,M07,c1ccc(cc1)CNc2c3ccccc3ncn2
7,SM08,4.22,0.01,,,,,UV-metric pKa,M08,Cc1ccc2c(c1)c(c(c(=O)[nH]2)CC(=O)O)c3ccccc3
8,SM09,5.37,0.01,,,,,"UV-metric psKa, Yasuda-Shedlovsky Extrapolation",M09,COc1cccc(c1)Nc2c3ccccc3ncn2.Cl
9,SM10,9.02,0.01,,,,,"UV-metric psKa, Yasuda-Shedlovsky Extrapolation",M10,c1ccc(cc1)C(=O)NCC(=O)Nc2nc3ccccc3s2


In [17]:
# reorganize experimental data: I want each row to represent one pKa.
data = []

for i, row in enumerate(df_exp.iterrows()):
    pKa1_mean = np.NaN
    pKa2_mean = np.NaN
    pKa3_mean = np.NaN
    
    mol_id = row[1]["Molecule ID"]
    pKa1_mean = row[1]["pKa1 mean"]
    pKa1_SEM = row[1]["pKa1 SEM"]
    pKa2_mean = row[1]["pKa2 mean"]
    pKa2_SEM = row[1]["pKa2 SEM"]
    pKa3_mean = row[1]["pKa3 mean"]
    pKa3_SEM = row[1]["pKa3 SEM"]
    assay_type = row[1]["Assay Type"]
    exp_mol_id = row[1]["Experimental Molecule ID"]
    can_iso_smiles = row[1]["canonical isomeric SMILES"]
    
    # all molecules have at least 1 pKa
    # Append pKa1
    data.append({
        "Molecule ID": mol_id,
        "pKa mean": pKa1_mean,
        "pKa SEM": pKa1_SEM,
        "Assay Type": assay_type,
        "Experimental Molecule ID": exp_mol_id,
        "canonical isomeric SMILES": can_iso_smiles,
        "pKa ID": mol_id+"_pKa1"
    })
    
    # if exists, append pKa2
    if np.isnan(pKa2_mean):
        continue
    else:
        data.append({
            "Molecule ID": mol_id,
            "pKa mean": pKa2_mean,
            "pKa SEM": pKa2_SEM,
            "Assay Type": assay_type,
            "Experimental Molecule ID": exp_mol_id,
            "canonical isomeric SMILES": can_iso_smiles,
            "pKa ID": mol_id+"_pKa2"
        })
        
    # if exists, append pKa3
    if np.isnan(pKa3_mean):
        continue
    else:
        data.append({
            "Molecule ID": mol_id,
            "pKa mean": pKa3_mean,
            "pKa SEM": pKa3_SEM,
            "Assay Type": assay_type,
            "Experimental Molecule ID": exp_mol_id,
            "canonical isomeric SMILES": can_iso_smiles,
            "pKa ID": mol_id+"_pKa3"
        })
    
# Transform into Pandas DataFrame.
df_exp_stacked = pd.DataFrame(data=data)
df_exp_stacked.to_csv("../../experimental_data/pKa_experimental_values_stacked.csv", index=False)




In [18]:
def reorganize_experimental_pKa_dataframe(dataframe):
    """Reorganize experimental data dataframe so that each row represents one pKa.
    Each row is also assigned a unique pKa ID in the form of SM##_pKa#
    
    Args:
        Pandas DataFrame of experimnental pKas.
    
    Returns:
        Pandas DataFrame
    """

    # reorganize experimental data: I want each row to represent one pKa.
    data = []

    for i, row in enumerate(dataframe.iterrows()):
        pKa1_mean = np.NaN
        pKa2_mean = np.NaN
        pKa3_mean = np.NaN

        mol_id = row[1]["Molecule ID"]
        pKa1_mean = row[1]["pKa1 mean"]
        pKa1_SEM = row[1]["pKa1 SEM"]
        pKa2_mean = row[1]["pKa2 mean"]
        pKa2_SEM = row[1]["pKa2 SEM"]
        pKa3_mean = row[1]["pKa3 mean"]
        pKa3_SEM = row[1]["pKa3 SEM"]
        assay_type = row[1]["Assay Type"]
        exp_mol_id = row[1]["Experimental Molecule ID"]
        can_iso_smiles = row[1]["canonical isomeric SMILES"]

        # all molecules have at least 1 pKa
        # Append pKa1
        data.append({
            "Molecule ID": mol_id,
            "pKa mean": pKa1_mean,
            "pKa SEM": pKa1_SEM,
            "Assay Type": assay_type,
            "Experimental Molecule ID": exp_mol_id,
            "canonical isomeric SMILES": can_iso_smiles,
            "pKa ID": mol_id+"_pKa1"
        })

        # if exists, append pKa2
        if np.isnan(pKa2_mean):
            continue
        else:
            data.append({
                "Molecule ID": mol_id,
                "pKa mean": pKa2_mean,
                "pKa SEM": pKa2_SEM,
                "Assay Type": assay_type,
                "Experimental Molecule ID": exp_mol_id,
                "canonical isomeric SMILES": can_iso_smiles,
                "pKa ID": mol_id+"_pKa2"
            })

        # if exists, append pKa3
        if np.isnan(pKa3_mean):
            continue
        else:
            data.append({
                "Molecule ID": mol_id,
                "pKa mean": pKa3_mean,
                "pKa SEM": pKa3_SEM,
                "Assay Type": assay_type,
                "Experimental Molecule ID": exp_mol_id,
                "canonical isomeric SMILES": can_iso_smiles,
                "pKa ID": mol_id+"_pKa3"
            })

    # Transform into Pandas DataFrame.
    df_exp_stacked = pd.DataFrame(data=data)
    df_exp_stacked.to_csv("../../experimental_data/pKa_experimental_values_stacked.csv", index=False)
    
    return df_exp_stacked

df_exp_pKaID = reorganize_experimental_pKa_dataframe(df_exp)
df_exp_pKaID 

Unnamed: 0,Assay Type,Experimental Molecule ID,Molecule ID,canonical isomeric SMILES,pKa ID,pKa SEM,pKa mean
0,UV-metric pKa,M01,SM01,c1cc2c(cc1O)c3c(o2)C(=O)NCCC3,SM01_pKa1,0.01,9.53
1,"UV-metric psKa, Yasuda-Shedlovsky Extrapolation",M02,SM02,c1ccc2c(c1)c(ncn2)Nc3cccc(c3)C(F)(F)F,SM02_pKa1,0.01,5.03
2,"UV-metric psKa, Yasuda-Shedlovsky Extrapolation",M03,SM03,c1ccc(cc1)Cc2nnc(s2)NC(=O)c3cccs3,SM03_pKa1,0.01,7.02
3,UV-metric pKa,M04,SM04,c1ccc2c(c1)c(ncn2)NCc3ccc(cc3)Cl,SM04_pKa1,0.01,6.02
4,"UV-metric psKa, Yasuda-Shedlovsky Extrapolation",M05,SM05,c1ccc(c(c1)NC(=O)c2ccc(o2)Cl)N3CCCCC3,SM05_pKa1,0.01,4.59
5,UV-metric pKa,M06,SM06,c1cc2cccnc2c(c1)NC(=O)c3cc(cnc3)Br,SM06_pKa1,0.04,3.03
6,UV-metric pKa,M06,SM06,c1cc2cccnc2c(c1)NC(=O)c3cc(cnc3)Br,SM06_pKa2,0.01,11.74
7,UV-metric pKa,M07,SM07,c1ccc(cc1)CNc2c3ccccc3ncn2,SM07_pKa1,0.01,6.08
8,UV-metric pKa,M08,SM08,Cc1ccc2c(c1)c(c(c(=O)[nH]2)CC(=O)O)c3ccccc3,SM08_pKa1,0.01,4.22
9,"UV-metric psKa, Yasuda-Shedlovsky Extrapolation",M09,SM09,COc1cccc(c1)Nc2c3ccccc3ncn2.Cl,SM09_pKa1,0.01,5.37


In [5]:
# reorganize experimental data
df_exp_reorg = df_exp.melt(id_vars = ["Molecule ID", "Assay Type" , "Experimental Molecule ID",
                       "canonical isomeric SMILES"])
df_exp_reorg.loc[df_exp_reorg["Molecule ID"] == "SM01"]

Unnamed: 0,Molecule ID,Assay Type,Experimental Molecule ID,canonical isomeric SMILES,variable,value
0,SM01,UV-metric pKa,M01,c1cc2c(cc1O)c3c(o2)C(=O)NCCC3,pKa1 mean,9.53
24,SM01,UV-metric pKa,M01,c1cc2c(cc1O)c3c(o2)C(=O)NCCC3,pKa1 SEM,0.01
48,SM01,UV-metric pKa,M01,c1cc2c(cc1O)c3c(o2)C(=O)NCCC3,pKa2 mean,
72,SM01,UV-metric pKa,M01,c1cc2c(cc1O)c3c(o2)C(=O)NCCC3,pKa2 SEM,
96,SM01,UV-metric pKa,M01,c1cc2c(cc1O)c3c(o2)C(=O)NCCC3,pKa3 mean,
120,SM01,UV-metric pKa,M01,c1cc2c(cc1O)c3c(o2)C(=O)NCCC3,pKa3 SEM,


In [6]:
# create pKa ID in the form of SM##_pKa#
#df_exp_reorg["pKa ID"]= np.NaN

#for i, row in enumerate(df_exp_reorg.iterrows()):
#    mol_id = row[1]["Molecule ID"]
#    pKa_label = row[1]["variable"].split(" ")[0]
#    pKa_id = mol_id + "_" + pKa_label
    
#    df_exp_reorg.loc[i, "pKa ID"] = pKa_id

df_exp_reorg["statistic"]= np.NaN

for i, row in enumerate(df_exp_reorg.iterrows()):
    statistic = row[1]["variable"].split(" ")[1]
    
    df_exp_reorg.loc[i, "statistic"] = statistic

df_exp_reorg.loc[df_exp_reorg["Molecule ID"] == "SM01"]

Unnamed: 0,Molecule ID,Assay Type,Experimental Molecule ID,canonical isomeric SMILES,variable,value,statistic
0,SM01,UV-metric pKa,M01,c1cc2c(cc1O)c3c(o2)C(=O)NCCC3,pKa1 mean,9.53,mean
24,SM01,UV-metric pKa,M01,c1cc2c(cc1O)c3c(o2)C(=O)NCCC3,pKa1 SEM,0.01,SEM
48,SM01,UV-metric pKa,M01,c1cc2c(cc1O)c3c(o2)C(=O)NCCC3,pKa2 mean,,mean
72,SM01,UV-metric pKa,M01,c1cc2c(cc1O)c3c(o2)C(=O)NCCC3,pKa2 SEM,,SEM
96,SM01,UV-metric pKa,M01,c1cc2c(cc1O)c3c(o2)C(=O)NCCC3,pKa3 mean,,mean
120,SM01,UV-metric pKa,M01,c1cc2c(cc1O)c3c(o2)C(=O)NCCC3,pKa3 SEM,,SEM


In [None]:
df_exp_reorg = df_exp_reorg.pivot(index="pKa ID", columns"")

df_exp_reorg.loc[df_exp_reorg["Molecule ID"] == "SM01"]