## Organize pKa replicate experiment results

This notebook creates the pKa_replicate_experiment_results.csv file which incorporates the following to 
`pKa_results_of_replicate_experiments.csv`:
1. New experimental report names with Molecule ID prefixes
2. Add canonical isomeric SMILES of the molecules
3. Make sure pKa values are reported with 2 decimals

In [1]:
import pandas as pd
import numpy as np

In [2]:
input_path = "pKa_replicate_experimental_results.csv"
output_path = "pKa_results_of_replicate_experiments.csv"
path_to_report_files = "experiment_reports_with_molecule_ID/"

df_exp = pd.read_csv(input_path)
df_exp.head()

Unnamed: 0,Molecule ID,pKa1,pKa2,pKa3,Assay Type,Experiment ID,Experimental Molecule ID,Experiment Report
0,SM01,9.54,,,UV-metric pKa,17I-15024,M01,17I-15024_M01_UV-metric pKa_report
1,SM01,9.53,,,UV-metric pKa,17I-15025,M01,17I-15025_M01_UV-metric pKa_report
2,SM01,9.53,,,UV-metric pKa,17I-16001,M01,17I-16001_M01_UV-metric pKa_report
3,SM02,5.04,,,"UV-metric psKa, Yasuda-Shedlovsky Extrapolation",17I-22022,M02,17I-22022_M02_UV-metric psKa_report
4,SM02,5.04,,,"UV-metric psKa, Yasuda-Shedlovsky Extrapolation",17I-22023,M02,17I-22023_M02_UV-metric psKa_report


In [3]:
# Iterate over experimental reports to update their names

for i, row in enumerate(df_exp.iterrows()):
    old_report_name = row[1]["Experiment Report"]
    molecule_ID = row[1]["Molecule ID"]
    new_report_name = molecule_ID + "_" + old_report_name + ".pdf"
    df_exp.loc[i, "Experiment Report"] = new_report_name

df_exp.head()

Unnamed: 0,Molecule ID,pKa1,pKa2,pKa3,Assay Type,Experiment ID,Experimental Molecule ID,Experiment Report
0,SM01,9.54,,,UV-metric pKa,17I-15024,M01,SM01_17I-15024_M01_UV-metric pKa_report.pdf
1,SM01,9.53,,,UV-metric pKa,17I-15025,M01,SM01_17I-15025_M01_UV-metric pKa_report.pdf
2,SM01,9.53,,,UV-metric pKa,17I-16001,M01,SM01_17I-16001_M01_UV-metric pKa_report.pdf
3,SM02,5.04,,,"UV-metric psKa, Yasuda-Shedlovsky Extrapolation",17I-22022,M02,SM02_17I-22022_M02_UV-metric psKa_report.pdf
4,SM02,5.04,,,"UV-metric psKa, Yasuda-Shedlovsky Extrapolation",17I-22023,M02,SM02_17I-22023_M02_UV-metric psKa_report.pdf


In [4]:
path_to_smiles_table = "molecule_ID_and_SMILES.csv"
df_smiles = pd.read_csv(path_to_smiles_table)
df_smiles

Unnamed: 0,SAMPL6 Molecule ID,canonical isomeric SMILES
0,SM01,c1cc2c(cc1O)c3c(o2)C(=O)NCCC3
1,SM02,c1ccc2c(c1)c(ncn2)Nc3cccc(c3)C(F)(F)F
2,SM03,c1ccc(cc1)Cc2nnc(s2)NC(=O)c3cccs3
3,SM04,c1ccc2c(c1)c(ncn2)NCc3ccc(cc3)Cl
4,SM05,c1ccc(c(c1)NC(=O)c2ccc(o2)Cl)N3CCCCC3
5,SM06,c1cc2cccnc2c(c1)NC(=O)c3cc(cnc3)Br
6,SM07,c1ccc(cc1)CNc2c3ccccc3ncn2
7,SM08,Cc1ccc2c(c1)c(c(c(=O)[nH]2)CC(=O)O)c3ccccc3
8,SM09,COc1cccc(c1)Nc2c3ccccc3ncn2.Cl
9,SM10,c1ccc(cc1)C(=O)NCC(=O)Nc2nc3ccccc3s2


In [5]:
# Add SMILES to experiment table
df_exp["canonical isomeric SMILES"] = np.NaN

# iterate over experiments to record the corresponding SMILES
for i, row in enumerate(df_exp.iterrows()):
    molecule_ID = row[1]["Molecule ID"]
    
    # find the SMILES for each molecule ID
    smiles = df_smiles.loc[df_smiles["SAMPL6 Molecule ID"] == molecule_ID]["canonical isomeric SMILES"].values[0]
    #print(molecule_ID, smiles)
    
    df_exp.loc[i,"canonical isomeric SMILES"] = smiles

df_exp

Unnamed: 0,Molecule ID,pKa1,pKa2,pKa3,Assay Type,Experiment ID,Experimental Molecule ID,Experiment Report,canonical isomeric SMILES
0,SM01,9.54,,,UV-metric pKa,17I-15024,M01,SM01_17I-15024_M01_UV-metric pKa_report.pdf,c1cc2c(cc1O)c3c(o2)C(=O)NCCC3
1,SM01,9.53,,,UV-metric pKa,17I-15025,M01,SM01_17I-15025_M01_UV-metric pKa_report.pdf,c1cc2c(cc1O)c3c(o2)C(=O)NCCC3
2,SM01,9.53,,,UV-metric pKa,17I-16001,M01,SM01_17I-16001_M01_UV-metric pKa_report.pdf,c1cc2c(cc1O)c3c(o2)C(=O)NCCC3
3,SM02,5.04,,,"UV-metric psKa, Yasuda-Shedlovsky Extrapolation",17I-22022,M02,SM02_17I-22022_M02_UV-metric psKa_report.pdf,c1ccc2c(c1)c(ncn2)Nc3cccc(c3)C(F)(F)F
4,SM02,5.04,,,"UV-metric psKa, Yasuda-Shedlovsky Extrapolation",17I-22023,M02,SM02_17I-22023_M02_UV-metric psKa_report.pdf,c1ccc2c(c1)c(ncn2)Nc3cccc(c3)C(F)(F)F
5,SM02,5.02,,,"UV-metric psKa, Yasuda-Shedlovsky Extrapolation",17I-22024,M02,SM02_17I-22024_M02_UV-metric psKa_report.pdf,c1ccc2c(c1)c(ncn2)Nc3cccc(c3)C(F)(F)F
6,SM03,7.01,,,"UV-metric psKa, Yasuda-Shedlovsky Extrapolation",17I-19004,M03,SM03_17I-19004_M03_UV-metric psKa_report.pdf,c1ccc(cc1)Cc2nnc(s2)NC(=O)c3cccs3
7,SM03,7.01,,,"UV-metric psKa, Yasuda-Shedlovsky Extrapolation",17I-19005,M03,SM03_17I-19005_M03_UV-metric psKa_report.pdf,c1ccc(cc1)Cc2nnc(s2)NC(=O)c3cccs3
8,SM03,7.03,,,"UV-metric psKa, Yasuda-Shedlovsky Extrapolation",17I-19006,M03,SM03_17I-19006_M03_UV-metric psKa_report.pdf,c1ccc(cc1)Cc2nnc(s2)NC(=O)c3cccs3
9,SM04,6.03,,,UV-metric pKa,17I-18018,M04,SM04_17I-18018_M04_UV-metric pKa_report.pdf,c1ccc2c(c1)c(ncn2)NCc3ccc(cc3)Cl


In [6]:
# Make sure pKa values are reported with 2 decimals
for i, row in enumerate(df_exp.iterrows()):

    pKa1 = row[1]["pKa1"]
    pKa1 = str(format(pKa1,'.2f'))
    if pKa1 == "nan":
        pKa1 = ""
    df_exp.loc[i, "pKa1"] = pKa1

    pKa2 = row[1]["pKa2"]
    pKa2 = str(format(pKa2,'.2f'))
    if pKa2 == "nan":
        pKa2 = ""
    df_exp.loc[i, "pKa2"] = pKa2

    pKa3 = row[1]["pKa3"]
    pKa3 = str(format(pKa3,'.2f'))
    if pKa3 == "nan":
        pKa3 = ""
    df_exp.loc[i, "pKa3"] = pKa3
    
df_exp

Unnamed: 0,Molecule ID,pKa1,pKa2,pKa3,Assay Type,Experiment ID,Experimental Molecule ID,Experiment Report,canonical isomeric SMILES
0,SM01,9.54,,,UV-metric pKa,17I-15024,M01,SM01_17I-15024_M01_UV-metric pKa_report.pdf,c1cc2c(cc1O)c3c(o2)C(=O)NCCC3
1,SM01,9.53,,,UV-metric pKa,17I-15025,M01,SM01_17I-15025_M01_UV-metric pKa_report.pdf,c1cc2c(cc1O)c3c(o2)C(=O)NCCC3
2,SM01,9.53,,,UV-metric pKa,17I-16001,M01,SM01_17I-16001_M01_UV-metric pKa_report.pdf,c1cc2c(cc1O)c3c(o2)C(=O)NCCC3
3,SM02,5.04,,,"UV-metric psKa, Yasuda-Shedlovsky Extrapolation",17I-22022,M02,SM02_17I-22022_M02_UV-metric psKa_report.pdf,c1ccc2c(c1)c(ncn2)Nc3cccc(c3)C(F)(F)F
4,SM02,5.04,,,"UV-metric psKa, Yasuda-Shedlovsky Extrapolation",17I-22023,M02,SM02_17I-22023_M02_UV-metric psKa_report.pdf,c1ccc2c(c1)c(ncn2)Nc3cccc(c3)C(F)(F)F
5,SM02,5.02,,,"UV-metric psKa, Yasuda-Shedlovsky Extrapolation",17I-22024,M02,SM02_17I-22024_M02_UV-metric psKa_report.pdf,c1ccc2c(c1)c(ncn2)Nc3cccc(c3)C(F)(F)F
6,SM03,7.01,,,"UV-metric psKa, Yasuda-Shedlovsky Extrapolation",17I-19004,M03,SM03_17I-19004_M03_UV-metric psKa_report.pdf,c1ccc(cc1)Cc2nnc(s2)NC(=O)c3cccs3
7,SM03,7.01,,,"UV-metric psKa, Yasuda-Shedlovsky Extrapolation",17I-19005,M03,SM03_17I-19005_M03_UV-metric psKa_report.pdf,c1ccc(cc1)Cc2nnc(s2)NC(=O)c3cccs3
8,SM03,7.03,,,"UV-metric psKa, Yasuda-Shedlovsky Extrapolation",17I-19006,M03,SM03_17I-19006_M03_UV-metric psKa_report.pdf,c1ccc(cc1)Cc2nnc(s2)NC(=O)c3cccs3
9,SM04,6.03,,,UV-metric pKa,17I-18018,M04,SM04_17I-18018_M04_UV-metric pKa_report.pdf,c1ccc2c(c1)c(ncn2)NCc3ccc(cc3)Cl


In [8]:
df_exp.to_csv(output_path, index = False)