## For the analysis of type III macroscopic pKa submissions

### TO DO
1. parsing submission files
2. parsing experiment CSV
3. matching of experimental pKas to predicted pKas
    - minimum-error principle
    - keep the order of pKa values
4. calculate RMSE and MUE for each molecule

In [1]:
import pandas as pd
import numpy as np
from scipy import stats

#### Calculating root mean squared error (RMSE)

$ RMSE = \sqrt{\frac{1}{n}\sum_{i=1}^{n}{({pred}_i - {exp}_i})} $

In [2]:
# calculate RMSE
d = [0.000, 0.166, 0.333]
p = [0.000, 0.254, 0.998]

def rmse(predicted_values, experimental_values):
    return np.sqrt(((predicted_values - experimental_values) ** 2).mean())

rmse = rmse(np.array(d), np.array(p))
print("RMSE: " + str(rmse))

RMSE: 0.387284994115


#### Calculating mean absolute error (MAE)

$ MAE = \frac{1}{n}\sum_{i=1}^{n}{ |{ {pred}_i - {exp}_i } | } $

In [3]:
def mae(predicted_values, experimental_values):
    return abs(predicted_values - experimental_values).mean()

mae = mae(np.array(d), np.array(p))
print("MAE: ", str(mae))

MAE:  0.251


### 3. How to match experimental pKas to predicted pKas?

#### Matchning predicted pKas to minimum error prediction
For each molecule
1. Determine the number of experimental pKas
2. Determine the number of predicted pKas
3. Map each predicted pKa to closest experimental value (minimum absolute error)
4. If multiple predicted pKas are mapped to the same experimental value, then only the one with smallest absolute error will be analyzed.  

In [4]:
pred_pKa1 = 2.1
pred_pKa2 = 7.4
pred_pKa3 = 9.0
pred_pKa4 = 9.5

exp_pKa1 = 3.5
exp_pKa2 = 6.7

exp_pKas = np.array([exp_pKa1, exp_pKa2])
pred_pKas = np.array([pred_pKa1, pred_pKa2, pred_pKa3, pred_pKa4])

# create a dataframe to store absolute errors for all possible experimental and predicted pKa matches
# columns: experimental pKa
# rows: predicted pKa
df_abs_error = pd.DataFrame(index = pred_pKas, columns = exp_pKas)

# iterate over predicted pKas to find the experimental pKa that gives the minimum absolute error.
for i, pred_pKa in enumerate(pred_pKas):
    for j, exp_pKa in enumerate(exp_pKas):
        absolute_error = np.abs( pred_pKa - exp_pKa)
        df_abs_error.loc[pred_pKa,exp_pKa] = absolute_error
        
df_abs_error

Unnamed: 0,3.5,6.7
2.1,1.4,4.6
7.4,3.9,0.7
9.0,5.5,2.3
9.5,6.0,2.8


In [5]:
# Find the nearest experimental pKa for each predicted pKa
df_pKa_match = pd.DataFrame()
df_pKa_match["pred pKa"] = np.NaN
df_pKa_match["matched exp pKa"] = np.NaN
df_pKa_match["absolute error"] = np.NaN

for i, pred_pKa in enumerate(pred_pKas):
    min_abs_error = min(df_abs_error.loc[pred_pKa, :])
    
    # Find the column name (experimental pKa) that corresponds to minimum absolute error
    matched_exp_pKa = df_abs_error.loc[:, df_abs_error.loc[pred_pKa,:].values == min_abs_error].columns.values[0]
    #print("pred pKa: ", pred_pKa, " exp pKa: ", matched_exp_pKa)
    df_pKa_match.loc[i, "pred pKa"]= pred_pKa
    df_pKa_match.loc[i, "matched exp pKa"]= matched_exp_pKa
    df_pKa_match.loc[i, "absolute error"] = min_abs_error
    
# If multiple predicted pKas are matched to same experimental pKa, keep the closer match
# The unmatched predicted pKa will be assigned exp pKa np.NaN
df_pKa_match['duplicate_match'] = df_pKa_match.duplicated("matched exp pKa", keep=False)
df_pKa_match

Unnamed: 0,pred pKa,matched exp pKa,absolute error,duplicate_match
0,2.1,3.5,1.4,False
1,7.4,6.7,0.7,True
2,9.0,6.7,2.3,True
3,9.5,6.7,2.8,True


In [6]:
# Among dublicate matches, find the predicted pKa with minimum absolute error
df_dublicate_matches = df_pKa_match.loc[df_pKa_match["duplicate_match"] == True]
min_abs_error_of_duplicates = min(df_dublicate_matches.loc[:, "absolute error"])

for row in df_dublicate_matches.iterrows():
    index = row[0]
    abs_error = row[1]["absolute error"]
    pred_pKa = row[1]["pred pKa"]
    
    # for dublicates with bigger absolute error, modify matched exp pKa to np.NaN
    if abs_error == min_abs_error_of_duplicates:
        continue
    else:
        df_pKa_match.loc[index, "matched exp pKa"] = np.NaN

df_pKa_match

Unnamed: 0,pred pKa,matched exp pKa,absolute error,duplicate_match
0,2.1,3.5,1.4,False
1,7.4,6.7,0.7,True
2,9.0,,2.3,True
3,9.5,,2.8,True


In [7]:
def match_exp_and_pred_pKas(exp_pKas, pred_pKas):
    """
    Finds closest match between N experimental and M predicted pKas, based on 
    minimum absolute error. If multiple predicted pKas are mapped to the 
    same experimental value, predicted pKa with smallest pKa will be matched to
    experimental pKa and others will be matched to NaN.
    
    Args:
        exp_pKas: Numpy array of experimental pKas
        pred_pKas: Numpy array of predicted pKas
    
    Returns:
        Pandas DataFrame with predicted pKas and matched experimental pKa columns
    
    """
    
    # create a dataframe to store absolute errors for all possible experimental and predicted pKa matches
    # columns: experimental pKa
    # rows: predicted pKa
    df_abs_error = pd.DataFrame(index = pred_pKas, columns = exp_pKas)

    # iterate over predicted pKas to find the experimental pKa that gives the minimum absolute error.
    for i, pred_pKa in enumerate(pred_pKas):
        for j, exp_pKa in enumerate(exp_pKas):
            absolute_error = np.abs( pred_pKa - exp_pKa)
            df_abs_error.loc[pred_pKa,exp_pKa] = absolute_error

    # Find the nearest experimental pKa for each predicted pKa
    df_pKa_match = pd.DataFrame()
    df_pKa_match["pred pKa"] = np.NaN
    df_pKa_match["matched exp pKa"] = np.NaN
    df_pKa_match["absolute error"] = np.NaN

    for i, pred_pKa in enumerate(pred_pKas):
        min_abs_error = min(df_abs_error.loc[pred_pKa, :])

        # Find the column name (experimental pKa) that corresponds to minimum absolute error
        matched_exp_pKa = df_abs_error.loc[:, df_abs_error.loc[pred_pKa,:].values == min_abs_error].columns.values[0]
        #print("pred pKa: ", pred_pKa, " exp pKa: ", matched_exp_pKa)
        df_pKa_match.loc[i, "pred pKa"]= pred_pKa
        df_pKa_match.loc[i, "matched exp pKa"]= matched_exp_pKa
        df_pKa_match.loc[i, "absolute error"] = min_abs_error

    # If multiple predicted pKas are matched to same experimental pKa, keep the closer match
    # The unmatched predicted pKa will be assigned exp pKa np.NaN
    df_pKa_match['duplicate_match'] = df_pKa_match.duplicated("matched exp pKa", keep=False)
    
    # Among dublicate matches, find the predicted pKa with minimum absolute error
    df_dublicate_matches = df_pKa_match.loc[df_pKa_match["duplicate_match"] == True]
    min_abs_error_of_duplicates = min(df_dublicate_matches.loc[:, "absolute error"])

    for row in df_dublicate_matches.iterrows():
        index = row[0]
        abs_error = row[1]["absolute error"]
        pred_pKa = row[1]["pred pKa"]

        # for dublicates with bigger absolute error, modify matched exp pKa to np.NaN
        if abs_error == min_abs_error_of_duplicates:
            continue
        else:
            df_pKa_match.loc[index, "matched exp pKa"] = np.NaN
    
    return df_pKa_match 

df_pKa_match  = match_exp_and_pred_pKas(exp_pKas, pred_pKas)
df_pKa_match

Unnamed: 0,pred pKa,matched exp pKa,absolute error,duplicate_match
0,2.1,3.5,1.4,False
1,7.4,6.7,0.7,True
2,9.0,,2.3,True
3,9.5,,2.8,True


#### Second version of matching function

In [2]:
def match_exp_and_pred_pKas(pred_pKas, exp_pKas, exp_pKa_SEMs, exp_pKa_IDs):
    """
    Finds closest match between N experimental and M predicted pKas, based on 
    minimum absolute error. If multiple predicted pKas are mapped to the 
    same experimental value, predicted pKa with smallest pKa will be matched to
    experimental pKa and others will be matched to NaN.
    
    Args:
        pred_pKas: Numpy array of predicted pKas
        exp_pKas: Numpy array of experimental pKa means
        exp_pKa_SEMs: Numpy array of experimental pKa SEM values
        exp_pKa_IDs: Numpy array of pKa IDs assigned to experimental pKa values
        
    Returns:
        Pandas DataFrame with predicted pKas and matched experimental pKa columns
    
    """
    
    # create a dataframe to store absolute errors for all possible experimental and predicted pKa matches
    # columns: experimental pKa
    # rows: predicted pKa
    df_abs_error = pd.DataFrame(index = pred_pKas, columns = exp_pKas)

    # iterate over predicted pKas to find the experimental pKa that gives the minimum absolute error.
    for i, pred_pKa in enumerate(pred_pKas):
        for j, exp_pKa in enumerate(exp_pKas):
            absolute_error = np.abs( pred_pKa - exp_pKa)
            df_abs_error.loc[pred_pKa,exp_pKa] = absolute_error
    #print("Data frame of absolute error:")
    #print(df_abs_error)

    # Find the nearest experimental pKa for each predicted pKa
    df_pKa_match = pd.DataFrame()
    df_pKa_match["pred pKa"] = np.NaN
    df_pKa_match["matched exp pKa"] = np.NaN
    df_pKa_match["absolute error"] = np.NaN

    for i, pred_pKa in enumerate(pred_pKas):
        min_abs_error = min(df_abs_error.loc[pred_pKa, :])

        # Find the column name (experimental pKa) that corresponds to minimum absolute error
        matched_exp_pKa = df_abs_error.loc[:, df_abs_error.loc[pred_pKa,:].values == min_abs_error].columns.values[0]
        #print("pred pKa: ", pred_pKa, " exp pKa: ", matched_exp_pKa)
        df_pKa_match.loc[i, "pred pKa"]= pred_pKa
        df_pKa_match.loc[i, "matched exp pKa"]= matched_exp_pKa
        df_pKa_match.loc[i, "absolute error"] = min_abs_error

    # If multiple predicted pKas are matched to same experimental pKa, keep the closer match
    # The unmatched predicted pKa will be assigned exp pKa np.NaN
    df_pKa_match['duplicate_match'] = df_pKa_match.duplicated("matched exp pKa", keep=False)
    
    # Among dublicate matches, find the predicted pKa with minimum absolute error
    df_dublicate_matches = df_pKa_match.loc[df_pKa_match["duplicate_match"] == True]
    
    if df_dublicate_matches.shape[0] > 1:
        #print(df_dublicate_matches)
        min_abs_error_of_duplicates = min(df_dublicate_matches.loc[:, "absolute error"])
    elif df_dublicate_matches.shape[0] == 1:
        min_abs_error_of_duplicates = df_pKa_match.loc[:,"absolute error"].values

    for row in df_dublicate_matches.iterrows():
        index = row[0]
        abs_error = row[1]["absolute error"]
        pred_pKa = row[1]["pred pKa"]

        # for dublicates with bigger absolute error, modify matched exp pKa to np.NaN
        if abs_error == min_abs_error_of_duplicates:
            continue
        else:
            df_pKa_match.loc[index, "matched exp pKa"] = np.NaN
            
    # Drop the row with NaN experimental matched pKa
    df_pKa_match = df_pKa_match.dropna().reset_index(drop=True)
    
    # Add experimental pKa SEM and pKa ID to the dataframe for matched predictions
    df_pKa_match["exp pKa SEM"] = np.NaN
    df_pKa_match["pKa ID"] = np.NaN
    
    for i,row in enumerate(df_pKa_match.iterrows()): # iterate over matched pKas
        matched_exp_pKa = row[1]["matched exp pKa"]
        
        # find the matching experimental pKa SEM and pKa ID
        for j, pKa in enumerate(exp_pKas):
            if pKa == matched_exp_pKa:
                #print("index: ",j)
                exp_pKa_SEM = exp_pKa_SEMs[j]
                exp_pKa_ID = exp_pKa_IDs[j]
        
        # store experimental pKa SEM and pKa ID on the dataframe
        df_pKa_match.loc[i, "exp pKa SEM"] = exp_pKa_SEM
        df_pKa_match.loc[i, "pKa ID"] = exp_pKa_ID
        
    return df_pKa_match 


def add_pKa_IDs_to_matching_predictions(df_pred, df_exp):
    """Add pKa ID column to dataframe of predictions based on 
    the minimum error match to experimental pKas.
    
    Args:
        df_pred: Pandas Dataframe of pKa predictions
        df_exp: Pandas Dataframe of experimental pKas (stacked)
        
    Returns:
        A dataframe of predicted pKa values that gave the best match to experimental values. 
        Other predicted pKa values are ignored. 
        
    """
    
    # iterate over molecule IDs of the submission
    df_pred["pKa ID"] = np.NaN

    for i, row in enumerate(df_pred.iterrows()):
        mol_id = row[1]["Molecule ID"]

        # slice prediction and experimental data dataframes by molecule ID to detect the number of predicted pKas for each molecule
        df_pred_mol = df_pred[df_pred["Molecule ID"]== mol_id]
        df_exp_mol = df_exp[df_exp["Molecule ID"]== mol_id]

        # Create numpy array of predicted pKas
        pred_pKas  = np.array(df_pred_mol.loc[:,"pKa mean"].values)

        # Create numpy array of experimental pKa means, pKa SEM and pKa_ID
        exp_pKa_means = np.array(df_exp_mol.loc[:, "pKa mean"].values)
        exp_pKa_SEMs = np.array(df_exp_mol.loc[:, "pKa SEM"].values)
        exp_pKa_IDs = np.array(df_exp_mol.loc[:, "pKa ID"].values)

        # Match predicted pKas to experimental pKa that gives the smallest error
        df_pKa_match  = match_exp_and_pred_pKas(pred_pKas, exp_pKa_means, exp_pKa_SEMs, exp_pKa_IDs)
        #print("Data frame of pKa match:")
        #print(df_pKa_match, "\n")
    
        # Add matched pKa IDs to prediction data frame
        for index, row in enumerate(df_pKa_match.iterrows()):
            pred_pKa = row[1]["pred pKa"]
            pKa_ID = row[1]["pKa ID"]
            #print(pred_pKa, pKa_ID)

            # store in the correct position in prediction dataframe
            df_pred.loc[df_pred["pKa mean"] == pred_pKa, "pKa ID"] = pKa_ID

    # Drop predicted pKas that didn't match to experimental values
    df_pred_matched = df_pred.dropna(subset=["pKa ID"]).reset_index(drop=True)
    
    return df_pred_matched 


In [3]:
path_to_example_prediction = "prediction_example.csv"
df_pred = pd.read_csv(path_to_example_prediction)

path_to_experimental_data = "pKa_experimental_values_stacked.csv"
df_exp = pd.read_csv(path_to_experimental_data)

df_pred_matched = add_pKa_IDs_to_matching_predictions(df_pred, df_exp)
print(df_pred_matched)


   Molecule ID  pKa mean  pKa SEM     pKa ID
0         SM01      9.12     1.12  SM01_pKa1
1         SM02      4.05     1.47  SM12_pKa1
2         SM03      7.12     2.22  SM03_pKa1
3         SM04      5.56     1.13  SM07_pKa1
4         SM05      5.35     2.22  SM05_pKa1
5         SM06      2.18     1.12  SM06_pKa1
6         SM06     10.33     2.22  SM06_pKa2
7         SM07      5.56     1.13  SM07_pKa1
8         SM08      4.11     2.22  SM08_pKa1
9         SM09      4.05     1.47  SM12_pKa1
10        SM10      9.93     0.40  SM10_pKa1
11        SM11      3.87     2.00  SM11_pKa1
12        SM12      4.05     1.47  SM12_pKa1
13        SM13      4.27     2.22  SM13_pKa1
14        SM14      6.35     2.22  SM14_pKa2
15        SM14      3.37     2.22  SM14_pKa1
16        SM15      5.82     1.12  SM15_pKa1
17        SM15      8.71     1.47  SM15_pKa2
18        SM16      4.71     0.95  SM16_pKa1
19        SM16      9.85     2.22  SM16_pKa2
20        SM17      4.90     0.75  SM17_pKa1
21        

In [8]:
path_to_example_prediction = "prediction_example.csv"
df_pred = pd.read_csv(path_to_example_prediction)
df_pred.head()

Unnamed: 0,Molecule ID,pKa mean,pKa SEM
0,SM01,9.12,1.12
1,SM02,3.51,0.86
2,SM02,4.05,1.47
3,SM03,4.95,2.22
4,SM03,7.12,2.22


In [9]:
path_to_experimental_data = "pKa_experimental_values_stacked.csv"
df_exp = pd.read_csv(path_to_experimental_data)
df_exp.head(15)

Unnamed: 0,Assay Type,Experimental Molecule ID,Molecule ID,canonical isomeric SMILES,pKa ID,pKa SEM,pKa mean
0,UV-metric pKa,M01,SM01,c1cc2c(cc1O)c3c(o2)C(=O)NCCC3,SM01_pKa1,0.01,9.53
1,"UV-metric psKa, Yasuda-Shedlovsky Extrapolation",M02,SM02,c1ccc2c(c1)c(ncn2)Nc3cccc(c3)C(F)(F)F,SM02_pKa1,0.01,5.03
2,"UV-metric psKa, Yasuda-Shedlovsky Extrapolation",M03,SM03,c1ccc(cc1)Cc2nnc(s2)NC(=O)c3cccs3,SM03_pKa1,0.01,7.02
3,UV-metric pKa,M04,SM04,c1ccc2c(c1)c(ncn2)NCc3ccc(cc3)Cl,SM04_pKa1,0.01,6.02
4,"UV-metric psKa, Yasuda-Shedlovsky Extrapolation",M05,SM05,c1ccc(c(c1)NC(=O)c2ccc(o2)Cl)N3CCCCC3,SM05_pKa1,0.01,4.59
5,UV-metric pKa,M06,SM06,c1cc2cccnc2c(c1)NC(=O)c3cc(cnc3)Br,SM06_pKa1,0.04,3.03
6,UV-metric pKa,M06,SM06,c1cc2cccnc2c(c1)NC(=O)c3cc(cnc3)Br,SM06_pKa2,0.01,11.74
7,UV-metric pKa,M07,SM07,c1ccc(cc1)CNc2c3ccccc3ncn2,SM07_pKa1,0.01,6.08
8,UV-metric pKa,M08,SM08,Cc1ccc2c(c1)c(c(c(=O)[nH]2)CC(=O)O)c3ccccc3,SM08_pKa1,0.01,4.22
9,"UV-metric psKa, Yasuda-Shedlovsky Extrapolation",M09,SM09,COc1cccc(c1)Nc2c3ccccc3ncn2.Cl,SM09_pKa1,0.01,5.37


In [80]:
# iterate over molecule IDs of the submission
df_pred["pKa ID"] = np.NaN
    
for i, row in enumerate(df_pred.iterrows()):
    mol_id = row[1]["Molecule ID"]
    
    # slice prediction and experimental data dataframes by molecule ID to detect the number of predicted pKas for each molecule
    df_pred_mol = df_pred[df_pred["Molecule ID"]== mol_id]
    df_exp_mol = df_exp[df_exp["Molecule ID"]== mol_id]
    
    # Create numpy array of predicted pKas
    pred_pKas  = np.array(df_pred_mol.loc[:,"pKa mean"].values)
    
    # Create numpy array of experimental pKa means, pKa SEM and pKa_ID
    exp_pKa_means = np.array(df_exp_mol.loc[:, "pKa mean"].values)
    exp_pKa_SEMs = np.array(df_exp_mol.loc[:, "pKa SEM"].values)
    exp_pKa_IDs = np.array(df_exp_mol.loc[:, "pKa ID"].values)
    
    # Match predicted pKas to experimental pKa that gives the smallest error
    df_pKa_match  = match_exp_and_pred_pKas(pred_pKas, exp_pKa_means, exp_pKa_SEMs, exp_pKa_IDs)
    print("Data frame of pKa match:")
    print(df_pKa_match)
    print()
    
    # Add matched pKa IDs to prediction data frame
    for index, row in enumerate(df_pKa_match.iterrows()):
        pred_pKa = row[1]["pred pKa"]
        pKa_ID = row[1]["pKa ID"]
        print(pred_pKa, pKa_ID)
        
        # store in the correct position in prediction dataframe
        df_pred.loc[df_pred["pKa mean"] == pred_pKa, "pKa ID"] = pKa_ID
        
    # Drop predicted pKas that didn't match to experimental values
    df_pred_matched = df_pred.dropna(subset=["pKa ID"]).reset_index(drop=True)
    

Data frame of pKa match:
   pred pKa  matched exp pKa  absolute error  duplicate_match  exp pKa SEM  \
0      9.12             9.53            0.41            False         0.01   

      pKa ID  
0  SM01_pKa1  

9.12 SM01_pKa1
Data frame of pKa match:
   pred pKa  matched exp pKa  absolute error  duplicate_match  exp pKa SEM  \
0      4.05             5.03            0.98             True         0.01   

      pKa ID  
0  SM02_pKa1  

4.05 SM02_pKa1
Data frame of pKa match:
   pred pKa  matched exp pKa  absolute error  duplicate_match  exp pKa SEM  \
0      4.05             5.03            0.98             True         0.01   

      pKa ID  
0  SM02_pKa1  

4.05 SM02_pKa1
Data frame of pKa match:
   pred pKa  matched exp pKa  absolute error  duplicate_match  exp pKa SEM  \
0      7.12             7.02             0.1             True         0.01   

      pKa ID  
0  SM03_pKa1  

7.12 SM03_pKa1
Data frame of pKa match:
   pred pKa  matched exp pKa  absolute error  duplicate_match  

In [81]:
df_pred

Unnamed: 0,Molecule ID,pKa mean,pKa SEM,pKa ID
0,SM01,9.12,1.12,SM01_pKa1
1,SM02,3.51,0.86,
2,SM02,4.05,1.47,SM12_pKa1
3,SM03,4.95,2.22,
4,SM03,7.12,2.22,SM03_pKa1
5,SM04,3.88,2.22,
6,SM04,5.56,1.13,SM07_pKa1
7,SM05,2.0,0.6,
8,SM05,5.35,2.22,SM05_pKa1
9,SM06,4.0,0.73,


In [82]:
df_pred_matched

Unnamed: 0,Molecule ID,pKa mean,pKa SEM,pKa ID
0,SM01,9.12,1.12,SM01_pKa1
1,SM02,4.05,1.47,SM12_pKa1
2,SM03,7.12,2.22,SM03_pKa1
3,SM04,5.56,1.13,SM07_pKa1
4,SM05,5.35,2.22,SM05_pKa1
5,SM06,2.18,1.12,SM06_pKa1
6,SM06,10.33,2.22,SM06_pKa2
7,SM07,5.56,1.13,SM07_pKa1
8,SM08,4.11,2.22,SM08_pKa1
9,SM09,4.05,1.47,SM12_pKa1


In [31]:
exp = np.array([3, 2])
exp

array([3, 2])

In [57]:
picked = 2
for i, n in enumerate(exp):
    if n == picked:
        print("index: ",i)
        print("value: ", exp[i])

index:  1
value:  2


2