## For the analysis of type III macroscopic pKa submissions

### TO DO
1. parsing submission files
2. parsing experiment CSV
3. matching of experimental pKas to predicted pKas
    - minimum-error principle
    - keep the order of pKa values
4. calculate RMSE and MUE for each molecule

In [28]:
import pandas as pd
import numpy as np
from scipy import stats

#### Calculating root mean squared error (RMSE)

$ RMSE = \sqrt{\frac{1}{n}\sum_{i=1}^{n}{({pred}_i - {exp}_i})} $

In [29]:
# calculate RMSE
d = [0.000, 0.166, 0.333]
p = [0.000, 0.254, 0.998]

def rmse(predicted_values, experimental_values):
    return np.sqrt(((predicted_values - experimental_values) ** 2).mean())

rmse = rmse(np.array(d), np.array(p))
print("RMSE: " + str(rmse))

RMSE: 0.387284994115


#### Calculating mean absolute error (MAE)

$ MAE = \frac{1}{n}\sum_{i=1}^{n}{ |{ {pred}_i - {exp}_i } | } $

In [101]:
def mae(predicted_values, experimental_values):
    return abs(predicted_values - experimental_values).mean()

mae = mae(np.array(d), np.array(p))
print("MAE: ", str(mae))

MAE:  0.251


### 3. How to match experimental pKas to predicted pKas?

#### Matchning predicted pKas to minimum error prediction
For each molecule
1. Determine the number of experimental pKas
2. Determine the number of predicted pKas
3. Map each predicted pKa to closest experimental value (minimum absolute error)
4. If multiple predicted pKas are mapped to the same experimental value, then only the one with smallest absolute error will be analyzed.  

In [79]:
pred_pKa1 = 2.1
pred_pKa2 = 7.4
pred_pKa3 = 9.0
pred_pKa4 = 9.5

exp_pKa1 = 3.5
exp_pKa2 = 6.7

exp_pKas = np.array([exp_pKa1, exp_pKa2])
pred_pKas = np.array([pred_pKa1, pred_pKa2, pred_pKa3, pred_pKa4])

# create a dataframe to store absolute errors for all possible experimental and predicted pKa matches
# columns: experimental pKa
# rows: predicted pKa
df_abs_error = pd.DataFrame(index = pred_pKas, columns = exp_pKas)

# iterate over predicted pKas to find the experimental pKa that gives the minimum absolute error.
for i, pred_pKa in enumerate(pred_pKas):
    for j, exp_pKa in enumerate(exp_pKas):
        absolute_error = np.abs( pred_pKa - exp_pKa)
        df_abs_error.loc[pred_pKa,exp_pKa] = absolute_error
        
df_abs_error

Unnamed: 0,3.5,6.7
2.1,1.4,4.6
7.4,3.9,0.7
9.0,5.5,2.3
9.5,6.0,2.8


In [87]:
# Find the nearest experimental pKa for each predicted pKa
df_pKa_match = pd.DataFrame()
df_pKa_match["pred pKa"] = np.NaN
df_pKa_match["matched exp pKa"] = np.NaN
df_pKa_match["absolute error"] = np.NaN

for i, pred_pKa in enumerate(pred_pKas):
    min_abs_error = min(df_abs_error.loc[pred_pKa, :])
    
    # Find the column name (experimental pKa) that corresponds to minimum absolute error
    matched_exp_pKa = df_abs_error.loc[:, df_abs_error.loc[pred_pKa,:].values == min_abs_error].columns.values[0]
    #print("pred pKa: ", pred_pKa, " exp pKa: ", matched_exp_pKa)
    df_pKa_match.loc[i, "pred pKa"]= pred_pKa
    df_pKa_match.loc[i, "matched exp pKa"]= matched_exp_pKa
    df_pKa_match.loc[i, "absolute error"] = min_abs_error
    
# If multiple predicted pKas are matched to same experimental pKa, keep the closer match
# The unmatched predicted pKa will be assigned exp pKa np.NaN
df_pKa_match['duplicate_match'] = df_pKa_match.duplicated("matched exp pKa", keep=False)
df_pKa_match

Unnamed: 0,pred pKa,matched exp pKa,absolute error,duplicate_match
0,2.1,3.5,1.4,False
1,7.4,6.7,0.7,True
2,9.0,6.7,2.3,True
3,9.5,6.7,2.8,True


In [97]:
# Among dublicate matches, find the predicted pKa with minimum absolute error
df_dublicate_matches = df_pKa_match.loc[df_pKa_match["duplicate_match"] == True]
min_abs_error_of_duplicates = min(df_dublicate_matches.loc[:, "absolute error"])

for row in df_dublicate_matches.iterrows():
    index = row[0]
    abs_error = row[1]["absolute error"]
    pred_pKa = row[1]["pred pKa"]
    
    # for dublicates with bigger absolute error, modify matched exp pKa to np.NaN
    if abs_error == min_abs_error_of_duplicates:
        continue
    else:
        df_pKa_match.loc[index, "matched exp pKa"] = np.NaN

df_pKa_match

Unnamed: 0,pred pKa,matched exp pKa,absolute error,duplicate_match
0,2.1,3.5,1.4,False
1,7.4,6.7,0.7,True
2,9.0,,2.3,True
3,9.5,,2.8,True


In [98]:
def match_exp_and_pred_pKas(exp_pKas, pred_pKas):
    """
    Finds closest match between N experimental and M predicted pKas, based on 
    minimum absolute error. If multiple predicted pKas are mapped to the 
    same experimental value, predicted pKa with smallest pKa will be matched to
    experimental pKa and others will be matched to NaN.
    
    Args:
        exp_pKas: Numpy array of experimental pKas
        pred_pKas: Numpy array of predicted pKas
    
    Returns:
        Pandas DataFrame with predicted pKas and matched experimental pKa columns
    
    """
    
    # create a dataframe to store absolute errors for all possible experimental and predicted pKa matches
    # columns: experimental pKa
    # rows: predicted pKa
    df_abs_error = pd.DataFrame(index = pred_pKas, columns = exp_pKas)

    # iterate over predicted pKas to find the experimental pKa that gives the minimum absolute error.
    for i, pred_pKa in enumerate(pred_pKas):
        for j, exp_pKa in enumerate(exp_pKas):
            absolute_error = np.abs( pred_pKa - exp_pKa)
            df_abs_error.loc[pred_pKa,exp_pKa] = absolute_error

    # Find the nearest experimental pKa for each predicted pKa
    df_pKa_match = pd.DataFrame()
    df_pKa_match["pred pKa"] = np.NaN
    df_pKa_match["matched exp pKa"] = np.NaN
    df_pKa_match["absolute error"] = np.NaN

    for i, pred_pKa in enumerate(pred_pKas):
        min_abs_error = min(df_abs_error.loc[pred_pKa, :])

        # Find the column name (experimental pKa) that corresponds to minimum absolute error
        matched_exp_pKa = df_abs_error.loc[:, df_abs_error.loc[pred_pKa,:].values == min_abs_error].columns.values[0]
        #print("pred pKa: ", pred_pKa, " exp pKa: ", matched_exp_pKa)
        df_pKa_match.loc[i, "pred pKa"]= pred_pKa
        df_pKa_match.loc[i, "matched exp pKa"]= matched_exp_pKa
        df_pKa_match.loc[i, "absolute error"] = min_abs_error

    # If multiple predicted pKas are matched to same experimental pKa, keep the closer match
    # The unmatched predicted pKa will be assigned exp pKa np.NaN
    df_pKa_match['duplicate_match'] = df_pKa_match.duplicated("matched exp pKa", keep=False)
    
    # Among dublicate matches, find the predicted pKa with minimum absolute error
    df_dublicate_matches = df_pKa_match.loc[df_pKa_match["duplicate_match"] == True]
    min_abs_error_of_duplicates = min(df_dublicate_matches.loc[:, "absolute error"])

    for row in df_dublicate_matches.iterrows():
        index = row[0]
        abs_error = row[1]["absolute error"]
        pred_pKa = row[1]["pred pKa"]

        # for dublicates with bigger absolute error, modify matched exp pKa to np.NaN
        if abs_error == min_abs_error_of_duplicates:
            continue
        else:
            df_pKa_match.loc[index, "matched exp pKa"] = np.NaN
    
    return df_pKa_match 

df_pKa_match  = match_exp_and_pred_pKas(exp_pKas, pred_pKas)
df_pKa_match

Unnamed: 0,pred pKa,matched exp pKa,absolute error,duplicate_match
0,2.1,3.5,1.4,False
1,7.4,6.7,0.7,True
2,9.0,,2.3,True
3,9.5,,2.8,True
