## Testing how align_pKa function works for sequencial predicted and experimental pKa mathcing.

aling_pKa function from:
https://github.com/choderalab/titrato/blob/master/titrato/titrato.py#L556

In [35]:
import numpy as np
import pandas as pd
import logging
from copy import deepcopy

# Make sure to install typing_extensions
from typing import (
    Dict,
    List,
    Iterator,
    Union,
    Iterable,
    Tuple,
    Any,
    Optional,
    Callable,
    TypeVar,
)


# Only used to fill in the nans
def fixed_cost_solve(x1: float, x2: float, cost: float = 10.0) -> Tuple[float, float]:
    """Fill any missing value with the other value, plus an offset"""
    if np.isnan(x1):
        return x2 + cost, x2
    elif np.isnan(x2):
        return x1, x1 + cost
    else:
        return x1, x2

    
def squared_loss(x1: float, x2: float):
    """Returns the squared difference between two numbers"""
    return (x1 - x2) ** 2


def align_pka(
    experimental_pkas: np.ndarray,
    predicted_pkas: np.ndarray,
    cost_function: Callable[[float, float], float], # type annotation
):
    """Align pKas sequentialy and find the alignment that minimizes the cost.
    
    Parameters
    ----------
    experimental_pkas - Pandas DF with pKa and SEM columhs
    predicted_pkas - Pandas DF with pKa and SEM columhs
    cost_function - function to calculate the cost of any invidual mapping.    
    """
    n_experimental = experimental_pkas["pKa"].values.size
    n_predicted = predicted_pkas["pKa"].values.size
    # biggest size, and additional zero
    num_ka = max([n_experimental, n_predicted]) # making pKa errors same size
    exp = np.empty(num_ka)
    pred = np.empty(num_ka)
    sem_exp = np.empty(num_ka)
    sem_pred = np.empty(num_ka)
    pred[:] = np.nan
    exp[:] = np.nan
    sem_pred[:] = np.nan
    sem_exp[:] = np.nan

    experimental_pkas["pKa"].values.sort()
    predicted_pkas["pKa"].values.sort()
    exp[:n_experimental] = experimental_pkas["pKa"].values
    pred[:n_predicted] = predicted_pkas["pKa"].values

    sem_pred[:n_predicted] = predicted_pkas["SEM"].values
    sem_exp[:n_experimental] = experimental_pkas["SEM"].values

    min_cost = 1.0e14
    solution = deepcopy(pred)
    sol_sem = deepcopy(sem_pred)
    solution_cost = np.empty(num_ka)
    for _ in range(num_ka):
        pred = np.roll(pred, 1)
        sem_pred = np.roll(sem_pred, 1)
        cost = []
        for e1, p1 in np.array([deepcopy(exp), deepcopy(pred)]).T:
            e1, p1 = fixed_cost_solve(e1, p1, 14.00)
            cost.append(cost_function(e1, p1))
        total_cost = np.sum(cost)
        if total_cost < min_cost:
            solution = deepcopy(pred)
            min_cost = total_cost
            solution_cost = cost
            sol_sem = deepcopy(sem_pred)

    return pd.DataFrame.from_dict(
        {
            "Experimental": exp,
            "Experimental SEM": sem_exp,
            "Predicted": solution,
            "Predicted SEM": sol_sem,
            "Cost": solution_cost,
        }
    )

In [42]:
exp_pKa_values = np.array([2.0,3.0,4.0])
exp_pKa_SEMs = np.array([0.01,0.01,0.01])
pred_pKa_values = np.array([2.1,3.1,4.1])
pred_pKa_SEMs = np.array([0.5,0.5,0.5])

exp_pKas = pd.DataFrame({"pKa": exp_pKa_values,
                         "SEM": exp_pKa_SEMs})

pred_pKas = pd.DataFrame({"pKa": pred_pKa_values,
                          "SEM": pred_pKa_SEMs})

align_pka(experimental_pkas = exp_pKas, predicted_pkas = pred_pKas, cost_function = squared_loss)

Unnamed: 0,Experimental,Experimental SEM,Predicted,Predicted SEM,Cost
0,2.0,0.01,2.1,0.5,0.01
1,3.0,0.01,3.1,0.5,0.01
2,4.0,0.01,4.1,0.5,0.01


## Test cases (SM18)
Test if prediction pairs sometimes mismatched by hungarian method 
are matched in sequencial order with sequencial matching algorithm

SM18 prediction of 0hxtm method   
EXP: [9.58, 2.15]  PRED: [1.84, 0.5]  

SM18 prediction of yqkga method   
EXP: [2.15, 9.58, 11.02]  PRED: [2.14, 7.53, 9.26]  

In [43]:
# SM18 prediction of 0hxtm method   
# EXP: [9.58, 2.15]  PRED: [1.84, 0.5]  
        
exp_pKa_values = np.array([9.58, 2.15])
exp_pKa_SEMs = np.array([0.01,0.01])
pred_pKa_values = np.array([1.84, 0.5])
pred_pKa_SEMs = np.array([0.5,0.5])

exp_pKas = pd.DataFrame({"pKa": exp_pKa_values,
                         "SEM": exp_pKa_SEMs})

pred_pKas = pd.DataFrame({"pKa": pred_pKa_values,
                          "SEM": pred_pKa_SEMs})

align_pka(experimental_pkas = exp_pKas, predicted_pkas = pred_pKas, cost_function = squared_loss)

Unnamed: 0,Experimental,Experimental SEM,Predicted,Predicted SEM,Cost
0,2.15,0.01,0.5,0.5,2.7225
1,9.58,0.01,1.84,0.5,59.9076


In [44]:
# SM18 prediction of yqkga method   
# EXP: [2.15, 9.58, 11.02]  PRED: [2.14, 7.53, 9.26]  

exp_pKa_values = np.array([2.15,9.58,11.02])
exp_pKa_SEMs = np.array([0.01,0.01,0.01])
pred_pKa_values = np.array([2.14,7.53,9.26])
pred_pKa_SEMs = np.array([0.5,0.5,0.5])

exp_pKas = pd.DataFrame({"pKa": exp_pKa_values,
                         "SEM": exp_pKa_SEMs})

pred_pKas = pd.DataFrame({"pKa": pred_pKa_values,
                          "SEM": pred_pKa_SEMs})

align_pka(experimental_pkas = exp_pKas, predicted_pkas = pred_pKas, cost_function = squared_loss)

Unnamed: 0,Experimental,Experimental SEM,Predicted,Predicted SEM,Cost
0,2.15,0.01,2.14,0.5,0.0001
1,9.58,0.01,7.53,0.5,4.2025
2,11.02,0.01,9.26,0.5,3.0976


## Do hungarian matching and sequencial matching give the same results?
Take the hungarian collection and rematch by sequencial matching.
From sampl6-physicochemical-properties reporistory  
commit 389a9540 "Rerun type III analysis 20190913."

In [47]:
#https://github.com/choderalab/sampl6-physicochemical-properties/blob/master/analysis_of_pKa_predictions/analysis_of_typeIII_predictions/analysis_outputs_hungarian/typeIII_submission_collection.csv
hungarian_collection = "typeIII_submission_collection_hungarian_3891954.csv"

df_hungarian = pd.read_csv(hungarian_collection, index_col=0)
df_hungarian.head()

Unnamed: 0,$\Delta$pKa error (calc - exp),Molecule ID,name,pKa (calc),pKa (exp),pKa ID,pKa SEM (calc),pKa SEM (exp),participant,receipt_id
0,0.61,SM01,Full quantum chemical calculation of free ener...,10.14,9.53,SM01_pKa1,0.35,0.01,Philipp Pracht,xvxzd
1,-0.1,SM02,Full quantum chemical calculation of free ener...,4.93,5.03,SM02_pKa1,0.27,0.01,Philipp Pracht,xvxzd
2,0.5,SM03,Full quantum chemical calculation of free ener...,7.52,7.02,SM03_pKa1,0.31,0.01,Philipp Pracht,xvxzd
3,-0.85,SM04,Full quantum chemical calculation of free ener...,5.17,6.02,SM04_pKa1,0.28,0.01,Philipp Pracht,xvxzd
4,-0.23,SM05,Full quantum chemical calculation of free ener...,4.36,4.59,SM05_pKa1,0.27,0.01,Philipp Pracht,xvxzd


In [86]:
df_hungarian.columns.values

array(['$\\Delta$pKa error (calc - exp)', 'Molecule ID', 'name',
       'pKa (calc)', 'pKa (exp)', 'pKa ID', 'pKa SEM (calc)',
       'pKa SEM (exp)', 'participant', 'receipt_id'], dtype=object)

In [94]:
# Create a list construct sequencial matching collection. It will be converted to pandas dataframe in the end.
sequencial_collection_list = []

# Iterate through each submission
receipt_ids = set(df_hungarian["receipt_id"])

for receipt_id in receipt_ids:
#for receipt_id in ["xvxzd"]:
    df_1method = df_hungarian[df_hungarian["receipt_id"] == receipt_id]
    
    method_name =  df_1method["name"].values[0]
    participant = df_1method["participant"].values[0]
    
    molecule_ids = list(df_1method["Molecule ID"])
    # remove repeating molecule IDs
    molecule_ids = list(dict.fromkeys(molecule_ids))

    for molecule_id in molecule_ids:
    #for molecule_id in ["SM18"]:
        df_1method_1mol = df_1method[df_1method["Molecule ID"] == molecule_id]
        
        exp_pKa_values = df_1method_1mol["pKa (exp)"].values
        exp_pKa_SEMs = df_1method_1mol["pKa SEM (exp)"].values
        
        pred_pKa_values = df_1method_1mol["pKa (calc)"].values
        pred_pKa_SEMs = df_1method_1mol["pKa SEM (calc)"].values    
       
        exp_pKas = pd.DataFrame({"pKa": exp_pKa_values,
                         "SEM": exp_pKa_SEMs})

        pred_pKas = pd.DataFrame({"pKa": pred_pKa_values,
                          "SEM": pred_pKa_SEMs})
        
        df_aligned = align_pka(experimental_pkas = exp_pKas, predicted_pkas = pred_pKas, cost_function = squared_loss)
        
        for i,row in df_aligned.iterrows():
            exp_pKa = row["Experimental"]
            exp_pKa_SEM = row["Experimental SEM"]
            pred_pKa = row["Predicted"]
            pred_pKa_SEM = row["Predicted SEM"]
            pKa_error = pred_pKa - exp_pKa
            
            # Find pKa ID by matching by experimental pKa value
            pKa_ID = df_1method_1mol[df_1method_1mol["pKa (exp)"] == exp_pKa]["pKa ID"].values[0]

            # Creat the entry for sequencial matching collection dataframe
            sequencial_collection_list.append([pKa_error, molecule_id, method_name, pred_pKa, exp_pKa, pKa_ID,
                                              pred_pKa_SEM, exp_pKa_SEM, participant, receipt_id])
        
    
# Convert to pandas dataframe
df_sequencial = pd.DataFrame(sequencial_collection_list , columns = df_hungarian.columns.values)
df_sequencial.head()

Unnamed: 0,$\Delta$pKa error (calc - exp),Molecule ID,name,pKa (calc),pKa (exp),pKa ID,pKa SEM (calc),pKa SEM (exp),participant,receipt_id
0,-0.44,SM01,"EC-RISM/MP2/6-311+G(d,p)-P3NI-phi-noThiols-2par",9.09,9.53,SM01_pKa1,1.77,0.01,Stefan Kast,nb004
1,-1.15,SM02,"EC-RISM/MP2/6-311+G(d,p)-P3NI-phi-noThiols-2par",3.88,5.03,SM02_pKa1,1.77,0.01,Stefan Kast,nb004
2,2.58,SM03,"EC-RISM/MP2/6-311+G(d,p)-P3NI-phi-noThiols-2par",9.6,7.02,SM03_pKa1,1.77,0.01,Stefan Kast,nb004
3,-1.5,SM04,"EC-RISM/MP2/6-311+G(d,p)-P3NI-phi-noThiols-2par",4.52,6.02,SM04_pKa1,1.77,0.01,Stefan Kast,nb004
4,1.73,SM05,"EC-RISM/MP2/6-311+G(d,p)-P3NI-phi-noThiols-2par",6.32,4.59,SM05_pKa1,1.77,0.01,Stefan Kast,nb004


In [93]:
df_sequencial.to_csv("typeIII_submission_collection_sequencial_matching_after_hungarian.csv")



### Compare if sequencial and hungarian algorithms produce the same exp-pred pKa pairs?
I will compare predicted pKa values matched to each pKa ID to check if the matching is the same.

In [105]:
matching_comparison_list = []

# Iterate through methods
receipt_ids = set(df_hungarian["receipt_id"])

#for receipt_id in receipt_ids:
for receipt_id in ["xvxzd"]:
    df_hungarian_1method = df_hungarian[df_hungarian["receipt_id"] == receipt_id]
    df_sequencial_1method = df_sequencial[df_sequencial["receipt_id"] == receipt_id]
    
    # Iterate through pKa ID
    pKa_ids = list(df_hungarian_1method["pKa ID"])
    for pKa_id in pKa_ids:
        
        #Slice data frames by pKa IDs 
        hungarian_row = df_hungarian_1method[df_hungarian_1method["pKa ID"] == pKa_id]
        hungarian_pred_pKa = hungarian_row["pKa (calc)"].values[0]
        hungarian_exp_pKa = hungarian_row["pKa (exp)"].values[0]
        
        sequencial_row = df_sequencial_1method[df_sequencial_1method["pKa ID"] == pKa_id]
        sequencial_pred_pKa = sequencial_row["pKa (calc)"].values[0]
        sequencial_exp_pKa = sequencial_row["pKa (exp)"].values[0]
        
        molecule_id = hungarian_row["Molecule ID"].values[0]
        receipt_id = hungarian_row["receipt_id"].values[0]
        
        if hungarian_pred_pKa == sequencial_pred_pKa:
            equal_match = True
        else:
            equal_match = False
            
        matching_comparison_list.append([molecule_id, equal_match, pKa_id, hungarian_exp_pKa, 
                                         sequencial_exp_pKa, hungarian_pred_pKa, sequencial_pred_pKa, receipt_id ])
            


# Convert to pandas dataframe
df_hun_seq_comparison  = pd.DataFrame(matching_comparison_list , columns = ["Molecule ID", "Equal match", "pKa ID",
                                                                           "Seq pKa (exp)","Hun pKa (exp)",
                                                                           "Seq pKa (calc)", "Hun pKa (calc)", "receipt_id"])
df_hun_seq_comparison.head()

Unnamed: 0,Molecule ID,Equal match,pKa ID,Seq pKa (exp),Hun pKa (exp),Seq pKa (calc),Hun pKa (calc),receipt_id
0,SM01,True,SM01_pKa1,9.53,9.53,10.14,10.14,xvxzd
1,SM02,True,SM02_pKa1,5.03,5.03,4.93,4.93,xvxzd
2,SM03,True,SM03_pKa1,7.02,7.02,7.52,7.52,xvxzd
3,SM04,True,SM04_pKa1,6.02,6.02,5.17,5.17,xvxzd
4,SM05,True,SM05_pKa1,4.59,4.59,4.36,4.36,xvxzd


In [106]:
# Are there any row with Equal match == False?

df_hun_seq_comparison[df_hun_seq_comparison["Equal match"] == False]

Unnamed: 0,Molecule ID,Equal match,pKa ID,Seq pKa (exp),Hun pKa (exp),Seq pKa (calc),Hun pKa (calc),receipt_id
