## Testing how align_pKa function works for sequencial predicted and experimental pKa mathcing.

aling_pKa function from:
https://github.com/choderalab/titrato/blob/master/titrato/titrato.py#L556

In [35]:
import numpy as np
import pandas as pd
import logging
from copy import deepcopy

# Make sure to install typing_extensions
from typing import (
    Dict,
    List,
    Iterator,
    Union,
    Iterable,
    Tuple,
    Any,
    Optional,
    Callable,
    TypeVar,
)


# Only used to fill in the nans
def fixed_cost_solve(x1: float, x2: float, cost: float = 10.0) -> Tuple[float, float]:
    """Fill any missing value with the other value, plus an offset"""
    if np.isnan(x1):
        return x2 + cost, x2
    elif np.isnan(x2):
        return x1, x1 + cost
    else:
        return x1, x2

    
def squared_loss(x1: float, x2: float):
    """Returns the squared difference between two numbers"""
    return (x1 - x2) ** 2


def align_pka(
    experimental_pkas: np.ndarray,
    predicted_pkas: np.ndarray,
    cost_function: Callable[[float, float], float], # type annotation
):
    """Align pKas sequentialy and find the alignment that minimizes the cost.
    
    Parameters
    ----------
    experimental_pkas - Pandas DF with pKa and SEM columhs
    predicted_pkas - Pandas DF with pKa and SEM columhs
    cost_function - function to calculate the cost of any invidual mapping.    
    """
    n_experimental = experimental_pkas["pKa"].values.size
    n_predicted = predicted_pkas["pKa"].values.size
    # biggest size, and additional zero
    num_ka = max([n_experimental, n_predicted]) # making pKa errors same size
    exp = np.empty(num_ka)
    pred = np.empty(num_ka)
    sem_exp = np.empty(num_ka)
    sem_pred = np.empty(num_ka)
    pred[:] = np.nan
    exp[:] = np.nan
    sem_pred[:] = np.nan
    sem_exp[:] = np.nan

    experimental_pkas["pKa"].values.sort()
    predicted_pkas["pKa"].values.sort()
    exp[:n_experimental] = experimental_pkas["pKa"].values
    pred[:n_predicted] = predicted_pkas["pKa"].values

    sem_pred[:n_predicted] = predicted_pkas["SEM"].values
    sem_exp[:n_experimental] = experimental_pkas["SEM"].values

    min_cost = 1.0e14
    solution = deepcopy(pred)
    sol_sem = deepcopy(sem_pred)
    solution_cost = np.empty(num_ka)
    for _ in range(num_ka):
        pred = np.roll(pred, 1)
        sem_pred = np.roll(sem_pred, 1)
        cost = []
        for e1, p1 in np.array([deepcopy(exp), deepcopy(pred)]).T:
            e1, p1 = fixed_cost_solve(e1, p1, 14.00)
            cost.append(cost_function(e1, p1))
        total_cost = np.sum(cost)
        if total_cost < min_cost:
            solution = deepcopy(pred)
            min_cost = total_cost
            solution_cost = cost
            sol_sem = deepcopy(sem_pred)

    return pd.DataFrame.from_dict(
        {
            "Experimental": exp,
            "Experimental SEM": sem_exp,
            "Predicted": solution,
            "Predicted SEM": sol_sem,
            "Cost": solution_cost,
        }
    )

In [42]:
exp_pKa_values = np.array([2.0,3.0,4.0])
exp_pKa_SEMs = np.array([0.01,0.01,0.01])
pred_pKa_values = np.array([2.1,3.1,4.1])
pred_pKa_SEMs = np.array([0.5,0.5,0.5])

exp_pKas = pd.DataFrame({"pKa": exp_pKa_values,
                         "SEM": exp_pKa_SEMs})

pred_pKas = pd.DataFrame({"pKa": pred_pKa_values,
                          "SEM": pred_pKa_SEMs})

align_pka(experimental_pkas = exp_pKas, predicted_pkas = pred_pKas, cost_function = squared_loss)

Unnamed: 0,Experimental,Experimental SEM,Predicted,Predicted SEM,Cost
0,2.0,0.01,2.1,0.5,0.01
1,3.0,0.01,3.1,0.5,0.01
2,4.0,0.01,4.1,0.5,0.01


## Test cases (SM18)
Test if prediction pairs sometimes mismatched by hungarian method 
are matched in sequencial order with sequencial matching algorithm

SM18 prediction of 0hxtm method   
EXP: [9.58, 2.15]  PRED: [1.84, 0.5]  

SM18 prediction of yqkga method   
EXP: [2.15, 9.58, 11.02]  PRED: [2.14, 7.53, 9.26]  

In [43]:
# SM18 prediction of 0hxtm method   
# EXP: [9.58, 2.15]  PRED: [1.84, 0.5]  
        
exp_pKa_values = np.array([9.58, 2.15])
exp_pKa_SEMs = np.array([0.01,0.01])
pred_pKa_values = np.array([1.84, 0.5])
pred_pKa_SEMs = np.array([0.5,0.5])

exp_pKas = pd.DataFrame({"pKa": exp_pKa_values,
                         "SEM": exp_pKa_SEMs})

pred_pKas = pd.DataFrame({"pKa": pred_pKa_values,
                          "SEM": pred_pKa_SEMs})

align_pka(experimental_pkas = exp_pKas, predicted_pkas = pred_pKas, cost_function = squared_loss)

Unnamed: 0,Experimental,Experimental SEM,Predicted,Predicted SEM,Cost
0,2.15,0.01,0.5,0.5,2.7225
1,9.58,0.01,1.84,0.5,59.9076


In [44]:
# SM18 prediction of yqkga method   
# EXP: [2.15, 9.58, 11.02]  PRED: [2.14, 7.53, 9.26]  

exp_pKa_values = np.array([2.15,9.58,11.02])
exp_pKa_SEMs = np.array([0.01,0.01,0.01])
pred_pKa_values = np.array([2.14,7.53,9.26])
pred_pKa_SEMs = np.array([0.5,0.5,0.5])

exp_pKas = pd.DataFrame({"pKa": exp_pKa_values,
                         "SEM": exp_pKa_SEMs})

pred_pKas = pd.DataFrame({"pKa": pred_pKa_values,
                          "SEM": pred_pKa_SEMs})

align_pka(experimental_pkas = exp_pKas, predicted_pkas = pred_pKas, cost_function = squared_loss)

Unnamed: 0,Experimental,Experimental SEM,Predicted,Predicted SEM,Cost
0,2.15,0.01,2.14,0.5,0.0001
1,9.58,0.01,7.53,0.5,4.2025
2,11.02,0.01,9.26,0.5,3.0976
