In [26]:
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
import scipy.stats

def hungarian_matching(pred_pKas, exp_pKa_means, exp_pKa_SEMs, exp_pKa_IDs):
    """Perform Hungarian algorithm matching of pKa values.

    Original source by Kiril Lanevskij (ACD Labs).

    Args:
        predicted : array of predicted pKas
        experimental: array of experimental pKas

    """

    matched = pd.DataFrame()
    cost = []
    predicted = pred_pKas
    experimental = exp_pKa_means

    # Our cost matrix will have the same size as no. of exp. or pred. pKa values, whichever is larger
    sz = max(len(experimental), len(predicted))
    for i in range(sz):
        cost.append([])
        for j in range(sz):
            # Calculate mapping cost as an absolute diff. between exp. and pred. pKa values
            if i < len(experimental) and j < len(predicted):
                # Cost is defined as squared error
                cost_se = (predicted[j]-experimental[i])**2
                cost[i].append(cost_se)
            # Assign zero cost if we are out of bounds
            else:
                cost[i].append(0)
    # Perform mapping itself, row_indices => exp. data, col_indices => pred. pKa
    row_indices, col_indices = scipy.optimize.linear_sum_assignment(cost)
    for i, row_id in enumerate(row_indices):
        col_id = col_indices[i]
        # Ignore the entry if we are out of bounds
        if row_id >= len(experimental) or col_id >= len(predicted): continue
        # Otherwise assign a match
        match = {"pred pKa" : predicted[col_id], 'pKa mean': exp_pKa_means[row_id], 'pKa SEM': exp_pKa_SEMs[row_id], 'pKa ID': exp_pKa_IDs[row_id]}
        matched = matched.append(match, ignore_index=True)

    # If there are two pKa predictions with the same value they will be matched to the same experimental pKa
    # In that case only the first match should be retained in matched dataframe
    matched.drop_duplicates(keep="first", inplace=True)  # Drop duplicates except for the first occurrence.

    return matched

In [27]:
# exp. pKa = [2; 9] and pred. pKa = [9; 12]
hungarian_matching(pred_pKas=np.array([12,9]), 
                   exp_pKa_means=np.array([2,9]), 
                   exp_pKa_SEMs=np.array([1,2]), 
                   exp_pKa_IDs=["pKa1", "pKa2"])

Unnamed: 0,pKa ID,pKa SEM,pKa mean,pred pKa
0,pKa1,1.0,2.0,9.0
1,pKa2,2.0,9.0,12.0


In [28]:
def hungarian_matching_absolute_cost(pred_pKas, exp_pKa_means, exp_pKa_SEMs, exp_pKa_IDs):
    """Perform Hungarian algorithm matching of pKa values.

    Original source by Kiril Lanevskij (ACD Labs).

    Args:
        predicted : array of predicted pKas
        experimental: array of experimental pKas

    """

    matched = pd.DataFrame()
    cost = []
    predicted = pred_pKas
    experimental = exp_pKa_means

    # Our cost matrix will have the same size as no. of exp. or pred. pKa values, whichever is larger
    sz = max(len(experimental), len(predicted))
    for i in range(sz):
        cost.append([])
        for j in range(sz):
            # Calculate mapping cost as an absolute diff. between exp. and pred. pKa values
            if i < len(experimental) and j < len(predicted):
                # Cost is defined as squared error
                cost_ae = abs(predicted[j]-experimental[i])
                cost[i].append(cost_ae)
            # Assign zero cost if we are out of bounds
            else:
                cost[i].append(0)
    # Perform mapping itself, row_indices => exp. data, col_indices => pred. pKa
    row_indices, col_indices = scipy.optimize.linear_sum_assignment(cost)
    for i, row_id in enumerate(row_indices):
        col_id = col_indices[i]
        # Ignore the entry if we are out of bounds
        if row_id >= len(experimental) or col_id >= len(predicted): continue
        # Otherwise assign a match
        match = {"pred pKa" : predicted[col_id], 'pKa mean': exp_pKa_means[row_id], 'pKa SEM': exp_pKa_SEMs[row_id], 'pKa ID': exp_pKa_IDs[row_id]}
        matched = matched.append(match, ignore_index=True)

    # If there are two pKa predictions with the same value they will be matched to the same experimental pKa
    # In that case only the first match should be retained in matched dataframe
    matched.drop_duplicates(keep="first", inplace=True)  # Drop duplicates except for the first occurrence.

    return matched

In [34]:
hungarian_matching_absolute_cost(pred_pKas=np.array([2,9]), 
                   exp_pKa_means=np.array([9,12]), 
                   exp_pKa_SEMs=np.array([1,2]), 
                   exp_pKa_IDs=["pKa1", "pKa2"])

Unnamed: 0,pKa ID,pKa SEM,pKa mean,pred pKa
0,pKa1,1.0,9.0,9.0
1,pKa2,2.0,12.0,2.0


In [35]:
hungarian_matching(pred_pKas=np.array([2,9]), 
                   exp_pKa_means=np.array([9,12]), 
                   exp_pKa_SEMs=np.array([1,2]), 
                   exp_pKa_IDs=["pKa1", "pKa2"])

Unnamed: 0,pKa ID,pKa SEM,pKa mean,pred pKa
0,pKa1,1.0,9.0,2.0
1,pKa2,2.0,12.0,9.0


In [49]:
hungarian_matching_absolute_cost(pred_pKas=np.array([4,6]), 
                   exp_pKa_means=np.array([7,9]), 
                   exp_pKa_SEMs=np.array([1,2]), 
                   exp_pKa_IDs=["pKa1", "pKa2"])

Unnamed: 0,pKa ID,pKa SEM,pKa mean,pred pKa
0,pKa1,1.0,7.0,6.0
1,pKa2,2.0,9.0,4.0


In [50]:
hungarian_matching(pred_pKas=np.array([4,6]), 
                   exp_pKa_means=np.array([7,9]), 
                   exp_pKa_SEMs=np.array([1,2]), 
                   exp_pKa_IDs=["pKa1", "pKa2"])

Unnamed: 0,pKa ID,pKa SEM,pKa mean,pred pKa
0,pKa1,1.0,7.0,4.0
1,pKa2,2.0,9.0,6.0
