In [1]:
import pandas as pd
import numpy as np

In [2]:
df_exp = pd.read_csv("experimental_microstates.csv")
df_exp

Unnamed: 0,Microstate ID of A,Microstate ID of HA,Molecule ID,pKa (exp),pKa SEM (exp),pKa ID,Microstate identification source
0,SM07_micro004,SM07_micro006,SM07,6.08,0.01,SM07_pKa1,NMR measurement
1,SM14_micro001,SM14_micro002,SM14,5.3,0.01,SM14_pKa2,NMR measurement
2,SM14_micro002,SM14_micro003,SM14,2.58,0.01,SM14_pKa1,NMR measurement
3,SM02_micro001,SM02_micro004,SM02,5.03,0.01,SM02_pKa1,Estimated based on SM07 NMR measurement
4,SM04_micro003,SM04_micro002,SM04,6.02,0.01,SM04_pKa1,Estimated based on SM07 NMR measurement
5,SM09_micro003,SM09_micro001,SM09,5.37,0.01,SM09_pKa1,Estimated based on SM07 NMR measurement
6,SM12_micro012,SM12_micro006,SM12,5.28,0.01,SM12_pKa1,Estimated based on SM07 NMR measurement
7,SM13_micro005,SM13_micro001,SM13,5.77,0.01,SM13_pKa1,Estimated based on SM07 NMR measurement
8,SM15_micro004,SM15_micro002,SM15,8.94,0.01,SM15_pKa2,Estimated based on SM14 NMR measurement
9,SM15_micro002,SM15_micro003,SM15,4.7,0.01,SM15_pKa1,Estimated based on SM14 NMR measurement


In [35]:
df_pred = pd.read_csv("nb008-976-typeI-epik_microscopic-1-predictions.csv")
df_pred.head(30)

Unnamed: 0,Microstate ID of HA,Microstate ID of A,pKa mean,pKa SEM
0,SM01_micro001,SM01_micro005,-0.48,0.9
1,SM01_micro001,SM01_micro006,9.6,1.12
2,SM01_micro004,SM01_micro002,13.29,2.0
3,SM01_micro005,SM01_micro004,9.12,1.12
4,SM01_micro005,SM01_micro010,12.93,2.0
5,SM01_micro006,SM01_micro004,-0.48,0.9
6,SM01_micro010,SM01_micro002,9.4,1.47
7,SM02_micro002,SM02_micro011,13.21,0.73
8,SM02_micro003,SM02_micro011,5.9,0.86
9,SM02_micro004,SM02_micro002,4.05,1.47


In [29]:
def microstate_matching(pred_pKas, pred_pKa_SEMs, pred_HA_microstate_IDs, pred_A_microstate_IDs,
                        exp_pKas, exp_pKa_SEMs, exp_HA_microstate_IDs, exp_A_microstate_IDs, exp_pKa_IDs):
    """
    Matches experimental and predicted micoscopic pKas based on reported microstate ID pairs.
    Both microstate IDs and the order should match the experiment, for microscopic pKa match.

    :param pred_pKas:
    :param exp_pKa_means:
    :param exp_pKa_SEMs:
    :param exp_pKa_IDs:
    :return:
    """
    matched = pd.DataFrame()
    
    # Create a list of experimental microstate ID pairs (list of tuples)
    exp_ID_pairs = []
    for i in range(len(exp_HA_microstate_IDs)):
        exp_ID_pair = (exp_HA_microstate_IDs[i], exp_A_microstate_IDs[i])
        exp_ID_pairs.append(exp_ID_pair)
        
    # Iterate through predicted ID pairs to see if it matches to any experimental ID pair.
    for i in range(len(pred_HA_microstate_IDs)):
        pred_ID_pair = (pred_HA_microstate_IDs[i], pred_A_microstate_IDs[i])
        
        # Check for matches in experimental microstate ID pair list
        for j, exp_ID_pair in enumerate(exp_ID_pairs):
            if pred_ID_pair == exp_ID_pair:
                match = {"pKa mean(pred)" : pred_pKas[i], "pKa SEM(pred)": pred_pKa_SEMs[i],
                         'microstate ID of HA(pred)': pred_HA_microstate_IDs[i],
                         'microstate ID of A(pred)': pred_A_microstate_IDs[i], 
                         'pKa mean(exp)': exp_pKas[j], 'pKa SEM(exp)': exp_pKa_SEMs[j], 
                         'microstate ID of HA(exp)': exp_HA_microstate_IDs[j],
                         'microstate ID of A(exp)': exp_A_microstate_IDs[j],
                         'pKa ID': exp_pKa_IDs[j]}
    
                matched = matched.append(match, ignore_index=True)
    
    return matched

In [30]:
def add_pKa_IDs_to_matching_predictions_microstate_based_matching(df_pred, df_exp):
    """Add pKa ID column to dataframe of predictions based on
    the minimum error match to experimental pKas.

    Args:
        df_pred: Pandas Dataframe of pKa predictions
        df_exp: Pandas Dataframe of experimental pKas (stacked)

    Returns:
        df_pred_matched: A dataframe of predicted pKa values that gave the best match to experimental values.
        Other predicted pKa values are ignored.
        df_pred_unmatched: A dataframe of predicted pKas that were not matched to experimental pKa values

    """

    # iterate over molecule IDs of the submission
    df_pred["pKa ID"] = np.NaN
    
    # Create a column of molecule IDs
    #df_pred["Molecule ID"] = df_pred.index
    df_pred["Molecule ID"] = np.NaN
    for i, row in enumerate(df_pred.iterrows()):
        mol_id = df_pred.loc[i, "Microstate ID of HA"].split("_")[0]
        df_pred.loc[i, "Molecule ID"] = mol_id

    for mol_id, df_pred_mol in df_pred.groupby("Molecule ID"):
        print("mol_id: ", mol_id)
        print("df_pred_mol:\n", df_pred_mol)
        
        df_exp_mol = df_exp[df_exp["Molecule ID"] == mol_id]
        print("df_exp_mol:\n",df_exp_mol)
        # Create numpy array of predicted pKas and SEMS
        pred_pKas = np.array(df_pred_mol["pKa mean"]) # if there is multiple predicted pKas
        pred_pKa_SEMs = np.array(df_pred_mol["pKa SEM"])
        try:
            len(df_pred_mol["pKa mean"])
        except TypeError:
            pred_pKas = np.array([df_pred_mol["pKa mean"]])  # if there is single predicted pKa
            pred_pKa_SEMs = np.array([df_pred_mol["pKa SEM"]])
            
        # Create numpy arrays of microstates IDs of HA and microstate IDs of A
        pred_HA_microstate_IDs = np.array(df_pred_mol["Microstate ID of HA"]) # if there is multiple predicted pKas
        pred_A_microstate_IDs = np.array(df_pred_mol["Microstate ID of A"]) # if there is multiple predicted pKas
        try:
            len(df_pred_mol["Microstate ID of HA"])
        except TypeError:
            pred_HA_microstate_IDs = np.array([df_pred_mol["Microstate ID of HA"]])  # if there is single predicted pKa
            pred_A_microstate_IDs = np.array([df_pred_mol["Microstate ID of A"]])  # if there is single predicted pKa

        # Create numpy array of experimental pKa means, pKa SEM , pKa_ID, experimental microstate ID of HA and H
        exp_pKa_means = np.array(df_exp_mol.loc[:, "pKa (exp)"].values)
        exp_pKa_SEMs = np.array(df_exp_mol.loc[:, "pKa SEM (exp)"].values)
        exp_pKa_IDs = np.array(df_exp_mol.loc[:, "pKa ID"].values)
        exp_HA_microstate_IDs = np.array(df_exp_mol.loc[:, "Microstate ID of HA"].values)
        exp_A_microstate_IDs = np.array(df_exp_mol.loc[:, "Microstate ID of A"].values)

        # Match predicted pKas to experimental pKa that gives the smallest error
        df_pKa_match = microstate_matching(pred_pKas, pred_pKa_SEMs, pred_HA_microstate_IDs, pred_A_microstate_IDs,
                                           exp_pKa_means, exp_pKa_SEMs, exp_HA_microstate_IDs, exp_A_microstate_IDs,
                                           exp_pKa_IDs)
        df_pKa_match["Molecule ID"] = mol_id
        print("df_pKa_match:\n", df_pKa_match)

        # Add matched pKa IDs to prediction data frame
        for index, row in enumerate(df_pKa_match.iterrows()):
            pred_pKa = row[1]["pKa mean(pred)"]
            pKa_ID = row[1]["pKa ID"]

            # store in the correct position in prediction dataframe

            df_pred.loc[(df_pred["Molecule ID"] == mol_id) & (df_pred["pKa mean"] == pred_pKa), "pKa ID"] = pKa_ID

    # Save unmatched pKas in df_pred_unmatched dataframe
    df_pred_unmatched = df_pred.loc[pd.isnull(df_pred["pKa ID"])]

    # Drop predicted pKas that didn't match to experimental values
    df_pred_matched = df_pred.dropna(subset=["pKa ID"]).reset_index(drop=True)

    # If there are multiple microscopic pKas with the same exact value keep only the first one
    df_pred_matched.drop_duplicates(subset="pKa mean", keep="first", inplace=True)

    return df_pred_matched, df_pred_unmatched

In [31]:
(df_pred_matched, df_pred_unmatched) = add_pKa_IDs_to_matching_predictions_microstate_based_matching(df_pred, df_exp)

mol_id:  SM01
df_pred_mol:
   Microstate ID of HA Microstate ID of A  pKa mean  pKa SEM  pKa ID  \
0       SM01_micro001      SM01_micro005     -0.48     0.90     NaN   
1       SM01_micro001      SM01_micro006      9.60     1.12     NaN   
2       SM01_micro004      SM01_micro002     13.29     2.00     NaN   
3       SM01_micro005      SM01_micro004      9.12     1.12     NaN   
4       SM01_micro005      SM01_micro010     12.93     2.00     NaN   
5       SM01_micro006      SM01_micro004     -0.48     0.90     NaN   
6       SM01_micro010      SM01_micro002      9.40     1.47     NaN   

  Molecule ID  
0        SM01  
1        SM01  
2        SM01  
3        SM01  
4        SM01  
5        SM01  
6        SM01  
df_exp_mol:
 Empty DataFrame
Columns: [Microstate ID of A, Microstate ID of HA, Molecule ID, pKa (exp), pKa SEM (exp), pKa ID, Microstate identification source]
Index: []
df_pKa_match:
 Empty DataFrame
Columns: [Molecule ID]
Index: []
mol_id:  SM02
df_pred_mol:
    Microstat

mol_id:  SM16
df_pred_mol:
     Microstate ID of HA Microstate ID of A  pKa mean  pKa SEM  pKa ID  \
145       SM16_micro002      SM16_micro005      9.85     2.22     NaN   
146       SM16_micro004      SM16_micro002      4.71     2.22     NaN   
147       SM16_micro004      SM16_micro007      9.81     2.22     NaN   
148       SM16_micro006      SM16_micro002     -0.63     0.90     NaN   
149       SM16_micro007      SM16_micro005      7.67     0.94     NaN   

    Molecule ID  
145        SM16  
146        SM16  
147        SM16  
148        SM16  
149        SM16  
df_exp_mol:
 Empty DataFrame
Columns: [Microstate ID of A, Microstate ID of HA, Molecule ID, pKa (exp), pKa SEM (exp), pKa ID, Microstate identification source]
Index: []
df_pKa_match:
 Empty DataFrame
Columns: [Molecule ID]
Index: []
mol_id:  SM17
df_pred_mol:
     Microstate ID of HA Microstate ID of A  pKa mean  pKa SEM  pKa ID  \
150       SM17_micro011      SM17_micro010       4.9     0.75     NaN   

    Molecule ID

In [32]:
df_pred_matched

Unnamed: 0,Microstate ID of HA,Microstate ID of A,pKa mean,pKa SEM,pKa ID,Molecule ID
0,SM04_micro002,SM04_micro003,5.6,1.13,SM04_pKa1,SM04
2,SM09_micro001,SM09_micro003,4.05,1.47,SM09_pKa1,SM09
4,SM13_micro001,SM13_micro005,4.27,2.22,SM13_pKa1,SM13
5,SM14_micro002,SM14_micro001,6.35,2.22,SM14_pKa2,SM14
6,SM14_micro003,SM14_micro002,0.28,0.92,SM14_pKa1,SM14
7,SM15_micro002,SM15_micro004,8.71,1.06,SM15_pKa2,SM15
8,SM15_micro003,SM15_micro002,5.82,2.22,SM15_pKa1,SM15


In [33]:
df_pred_unmatched

Unnamed: 0,Microstate ID of HA,Microstate ID of A,pKa mean,pKa SEM,pKa ID,Molecule ID
0,SM01_micro001,SM01_micro005,-0.48,0.90,,SM01
1,SM01_micro001,SM01_micro006,9.60,1.12,,SM01
2,SM01_micro004,SM01_micro002,13.29,2.00,,SM01
3,SM01_micro005,SM01_micro004,9.12,1.12,,SM01
4,SM01_micro005,SM01_micro010,12.93,2.00,,SM01
5,SM01_micro006,SM01_micro004,-0.48,0.90,,SM01
6,SM01_micro010,SM01_micro002,9.40,1.47,,SM01
7,SM02_micro002,SM02_micro011,13.21,0.73,,SM02
8,SM02_micro003,SM02_micro011,5.90,0.86,,SM02
9,SM02_micro004,SM02_micro002,4.05,1.47,,SM02
