In [47]:
from sampl_pH_0_12 import SAMPL6DataProvider
import pandas as pd
import numpy as np

In [69]:
# molecules with known microstates
mol_IDs = ["SM07", "SM02", "SM04", "SM09", "SM12", "SM13", "SM14", "SM15"]


df_exp_microstates = pd.read_csv("experimental_microstates_with_charge.csv")
df_exp_microstates

Unnamed: 0,Microstate ID of A,Charge of A,Microstate ID of HA,Charge of HA,Molecule ID,pKa (exp),pKa SEM (exp),pKa ID,Microstate identification source
0,SM07_micro004,0,SM07_micro006,1,SM07,6.08,0.01,SM07_pKa1,NMR measurement
1,SM14_micro001,0,SM14_micro002,1,SM14,5.3,0.01,SM14_pKa2,NMR measurement
2,SM14_micro002,1,SM14_micro003,2,SM14,2.58,0.01,SM14_pKa1,NMR measurement
3,SM02_micro002,0,SM02_micro004,1,SM02,5.03,0.01,SM02_pKa1,Estimated based on SM07 NMR measurement
4,SM04_micro003,0,SM04_micro002,1,SM04,6.02,0.01,SM04_pKa1,Estimated based on SM07 NMR measurement
5,SM09_micro003,0,SM09_micro001,1,SM09,5.37,0.01,SM09_pKa1,Estimated based on SM07 NMR measurement
6,SM12_micro012,0,SM12_micro006,1,SM12,5.28,0.01,SM12_pKa1,Estimated based on SM07 NMR measurement
7,SM13_micro005,0,SM13_micro001,1,SM13,5.77,0.01,SM13_pKa1,Estimated based on SM07 NMR measurement
8,SM15_micro004,-1,SM15_micro002,0,SM15,8.94,0.01,SM15_pKa2,Estimated based on SM14 NMR measurement
9,SM15_micro002,0,SM15_micro003,1,SM15,4.7,0.01,SM15_pKa1,Estimated based on SM14 NMR measurement


1. calculate $\Delta G$ of each microstate at pH 0
2. determine lowest energy tautomer of each charge state
3. compare sequence of predicted dominant states to experimental states

In [5]:
jaguartypei = SAMPL6DataProvider("typeI-raw-jaguar.csv", "typei", "Jaguar", bootstrap_options={"n_samples": 1})
exp = SAMPL6DataProvider("SAMPL6_experimental_pkas.csv", "exp", "Experiment", bootstrap_options={"n_samples": 3})
#"experimental_microstates.csv"
jaguartypei.__dict__

{'file_path': '/Users/isikm/lab/SAMPL6-repos/sampl6-physicochemical-properties/analysis_of_pKa_predictions/20191003_typeI_microstate_deltaG_calculation/typeI-raw-jaguar.csv',
 'data_type': 'typei',
 'method_desc': 'Jaguar',
 'label': 'Jaguar',
 'load_opts': {},
 'bootstrap_opts': {'n_samples': 1},
 '_typeiii_charge_file': None,
 'load': <function sampl_pH_0_12.SAMPL6DataProvider.__init__.<locals>.<lambda>(mol_id)>,
 'can_bootstrap': True,
 'bootstrap': <function sampl_pH_0_12.SAMPL6DataProvider.__init__.<locals>.<lambda>(mol_id, n_bootstrap)>}

In [4]:
SM07_exp = exp.load("SM07")
SM07_jag = jaguartypei.load("SM07")

# show properties as example

SM07_jag.__dict__

{'free_energies': array([[ 33.52563895,  33.24932874,  32.97301853, ...,   6.44723826,
           6.17092805,   5.89461784],
        [ 11.90436493,  11.90436493,  11.90436493, ...,  11.90436493,
          11.90436493,  11.90436493],
        [-11.95041663, -11.67410642, -11.39779621, ...,  15.12798406,
          15.40429427,  15.68060448],
        ...,
        [ -5.24989401,  -4.69727359,  -4.14465317, ...,  48.90690738,
          49.4595278 ,  50.01214822],
        [ 14.29905343,  14.85167385,  15.40429427, ...,  68.45585481,
          69.00847524,  69.56109566],
        [ 12.61816631,  13.44709694,  14.27602758, ...,  93.85336839,
          94.68229902,  95.51122966]]),
 'populations': array([[1.59710596e-20, 2.77618750e-20, 4.82542589e-20, ...,
         1.58202052e-03, 2.08445986e-03, 2.74603171e-03],
        [3.92043023e-11, 5.16950302e-11, 6.81609524e-11, ...,
         6.74857552e-06, 6.74517988e-06, 6.74070851e-06],
        [8.98118681e-01, 8.98356664e-01, 8.98536348e-01, ...,
   

In [31]:

microstate_data = []

for mol_ID in mol_IDs:
    jag_1mol = jaguartypei.load(mol_ID)
    microstate_IDs = jag_1mol.state_ids
    charges_of_microstates = jag_1mol.charges
    free_energies_of_microstates_pH0 = jag_1mol.free_energies[:,0]
    
    print(mol_ID)
    print("Microstates:\n", microstate_IDs)
    print("Charges:", charges_of_microstates)
    print("Free energies:\n",free_energies_of_microstates_pH0)
    print()
    
    for i, microstate_ID in enumerate(microstate_IDs):
    
        microstate_data.append({
            'Molecule ID': mol_ID,
            'Microstate ID': microstate_ID,
            'Charge': charges_of_microstates[i],
            '$\Delta$G (pH=0)': free_energies_of_microstates_pH0[i]
        })

df_microstate_data = pd.DataFrame(data=microstate_data)
df_microstate_data

SM07
Microstates:
 ['SM07_micro012', 'SM07_micro002', 'SM07_micro006', 'SM07_micro003', 'SM07_micro007', 'SM07_micro004', 'SM07_micro011', 'SM07_micro013', 'SM07_micro014', 'SM07_micro015', 'SM07_micro016']
Charges: [-1  0  1  0  1  0  1  2  2  2  3]
Free energies:
 [ 33.52563895  11.90436493 -11.95041663   8.40443559  -9.76296079
   0.           6.1018505    6.93078113  -5.24989401  14.29905343
  12.61816631]

SM02
Microstates:
 ['SM02_micro011', 'SM02_micro002', 'SM02_micro004', 'SM02_micro005', 'SM02_micro006', 'SM02_micro003', 'SM02_micro007', 'SM02_micro008', 'SM02_micro012', 'SM02_micro013', 'SM02_micro014']
Charges: [-1  0  1  1  1  0  0  2  2  2  3]
Free energies:
 [26.73301293  0.         -9.18731452 -6.1479022  15.45034597  4.8354287
  6.88472943 15.19706161  3.84531711 30.90069195 41.69981603]

SM04
Microstates:
 ['SM04_micro005', 'SM04_micro003', 'SM04_micro002', 'SM04_micro004', 'SM04_micro013', 'SM04_micro014', 'SM04_micro006', 'SM04_micro008', 'SM04_micro009', 'SM04_micr

Unnamed: 0,$\Delta$G (pH=0),Charge,Microstate ID,Molecule ID
0,33.525639,-1,SM07_micro012,SM07
1,11.904365,0,SM07_micro002,SM07
2,-11.950417,1,SM07_micro006,SM07
3,8.404436,0,SM07_micro003,SM07
4,-9.762961,1,SM07_micro007,SM07
5,0.000000,0,SM07_micro004,SM07
6,6.101850,1,SM07_micro011,SM07
7,6.930781,2,SM07_micro013,SM07
8,-5.249894,2,SM07_micro014,SM07
9,14.299053,2,SM07_micro015,SM07


In [18]:
df_microstate_data_charge0 = df_microstate_data[(df_microstate_data["Molecule ID"] == "SM07") & (df_microstate_data["Charge"] == 0)]
df_microstate_data_charge0

Unnamed: 0,$\Delta$G (pH=0),Charge,Microstate ID,Molecule ID
1,11.904365,0,SM07_micro002,SM07
3,8.404436,0,SM07_micro003,SM07
5,0.0,0,SM07_micro004,SM07


In [22]:
dominant_microstate = df_microstate_data_charge0.loc[df_microstate_data_charge0['$\Delta$G (pH=0)'].idxmin()]["Microstate ID"]
dominant_microstate

'SM07_micro004'

In [34]:
pred_mol_IDs = set(df_microstate_data["Molecule ID"].values)
pred_mol_IDs

{'SM02', 'SM04', 'SM07', 'SM09', 'SM12', 'SM13', 'SM14', 'SM15'}

In [77]:
# Create dataframe to store dominant predicted microstate ID of each charge

submission_ID = "jaguar"

df_dominant_ms = pd.DataFrame(columns=["submission ID", "Molecule ID", "charge -4", "charge -3","charge -2", "charge -1","charge 0",
                                       "charge 1","charge 2", "charge 3", "charge 4"])
for i, mol_ID in enumerate(pred_mol_IDs): 
    df_dominant_ms.loc[i] = [submission_ID, mol_ID, np.NaN, np.NaN, np.NaN, np.NaN, np.NaN, np.NaN, np.NaN, np.NaN, np.NaN ]
df_dominant_ms

Unnamed: 0,submission ID,Molecule ID,charge -4,charge -3,charge -2,charge -1,charge 0,charge 1,charge 2,charge 3,charge 4
0,jaguar,SM09,,,,,,,,,
1,jaguar,SM12,,,,,,,,,
2,jaguar,SM13,,,,,,,,,
3,jaguar,SM04,,,,,,,,,
4,jaguar,SM15,,,,,,,,,
5,jaguar,SM07,,,,,,,,,
6,jaguar,SM02,,,,,,,,,
7,jaguar,SM14,,,,,,,,,


In [78]:
# Populate dominant microstate dataframe

for mol_ID in pred_mol_IDs:
    df_1mol = df_microstate_data[df_microstate_data["Molecule ID"] == mol_ID]
    charges = set(df_1mol["Charge"].values)

    for charge in charges:
        df_1mol_1charge = df_1mol[df_1mol["Charge"] == charge]
        dominant_microstate = df_1mol_1charge.loc[df_1mol_1charge['$\Delta$G (pH=0)'].idxmin()]["Microstate ID"]
        df_dominant_ms.loc[(df_dominant_ms["Molecule ID"] == mol_ID), "charge {}".format(charge)] = dominant_microstate
        #print("Charge {}, {}".format(charge, dominant_microstate))

# Save as CSV
df_dominant_ms.to_csv("typeI_submission_dominant_microstate_collection.csv", index=False)

In [79]:
# Compare predicted dominant microstate collection to experimental dominant microstates
# Read predictied dominant microstate collection
df_pred_dom_ms = pd.read_csv("typeI_submission_dominant_microstate_collection.csv")
df_pred_dom_ms

Unnamed: 0,submission ID,Molecule ID,charge -4,charge -3,charge -2,charge -1,charge 0,charge 1,charge 2,charge 3,charge 4
0,jaguar,SM09,,,,SM09_micro009,SM09_micro003,SM09_micro001,SM09_micro013,SM09_micro015,
1,jaguar,SM12,,,,SM12_micro009,SM12_micro012,SM12_micro006,SM12_micro013,SM12_micro015,
2,jaguar,SM13,,,,SM13_micro004,SM13_micro005,SM13_micro001,SM13_micro013,SM13_micro015,
3,jaguar,SM04,,,,SM04_micro005,SM04_micro003,SM04_micro002,SM04_micro014,SM04_micro016,
4,jaguar,SM15,,,,SM15_micro004,SM15_micro002,SM15_micro003,,,
5,jaguar,SM07,,,,SM07_micro012,SM07_micro004,SM07_micro006,SM07_micro014,SM07_micro016,
6,jaguar,SM02,,,,SM02_micro011,SM02_micro002,SM02_micro004,SM02_micro012,SM02_micro014,
7,jaguar,SM14,,,,SM14_micro006,SM14_micro001,SM14_micro002,SM14_micro003,,


In [82]:
# Read experimental dominant microstates
exp_microstates = pd.read_csv("experimental_microstates_with_charge.csv")
exp_microstates

Unnamed: 0,Microstate ID of A,Charge of A,Microstate ID of HA,Charge of HA,Molecule ID,pKa (exp),pKa SEM (exp),pKa ID,Microstate identification source
0,SM07_micro004,0,SM07_micro006,1,SM07,6.08,0.01,SM07_pKa1,NMR measurement
1,SM14_micro001,0,SM14_micro002,1,SM14,5.3,0.01,SM14_pKa2,NMR measurement
2,SM14_micro002,1,SM14_micro003,2,SM14,2.58,0.01,SM14_pKa1,NMR measurement
3,SM02_micro002,0,SM02_micro004,1,SM02,5.03,0.01,SM02_pKa1,Estimated based on SM07 NMR measurement
4,SM04_micro003,0,SM04_micro002,1,SM04,6.02,0.01,SM04_pKa1,Estimated based on SM07 NMR measurement
5,SM09_micro003,0,SM09_micro001,1,SM09,5.37,0.01,SM09_pKa1,Estimated based on SM07 NMR measurement
6,SM12_micro012,0,SM12_micro006,1,SM12,5.28,0.01,SM12_pKa1,Estimated based on SM07 NMR measurement
7,SM13_micro005,0,SM13_micro001,1,SM13,5.77,0.01,SM13_pKa1,Estimated based on SM07 NMR measurement
8,SM15_micro004,-1,SM15_micro002,0,SM15,8.94,0.01,SM15_pKa2,Estimated based on SM14 NMR measurement
9,SM15_micro002,0,SM15_micro003,1,SM15,4.7,0.01,SM15_pKa1,Estimated based on SM14 NMR measurement


In [81]:
# Organize experimental microstate in dominant microstate collection format

# Create empty dataframe
df_exp_dom_ms = pd.DataFrame(columns=[ "Molecule ID", "charge -4", "charge -3","charge -2", "charge -1","charge 0",
                                       "charge 1","charge 2", "charge 3", "charge 4"])
charges = np.arange(-4,4,1)
exp_mol_IDs = set(exp_microstates["Molecule ID"].values)

for i, mol_ID in enumerate(exp_mol_IDs): 
    df_exp_dom_ms.loc[i] = [mol_ID, np.NaN, np.NaN, np.NaN, np.NaN, np.NaN, np.NaN, np.NaN, np.NaN, np.NaN ]
    
# Populate expermental dominant microstate collection
for i, mol_ID in enumerate(exp_mol_IDs): 
    for charge in charges:
        dominant_exp_microstate = exp_microstates[exp_microstates["Molecule ID"] == mol_ID]...
        df_exp_dom_ms.loc[(df_exp_dom_ms["Molecule ID"] == "mol_ID"), "charge {}".format(charge)] = dominant_exp_microstate

Unnamed: 0,Molecule ID,charge -4,charge -3,charge -2,charge -1,charge 0,charge 1,charge 2,charge 3,charge 4
0,SM09,,,,,,,,,
1,SM12,,,,,,,,,
2,SM13,,,,,,,,,
3,SM04,,,,,,,,,
4,SM15,,,,,,,,,
5,SM07,,,,,,,,,
6,SM02,,,,,,,,,
7,SM14,,,,,,,,,
