In [None]:
from titrato import align_pka, hungarian_pka, closest_pka, TitrationCurve
from titrato.stats import array_rmse, msd
from titrato.sampl import get_experimental_pKa_data, get_typeiii_pka_data, get_typei_pka_data, data_dir
from titrato.reports import plot_correlation_analysis
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib
import seaborn as sns
from uncertainties import ufloat
from copy import deepcopy
from scipy.stats import pearsonr
import seaborn as sns

%matplotlib inline
sns.set_style("ticks")
font = {'size': 9}

matplotlib.rc('font', **font)


# Epik scan 

In [None]:
df_closest = pd.DataFrame(columns=["Molecule", "Experimental", "Experimental SEM", "Predicted", "Predicted SEM", "Cost"])
df_align = pd.DataFrame(columns=["Molecule", "Experimental", "Experimental SEM", "Predicted", "Predicted SEM", "Cost"])
df_hung =  pd.DataFrame(columns=["Molecule", "Experimental", "Experimental SEM", "Predicted", "Predicted SEM", "Cost"])

for molecule in range(24):
    mol_name = "SM{:02d}".format(molecule+1)
    print(mol_name)
    experimental_pkas = get_experimental_pKa_data(mol_name)
    predicted_pkas = get_typeiii_pka_data(mol_name, "Epik/typeIII-raw-sequential.csv")
    closest_match = closest_pka(experimental_pkas, predicted_pkas, array_rmse)
    hung_match = hungarian_pka(experimental_pkas, predicted_pkas, msd)
    align_match = align_pka(experimental_pkas, predicted_pkas, array_rmse)
    closest_match["Molecule"] = mol_name
    hung_match["Molecule"] = mol_name
    align_match["Molecule"] = mol_name
    df_closest = df_closest.append(closest_match, ignore_index=True)
    df_hung = df_hung.append(hung_match, ignore_index=True)
    df_align = df_align.append(align_match, ignore_index=True)
    

df_closest = df_closest.dropna(subset=["Experimental"])
df_hung = df_hung.dropna(subset=["Experimental"])
df_align = df_align.dropna(subset=["Experimental"])
    

In [None]:
plot_correlation_analysis(df_align,'Epik scan', 'blue', 's', nsamples=10000)[0].savefig("aligned_pka_epik_scan.pdf")


In [None]:
plot_correlation_analysis(df_closest, 'Epik scan', 'blue', 's', nsamples=10000)[0].savefig("closest_pka_epik_scan.pdf")

In [None]:
np.random.choice(np.arange(12), size=12)

In [None]:
def add_uncertainty(df, pred_name):
    df["Experiment +/- SEM"] = df.apply(lambda row: ufloat(row["Experimental"], row["Experimental SEM"]), axis=1)
    df[pred_name] = df.apply(lambda row: ufloat(row["Predicted"], row["Predicted SEM"]), axis=1)

In [None]:
add_uncertainty(df_closest, "Epik closest +/- SEM")
add_uncertainty(df_hung, "Epik hungarian +/- SEM")
add_uncertainty(df_align, "Epik aligned +/- SEM")

In [None]:
new_df = df_closest[["Molecule", "Experiment +/- SEM", "Epik closest +/- SEM"]]


In [None]:
new_df

## Jaguar type I

In [None]:
df_closest = pd.DataFrame(columns=["Molecule", "Experimental", "Experimental SEM", "Predicted", "Predicted SEM", "Cost"])
df_align = pd.DataFrame(columns=["Molecule", "Experimental", "Experimental SEM", "Predicted", "Predicted SEM", "Cost"])
df_hung =  pd.DataFrame(columns=["Molecule", "Experimental", "Experimental SEM", "Predicted", "Predicted SEM", "Cost"])

for molecule in range(24):
    mol_name = "SM{:02d}".format(molecule+1)
    print(mol_name)
    experimental_pkas = get_experimental_pKa_data(mol_name)
    predicted_pkas = get_typei_pka_data(mol_name, "Jaguar/typeI-raw.csv")
    closest_match = closest_pka(experimental_pkas, predicted_pkas, array_rmse)
    hung_match = hungarian_pka(experimental_pkas, predicted_pkas, array_rmse)
    align_match = align_pka(experimental_pkas, predicted_pkas, array_rmse)
    closest_match["Molecule"] = mol_name
    hung_match["Molecule"] = mol_name
    align_match["Molecule"] = mol_name
    df_closest = df_closest.append(closest_match, ignore_index=True)
    df_hung = df_hung.append(hung_match, ignore_index=True)
    df_align = df_align.append(align_match, ignore_index=True)
    

df_closest = df_closest.dropna(subset=["Experimental"])
df_hung = df_hung.dropna(subset=["Experimental"])


In [None]:
plot_correlation_analysis(df_closest, 'Jaguar pKa', 'green', 's', nsamples=10000)[0].savefig("closest_pka_jaguar.pdf")

## Epik microscopic pKa


In [None]:
df_closest = pd.DataFrame(columns=["Molecule", "Experimental", "Experimental SEM", "Predicted", "Predicted SEM", "Cost"])
df_align = pd.DataFrame(columns=["Molecule", "Experimental", "Experimental SEM", "Predicted", "Predicted SEM", "Cost"])
df_hung =  pd.DataFrame(columns=["Molecule", "Experimental", "Experimental SEM", "Predicted", "Predicted SEM", "Cost"])

for molecule in range(24):
    mol_name = "SM{:02d}".format(molecule+1)
    print(mol_name)
    experimental_pkas = get_experimental_pKa_data(mol_name)
    predicted_pkas = get_typei_pka_data(mol_name, "Epik/typeI-raw-microscopic.csv")
    closest_match = closest_pka(experimental_pkas, predicted_pkas, array_rmse)
    hung_match = hungarian_pka(experimental_pkas, predicted_pkas, array_rmse)
    align_match = align_pka(experimental_pkas, predicted_pkas, array_rmse)
    closest_match["Molecule"] = mol_name
    hung_match["Molecule"] = mol_name
    align_match["Molecule"] = mol_name
    df_closest = df_closest.append(closest_match, ignore_index=True)
    df_hung = df_hung.append(hung_match, ignore_index=True)
    df_align = df_align.append(align_match, ignore_index=True)
    

df_closest = df_closest.dropna(subset=["Experimental"])
df_hung = df_hung.dropna(subset=["Experimental"])


In [None]:
plot_correlation_analysis(df_closest, 'Epik micropKa', 'red', 's', nsamples=10000)[0].savefig("closest_pka_epik_micropka.pdf")

In [None]:
x = np.arange(12
             )

In [None]:
y = 17

In [None]:
x - y

In [None]:
y - x

In [None]:
np.percentile(x, 30, axis=0)

In [None]:
np.percentile(np.arange(11), 30, axis=0)
