In [None]:
import numpy as np
import scipy
import pandas as pd
import regex
import pickle
import matplotlib.pyplot as plt
%matplotlib inline
plt.ion()
import os
import matplotlib.style as style
import matplotlib.cm as mplcm
import matplotlib.colors as colors
from collections import Counter
import csv
import seaborn as sns
import matplotlib as mpl
mpl.rcParams['pdf.fonttype'] = 42

plt.style.use('seaborn')

def save_obj(obj, name ):
    with open('../obj/'+ name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

def load_obj(name ):
    with open('../obj/' + name + '.pkl', 'rb') as f:
        return pickle.load(f)

amino_acids = ['A', 'R', 'N', 'D', 'C', 'E', 'Q', 'G', 'H', 'I', 'L', 'K', 'M', 'F', 'P', 'S', 'T', 'W', 'Y', 'V']
def all_possible_mutations(peptides):
    new_peptides = []
    for i in peptides:
        #print(i)
        for j in range(0, len(i)):
            for k in amino_acids:
                new_peptide = i[:j]+k+i[j+1:]
                #print(new_peptide)
                new_peptides.append(new_peptide)
        #break
    return new_peptides

In [None]:
prefilled_data = pd.read_excel("../Combined Anchor Validation/Combined anchor validation data new version.xlsx")

## HLA-A*68:01

In [None]:
sns.set(rc={'figure.figsize':(15,3)})
data = pd.DataFrame()
mfi_value = []
peptide_id = []
mutated_position = []
for n,i in prefilled_data.iterrows():
    if i['Peptide Category'] != 'HLA-A*68:01':
        continue
    if i['Label Category'] == "MT" or i['Label Category'] == 'WT':
        pos = "Reference MT/WT"
    elif i['Label Category'] == "Positive Control":
        pos = "(+)"
    else:
        pos = str(int(i['Mutation Position']))
    id_str = i['Peptide Sequence'] + "-" + str(i['Amino Acid Change']) + "-" + str(int(i["Mutation Position"]))
    mfi_value.append(i["Average MFI Value (100)"])
    peptide_id.append(id_str)
    mutated_position.append(pos)

data['Average MFI Value'] = mfi_value
data['Peptide'] = peptide_id
data['Mutated Position'] = mutated_position
fig, ax = plt.subplots()
sns.barplot(x='Peptide', y='Average MFI Value', ax=ax, data=data, capsize=0.05, color = 'grey')
def change_width(ax, new_value) :
    for patch in ax.patches :
        current_width = patch.get_width()
        diff = current_width - new_value

        # we change the bar width
        patch.set_width(new_value)

        # we recenter the bar
        patch.set_x(patch.get_x() + diff * .5)

change_width(ax, .35)
plt.xticks(rotation=45, ha="right")

plt.savefig("HLA-A*68:01_bar_plot.pdf", dpi=200)

## HLA-A*23:01

In [None]:
sns.set(rc={'figure.figsize':(15,3)})
data = pd.DataFrame()
mfi_value = []
peptide_id = []
mutated_position = []
for n,i in prefilled_data.iterrows():
    if i['Peptide Category'] != 'HLA-A*23:01':
        continue
    if i['Label Category'] == "MT" or i['Label Category'] == 'WT':
        pos = "Reference MT/WT"
    elif i['Label Category'] == "Positive Control":
        pos = "(+)"
    else:
        pos = str(int(i['Mutation Position']))
    id_str = i['Peptide Sequence'] + "-" + str(i['Amino Acid Change']) + "-" + str(int(i["Mutation Position"]))
    mfi_value.append(i["Average MFI Value (20)"])
    peptide_id.append(id_str)
    mutated_position.append(pos)

data['Average MFI Value'] = mfi_value
data['Peptide'] = peptide_id
data['Mutated Position'] = mutated_position
fig, ax = plt.subplots()
sns.barplot(x='Peptide', y='Average MFI Value', ax=ax, data=data, capsize=0.05, color = 'grey')
def change_width(ax, new_value) :
    for patch in ax.patches :
        current_width = patch.get_width()
        diff = current_width - new_value

        # we change the bar width
        patch.set_width(new_value)

        # we recenter the bar
        patch.set_x(patch.get_x() + diff * .5)

change_width(ax, .35)
plt.xticks(rotation=45, ha="right")

plt.savefig("HLA-A*23:01_bar_plot_20nM.pdf", dpi=200)

## Summary Plots

In [None]:
prefilled_data = prefilled_data.drop_duplicates(subset=["Peptide Sequence", "HLA Allele"])

In [None]:
## Binding affinity prediction vs log binding affinity measured
#sns.set(rc={'figure.figsize':(11,7)})
plt.gcf().set_size_inches(11, 7)
pred_binding_affinity = []
log_binding_affinity = []
hla_allele = []
for n,i in prefilled_data.iterrows():
    if np.isnan(i["Measured Binding Affinity (log)"]):
        continue
    else:
        pred_binding_affinity.append(i['Predicted Binding Affinity (log)'])
        log_binding_affinity.append(i['Measured Binding Affinity (log)'])
        hla_allele.append(i['HLA Allele'])
                                       
data = pd.DataFrame()
data["Predicted binding affinity"] = pred_binding_affinity
data["Measured Binding Affinity"] = log_binding_affinity
data["HLA Allele"] = hla_allele

hla_allele_list = ["HLA-A*68:01", "HLA-B*07:02", "HLA-B*08:01", "HLA-A*02:01"]

fig = plt.figure()
sns.scatterplot(data=data, x="Predicted binding affinity", y="Measured Binding Affinity", hue="HLA Allele", style='HLA Allele', s=130)
color = ["cornflowerblue", "orange", "green", "red"]
for n,i in enumerate(hla_allele_list):
    m, b, r_val, p_val, std_err = scipy.stats.linregress(data[data["HLA Allele"] == i]["Predicted binding affinity"], data[data["HLA Allele"] == i]["Measured Binding Affinity"])
    plt.plot([1,2,3,4,5,6], m*np.array([1,2,3,4,5,6]).astype(float)+b, linewidth=2, color=color[n], label=i+"-"+str(r_val**2))

#plt.plot([1,2,3,4,5,6], [1,2,3,4,5,6], '--', label='y=x guide line', linewidth=1, color="black")
plt.xlabel("Predicted Binding Affinity (log[nM])")
plt.ylabel("Measured Binding Affinity (log[nM])")
plt.title("Predicted vs Measured Binding Affinity ")
plt.legend()
#plt.savefig("Predicted Binding Affinity vs Measured Binding Affinity (log values) without A2402.pdf", dpi=300)

In [None]:
## Binding affinity prediction vs log binding affinity measured
#sns.set(rc={'figure.figsize':(11,7)})
plt.gcf().set_size_inches(11, 7)
pred_binding_affinity = []
log_binding_affinity = []
hla_allele = []
for n,i in prefilled_data.iterrows():
    #print(i["Measured Binding Category"])
    if i["Measured Binding Category"] == "No Binder":
        pred_binding_affinity.append(i['Predicted Binding Affinity (log)'])
        log_binding_affinity.append(7)
        hla_allele.append(i['HLA Allele'])
        continue
    try:
        if np.isnan(i["Measured Binding Category"]):
            #print("Here")
            continue
    except:
        pred_binding_affinity.append(i['Predicted Binding Affinity (log)'])
        log_binding_affinity.append(i['Measured Binding Affinity (log)'])
        hla_allele.append(i['HLA Allele'])
                                       
data = pd.DataFrame()
data["Predicted binding affinity"] = pred_binding_affinity
data["Measured Binding Affinity"] = log_binding_affinity
data["HLA Allele"] = hla_allele

hla_allele_list = ["HLA-A*68:01", "HLA-B*07:02", "HLA-B*08:01", "HLA-A*02:01"]

fig = plt.figure()
sns.scatterplot(data=data, x="Predicted binding affinity", y="Measured Binding Affinity", hue="HLA Allele", style='HLA Allele', s=130)
color = ["cornflowerblue", "orange", "green", "red"]
for n,i in enumerate(hla_allele_list):
    m, b, r_val, p_val, std_err = scipy.stats.linregress(data[data["HLA Allele"] == i]["Predicted binding affinity"], data[data["HLA Allele"] == i]["Measured Binding Affinity"])
    plt.plot([1,2,3,4,5,6], m*np.array([1,2,3,4,5,6]).astype(float)+b, linewidth=2, color=color[n], label=i+"-"+str(r_val**2))

#plt.plot([1,2,3,4,5,6], [1,2,3,4,5,6], '--', label='y=x guide line', linewidth=1, color="black")
plt.xlabel("Predicted Binding Affinity (log[nM])")
plt.ylabel("Measured Binding Affinity (log[nM])")
plt.title("Predicted vs Measured Binding Affinity ")
plt.legend()
plt.savefig("Predicted Binding Affinity vs Measured Binding Affinity (log values) with no binders.pdf", dpi=300)

In [None]:
#Predicted Binding vs Binding Category
fig = plt.figure()
dataset = pd.DataFrame()
pred_binding_affinity = []
log_binding_cat = []
for n,i in prefilled_data.iterrows():
    try:
        if np.isnan(i["Measured Binding Category"]):
            continue
    except:
        #print(i['Measured Binding Category'])
        pred_binding_affinity.append(i['Predicted Binding Affinity (log)'])
        log_binding_cat.append(i['Measured Binding Category'])
        
dataset['Predicted Binding Affinity (log[nM])'] = pred_binding_affinity
dataset['Binding Category'] = log_binding_cat
ax = sns.boxplot(y="Predicted Binding Affinity (log[nM])", x="Binding Category", data=dataset, order=["High Affinity", "Medium Affinity", "Low Affinity", "Very Low Affinity", "No Binder"])
ax = sns.stripplot(y="Predicted Binding Affinity (log[nM])", x="Binding Category", data=dataset, order=["High Affinity", "Medium Affinity", "Low Affinity", "Very Low Affinity", "No Binder"], color=".3")

plt.title("Predicted binding affinity (log values) vs Binding Category")
plt.savefig("Predicted binding affinity (log values) vs Binding Category 06-2022.pdf", dpi=300)

In [None]:
## Binding affinity prediction vs corrected MFI average
pred_binding_affinity = []
MFI_average = []
hla_allele = []
for n,i in prefilled_data.iterrows():
    if np.isnan(i["Predicted Binding Affinity"]) or np.isnan(i["Average MFI Value (100)"]):
        continue
    else:
        pred_binding_affinity.append(i['Predicted Binding Affinity'])
        MFI_average.append(i["Average MFI Value (100)"])
        hla_allele.append(i['HLA Allele'])
        
data = pd.DataFrame()
data["Predicted binding affinity"] = pred_binding_affinity
data["MFI Average"] = MFI_average
data["HLA Allele"] = hla_allele

def func(x, a, b, c):
    return a * np.exp(-b * x) + c

sns.scatterplot(data=data, x="Predicted binding affinity", y="MFI Average", hue="HLA Allele", style='HLA Allele', s=130)

hla_allele_list = ["HLA-A*23:01", "HLA-A*68:01", "HLA-B*07:02", "HLA-A*02:01", "HLA-A*31:01", "HLA-A*24:02", "HLA-B*18:01"]
color = ["cornflowerblue", "orange", "green", "red", "purple", "brown", "pink"]
plt.xlabel("Predicted Binding Affinity (nM)")
plt.ylabel("MFI Average Value")
plt.xscale("log")
plt.title("Predicted binding affinity vs MFI average value")
plt.xticks([100,1000,10000], ["100","1000","10000"])
plt.savefig("Predicted Binding Affinity vs MFI average 06-27-2022.pdf", dpi=300)

In [None]:
## Binding category vs MFI
fig = plt.figure()
dataset = pd.DataFrame()
MFI_average = []
log_binding_cat = []
for n,i in prefilled_data.iterrows():
    try:
        if np.isnan(i["Average MFI Value (100)"]) or np.isnan(i["Measured Binding Category"]):
            continue
    except:
        MFI_average.append(i["Average MFI Value (100)"]+1)
        log_binding_cat.append(i["Measured Binding Category"])
dataset['MFI Average'] = MFI_average
dataset['Binding Category'] = log_binding_cat
ax = sns.boxplot(y='MFI Average', x="Binding Category", data=dataset, order=["High Affinity", "Medium Affinity", "Low Affinity", "Very Low Affinity", "No Binder"])
ax = sns.stripplot(y='MFI Average', x="Binding Category", data=dataset, order=["High Affinity", "Medium Affinity", "Low Affinity", "Very Low Affinity", "No Binder"], color=".3")
plt.yscale("log")
plt.title("MFI Average vs Binding Category")
plt.savefig("MFI Average vs Binding Category 06-27-2022.pdf", dpi=300)

## Comparison across all 8 algorithms

In [None]:
hla_list = list(set(prefilled_data["HLA Allele"]))
combined_all_epitopes_files = pd.DataFrame()
for hla in hla_list:
    all_epitopes = pd.read_table("../wet_lab_validation_simulation_output_files/ANCHOR."+hla+".all_epitopes.tsv", delimiter='\t')
    combined_all_epitopes_files = combined_all_epitopes_files.append(all_epitopes)
    for length in [8,9,10,11]:
        try:
            all_epitopes = pd.read_table("../wet_lab_validation_simulation_output_files/anchor_"+hla+"_"+str(length)+"_output/MHC_Class_I/ANCHOR.all_epitopes.tsv", delimiter='\t')
            combined_all_epitopes_files = combined_all_epitopes_files.append(all_epitopes)
        except:
            print(length, hla)

In [None]:
algorithms = ["MHCflurry Score", "MHCnuggetsI Score", "NetMHC Score", "NetMHCcons Score", "NetMHCpan Score", "PickPocket Score", "SMM Score", "SMMPMBEC Score"]
data = pd.DataFrame(columns = ["HLA Allele", "Peptide Sequence", "MFI Value", "Measured BA"]+algorithms)
for n, i in prefilled_data.iterrows():
    hla = i["HLA Allele"]
    peptide = i["Peptide Sequence"]
    average_MFI = i["Average MFI Value (100)"]
    measured_binding = i["Measured Binding Affinity (log)"]
    row_data = {"HLA Allele":hla, "Peptide Sequence":peptide, "MFI Value": average_MFI, "Measured BA": measured_binding}
    try:
        query = combined_all_epitopes_files[(combined_all_epitopes_files["Epitope Seq"] == peptide) & (combined_all_epitopes_files["HLA Allele"] == hla)].iloc[0]
        for j in algorithms:
            row_data[j] = query[j]
    except:
        for j in algorithms:
            row_data[j] = 0
    data = data.append(row_data, ignore_index = True)

In [None]:
#data.to_excel("Individual Algorithm Scores for all validation data.xlsx")

In [None]:
## Binding affinity prediction vs log binding affinity measured across all algorithms
dataset = pd.read_excel("Individual Algorithm Scores for all validation data.xlsx")
algorithms = ["MHCflurry Score", "MHCnuggetsI Score", "NetMHC Score", "NetMHCcons Score", "NetMHCpan Score", "PickPocket Score", "SMM Score", "SMMPMBEC Score"]
fig, axes = plt.subplots(4,2, sharex=True, figsize=(20,34))
fig.suptitle('Predicted versus Measured Binding Affinity across 8 different algorithms')
corr_dict = {}

for subplot in range(0,8):
    algorithm = algorithms[subplot]
    corr_dict[algorithm] = {}
    row = int(np.floor(subplot/2))
    col = subplot - (row*2)
    #print(row, col)
    axes[row, col].set_title(algorithm)
    pred_binding_affinity = []
    log_binding_affinity = []
    hla_allele = []
    for n,i in dataset.iterrows():
        if np.isnan(i['Measured BA']):
            continue
        else:
            pred_binding_affinity.append(np.log10(i[algorithm]))
            log_binding_affinity.append(i['Measured BA'])
            hla_allele.append(i['HLA Allele'])

    data = pd.DataFrame()
    data["Predicted binding affinity"] = pred_binding_affinity
    data["Measured Binding Affinity"] = log_binding_affinity
    data["HLA Allele"] = hla_allele

    hla_allele_list = ["HLA-A*68:01", "HLA-B*07:02", "HLA-B*08:01", "HLA-A*02:01"]

    
    sns.scatterplot(ax=axes[row, col], data=data, x="Predicted binding affinity", y="Measured Binding Affinity", hue="HLA Allele", style='HLA Allele', s=130)
    color = ["cornflowerblue", "orange", "green", "red"]
    for n,i in enumerate(hla_allele_list):
        pred_data = data[data["HLA Allele"] == i]["Predicted binding affinity"]
        measured_data = data[data["HLA Allele"] == i]["Measured Binding Affinity"]
        m, b, r_val, p_val, std_err = scipy.stats.linregress(pred_data, measured_data)
        corr_dict[algorithm][i] = r_val
        axes[row, col].plot([1,2,3,4,5,6], m*np.array([1,2,3,4,5,6]).astype(float)+b, linewidth=2, color=color[n], label=i+"-"+str(r_val**2))
    axes[row, col].legend()
    #plt.plot([1,2,3,4,5,6], [1,2,3,4,5,6], '--', label='y=x guide line', linewidth=1, color="black")
    plt.xlabel("Predicted Binding Affinity (log[nM])")
    plt.ylabel("Measured Binding Affinity (log[nM])")
    #plt.title("Predicted vs Measured Binding Affinity ")
    
#plt.savefig("Binding affinity prediction vs log binding affinity measured across all algorithms.pdf", dpi=300)  

In [None]:
corr_data = pd.Series(corr_dict, name='HLA Allele').rename_axis('Algorithm').explode().reset_index()
score = []
for n, i in corr_data.iterrows():
    score.append(corr_dict[i["Algorithm"]][i["HLA Allele"]])
corr_data["Correlation Score"] = score

In [None]:
plt.gcf().set_size_inches(11, 7)

sns.barplot(x="HLA Allele", y="Correlation Score", hue="Algorithm", data=corr_data, palette="deep")
plt.legend(bbox_to_anchor=(1.02, 1), loc='upper left', borderaxespad=0)
plt.savefig("Correlation Score barplot by individual algorithms.pdf", dpi=300, bbox_inches="tight")