In [None]:
import numpy as np
import scipy
import pandas as pd
import regex
import pickle
import matplotlib.pyplot as plt
%matplotlib inline
plt.ion()
import os
import matplotlib.style as style
import matplotlib.cm as mplcm
import matplotlib.colors as colors
from collections import Counter
import csv
import seaborn as sns
import matplotlib as mpl
import re
import statistics
plt.style.use('seaborn')

def save_obj(obj, name ):
    with open('obj/'+ name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

def load_obj(name ):
    with open('../obj/' + name + '.pkl', 'rb') as f:
        return pickle.load(f)

def lines_that_contain(string, fp):
    return [line for line in fp if string in line]

## NetMHCpan distance metrics

In [None]:
data_8 = pd.read_excel("/Users/h.xia/Desktop/Griffith_Lab/Anchor Paper/Supp Data/Supp Table 1.xlsx", sheet_name="8-mer")
data_9 = pd.read_excel("/Users/h.xia/Desktop/Griffith_Lab/Anchor Paper/Supp Data/Supp Table 1.xlsx", sheet_name="9-mer")
data_10 = pd.read_excel("/Users/h.xia/Desktop/Griffith_Lab/Anchor Paper/Supp Data/Supp Table 1.xlsx", sheet_name="10-mer")
data_11 = pd.read_excel("/Users/h.xia/Desktop/Griffith_Lab/Anchor Paper/Supp Data/Supp Table 1.xlsx", sheet_name="11-mer")

In [None]:
hla_alleles = list(set(list(data_8['HLA allele']) + list(data_9['HLA allele']) + list(data_10['HLA allele']) + list(data_11['HLA allele'])))

In [None]:
count = 0
grouping = []
for i in range(0,10):
    hla_string = ""
    for j in range(0,40):
        try:
            hla_string += hla_alleles[count]
            hla_string += ","
            count += 1
        except:
            break
    hla_string = hla_string.replace("*", "")
    grouping.append(hla_string)

In [None]:
hla_list = []
neighbor_list = []
distance_list = []
for i in range(0,10):
    with open("../netMHCpan-4.0/hla_group_"+str(i)+".log", "r") as fp:
        for line in lines_that_contain("using nearest neighbor", fp):
            line1 = line.replace(")\n","")
            breakdown = line1.split(" ")
            original_hla = breakdown[0]
            nearest_neighbor = breakdown[-1]
            distance = breakdown[7]
            hla_list.append(re.sub(r"([0-9,:]+(\.[0-9,:]+)?)",r"*\1", original_hla).strip())
            neighbor_list.append(re.sub(r"([0-9,:]+(\.[0-9,:]+)?)",r"*\1", nearest_neighbor).strip())
            distance_list.append(distance)

In [None]:
distance_data = pd.DataFrame()
distance_data["Original HLA Allele"] = hla_list
distance_data["Nearest Neighbor"] = neighbor_list
distance_data["Distance"] = np.array(distance_list).astype(float)
#distance_data.to_excel("NetMHCpan4.0 Distance Metrics for Pan Alleles.xlsx")

In [None]:
not_available_alleles = []
for i in hla_alleles:
    if i not in hla_list:
        not_available_alleles.append(i)

## Variance metrics

In [None]:
filtered_epitopes_combined = load_obj('filtered_epitopes_combined_round_4')

In [None]:
columns = ['MT Epitope Seq', 'HLA Allele', 'MHCflurry MT Score', 'MHCnuggetsI MT Score', 'NetMHC MT Score', 'NetMHCcons MT Score',
           'NetMHCpan MT Score', 'SMM MT Score', 'SMMPMBEC MT Score', 'PickPocket MT Score']
data = filtered_epitopes_combined[columns]
data = data.drop_duplicates()

In [None]:
variance_list = np.nanstd(data.loc[:, (data.columns != 'MT Epitope Seq') & (data.columns != 'HLA Allele')], axis=1)
data["Standard Deviation"] = variance_list
data['Algorithm Count'] = data.loc[:,'MHCflurry MT Score':'PickPocket MT Score'].count(axis=1)

In [None]:
data_null = data.isnull()
data["Algorithm Null List"] = data_null.eq(1).dot(data_null.columns + ',').str.rstrip(',').str.split(',')
algorithms = ['MHCflurry MT Score', 'MHCnuggetsI MT Score', 'NetMHC MT Score', 'NetMHCcons MT Score',
           'NetMHCpan MT Score', 'SMM MT Score', 'SMMPMBEC MT Score', 'PickPocket MT Score']
algorithm_list = []
for n, i in data.iterrows():
    l = i["Algorithm Null List"]
    new_l = []
    for j in algorithms:
        if j not in l:
            new_l.append(j)
    algorithm_list.append(new_l)
data["Algorithm List"] = algorithm_list
data["Peptide Length"] = [len(str(i)) for i in data["MT Epitope Seq"]]

In [None]:
hla_column = []
variance_mean = []
neighbor_column = []
distance_column = []
cluster_column = []
algorithm_count_column= []
algorithm_list_column = []
algorithm_data_used_column = []
overall_data = pd.DataFrame()
for i in hla_alleles:
    hla_data = data.loc[data["HLA Allele"] == i,:]
    mean = np.mean(hla_data["Standard Deviation"])
    try:
        neighbor = distance_data[distance_data["Original HLA Allele"] == i]["Nearest Neighbor"].iloc[0]
        distance = distance_data[distance_data["Original HLA Allele"] == i]["Distance"].iloc[0]
    except:
        neighbor = "Nan"
        distance = "Nan"
    hla_column.append(i)
    variance_mean.append(mean)
    neighbor_column.append(neighbor)
    distance_column.append(distance)
    try:
        cluster_column.append(data_9[data_9["HLA allele"] == i]["Cluster Color Codes"].iloc[0])
    except:
        cluster_column.append("n/a")
    try:
        print("Trying 9mer")
        mer_data = data[data["Peptide Length"] == 9]
        algorithm_count = np.max(mer_data.loc[mer_data["HLA Allele"] == i,:]["Algorithm Count"])
        algorithm_list = mer_data.loc[mer_data["HLA Allele"] == i,:]["Algorithm List"].iloc[0]
        algorithm_count_column.append(algorithm_count)
        algorithm_list_column.append(algorithm_list)
        algorithm_data_used_column.append("9-mer")
        print("Success")
        continue
    except:
        print("No 9-mer data available for: ", i)
    try:
        print("Trying 10mer")
        mer_data = data[data["Peptide Length"] == 10]
        algorithm_count = np.max(mer_data.loc[mer_data["HLA Allele"] == i,:]["Algorithm Count"])
        algorithm_list = mer_data.loc[mer_data["HLA Allele"] == i,:]["Algorithm List"].iloc[0]
        algorithm_count_column.append(algorithm_count)
        algorithm_list_column.append(algorithm_list)
        algorithm_data_used_column.append("10-mer")
        print("Success")
        continue
    except:
        print("No 10-mer data available for: ", i)
    try:
        print("Trying 8mer")
        mer_data = data[data["Peptide Length"] == 8]
        algorithm_count = np.max(mer_data.loc[mer_data["HLA Allele"] == i,:]["Algorithm Count"])
        algorithm_list = mer_data.loc[mer_data["HLA Allele"] == i,:]["Algorithm List"].iloc[0]
        algorithm_count_column.append(algorithm_count)
        algorithm_list_column.append(algorithm_list)
        algorithm_data_used_column.append("8-mer")
        print("Success")
        continue
    except:
        print("No 8-mer data available for: ", i)
    try:
        print("Trying 11mer")
        mer_data = data[data["Peptide Length"] == 11]
        algorithm_count = np.max(mer_data.loc[mer_data["HLA Allele"] == i,:]["Algorithm Count"])
        algorithm_list = mer_data.loc[mer_data["HLA Allele"] == i,:]["Algorithm List"].iloc[0]
        algorithm_count_column.append(algorithm_count)
        algorithm_list_column.append(algorithm_list)
        algorithm_data_used_column.append("11-mer")
        print("Success")
        continue
    except:
        print("No 11-mer data available for: ", i)
        print("ERROR")
    
overall_data["HLA allele"] = hla_column
overall_data["Nearest Neighbor"] = neighbor_column
overall_data["Distance"] = np.array(distance_column).astype(float)
overall_data["Standard Deviation Mean"] = variance_mean
overall_data["Cluster"] = cluster_column
overall_data["Algorithm Count"] = algorithm_count_column
overall_data["Algorithm List"] = algorithm_list_column
overall_data["Algorithm by Peptide Length"] = algorithm_data_used_column

In [None]:
#overall_data.to_excel("Supplementary table with HLA summary information.xlsx")

In [None]:
g = sns.scatterplot(data=overall_data, x="Distance", y="Standard Deviation Mean", hue = "Cluster",
                palette=["red", "black", "orange", "purple", "grey", "green", "blue"], size=3, alpha = 0.8)
h,l = g.get_legend_handles_labels()
plt.legend(h[0:7],l[0:7],bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0., fontsize=13)
plt.xticks(rotation=90)
#plt.savefig("HLA distance vs standard deviation mean by hla cluster.pdf", dpi=300)

## NetMHCpan training data analysis

In [None]:
training_data_ba = []
for i in range(0,5):
    with open("./NetMHCpan4.0 dataset/f00"+str(i)+"_ba.txt", "r") as fp:
        for line in fp:
            line = line.strip("\n")
            new_line = line.split(" ")
            training_data_ba.append(new_line)
training_data_ba = pd.DataFrame(training_data_ba)

training_data_el = []
for i in range(0,5):
    with open("./NetMHCpan4.0 dataset/f00"+str(i)+"_el.txt", "r") as fp:
        for line in fp:
            line = line.strip("\n")
            new_line = line.split("\t")
            training_data_el.append(new_line)
training_data_el = pd.DataFrame(training_data_el)

In [None]:
ba_dict = dict(Counter(training_data_ba[2]))
el_dict = dict(Counter(training_data_el[2]))

In [None]:
supp_data = overall_data

In [None]:
ba_count_list = []
el_count_list = []
for n, i in supp_data.iterrows():
    allele = i['HLA allele'].replace("*", "")
    try:
        ba_count_list.append(ba_dict[allele])
    except:
        ba_count_list.append(0)
    try:
        el_count_list.append(el_dict[allele])
    except:
        el_count_list.append(0)
supp_data['Training Data (BA)'] = ba_count_list
supp_data['Training Data (EL)'] = el_count_list

In [None]:
#supp_data.to_excel("Supplementary table with HLA summary information.xlsx")

In [None]:
g = sns.scatterplot(data=overall_data[overall_data["Algorithm Count"] == 4], x="Distance", y="Standard Deviation Mean", hue = "Cluster", size=3, alpha = 0.8,
                palette=["orange", "purple","red","black", "blue", "green"])
h,l = g.get_legend_handles_labels()
plt.legend(h[0:6],l[0:6],bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0., fontsize=13)
plt.xticks(rotation=90)
#plt.savefig("HLA distance vs standard deviation mean by hla cluster (algorithm = 4).pdf", dpi=300)

In [None]:
g = sns.scatterplot(data=overall_data[overall_data["Algorithm Count"] == 8], x="Distance", y="Standard Deviation Mean", hue = "Cluster", size=3, alpha = 0.8,
                palette=["orange","purple", "red",  "grey", "green", "blue"])
h,l = g.get_legend_handles_labels()
plt.legend(h[0:6],l[0:6],bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0., fontsize=13)
plt.xticks(rotation=90)
#plt.savefig("HLA distance vs standard deviation mean by hla cluster (algorithm = 8).pdf", dpi=300)

In [None]:
g = sns.scatterplot(data=overall_data[overall_data["Algorithm Count"] > 4], x="Distance", y="Standard Deviation Mean", hue = "Cluster", size=3, alpha = 0.8,
                palette=["red", "orange","purple", "grey",  "green", "blue", "black"])
h,l = g.get_legend_handles_labels()
plt.legend(h[0:7],l[0:7],bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0., fontsize=13)
plt.xticks(rotation=90)
#plt.savefig("HLA distance vs standard deviation mean by hla cluster (4 < algorithm <= 8).pdf", dpi=300)

In [None]:
plt.figure(figsize = (12,7))
ax = sns.violinplot(x="Cluster", y="Distance", data=overall_data)
ax = sns.scatterplot(x="Cluster", y="Distance", data=overall_data)
#plt.savefig("Violin plot HLA distance by hla cluster.pdf", dpi=300)

In [None]:
plt.figure(figsize = (12,7))
ax = sns.violinplot(x="Cluster", y="Standard Deviation Mean", data=overall_data)
ax = sns.scatterplot(x="Cluster", y="Standard Deviation Mean", data=overall_data)
#plt.savefig("Violin plot standard deviation mean by hla cluster.pdf", dpi=300)

In [None]:
plt.figure(figsize = (12,7))
ax = sns.scatterplot(x="Training Data (BA)", y="Standard Deviation Mean", data=overall_data)
#plt.savefig("Training data (BA) vs Standard deviation mean.pdf", dpi=300)

## Network Graphs

In [None]:
import py4cytoscape as p4c

In [None]:
node_id = list(set(list(supp_data["HLA allele"])+list(supp_data["Nearest Neighbor"])))
node_size = []
node_color = []
for i in node_id:
    try: 
        node_size.append(supp_data[supp_data["HLA allele"] == i]["Training Data (BA)"].iloc[0] + 1)
    except: 
        node_size.append(1)
    try:
        if supp_data[supp_data["HLA allele"] == i]["Cluster"].iloc[0] == "n/a":
            node_color.append("white")
        else:
            node_color.append(supp_data[supp_data["HLA allele"] == i]["Cluster"].iloc[0])
    except:
        node_color.append("white")
nodes = pd.DataFrame(data = {'id' : node_id, 'size' : node_size, 'color': node_color})

source = []
target = []
weight = []
for n, i in supp_data.iterrows():
    if i["HLA allele"] != i["Nearest Neighbor"]:
        source.append(i["HLA allele"])
        target.append(i["Nearest Neighbor"])
        weight.append(i["Distance"])
        #if i["Nearest Neighbor"] not in set(supp_data["HLA allele"]):
            #print(i["Nearest Neighbor"])
edges = pd.DataFrame(data = {'source' : source, 'target' : target, 'weight' : weight})
p4c.create_network_from_data_frames(nodes, edges, title="HLA Alleles Network")

In [None]:
nodes = pd.DataFrame(data={'id': ["A", "B", "C", "D"]})
edges = pd.DataFrame(data={'source': ["C", "B", "B", "B"], 'target': ["D", "A", "D", "C"]})

p4c.create_network_from_data_frames(nodes, edges, title="simple network", collection="Biological Example")