In [None]:
import numpy as np
import pandas as pd
import regex
import random
import pickle
import matplotlib.pyplot as plt
%matplotlib inline
plt.ion()
import os
import matplotlib.style as style
import matplotlib.cm as mplcm
import matplotlib.colors as colors
from collections import Counter
import csv
import seaborn as sns
import matplotlib as mpl
import gzip
mpl.rcParams['pdf.fonttype'] = 42

plt.style.use('seaborn')

def save_obj(obj, name ):
    with open('obj/'+ name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

def load_obj(name ):
    with open('/Users/h.xia/Desktop/Griffith_Lab/R_shiny_visualization/anchor_analysis/obj/' + name + '.pkl', 'rb') as f:
        return pickle.load(f)

amino_acids = ['A', 'R', 'N', 'D', 'C', 'E', 'Q', 'G', 'H', 'I', 'L', 'K', 'M', 'F', 'P', 'S', 'T', 'W', 'Y', 'V']
def all_possible_mutations(peptides):
    new_peptides = []
    for i in peptides:
        #print(i)
        for j in range(0, len(i)):
            for k in amino_acids:
                new_peptide = i[:j]+k+i[j+1:]
                #print(new_peptide)
                new_peptides.append(new_peptide)
        #break
    return new_peptides

def get_random_str(main_str, substr_len):
    idx = random.randrange(0, len(main_str) - substr_len + 1)    # Randomly select an "idx" such that "idx + substr_len <= len(main_str)".
    return main_str[idx : (idx+substr_len)]

## Random Peptides Generation

In [None]:
random_peptide_sequence = random.choices(amino_acids, k=100000000)
random_peptide_sequence = "".join(random_peptide_sequence)

In [None]:
random_peptide_list = []
for i in range(0, 50000):
    random_9_mer = get_random_str(random_peptide_sequence,9)
    random_peptide_list.append(random_9_mer)

In [None]:
fasta_file = open("HLA-A0201_anchor_random_9mer_input_discovery_phase_add.fa", 'w+')
for i,j in enumerate(random_peptide_list):
    fasta_file.write(">"+str(i+1)+'\n')
    fasta_file.write(j+'\n')
fasta_file.close()

In [None]:
all_epitopes_random = pd.read_table("./ANCHOR.all_epitopes.random.discovery.add.tsv", delimiter='\t')
good_binding_random_peptides = all_epitopes_random[all_epitopes_random["Median Score"] <= 500]["Epitope Seq"]
good_binding_random_peptides = list(good_binding_random_peptides)
print(len(good_binding_random_peptides))

In [None]:
random_peptide_list_1000 = random.sample(good_binding_random_peptides, 1000)

In [None]:
fasta_file = open("HLA-A0201_anchor_random_9mer_input.fa", 'w+')
all_mutated = all_possible_mutations(random_peptide_list_1000)
for i,j in enumerate(all_mutated):
    fasta_file.write(">"+str(i+1)+'\n')
    fasta_file.write(j+'\n')
fasta_file.close()

## Reference Proteome Peptide Generation

In [None]:
protein_seq_list = []
count = 0
#other_count = 0
with gzip.open("Homo_sapiens.GRCh38.pep.all.fa.gz", 'rt') as refseq:
    for line in refseq:
        other_count += 1
        if line[0] != ">":
            count += 1
            split = line.split("\n")
            if "*" not in split[0]:
                protein_seq_list.append(split[0])
            if count > 1:
                current = protein_seq_list.pop()
                prev = protein_seq_list.pop()
                new_current = prev + current
                #print(prev, current, new_current)
                protein_seq_list.append(new_current)
        else:
            count = 0
        #if other_count == 1000:
            #break
refseq.close()

In [None]:
protein_seq_list_above_8mer = [i for i in protein_seq_list if len(i) > 8]

In [None]:
wt_peptide_list = []
for i in range(0, 30000):
    random_protein = random.choice(protein_seq_list_above_8mer)
    random_9_mer = get_random_str(random_protein,9)
    wt_peptide_list.append(random_9_mer)

In [None]:
fasta_file = open("HLA-A0201_anchor_wt_9mer_input_discovery_phase.fa", 'w+')
for i,j in enumerate(wt_peptide_list):
    fasta_file.write(">"+str(i+1)+'\n')
    fasta_file.write(j+'\n')
fasta_file.close()

In [None]:
all_epitopes_wt = pd.read_table("./ANCHOR.all_epitopes.wt.discovery.tsv", delimiter='\t')
good_binding_wt_peptides = all_epitopes_wt[all_epitopes_wt["Median Score"] <= 500]["Epitope Seq"]
good_binding_wt_peptides = list(good_binding_wt_peptides)
print(len(np.unique(good_binding_wt_peptides)))

In [None]:
wt_peptide_list_1000 = random.sample(good_binding_wt_peptides, 1000)

In [None]:
fasta_file = open("HLA-A0201_anchor_wt_9mer_input_1000.fa", 'w+')
all_mutated = all_possible_mutations(wt_peptide_list_1000)
for i,j in enumerate(all_mutated):
    fasta_file.write(">"+str(i+1)+'\n')
    fasta_file.write(j+'\n')
fasta_file.close()

## Viral Genome peptides

In [None]:
covid_seq_list = []
count = 0
with open("covid19_proteins.fa") as refseq:
    for line in refseq:
        if line[0] != ">":
            count += 1
            split = line.split("\n")
            covid_seq_list.append(split[0])
            if count > 1:
                current = covid_seq_list.pop()
                prev = covid_seq_list.pop()
                new_current = prev + current
                covid_seq_list.append(new_current)
        else:
            count = 0
refseq.close()

covid_peptide_list = []
for i in range(0, 30000):
    random_protein = random.choice(covid_seq_list)
    random_9_mer = get_random_str(random_protein,9)
    covid_peptide_list.append(random_9_mer)
    

In [None]:
viral_seq_list = []
count = 0
with gzip.open("uniprot-compressed_true_download_true_format_fasta_query__28proteome-2022.08.12-16.26.49.28.fasta.gz", 'rt') as refseq:
    for line in refseq:
        if line[0] != ">":
            count += 1
            split = line.split("\n")
            viral_seq_list.append(split[0])
            if count > 1:
                current = viral_seq_list.pop()
                prev = viral_seq_list.pop()
                new_current = prev + current
                viral_seq_list.append(new_current)
        else:
            count = 0
refseq.close()

In [None]:
viral_peptide_list = []
for i in range(0, 30000):
    random_protein = random.choice(viral_seq_list)
    random_9_mer = get_random_str(random_protein,9)
    viral_peptide_list.append(random_9_mer)

In [None]:
viral_peptide_list_add = []
for i in range(0, 50000):
    random_protein = random.choice(viral_seq_list)
    random_9_mer = get_random_str(random_protein,9)
    if random_9_mer not in viral_peptide_list:
        viral_peptide_list_add.append(random_9_mer)

In [None]:
len(viral_peptide_list_add), len(np.unique(viral_peptide_list_add))

In [None]:
fasta_file = open("HLA-A0201_anchor_viral_9mer_input_discovery_phase_add.fa", 'w+')
for i,j in enumerate(viral_peptide_list_add):
    fasta_file.write(">"+str(i+1)+'\n')
    fasta_file.write(j+'\n')
fasta_file.close()

In [None]:
all_epitopes_viral_1 = pd.read_table("./ANCHOR.all_epitopes.viral.discovery.tsv", delimiter='\t')
all_epitopes_viral_2 = pd.read_table("./ANCHOR.all_epitopes.viral.discovery.add.tsv", delimiter='\t')
all_epitopes_viral = pd.concat([all_epitopes_viral_1, all_epitopes_viral_2])
good_binding_viral_peptides = all_epitopes_viral[all_epitopes_viral["Median Score"] <= 500]["Epitope Seq"]
good_binding_viral_peptides = list(good_binding_viral_peptides)
print(len(good_binding_viral_peptides), len(np.unique(good_binding_viral_peptides)))

In [None]:
viral_peptide_list_1000 = random.sample(good_binding_viral_peptides, 1000)

In [None]:
fasta_file = open("HLA-A0201_anchor_viral_9mer_input.fa", 'w+')
all_mutated = all_possible_mutations(viral_peptide_list_1000)
for i,j in enumerate(all_mutated):
    fasta_file.write(">"+str(i+1)+'\n')
    fasta_file.write(j+'\n')
fasta_file.close()

## Peptide results analysis 

In [None]:
### For finding the sequences that are at most one mutation away
def one_mutation_sequence_match(seq1, seq2):
    m = regex.findall("("+seq1+"){s<=1}", seq2, overlapped=True)
    return m!=[]

### For creating the dictionary matching all original epitopes to their mutation and scores
def create_mutation_dictionary(og_epitopes, all_mutated_epitopes, all_mutated_median):
    all_mutated_epitopes_new = all_mutated_epitopes
    mutation_dict = {}
    for i in og_epitopes:
        count = 0
        mutation_dict[i] = {}
        for k,j in enumerate(all_mutated_epitopes):
            #print(i,j)
            if one_mutation_sequence_match(i,j):
                #print ('dropping: '+str(k))
                median = all_mutated_median[k]
                mutation_dict[i][j] = median
                #print(i,j,k)
                try:
                    all_mutated_epitopes_new = all_mutated_epitopes_new.drop(k)
                    count += 1
                except:
                    continue
        #print (len(all_mutated_epitopes_new))
        #print (count)
    return mutation_dict

amino_acids = ['A', 'R', 'N', 'D', 'C', 'E', 'Q', 'G', 'H', 'I', 'L', 'K', 'M', 'F', 'P', 'S', 'T', 'W', 'Y', 'V']

### for calculating sum differences for each individual epitope for each individual position
def calculate_sum_diff(og_epitopes, all_scores, all_mutant_dict): 
    sum_dict = {}
    og_scores = np.unique([all_scores[all_scores['Epitope Seq'] == i]['Median Score'].iloc[0] for i in og_epitopes])
    #print(og_scores)
    for (i,j) in zip(og_epitopes, og_scores):
        mutant_list = all_mutant_dict[i]
        sum_dict[i] = []
        if len(mutant_list) == 0:
            continue
        for n in range(0, len(i)):
            sum_score_diff = 0
            for k in amino_acids:
                query_peptide = i[:n]+k+i[n+1:]
                score = mutant_list[query_peptide]
                sum_score_diff += abs(score-j)
            sum_dict[i].append([n, sum_score_diff])
        
    return sum_dict

### for calculating the overall sum differences across all 10 original epitopes 
def overall_pos_sum_diff(sum_diff_results, pos_length):
    overall_score = [0]*pos_length
    for i in sum_diff_results.keys():
        for j in range(len(i)):
            (pos, score) = sum_diff_results[i][j]
            overall_score[pos] += score
    return overall_score

def calculate_all_diff(og_epitopes, all_scores, all_mutant_dict): 
    all_diff_dict = {}
    #og_scores = np.unique([all_scores[all_scores['Epitope Seq'] == i]['Median Score'].iloc[0] for i in og_epitopes])
    og_scores = []
    for i in og_epitopes:
        #print(i)
        score = all_scores[all_scores['Epitope Seq'] == i]['Median Score'].iloc[0]
        og_scores.append(score)
    og_scores = np.unique(og_scores)
    for (i,j) in zip(og_epitopes, og_scores):
        mutant_list = all_mutant_dict[i]
        all_diff_dict[i] = {}
        for n in range(0, len(i)):
            all_diff_dict[i][n+1] = {}
            for k in amino_acids:
                query_peptide = i[:n]+k+i[n+1:]
                score = mutant_list[query_peptide]
                all_diff_dict[i][n+1][k] = score
        
    return all_diff_dict

In [None]:
all_epitopes_random = pd.read_table("./ANCHOR.all_epitopes.random.tsv", delimiter='\t')
good_binders_random = random.choices(good_binding_random_peptides, k=100)

In [None]:
all_epitopes_wt = pd.read_table("./ANCHOR.all_epitopes.wt.tsv", delimiter='\t')
good_binders_wt = wt_peptide_list_1000

In [None]:
all_epitopes_viral = pd.read_table("./ANCHOR.all_epitopes.viral.tsv", delimiter='\t')
good_binders_viral= viral_peptide_list_1000

In [None]:
overall_pos_scores = {}

og_epitopes_random = good_binders_random
print("Here 1")
mutation_dictionary_random = create_mutation_dictionary(og_epitopes_random, all_epitopes_random['Epitope Seq'], all_epitopes_random['Median Score'])
save_obj(mutation_dictionary_random, "mutation_dictionary_random_comparison_analysis")
print("Here 2")
sum_diff_pos_random = calculate_sum_diff(og_epitopes_random, all_epitopes_random, mutation_dictionary_random)
print("Here 3")
overall_pos_scores["Random Peptides"] = overall_pos_sum_diff(sum_diff_pos_random,9)

og_epitopes_wt = good_binders_wt
print("Here 1")
mutation_dictionary_wt = create_mutation_dictionary(og_epitopes_wt, all_epitopes_wt['Epitope Seq'], all_epitopes_wt['Median Score'])
save_obj(mutation_dictionary_wt, "mutation_dictionary_wt_comparison_analysis")
print("Here 2")
sum_diff_pos_wt = calculate_sum_diff(og_epitopes_wt, all_epitopes_wt, mutation_dictionary_wt)
print("Here 3")
overall_pos_scores["WildType Peptides"] = overall_pos_sum_diff(sum_diff_pos_wt,9)

og_epitopes_viral = good_binders_viral
print("Here 1")
mutation_dictionary_viral = create_mutation_dictionary(og_epitopes_viral, all_epitopes_viral['Epitope Seq'], all_epitopes_viral['Median Score'])
save_obj(mutation_dictionary_viral, "mutation_dictionary_viral_comparison_analysis")
print("Here 2")
sum_diff_pos_viral = calculate_sum_diff(og_epitopes_viral, all_epitopes_viral, mutation_dictionary_viral)
print("Here 3")
overall_pos_scores["Viral Peptides"] = overall_pos_sum_diff(sum_diff_pos_viral,9)

In [None]:
og_saturation_results = load_obj("saturation_analysis/random_round_1/overall_pos_scores_9_mer_dict")
gold_standard_1000 = og_saturation_results[1000]

In [None]:
comparison_data = pd.DataFrame(data = {
    "methods": ["Random Peptides", "WildType Peptides", "Viral Peptides", "Tumor mutation derived Peptides"],
    "scores": [
        [i/sum(overall_pos_scores['Random Peptides']) for i in overall_pos_scores['Random Peptides']],
        [i/sum(overall_pos_scores['WildType Peptides']) for i in overall_pos_scores['WildType Peptides']],
        [i/sum(overall_pos_scores['Viral Peptides']) for i in overall_pos_scores['Viral Peptides']],
        [i/sum(gold_standard_1000) for i in gold_standard_1000]
    ]}
)

In [None]:
pos = [1,2,3,4,5,6,7,8,9]
for n, i in comparison_data.iterrows():
    method = i["methods"]
    score = i["scores"]
    plt.plot(pos, score, label = method)
plt.title("Comparison of anchor pattern for HLA-A*02:01 using peptides from seed dataset versus other methods")
legend = plt.legend(title="Peptide Source", shadow=True)
legend.get_frame().set_facecolor('white')
#plt.savefig("Comparison of anchor pattern for HLA-A*02:01 using peptides from seed dataset versus other methods.png", dpi=300)
plt.show()