In [None]:
import numpy as np
import pandas as pd
import sys
import os
import math
import regex
import pickle
import collections
%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('seaborn')

In [None]:
def save_obj(obj, name ):
    with open('obj/'+ name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

def load_obj(name ):
    with open('obj/' + name + '.pkl', 'rb') as f:
        return pickle.load(f)

### For finding the sequences that are at most one mutation away
def one_mutation_sequence_match(seq1, seq2):
    m = regex.findall("("+seq1+"){s<=1}", seq2, overlapped=True)
    return m!=[]

### For creating the dictionary matching all original epitopes to their mutation and scores
def create_mutation_dictionary(og_epitopes, all_mutated_epitopes, all_mutated_median):
    all_mutated_epitopes_new = all_mutated_epitopes
    mutation_dict = {}
    for i in og_epitopes:
        count = 0
        mutation_dict[i] = {}
        for k,j in enumerate(all_mutated_epitopes):
            if one_mutation_sequence_match(i,j):
                #print ('Matched: '+str(k))
                median = all_mutated_median[k]
                mutation_dict[i][j] = median
                #print(median, j)
                #all_mutated_epitopes_new = all_mutated_epitopes_new.drop(k)
                count += 1
        #print (len(all_mutated_epitopes_new))
        #print (count)
    return mutation_dict

amino_acids = ['A', 'R', 'N', 'D', 'C', 'E', 'Q', 'G', 'H', 'I', 'L', 'K', 'M', 'F', 'P', 'S', 'T', 'W', 'Y', 'V']

### for calculating sum differences for each individual epitope for each individual position
def calculate_sum_diff(og_epitopes, og_scores, all_mutant_dict): 
    sum_dict = {}
    for (i,j) in zip(og_epitopes, og_scores):
        mutant_list = all_mutant_dict[i]
        sum_dict[i] = []
        for n in range(0, len(i)):
            sum_score_diff = 0
            for k in amino_acids:
                query_peptide = i[:n]+k+i[n+1:]
                score = mutant_list[query_peptide]
                sum_score_diff += abs(score-j)
            sum_dict[i].append([n, sum_score_diff])    
    return sum_dict

### for calculating the overall sum differences across all 10 original epitopes 
def overall_pos_sum_diff(sum_diff_results, pos_length):
    overall_score = [0]*pos_length
    for i in sum_diff_results.keys():
        for j in range(len(i)):
            (pos, score) = sum_diff_results[i][j]
            overall_score[pos] += score
    return overall_score

def locate_data_by_sequence(all_epitope_data, epitope_seq_list, hla_allele):
    output_list = []
    for i in epitope_seq_list:
        output_list.append(all_epitope_data.loc[(all_epitope_data['MT Epitope Seq'] == i) & 
                                                (all_epitope_data['HLA Allele'] == hla_allele)])
    output = pd.concat(output_list)
    output = output.drop_duplicates(['Median MT Score','HLA Allele', 'MT Epitope Seq'])
    return output

def roundup(x, n):
    return int(math.ceil(x / n)) * n

def rounddown(x, n):
    return int(math.floor(x / n)) * n

In [None]:
og_epitopes = load_obj('random_peptide_sets_r4')
filtered_epitopes_combined = load_obj('filtered_epitopes_combined_round_4')
all_anchor_sum_diff_dict = {}
all_anchor_overall_pos_score_dict = {}

In [None]:
## Plotting and saving for length = n
length = 11
for hla_allele in og_epitopes.keys():
    print(hla_allele)
    outdir = "./anchor_analysis_pdfs/"+hla_allele
    if not os.path.exists(outdir):
        os.makedirs(outdir)
    if hla_allele not in all_anchor_sum_diff_dict:
        all_anchor_sum_diff_dict[hla_allele] = {}
        all_anchor_overall_pos_score_dict[hla_allele] = {}
    try:
        all_epitopes_n_mer = pd.read_csv("./output_from_cluster_r4/"+hla_allele+"/anchor_"+str(length)+"mer_output/MHC_Class_I/ANCHOR.all_epitopes.tsv", sep='\t')
    except:
        continue
    og_epitopes_n_mer = locate_data_by_sequence(filtered_epitopes_combined, og_epitopes[hla_allele][length], hla_allele)
    num_peptides = len(og_epitopes[hla_allele][length])
    mutation_dictionary_n = create_mutation_dictionary(og_epitopes_n_mer['MT Epitope Seq'], all_epitopes_n_mer['Epitope Seq'], all_epitopes_n_mer['Median Score'])
    sum_diff_pos_n = calculate_sum_diff(og_epitopes_n_mer['MT Epitope Seq'], og_epitopes_n_mer['Median MT Score'], mutation_dictionary_n)
    overall_pos_scores_n = overall_pos_sum_diff(sum_diff_pos_n,length)
    all_anchor_sum_diff_dict[hla_allele][length] = sum_diff_pos_n
    all_anchor_overall_pos_score_dict[hla_allele][length] = overall_pos_scores_n

    max_y = roundup(np.amax(list(sum_diff_pos_n.values())), 50000)
    min_y = rounddown(np.amin(list(sum_diff_pos_n.values())), 50000)
    max_overall_y = max(overall_pos_scores_n)
    min_overall_y = min(overall_pos_scores_n)
    fig = plt.figure()
    fig.subplots_adjust(hspace=.8, wspace=.9)
    for i in range(1, num_peptides+2):
        #print(num_peptides,i)
        if i == num_peptides+1:
            ax = fig.add_subplot(3, 4, num_peptides+1)
            ax.plot(range(1,length+1),overall_pos_scores_n)
            ax.set_xlabel('Overall Score')
            plt.xticks(range(1,length+1), fontsize=7)
            plt.yticks(np.arange(min_overall_y, max_overall_y+1, (max_overall_y-min_overall_y)/5), fontsize=5)
        else:
            ax = fig.add_subplot(3, 4, i)
            index = og_epitopes_n_mer['MT Epitope Seq'].index[i-1]
            y_data = [row[1] for row in sum_diff_pos_n[og_epitopes_n_mer['MT Epitope Seq'][index]]]
            ax.plot(range(1,length+1),y_data)
            plt.xticks(range(1,length+1), fontsize=7)
            plt.yticks(np.arange(min_y, max_y+1, (max_y-min_y)/5), fontsize=5)
            ax.set_xlabel('Peptide: '+str(og_epitopes_n_mer['MT Epitope Seq'][index]), fontsize=9)

    plt.savefig("anchor_analysis_pdfs/"+hla_allele+"/"+hla_allele+"_anchor_"+str(length)+"_mer_analysis.pdf")
    #plt.show()
    plt.close()

In [None]:
all_epitopes_n_mer = pd.read_csv("./output_from_cluster_r4/"+"HLA-A*02:01"+"/anchor_"+str(9)+"mer_output/MHC_Class_I/ANCHOR.all_epitopes.tsv", sep='\t')
og_epitopes_n_mer = locate_data_by_sequence(filtered_epitopes_combined, og_epitopes['HLA-A*02:01'][9], 'HLA-A*02:01')

In [None]:
mutation_dictionary_n = create_mutation_dictionary(og_epitopes_n_mer['MT Epitope Seq'], all_epitopes_n_mer['Epitope Seq'], all_epitopes_n_mer['Median Score'])