In [None]:
import numpy as np
import pandas as pd
import regex
import random
import pickle
import matplotlib.pyplot as plt
%matplotlib inline
plt.ion()
import os
import matplotlib.style as style
import matplotlib.cm as mplcm
import matplotlib.colors as colors
import upsetplot
from collections import Counter
import csv
import seaborn as sns
import matplotlib as mpl
mpl.rcParams['pdf.fonttype'] = 42

plt.style.use('seaborn')

def save_obj(obj, name ):
    with open('obj/'+ name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

def load_obj(name ):
    with open('obj/' + name + '.pkl', 'rb') as f:
        return pickle.load(f)

amino_acids = ['A', 'R', 'N', 'D', 'C', 'E', 'Q', 'G', 'H', 'I', 'L', 'K', 'M', 'F', 'P', 'S', 'T', 'W', 'Y', 'V']
def all_possible_mutations(peptides):
    new_peptides = []
    for i in peptides:
        for j in range(0, len(i)):
            for k in amino_acids:
                new_peptide = i[:j]+k+i[j+1:]
                new_peptides.append(new_peptide)
    return new_peptides

In [None]:
filtered_epitopes_combined = load_obj('filtered_epitopes_combined_round_4')

In [None]:
hla_allele_all_epitopes = {}
for n, i in filtered_epitopes_combined.iterrows():
    hla = i['HLA Allele']
    peptide = i['MT Epitope Seq']
    best_score = i['Best MT Score']
    median_score = i['Median MT Score']
    if hla not in hla_allele_all_epitopes:
        hla_allele_all_epitopes[hla] = [(peptide, best_score, median_score)]
    else:
        hla_allele_all_epitopes[hla].append((peptide, best_score, median_score))

In [None]:
hla_allele_all_epitopes_9_mer = {}
for i in hla_allele_all_epitopes:
    hla_allele_all_epitopes_9_mer[i] = []
    hla_allele_all_epitopes[i] = set(hla_allele_all_epitopes[i])
    for j in hla_allele_all_epitopes[i]:
        if len(j[0]) == 9:
            hla_allele_all_epitopes_9_mer[i].append(j)

In [None]:
hla_list = ['HLA-A*02:01', 'HLA-A*24:02', 'HLA-B*07:02', 'HLA-B*18:01', 'HLA-A*03:01','HLA-A*23:01','HLA-A*68:01','HLA-A*31:01', 'HLA-B*08:01']
selected_hla_allele_all_epitopes = {}
for i in hla_list:
    selected_hla_allele_all_epitopes[i] = sorted(list(hla_allele_all_epitopes_9_mer[i]), key=lambda x: x[2])

In [None]:
filtered_selected_hla_allele_all_epitopes = {}
for i in selected_hla_allele_all_epitopes.keys():
    filtered_selected_hla_allele_all_epitopes[i] = []
    for j in selected_hla_allele_all_epitopes[i]:
        if j[2] <= 300:
            filtered_selected_hla_allele_all_epitopes[i].append(j)

In [None]:
##Randomly selecting 200 peptides for each HLA-9mer combination 
selected_peptides = {}
for i in filtered_selected_hla_allele_all_epitopes:
    selected_peptides[i] = random.sample(filtered_selected_hla_allele_all_epitopes[i], 200)

In [None]:
selected_peptides_only_peptides = {}
for i in selected_peptides:
    selected_peptides_only_peptides[i] = []
    for j in selected_peptides[i]:
        selected_peptides_only_peptides[i].append(j[0])

In [None]:
for k in selected_peptides_only_peptides.keys():
    all_mutated_peptides = all_possible_mutations(selected_peptides_only_peptides[k])
    fasta_file = open("wet_lab_simulation_fasta_files/anchor_"+k+"_"+"9mer"+"_input_corrected.fa", 'w+')
    for i,j in enumerate(all_mutated_peptides):
        fasta_file.write(">"+str(i+1)+'\n')
        fasta_file.write(j+'\n')
    fasta_file.close()

In [None]:
## previously validated peptide provided by collaborating lab
michelle_peptide_hla_combinations = [
    ("HLA-A*68:01","ELAKHACPR", "FLNA-04-HIST1H2BK"),
    ("HLA-A*02:01","GLSLLTNMTV","REM-FLNA02-12 MUT"),
    ("HLA-A*02:01","AQCERQIPV","REM-FLNA02-17 MUT"),
    ("HLA-A*02:01","SLFKNPPENV","REM-FLNA02-27 MUT"),
    ("HLA-A*31:01","KGMRMLVDAR","REM-FLNA02-14 MUT"),
    ("HLA-A*31:01","MLKDMHDSKTR","REM-FLNA02-30 MUT"),
    ("HLA-B*07:02","RPDVKHSKM","REM-FLNA02-19 MUT"),
    ("HLA-B*07:02","APRVKAMII","REM-FLNA02-22 MUT"),
    ("HLA-B*07:02","LPREPRDAW","REM-FLNA02-25 MUT"),
    ("HLA-B*07:02","SPRGRFMTT","REM-FLNA02-29 MUT"),
    ("HLA-A*02:01","LMFTAFLPDI","REM-FLNA02-15 MUT"),
    ("HLA-A*02:01","YLEPYEEYT","REM-FLNA02-23 MUT"),
    ("HLA-A*31:01","LQLEIDLKHR","REM-FLNA02-09 MUT"),
    ("HLA-B*07:02","CTRDFPLTHW","REM-FLNA02-01 MUT")
]
michelle_dataset = {}
for i,j,k in michelle_peptide_hla_combinations:
    for m in [9,10]:
        if i not in michelle_dataset:
            michelle_dataset[i]= {9:[],10:[],11:[]}
    michelle_dataset[i][len(j)].append(j)

In [None]:
for k in michelle_dataset.keys():
    for m in michelle_dataset[k]:
        if len(michelle_dataset[k][m]) > 0:
            all_mutated_peptides = all_possible_mutations(michelle_dataset[k][m])
            fasta_file = open("wet_lab_simulation_fasta_files/anchor_"+k+"_"+str(m)+"_input.fa", 'w+')
            for i,j in enumerate(all_mutated_peptides):
                fasta_file.write(">"+str(i+1)+'\n')
                fasta_file.write(j+'\n')
            fasta_file.close()

# Positional Analysis

In [None]:
### For finding the sequences that are at most one mutation away
def one_mutation_sequence_match(seq1, seq2):
    m = regex.findall("("+seq1+"){s<=1}", seq2, overlapped=True)
    return m!=[]

### For creating the dictionary matching all original epitopes to their mutation and scores
def create_mutation_dictionary(og_epitopes, all_mutated_epitopes, all_mutated_median):
    all_mutated_epitopes_new = all_mutated_epitopes
    mutation_dict = {}
    for i in og_epitopes:
        count = 0
        mutation_dict[i] = {}
        for k,j in enumerate(all_mutated_epitopes):
            if one_mutation_sequence_match(i,j):
                median = all_mutated_median[k]
                mutation_dict[i][j] = median
                try:
                    all_mutated_epitopes_new = all_mutated_epitopes_new.drop(k)
                    count += 1
                except:
                    continue
    return mutation_dict

amino_acids = ['A', 'R', 'N', 'D', 'C', 'E', 'Q', 'G', 'H', 'I', 'L', 'K', 'M', 'F', 'P', 'S', 'T', 'W', 'Y', 'V']

### for calculating sum differences for each individual epitope for each individual position
def calculate_sum_diff(og_epitopes, all_scores, all_mutant_dict): 
    sum_dict = {}
    og_scores = np.unique([all_scores[all_scores['Epitope Seq'] == i]['Median Score'].iloc[0] for i in og_epitopes])
    for (i,j) in zip(og_epitopes, og_scores):
        mutant_list = all_mutant_dict[i]
        sum_dict[i] = []
        if len(mutant_list) == 0:
            continue
        for n in range(0, len(i)):
            sum_score_diff = 0
            for k in amino_acids:
                query_peptide = i[:n]+k+i[n+1:]
                score = mutant_list[query_peptide]
                sum_score_diff += abs(score-j)
            sum_dict[i].append([n, sum_score_diff])
        
    return sum_dict

### for calculating the overall sum differences across all 10 original epitopes 
def overall_pos_sum_diff(sum_diff_results, pos_length):
    overall_score = [0]*pos_length
    for i in sum_diff_results.keys():
        for j in range(len(i)):
            (pos, score) = sum_diff_results[i][j]
            overall_score[pos] += score
    return overall_score

def calculate_all_diff(og_epitopes, all_scores, all_mutant_dict): 
    all_diff_dict = {}
    og_scores = []
    for i in og_epitopes:
        score = all_scores[all_scores['Epitope Seq'] == i]['Median Score'].iloc[0]
        og_scores.append(score)
    og_scores = np.unique(og_scores)
    for (i,j) in zip(og_epitopes, og_scores):
        mutant_list = all_mutant_dict[i]
        all_diff_dict[i] = {}
        for n in range(0, len(i)):
            all_diff_dict[i][n+1] = {}
            for k in amino_acids:
                query_peptide = i[:n]+k+i[n+1:]
                score = mutant_list[query_peptide]
                all_diff_dict[i][n+1][k] = score
        
    return all_diff_dict

In [None]:
for hla in michelle_dataset.keys():
    for length in michelle_dataset[hla]:
        if len(michelle_dataset[hla][length]) > 0:
            all_epitopes = pd.read_table("wet_lab_validation_simulation_output_files/anchor_"+hla+"_"+str(length)+"_output/MHC_Class_I/ANCHOR.all_epitopes.tsv", delimiter='\t')
            og_epitopes = michelle_dataset[hla][length]
            mutation_dictionary = create_mutation_dictionary(og_epitopes, all_epitopes['Epitope Seq'], all_epitopes['Median Score'])
            sum_diff_pos = calculate_sum_diff(og_epitopes, all_epitopes, mutation_dictionary)
            overall_pos_scores = overall_pos_sum_diff(sum_diff_pos,length)

            fig = plt.figure()
            fig.subplots_adjust(hspace=0.5, wspace=.9)
            for i in range(1, len(michelle_dataset[hla][length])+1):
                ax = fig.add_subplot(2, 2, i)
                ax.plot(range(1,length+1),[row[1] for row in sum_diff_pos[og_epitopes[i-1]]])
                plt.xticks(range(1,length+1))
                ax.set_xlabel('Peptide: '+str(og_epitopes[i-1]))

            plt.savefig("wet_lab_validation_simulation_output_files/"+hla+"_"+str(length)+"_Anchor Analysis.pdf")


In [None]:
overall_pos_scores_all_hla = {}
for hla in selected_peptides_only_peptides:
    print(hla)
    all_epitopes = pd.read_table("wet_lab_validation_simulation_output_files/ANCHOR."+hla+".all_epitopes.tsv", delimiter='\t')
    og_epitopes = selected_peptides_only_peptides[hla]
    mutation_dictionary = create_mutation_dictionary(og_epitopes, all_epitopes['Epitope Seq'], all_epitopes['Median Score'])
    sum_diff_pos = calculate_sum_diff(og_epitopes, all_epitopes, mutation_dictionary)
    overall_pos_scores = overall_pos_sum_diff(sum_diff_pos,9)
    overall_pos_scores_all_hla[hla] = {}
    for i,j in zip(sum_diff_pos,og_epitopes):
        overall_pos_scores_all_hla[hla][j] = i

In [None]:
for hla in selected_peptides_only_peptides.keys():
    print(hla)
    all_epitopes = pd.read_table("wet_lab_validation_simulation_output_files/ANCHOR."+hla+".all_epitopes.tsv", delimiter='\t')
    og_epitopes = selected_peptides_only_peptides[hla]
    mutation_dictionary = create_mutation_dictionary(og_epitopes, all_epitopes['Epitope Seq'], all_epitopes['Median Score'])
    sum_diff_pos = calculate_sum_diff(og_epitopes, all_epitopes, mutation_dictionary)
    overall_pos_scores = overall_pos_sum_diff(sum_diff_pos,9)
    for j in range(0,8):
        fig = plt.figure(figsize=(25, 30), dpi=200)
        fig.subplots_adjust(hspace=0.5, wspace=.9)
        for i in range(1, 26):
            if len(sum_diff_pos[og_epitopes[(j)*25+i-1]]) == 0:
                continue
            ax = fig.add_subplot(5, 5, i)
            ax.plot(range(1,10),[row[1] for row in sum_diff_pos[og_epitopes[(j)*25+i-1]]])
            plt.xticks(range(1,10))
            ax.set_xlabel('Peptide: '+str(og_epitopes[(j)*25+i-1]), fontsize=10)

        plt.savefig("wet_lab_validation_simulation_output_files/"+hla+"/"+hla+"_9mer_"+str(j+1)+"_Anchor Analysis.pdf")

In [None]:
good_list = {
    'HLA-A*02:01':['ELMGALTRV','MMSESNLNI','WLWGISLNV','QLWMLPIMV','ILQALASTV','IMVMYFIML','YLGRKAMRV','SMLETFIRL',
                  'SLAEKASGV','MMAKYEHRI','SLAEKASGV','TLPSPQHAV','MLRWKQSSV','MLLFSMLLL','ALVSEDHHV','MLTYLFVLV',
                  'ALPHDLQPV','SLIQTIRHV','MMTEWTPCL','VLCKEMVPV','SLVGHTEGV','KLELLQFQV','ALLQWKTPV','QMMCTTTLL',
                  'ALLALLLPI','GLLMPLLCV','RMSNLQLKV','KLADSSRML','SLDPKLYIL','SLATDPWSL','ILFIIIFIL','MLKCHHMRV',
                  'QLLVLEPPV','IIVGYILLA','RLIDFKETV','VMLENYINL','QLYTPMYFL','MLHLAVIFV','MLGAMKYIV'],
    'HLA-A*24:02':['PYLNPNPHF','PYIMRRPNF','AYQYTHQNF','TWQSAHAVI','LYRESVLTW','RYGGKSIVF','KYWLKDLTF','KYWTEPDKF',
                  'SYFSQLLLF','LYCPCMGCF','LYRENGCMF','RYETNSMDF','WYIIKKDFF','VWLANHCSI','DYIAYLSIF','PYLLGFLTF',
                  'RFLLALQLL','IYAVHLENW','HYIYIENHF','NYFVKVVPF','KYGNVVTSF','SYLQCMETF','SYSGTPELF','AYLKAIILF',
                  'RYCNHWRSF','SYLAIVHPF','EYSYHLQVF','FYTDPSYLF','MYEDFYELF','LYYQNGKAF','TYSLITRNF','HYKGPSVPF',
                  'WYFIAPAKF'],
    'HLA-B*07:02':['LPSASTTFF','SPVGTARML','WPQVLHLGL','TPAPCTPWL','RPDFGGPSV','HPQYSRVHI','TPVDKTSNM','GPQAAHHWL',
                  'HPNKSTTLL','SPLGLHPHL','MPHEPTILL','YPVYLGARL','HPFVSTREV','TPAWHPLAL','YPFQRREQL','SPGTTPEPL',
                  'HPADRVVRL','SPIKQRSGL','NPHMLVDTL','VPSLAQLAM','SPGPVESKL','VPSFKGPWL','VPAETVGHF','GPRGQHGRL',
                  'YPEIKGHFL','SPRCRPPHL','SPATWCINL','APKYGIFHL','APPSRLLAL','KPLQGTLTL','MPNNTQIRL','APPPHLQAL',
                  'PPLPGVQAL','LPYQGHNFF','KPSNMNVFM','GPDQLTVAI','SPPLMSRSM','LPARDKRTL','TPRRRLINL','VPRYLCEGM',
                  'QPPILSGPL','MPGILLTGL','APTRNIWQL','WPCGPSRSL','YPSVMTPKL','SPPIFSSIL','HPAVLGQHL','SPQNCKLCV',
                  'TPVRAWAVM','TPDPHWGAF','PPHSTTGTV','DPALFSGSL','TPPATLPSL','QPLHYMLIM','APGTASESL','GPPQSGEPL',
                  'LPWTCLLIL','VPLGSILCL','SPLLAVLGM'],
    'HLA-B*18:01':['NEYHSVVYY','EENVAYQAY','GELAAVAPF','EEIGPILWY','DELKEPLSF','QEEENLTVY','SEDGVRAPF','LEEKMLAAY',
                  'IEKMVANMF','TESFLQGQF','MECSKLKLF','EEYRSRGVF','SEVDISQHF','KEAPYFYNY','SEASWEPVY','LEVQDDNAF',
                  'IEYLSEGQY','CENLQLRKY','EEAKQKQSF','TEVFTLLFF','MEAYRNACY','LEHVVQCMF','LEKIAFSTY','RECGRGFSY',
                  'TENTPINSF','EERIINEEY','DEEVLNFPY','IELFIPCGF','EEWIELRLF','YEMASTPLY','YENGAFDKF','GEWLVRKLF',
                  'REIYFNSSF','FEARAPAFY','QEAVVLALM','NELFWVPLY','QEEGYDSVF','YEMLKALDY','MEVLEGLDF','LECSTRCWF',
                  'SEGDVHKVY','SECLHIFVY','QEEEAHPKF','DEGVNNSPY','IEAVQNLPY','TEAGLGNTF','NEELHICLM','SEVIRLPYY',
                  'EETIGRHVF','YEDCDDLEF','NESTKYGLF','NEFCDSRLF','NEKNLRSTM','NEEKDNGPF','WEVASQLGY','LECKTAEEY',
                  'HERTFILEY','HEFHSRIKF','LELHARVFY','QEFFAAVFY','AEYLTRPHF','REFRKVVEY','LERILLQLF','LEQAKTISY',
                  'NEIPYPIKY','VEISIQINF','QEQANAAKF','SENGTNFCF','NEELGQIEF','VETDFFIFF','EERGRNPAF'],
    'HLA-A*03:01':['LMWIFRILK','MMNRRGAIK','TMRTQCRLK','LLAMYERAK','LMIREQVLK','KMTRKKLNK','GLLLSRENK','ILSGMETLK',
                  'ILREENSNK','SLATRCLDK','LLRTFCQLK','LLRPVQKEK','ALHRRLGWK','CLGSIMTLK','HLIFVEPRK','VMLAELKSK',
                  'TLTLSFGKK','NLRSSSPSK','LLKQTRLEK','RMKRPLESK','LLETSLSAK','LLKRRKLRK','SVFWWTQLY','ILSEGIKRK',
                  'RLLNRREVK','GMLEYEPAK','YLNCNHLTK','VLHKGMCIK','IMRLVCTWK','VLKHPCTSK','LMTQMCMGK','TMSFSPYKK'],
    'HLA-A*23:01':['QWLQPEAHF','VFSFGDLNF','HYNYMCNSF','FYTPSLFSF','WYHWKVHEF','RWWTLVMEW','LYGPILIIF','KYVDGGELF',
                  'HWGRTNSNF','LYAACAVFF','SFIGELTKF','GYYLGRQMF','IYGASFVPF','QYFSYSGAF','QYKCIKHLF','TYSFGAPKF',
                  'QYKDRVQVF','VFGLYASFF','AFLSSPSFW','FFSFLTHCF','DYYVGMYSF','GYGCCYGSF','LFPRYTSLW','LYCGYPGNF',
                  'KFSHLDPNF','TYYGKTGEF','VYMYIQSRF','YYTTPSHRF','FFSLNPVRF','LFSSWTPVW','YWAVEQAHF','KFSRLDPTF',
                  'VFSIIFTKI','LYCVVSKTF','MYSAEKLNF','FYLLRIGNF','DYNRYISIF','QFWMIPRRF','VYHSIDAYF','RFEEVDLIF',
                  'EFLLRCPLF','PFRSWPVAF'],
    'HLA-A*68:01':['LLTSMAYDR','NLIFLRLKR','NMMGVYAER','HPSVLNHLR','TLCSTLSLR','DQREIISFR','SLFPLGFGR','TVVSFCYLR',
                  'SNFFMNAQR','LVIALLLKR','YLFCVCGTR','TLILNKFLR','HNMPSYILR','CPILPCFWR','ASVEIMSYR','ESVASWAAR',
                  'LNTLIYTLR','AAFVSSAPR','MSIYRSALR','GLVTSHSFR','LNHASSYMR','EMVKAVQLR','TLCICISTR','TANYTDSQR',
                  'CSYIFIIAR','SPFFEIYYK','TSTSSRAER','SSMPSTPLR','FPLWGIHWR','NSYFNMSIR','TGIFYPNVR','GTVSSPSYR'],
    'HLA-A*31:01':['IIKVYFKGR','TQIFYRTCR','VYKNCSPLR','KWFFSKTQR','VAMRKLCSR','SSCTLSCLR','NWSSIFAAR','SGRSKLSRR',
                  'ISFGGFRKR','ALHLVEEYR','SILSIRICR','CVSLYVVFR','AQWRKTEAR','VLYSLPAVR','TLFPNSIAR','TGLLWKFIR',
                  'GGAARHSGR','LLLWLSSAR','MTACSILAR','RTFSYPCRR','ALDFFEYFR','GWHLFRFNR','LIKWELYER','SVSACCAPR',
                  'FQCPVRTLR','RISIFINRR','AIGRLFHYR','LGQEIATYR','SGKVLSAAR','VLLGSKALR','TGRRKICPR','MALKFLEGR',
                  'RNSWRAVAR','RCWLRPTTR','LIYGTMAQR','RVRNWAVWR','LFYLLRTMR','VSWTKRTLR','HLLIKTSLR','MVLVKPDLR',
                  'MSLPLFHGR','LTNKFKNAR'],
    'HLA-B*08:01':['TLFMREHNL','RMHDRAASL','RLFQKSKEL','VIMSRTVCV','FPQVRCYAF','SPAVRALAA','VLYTKVIPM','FLWQRGWKF',
                  'TLNGRRLVF','APHHRLLVL','SLPSRAVAL','CLLVKFGSA','LMWRRYVMA','FMDVRVQSI','TLQKKFGIV','SMVPRGVSM',
                  'AAGTRWYPV','HIQRRYFRF','WLCIRKQAA','YLYQRSQLV','FPYVRNFVM','LLLQRARHA','LPEKRDVAV','NLGLKFRDI',
                  'SLHLRSPQI']
}

In [None]:
hla_diff_pos_score_all = {}
for hla in good_list:
    print(hla)
    all_epitopes = pd.read_table("wet_lab_validation_simulation_output_files/ANCHOR."+hla+".all_epitopes.tsv", delimiter='\t')
    og_epitopes = good_list[hla]
    mutation_dictionary = create_mutation_dictionary(og_epitopes, all_epitopes['Epitope Seq'], all_epitopes['Median Score'])
    all_diff_pos = calculate_all_diff(og_epitopes, all_epitopes, mutation_dictionary)
    hla_diff_pos_score_all[hla] = all_diff_pos

In [None]:
data = [(i,j) for i, j in zip(list(hla_diff_pos_score_all['HLA-B*08:01']['TLFMREHNL'][9]),list(hla_diff_pos_score_all['HLA-B*08:01']['TLFMREHNL'][9].values()))]
data.sort(key=lambda x:x[1])

In [None]:
sns.set(rc={'figure.figsize':(30,30)})
for hla in hla_diff_pos_score_all:
    df = pd.DataFrame()
    hla_list = []
    pos = []
    score = []
    aa = []
    peptide = []
    epitope_list = list(hla_diff_pos_score_all[hla].keys())
    for k in epitope_list:
        data = hla_diff_pos_score_all[hla][k]
        for i in data:
            for j in data[i]:
                pos.append(i)
                aa.append(j)
                score.append(data[i][j])
                hla_list.append(hla)
                peptide.append(k)
    df['Position'] = pos
    df['Score'] = score
    df['Mutation'] = aa
    df['HLA Allele'] = hla
    df['Peptide'] = peptide
    
    print(df)
    
    # Get Unique continents
    color_labels = df['Mutation'].unique()

    # List of colors in the color palettes
    #rgb_values = sns.color_palette("Spectral", 20)
    c25 = ["#1c86ee", "#E31A1C", # red
           "#008b00", "#6A3D9A", # purple
           "#FF7F00", # orange
           "black", "#ffd700",
           "#7ec0ee", "#FB9A99", # lt pink
           "#90ee90","#CAB2D6", # lt purple
          #"#FDBF6F", # lt orange
           "#b3b3b3", "#eee685",
           "maroon", "#ff83fa", "#ff1493", "#0000ff", "#36648b",
          #"darkturquoise", "green1", "yellow4", "yellow3",
          "#8b4500", "brown"]
    
    # Map continents to the colors
    color_map = dict(zip(color_labels, c25))

    grid = sns.FacetGrid(df, col = "Peptide", hue = "Mutation", col_wrap=3, palette=df['Mutation'].map(color_map))
    plt.xticks(range(1,10))
    grid.map(plt.axhline, y=100, color='r', linestyle='dotted')
    grid.map(sns.stripplot, "Position", "Score", size = 4, jitter=0.4, order=range(1,10))
    

    grid.add_legend()
    plt.show()


## Michelle Validated Peptides

In [None]:
michelle_diff_pos_score_all = {}
for hla in michelle_dataset.keys():
    michelle_diff_pos_score_all[hla] = {}
    for length in michelle_dataset[hla]:
        if len(michelle_dataset[hla][length]) > 0:
            all_epitopes = pd.read_table("~/Desktop/Griffith_Lab/R_shiny_visualization/anchor_analysis/wet_lab_validation_simulation_output_files/anchor_"+hla+"_"+str(length)+"_output/MHC_Class_I/ANCHOR.all_epitopes.tsv", delimiter='\t')
            og_epitopes = michelle_dataset[hla][length]
            mutation_dictionary = create_mutation_dictionary(og_epitopes, all_epitopes['Epitope Seq'], all_epitopes['Median Score'])
            all_diff_pos = calculate_all_diff(og_epitopes, all_epitopes, mutation_dictionary)
            michelle_diff_pos_score_all[hla][length] = all_diff_pos

In [None]:
for hla in michelle_dataset.keys():
    for length in michelle_dataset[hla]:
        if len(michelle_dataset[hla][length]) > 0:
            all_epitopes = pd.read_table("wet_lab_validation_simulation_output_files/anchor_"+hla+"_"+str(length)+"_output/MHC_Class_I/ANCHOR.all_epitopes.tsv", delimiter='\t')
            og_epitopes = michelle_dataset[hla][length]
            mutation_dictionary = create_mutation_dictionary(og_epitopes, all_epitopes['Epitope Seq'], all_epitopes['Median Score'])
            sum_diff_pos = calculate_sum_diff(og_epitopes, all_epitopes, mutation_dictionary)
            overall_pos_scores = overall_pos_sum_diff(sum_diff_pos,length)

            fig = plt.figure()
            fig.subplots_adjust(hspace=0.5, wspace=.9)
            for i in range(1, len(michelle_dataset[hla][length])+1):
                ax = fig.add_subplot(2, 2, i)
                ax.plot(range(1,length+1),[row[1] for row in sum_diff_pos[og_epitopes[i-1]]])
                plt.xticks(range(1,length+1))
                ax.set_xlabel('Peptide: '+str(og_epitopes[i-1]))
            plt.show()

In [None]:
sns.set(rc={'figure.figsize':(30,30)})
for hla in michelle_dataset.keys():
    for length in michelle_dataset[hla]:
        if len(michelle_dataset[hla][length]) > 0:
            df = pd.DataFrame()
            hla_list = []
            pos = []
            score = []
            aa = []
            peptide = []
            epitope_list = list(michelle_diff_pos_score_all[hla][length].keys())
            for k in epitope_list:
                data = michelle_diff_pos_score_all[hla][length][k]
                for i in data:
                    for j in data[i]:
                        pos.append(i)
                        aa.append(j)
                        score.append(data[i][j])
                        hla_list.append(hla)
                        peptide.append(k)
            df['Position'] = pos
            df['Score'] = score
            df['Mutation'] = aa
            df['HLA Allele'] = hla
            df['Peptide'] = peptide

            # Get Unique continents
            color_labels = df['Mutation'].unique()

            # List of colors in the color palettes
            #rgb_values = sns.color_palette("Spectral", 20)
            c25 = ["#1c86ee", "#E31A1C", # red
                   "#008b00", "#6A3D9A", # purple
                   "#FF7F00", # orange
                   "black", "#ffd700",
                   "#7ec0ee", "#FB9A99", # lt pink
                   "#90ee90","#CAB2D6", # lt purple
                  #"#FDBF6F", # lt orange
                   "#b3b3b3", "#eee685",
                   "maroon", "#ff83fa", "#ff1493", "#0000ff", "#36648b",
                  #"darkturquoise", "green1", "yellow4", "yellow3",
                  "#8b4500", "brown"]

            # Map continents to the colors
            color_map = dict(zip(color_labels, c25))

            grid = sns.FacetGrid(df, col = "Peptide", hue = "Mutation", col_wrap=3, palette=df['Mutation'].map(color_map))
            plt.xticks(range(1,length+1))
            grid.map(plt.axhline, y=100, color='r', linestyle='dotted')
            grid.map(sns.stripplot, "Position", "Score", size = 4, jitter=0.4, order=range(1,length+1))
            grid.add_legend()
            plt.show()


In [None]:
sns.set(rc={'figure.figsize':(10,5)})
for hla in ['HLA-B*07:02']:
    for length in [9]:
        if len(michelle_dataset[hla][length]) > 0:
            df = pd.DataFrame()
            hla_list = []
            pos = []
            score = []
            aa = []
            peptide = []
            epitope_list = ['RPDVKHSKM']
            for k in epitope_list:
                data = michelle_diff_pos_score_all[hla][length][k]
                for i in data:
                    for j in data[i]:
                        pos.append(i)
                        aa.append(j)
                        score.append(data[i][j])
                        hla_list.append(hla)
                        peptide.append(k)
            df['Position'] = pos
            df['Score'] = score
            df['Mutation'] = aa
            df['HLA Allele'] = hla
            df['Peptide'] = peptide

            # Get Unique continents
            color_labels = df['Mutation'].unique()

            # List of colors in the color palettes
            #rgb_values = sns.color_palette("Spectral", 20)
            c25 = ["#1c86ee", "#E31A1C", # red
                   "#008b00", "#6A3D9A", # purple
                   "#FF7F00", # orange
                   "black", "#ffd700",
                   "#7ec0ee", "#FB9A99", # lt pink
                   "#90ee90","#CAB2D6", # lt purple
                  #"#FDBF6F", # lt orange
                   "#b3b3b3", "#eee685",
                   "maroon", "#ff83fa", "#ff1493", "#0000ff", "#36648b",
                  #"darkturquoise", "green1", "yellow4", "yellow3",
                  "#8b4500", "brown"]

            # Map continents to the colors
            color_map = dict(zip(color_labels, c25))

            sns.stripplot(data=df, x="Position", y="Score", size = 7, jitter=0.2, order=range(1,length+1), hue='Mutation', palette=df['Mutation'].map(color_map))
            plt.axhline(y=43, color='r', linestyle='dotted')
            plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.1)
            plt.xticks(range(1,length+1))

            plt.show()


In [None]:
sns.set(rc={'figure.figsize':(10,5)})
length = 9
for hla in ['HLA-B*08:01']:
    df = pd.DataFrame()
    hla_list = []
    pos = []
    score = []
    aa = []
    peptide = []
    epitope_list = ['TLFMREHNL']
    for k in epitope_list:
        data = hla_diff_pos_score_all[hla][k]
        for i in data:
            for j in data[i]:
                pos.append(i)
                aa.append(j)
                score.append(data[i][j])
                hla_list.append(hla)
                peptide.append(k)
    df['Position'] = pos
    df['Score'] = score
    df['Mutation'] = aa
    df['HLA Allele'] = hla
    df['Peptide'] = peptide

    # Get Unique continents
    color_labels = df['Mutation'].unique()

    # List of colors in the color palettes
    #rgb_values = sns.color_palette("Spectral", 20)
    c25 = ["#1c86ee", "#E31A1C", # red
           "#008b00", "#6A3D9A", # purple
           "#FF7F00", # orange
           "black", "#ffd700",
           "#7ec0ee", "#FB9A99", # lt pink
           "#90ee90","#CAB2D6", # lt purple
          #"#FDBF6F", # lt orange
           "#b3b3b3", "#eee685",
           "maroon", "#ff83fa", "#ff1493", "#0000ff", "#36648b",
          #"darkturquoise", "green1", "yellow4", "yellow3",
          "#8b4500", "brown"]

    # Map continents to the colors
    color_map = dict(zip(color_labels, c25))

    sns.stripplot(data=df, x="Position", y="Score", size = 7, jitter=0.2, order=range(1,length+1), hue='Mutation', palette=df['Mutation'].map(color_map))
    plt.axhline(y=43, color='r', linestyle='dotted')
    plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.1)
    plt.xticks(range(1,length+1))
    plt.show()
