In [None]:
import pandas as pd
import numpy as np
import pickle
import csv
from collections import Counter
import matplotlib.pyplot as plt
import upsetplot
import seaborn as sns
sns.set_theme()
import pylab as pl
import difflib

def save_obj(obj, name ):
    with open('obj/'+ name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

def load_obj(name):
    with open('obj/' + name + '.pkl', 'rb') as f:
        return pickle.load(f)

## Plotting the frequency of anchor positions for a certain length
def anchor_pos_freq_by_length(filtered_anchor_dict, length):
    pos = [i+1 for i in range(length)]
    freq = [0 for i in range(length)]
    for i in filtered_anchor_dict.keys():
        try:
            for j in filtered_anchor_dict[i][length]:
                freq[j-1] += 1
        except:
            continue
    plt.bar(pos, freq)
    plt.xlabel("peptide positions")
    plt.ylabel("Frequency in dataset")
    plt.show()
    plt.close()
    return freq

### Method for determining anchor position
### criteria:
### determine contribution of each position to overall percentage
### Set cutoff at X =?
    
def percentage_cutoff(normalized_list, cutoff):
    count = 0
    anchor_list = []
    #print(normalized_list)
    indexed_list = sorted([(i,j) for i,j in enumerate(normalized_list)],key=lambda x:(-x[1],x[0]))
    #print(indexed_list)
    for n,m in indexed_list:
        if count > cutoff:
            break
        else:
            count += m
            anchor_list.append(n+1)
    return anchor_list

def anchor_pos_score_by_contribution(anchor_score_dict, percentage):
    normalized_dict = {}
    filter_contr = {}
    for i in anchor_score_dict.keys():
        normalized_dict[i] = {}
        filter_contr[i] = {}
        for j in anchor_score_dict[i].keys():
            sum_val = sum(anchor_score_dict[i][j])
            normalized_dict[i][j] = (anchor_score_dict[i][j])/sum_val
            filter_contr[i][j] = percentage_cutoff(normalized_dict[i][j], percentage)

    return normalized_dict, filter_contr

def anchor_pos_score_by_contribution_with_base(anchor_score_dict, percentage):
    normalized_dict = {}
    filter_contr = {}
    for i in anchor_score_dict.keys():
        normalized_dict[i] = {}
        filter_contr[i] = {}
        for j in anchor_score_dict[i].keys():
            base_score_dict = anchor_score_dict[i][j]-min(anchor_score_dict[i][j])
            sum_val = sum(base_score_dict)
            normalized_dict[i][j] = (base_score_dict)/sum_val
            filter_contr[i][j] = percentage_cutoff(normalized_dict[i][j], percentage)

    return normalized_dict, filter_contr

### Shared counting functions
def anchor_category_determine(condensed_info_row, cutoff):
    if condensed_info_row['Median Fold Change'] > 1:
        if condensed_info_row['Mutation at Anchor'] == 1:
            if condensed_info_row['Median WT Score'] <= cutoff:
                decision = 'Reject'
                group = '3B'
            else:
                decision = 'Accept'
                group = '2'
        else:
            decision = 'Accept'
            group = '1A'
    else:
        if condensed_info_row['Mutation at Anchor'] == 1:
            decision = 'Reject'
            group = '3A'
        else:
            decision = 'Accept'
            group = '1B'
    return decision, group
    
def original_decision(condensed_info_row):
    if condensed_info_row['Median Fold Change'] > 1 and condensed_info_row['Median MT Score']:
        decision = 'Accept'
    else:
        decision = 'Reject'
    return decision


def naive_anchor_category_determine(condensed_info_row, cutoff):
    if condensed_info_row['Median Fold Change'] > 1:
        if condensed_info_row['Mutation Position'] in [2,condensed_info_row['Peptide Length']]:
            if condensed_info_row['Median WT Score'] <= cutoff:
                decision = 'Reject'
                group = '3B'
            else:
                decision = 'Accept'
                group = '2'
        else:
            decision = 'Accept'
            group = '1A'
    else:
        if condensed_info_row['Mutation Position'] in [2,condensed_info_row['Peptide Length']]:
            decision = 'Reject'
            group = '3A'
        else:
            decision = 'Accept'
            group = '1B'
    return decision, group

def count_set_occurrences(input_table, filter_list):
    table = []
    for index, filters in input_table.iterrows():
        members = []
        for j in filter_list:
            decision = filters[j]
            if decision == 'Accept':
                members.append(j)
        table.append(members)
    set_table = set(map(tuple, table))
    final_count = Counter(map(tuple, table))
    output_table = []
    counts = []
    for i in set_table:
        counts.append(final_count[i])
        output_table.append(list(i))
    return output_table, counts

def difference_between_peptides(a,b):
    diff_pos = []
    if len(a) != len(b):
        print("INPUT LENGTHS NOT EQUAL")
    for i, (j, k) in enumerate(zip(a,b)):
        if j != k :
            diff_pos.append(i+1)
    return diff_pos

def if_anchor_matches_mutation_pos(mutation_type, wt_peptide, mt_peptide, mutation_pos, HLA_allele, anchor_dict):
    matched = False
    length = len(mt_peptide)
    anchor_list = anchor_dict[HLA_allele][length]
    if mutation_type == "missense" or mutation_type == 'inframe_del': 
        if pd.isna(wt_peptide):
            print('NULL WT Peptide')
        else:
            diff_pos = difference_between_peptides(wt_peptide, mt_peptide)
            #print("Different Position: ", diff_pos, mutation_pos, wt_peptide, mt_peptide)
            if set(diff_pos).issubset(anchor_list):
                matched = True
                #print("Matched Anchor: ", anchor_list)
    elif mutation_type == 'FS' or mutation_type == 'inframe_ins': ### DOES THIS NEED CHANGING? inframe_ins should be followed by difference_between_peptides?
        if mutation_pos != 0 or mutation_pos != -1:
            diff_pos = list(range(mutation_pos, length+1))
            if set(diff_pos).issubset(anchor_list):
                matched = True
                #print("FS/ins mutation matched to anchor: ", anchor_list) 
                #print("FS/ins mutation positions: ", diff_pos, length, anchor_list, HLA_allele)
    return matched

In [None]:
## Cutoff at 80% contribution
all_anchor_overall_pos_score_dict_r4 = load_obj('all_anchor_overall_pos_score_dict_r4')
normalized_anchor_dict_r4, filtered_anchor_dict_r4 = anchor_pos_score_by_contribution(all_anchor_overall_pos_score_dict_r4, 0.8)
anchor_pos_freq_by_length(filtered_anchor_dict_r4,9)

In [None]:
tcga_1000_m2_filter_500b_1 = pd.read_csv('filter_500b_TCGA_1000_m1_pvacseq_analysis_combined.txt', sep='\t')
tcga_1000_m2_filter_500b_2 = pd.read_csv('filter_500b_TCGA_1000_m1_pvacseq_re_analysis_combined.txt', sep='\t')
tcga_1000_m2_filter_500b = tcga_1000_m2_filter_500b_1.append(tcga_1000_m2_filter_500b_2)

In [None]:
tcga_1000_m2_filter_100b = tcga_1000_m2_filter_500b[tcga_1000_m2_filter_500b["Median MT Score"] < 100]
tcga_1000_m2_filter_50b = tcga_1000_m2_filter_500b[tcga_1000_m2_filter_500b["Median MT Score"] < 50]

In [None]:
len(np.unique(tcga_1000_m2_filter_500b["HLA Allele"])), len(np.unique(tcga_1000_m2_filter_100b["HLA Allele"])), len(np.unique(tcga_1000_m2_filter_50b["HLA Allele"]))

In [None]:
def anchor_impact_analysis(data, anchor_dict, cutoff):
    hla_anchor_data = pd.DataFrame()

    hla_alleles_for_r4 = np.sort(data['HLA Allele'])
    print("Number of HLA alleles included in samples from method 1: ", len(np.unique(hla_alleles_for_r4)))

    ### For each entry, determine if mutation is at anchor position
    condensed_info = []
    for n, i in data.iterrows():
        if pd.isna(i['Mutation Position']):
            print('mutation position is NA', n)
            continue
        mut_pos = int(i['Mutation Position'])
        wt_seq = i['WT Epitope Seq']
        mt_seq = i['MT Epitope Seq']
        wt_binding = i['Median WT Score']
        mt_binding = i['Median MT Score']
        hla_allele = i['HLA Allele']
        length = i['Peptide Length']
        fold_change = i['Median Fold Change']
        mutation_type = i['Variant Type']
        try:
            anchor_pos_list = anchor_dict[hla_allele][length]
        except:
            #anchor_pos_list = [-1]
            continue
        if if_anchor_matches_mutation_pos(mutation_type, wt_seq, mt_seq, mut_pos, hla_allele, anchor_dict):
            at_anchor = 1
        else:
            at_anchor = 0
        condensed_info.append([mut_pos, mutation_type, wt_seq, mt_seq, wt_binding, mt_binding, hla_allele, length, fold_change, at_anchor, anchor_pos_list])
    condensed_info_df = pd.DataFrame.from_records(condensed_info, columns=['Mutation Position', 'Variant Type', 'WT Sequence', 'MT Sequence','Median WT Score','Median MT Score', 'HLA Allele', 'Peptide Length', 'Median Fold Change', 'Mutation at Anchor', 'Anchor List'])

    anchor_based_decision = []
    anchor_based_group = []
    og_decision = []
    naive_anchor_decision = []
    naive_decision_group = []
    for n, i in condensed_info_df.iterrows():
        anchor_decision, anchor_group = anchor_category_determine(i, cutoff)
        naive_anchor, naive_group = naive_anchor_category_determine(i, cutoff)
        naive_anchor_decision.append(naive_anchor)
        naive_decision_group.append(naive_group)
        anchor_based_decision.append(anchor_decision)
        anchor_based_group.append(anchor_group)
        og_decision.append(original_decision(i))

    agreed_decision_1_3 = []
    for i,j in zip(og_decision, anchor_based_decision):
        if i == j:
            agreed_decision_1_3.append('YES')
        else:
            agreed_decision_1_3.append('NO')

    agreed_decision_1_2 = []
    for i,j in zip(og_decision, naive_anchor_decision):
        if i == j:
            agreed_decision_1_2.append('YES')
        else:
            agreed_decision_1_2.append('NO')

    agreed_decision_2_3 = []
    for i,j in zip(naive_anchor_decision, anchor_based_decision):
        if i == j:
            agreed_decision_2_3.append('YES')
        else:
            agreed_decision_2_3.append('NO')

    condensed_info_df['Binding < 100 and FC > 1'] = og_decision
    condensed_info_df['Naive Anchor Based decision'] = naive_anchor_decision
    condensed_info_df['Naive Anchor Decision Group'] = naive_decision_group
    condensed_info_df['Anchor Based decision'] = anchor_based_decision
    condensed_info_df['Anchor Decision Group'] = anchor_based_group
    condensed_info_df['Do basic & naive decisions agree?'] = agreed_decision_1_2
    condensed_info_df['Do naive & calculated decisions agree?'] = agreed_decision_2_3
    condensed_info_df['Do basic & calculated decisions agree?'] = agreed_decision_1_3
    
    print(Counter(og_decision), Counter(naive_anchor_decision), Counter(anchor_based_decision))
    filters_decision_combined = pd.DataFrame(columns=['Filter A', 'Filter B', 'Filter C'])
    filters_decision_combined['Filter A'] = og_decision
    filters_decision_combined['Filter B'] = naive_anchor_decision
    filters_decision_combined['Filter C'] = anchor_based_decision
    return filters_decision_combined

In [None]:
filters_decision_combined_500b = anchor_impact_analysis(tcga_1000_m2_filter_500b, filtered_anchor_dict_r4, 500)
filters_decision_combined_100b = anchor_impact_analysis(tcga_1000_m2_filter_100b, filtered_anchor_dict_r4, 100)
filters_decision_combined_50b = anchor_impact_analysis(tcga_1000_m2_filter_50b, filtered_anchor_dict_r4, 50)

In [None]:
upset_table, counts = count_set_occurrences(filters_decision_combined_500b, ['Filter A', 'Filter B', 'Filter C'])

for i in zip(upset_table, counts):
    print(i)
upsetplot.plot(upsetplot.from_memberships(upset_table, data=counts), sort_by='cardinality', show_percentages = True,)
#plt.savefig('upset_plot_filters_combined_m2_contribution_80_with_base_500b_filter.png')

In [None]:
upset_table, counts = count_set_occurrences(filters_decision_combined_100b, ['Filter A', 'Filter B', 'Filter C'])

for i in zip(upset_table, counts):
    print(i)
upsetplot.plot(upsetplot.from_memberships(upset_table, data=counts), sort_by='cardinality', show_percentages = True,)
#plt.savefig('upset_plot_filters_combined_m2_contribution_80_with_base_100b_filter.png')

In [None]:
upset_table, counts = count_set_occurrences(filters_decision_combined_50b, ['Filter A', 'Filter B', 'Filter C'])

for i in zip(upset_table, counts):
    print(i)
upsetplot.plot(upsetplot.from_memberships(upset_table, data=counts), sort_by='cardinality', show_percentages = True,)
#plt.savefig('upset_plot_filters_combined_m2_contribution_80_with_base_50b_filter.png')

## Analysis with filtered HLA alleles

In [None]:
## Filter out those equal to 2,9 or 9,2
all_anchor_overall_pos_score_dict_r4 = load_obj('all_anchor_overall_pos_score_dict_r4')
normalized_anchor_dict_r4, filtered_anchor_dict_r4 = anchor_pos_score_by_contribution(all_anchor_overall_pos_score_dict_r4, 0.8)
filtered_anchor_dict = {}
filtered_out_anchor_dict = {}

for i in filtered_anchor_dict_r4:
    filtered_anchor_dict[i] = {}
    filtered_out_anchor_dict[i] = {}
    for length in [8,9,10,11]:
        try:
            if filtered_anchor_dict_r4[i][length] == [2,length] or filtered_anchor_dict_r4[i][length] == [length, 2]:
                filtered_out_anchor_dict[i][length] = filtered_anchor_dict_r4[i][length]
            else:
                filtered_anchor_dict[i][length] = filtered_anchor_dict_r4[i][length]
        except:
            #print(filtered_anchor_dict_r4[i])
            continue

filtered_anchor_dict = {i:filtered_anchor_dict[i] for i in filtered_anchor_dict if filtered_anchor_dict[i] != {}}
filtered_out_anchor_dict = {i:filtered_out_anchor_dict[i] for i in filtered_out_anchor_dict if filtered_out_anchor_dict[i] != {}}

In [None]:
## Filter out those in clusters other than red, orange, purple
hla_list = ["HLA-A*29:02",
"HLA-A*30:01",
"HLA-A*30:02",
"HLA-A*31:01",
"HLA-A*33:01",
"HLA-A*68:01",
"HLA-A*80:01",
"HLA-B*08:01",
"HLA-B*08:09",
"HLA-B*14:02",
"HLA-C*04:01",
"HLA-C*04:03",
"HLA-C*05:01",
"HLA-C*05:09",
"HLA-C*08:02",
"HLA-C*08:15"]
all_anchor_overall_pos_score_dict_r4 = load_obj('all_anchor_overall_pos_score_dict_r4')
normalized_anchor_dict_r4, filtered_anchor_dict_r4 = anchor_pos_score_by_contribution(all_anchor_overall_pos_score_dict_r4, 0.8)
filtered_anchor_dict_cluster = {}
for i in filtered_anchor_dict_r4:
    if i in hla_list:
        filtered_anchor_dict_cluster[i] = filtered_anchor_dict_r4[i]

In [None]:
len(filtered_anchor_dict), len(filtered_out_anchor_dict), len(filtered_anchor_dict_cluster)

In [None]:
tcga_1000_m2_filter_500b_1 = pd.read_csv('filter_500b_TCGA_1000_m1_pvacseq_analysis_combined.txt', sep='\t')
tcga_1000_m2_filter_500b_2 = pd.read_csv('filter_500b_TCGA_1000_m1_pvacseq_re_analysis_combined.txt', sep='\t')
tcga_1000_m2_filter_500b = tcga_1000_m2_filter_500b_1.append(tcga_1000_m2_filter_500b_2)
dataset = tcga_1000_m2_filter_500b[tcga_1000_m2_filter_500b["Peptide Length"] == 9]

In [None]:
filters_decision_combined_filtered_hla = anchor_impact_analysis(dataset_filtered, filtered_anchor_dict, 500)

In [None]:
upset_table, counts = count_set_occurrences(filters_decision_combined_filtered_dict, ['Filter A', 'Filter B', 'Filter C'])

for i in zip(upset_table, counts):
    print(i)
ax = upsetplot.plot(upsetplot.from_memberships(upset_table, data=counts), show_percentages = True, sort_by="cardinality")
#plt.savefig('upset_plot_filters_combined_m2_contribution_80_no_base_50b_filter.png')

In [None]:
cutoffs = ["500 Binding Cutoff", "100 Binding Cutoff", "50 Binding Cutoff", "Filtered for non-conventional anchors"]
metrics = ["Difference between A & (B | C)", "Difference between C & (A | B)"]
percentages = {
    "500": [38.3, 5.7],
    "100": [37.4, 5.5],
    "50": [37.3, 3.6],
    "HLA filter": [39.2, 7.4]
} 
cutoff_col = []
metrics_col = []
percentage_col = []
for i in percentages:
    for j in range(0,2):
        cutoff_col.append(i)
        metrics_col.append(metrics[j])
        percentage_col.append(percentages[i][j])
data = pd.DataFrame()
data["Cutoffs"] = cutoff_col
data["Metrics"] = metrics_col
data["Percentage"] = percentage_col
data

In [None]:
sns.barplot(data = data[data["Metrics"] == "Difference between A & (B | C)"], x = "Metrics", y = "Percentage", hue = "Cutoffs")
plt.savefig("Difference between A & (B | C).pdf", dpi = 300)

In [None]:
sns.barplot(data = data[data["Metrics"] == "Difference between C & (A | B)"], x = "Metrics", y = "Percentage", hue = "Cutoffs")
plt.savefig("Difference between C & (A | B).pdf", dpi = 300)