In [None]:
import pandas as pd
import numpy as np
import pickle
import csv
from collections import Counter
import matplotlib.pyplot as plt
import upsetplot
import seaborn as sns
sns.set_theme()
import pylab as pl
import difflib

def save_obj(obj, name ):
    with open('obj/'+ name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

def load_obj(name):
    with open('/Users/h.xia/Desktop/Griffith_Lab/R_shiny_visualization/anchor_analysis/obj/' + name + '.pkl', 'rb') as f:
        return pickle.load(f)

## Plotting the frequency of anchor positions for a certain length
def anchor_pos_freq_by_length(filtered_anchor_dict, length):
    pos = [i+1 for i in range(length)]
    freq = [0 for i in range(length)]
    for i in filtered_anchor_dict.keys():
        try:
            for j in filtered_anchor_dict[i][length]:
                freq[j-1] += 1
        except:
            continue
    plt.bar(pos, freq)
    plt.xlabel("peptide positions")
    plt.ylabel("Frequency in dataset")
    plt.show()
    plt.close()
    return freq

### Method for determining anchor position
### criteria:
### determine contribution of each position to overall percentage
### Set cutoff at X =?
    
def percentage_cutoff(normalized_list, cutoff):
    count = 0
    anchor_list = []
    #print(normalized_list)
    indexed_list = sorted([(i,j) for i,j in enumerate(normalized_list)],key=lambda x:(-x[1],x[0]))
    #print(indexed_list)
    for n,m in indexed_list:
        if count > cutoff:
            break
        else:
            count += m
            anchor_list.append(n+1)
    return anchor_list

def anchor_pos_score_by_contribution(anchor_score_dict, percentage):
    normalized_dict = {}
    filter_contr = {}
    for i in anchor_score_dict.keys():
        normalized_dict[i] = {}
        filter_contr[i] = {}
        for j in anchor_score_dict[i].keys():
            sum_val = sum(anchor_score_dict[i][j])
            normalized_dict[i][j] = (anchor_score_dict[i][j])/sum_val
            filter_contr[i][j] = percentage_cutoff(normalized_dict[i][j], percentage)

    return normalized_dict, filter_contr

def anchor_pos_score_by_contribution_with_base(anchor_score_dict, percentage):
    normalized_dict = {}
    filter_contr = {}
    for i in anchor_score_dict.keys():
        normalized_dict[i] = {}
        filter_contr[i] = {}
        for j in anchor_score_dict[i].keys():
            base_score_dict = anchor_score_dict[i][j]-min(anchor_score_dict[i][j])
            sum_val = sum(base_score_dict)
            normalized_dict[i][j] = (base_score_dict)/sum_val
            filter_contr[i][j] = percentage_cutoff(normalized_dict[i][j], percentage)

    return normalized_dict, filter_contr

### Shared counting functions
def anchor_category_determine(condensed_info_row):
    if condensed_info_row['Median Fold Change'] > 1:
        if condensed_info_row['Mutation at Anchor'] == 1:
            if condensed_info_row['Median WT Score'] <= 500:
                decision = 'Reject'
                group = '3B'
            else:
                decision = 'Accept'
                group = '2'
        else:
            decision = 'Accept'
            group = '1A'
    else:
        if condensed_info_row['Mutation at Anchor'] == 1:
            decision = 'Reject'
            group = '3A'
        else:
            decision = 'Accept'
            group = '1B'
    return decision, group
    
def original_decision(condensed_info_row):
    if condensed_info_row['Median Fold Change'] > 1 and condensed_info_row['Median MT Score']:
        decision = 'Accept'
    else:
        decision = 'Reject'
    return decision


def naive_anchor_category_determine(condensed_info_row):
    if condensed_info_row['Median Fold Change'] > 1:
        if condensed_info_row['Mutation Position'] in [2,condensed_info_row['Peptide Length']]:
            if condensed_info_row['Median WT Score'] <= 500:
                decision = 'Reject'
                group = '3B'
            else:
                decision = 'Accept'
                group = '2'
        else:
            decision = 'Accept'
            group = '1A'
    else:
        if condensed_info_row['Mutation Position'] in [2,condensed_info_row['Peptide Length']]:
            decision = 'Reject'
            group = '3A'
        else:
            decision = 'Accept'
            group = '1B'
    return decision, group

def count_set_occurrences(input_table, filter_list):
    table = []
    for index, filters in input_table.iterrows():
        members = []
        for j in filter_list:
            decision = filters[j]
            if decision == 'Accept':
                members.append(j)
        table.append(members)
    set_table = set(map(tuple, table))
    final_count = Counter(map(tuple, table))
    output_table = []
    counts = []
    for i in set_table:
        counts.append(final_count[i])
        output_table.append(list(i))
    return output_table, counts

def difference_between_peptides(a,b):
    diff_pos = []
    if len(a) != len(b):
        print("INPUT LENGTHS NOT EQUAL")
    for i, (j, k) in enumerate(zip(a,b)):
        if j != k :
            diff_pos.append(i+1)
    return diff_pos

def if_anchor_matches_mutation_pos(mutation_type, wt_peptide, mt_peptide, mutation_pos, HLA_allele, anchor_dict):
    matched = False
    length = len(mt_peptide)
    anchor_list = anchor_dict[HLA_allele][length]
    if mutation_type == "missense" or mutation_type == 'inframe_del': 
        if pd.isna(wt_peptide):
            print('NULL WT Peptide')
        else:
            diff_pos = difference_between_peptides(wt_peptide, mt_peptide)
            #print("Different Position: ", diff_pos, mutation_pos, wt_peptide, mt_peptide)
            if set(diff_pos).issubset(anchor_list):
                matched = True
                #print("Matched Anchor: ", anchor_list)
    elif mutation_type == 'FS' or mutation_type == 'inframe_ins': ### DOES THIS NEED CHANGING? inframe_ins should be followed by difference_between_peptides?
        if mutation_pos != 0 or mutation_pos != -1:
            diff_pos = list(range(mutation_pos, length+1))
            if set(diff_pos).issubset(anchor_list):
                matched = True
                #print("FS/ins mutation matched to anchor: ", anchor_list) 
                #print("FS/ins mutation positions: ", diff_pos, length, anchor_list, HLA_allele)
    return matched

In [None]:
## Cutoff at 80% contribution
all_anchor_overall_pos_score_dict_r4 = load_obj('all_anchor_overall_pos_score_dict_r4')
normalized_anchor_dict_r4, filtered_anchor_dict_r4 = anchor_pos_score_by_contribution(all_anchor_overall_pos_score_dict_r4, 0.8)
anchor_pos_freq_by_length(filtered_anchor_dict_r4,9)

In [None]:
## Cutoff at 80% contribution wi†h base cutoff
normalized_anchor_dict_r4_base, filtered_anchor_dict_r4_base = anchor_pos_score_by_contribution_with_base(all_anchor_overall_pos_score_dict_r4, 0.8)
anchor_pos_freq_by_length(filtered_anchor_dict_r4_base,9)

In [None]:
hla_anchor_freq = {}
n = 9
for i in filtered_anchor_dict_r4_m1:
    try:
        anchors = "-".join([str(j) for j in filtered_anchor_dict_r4_m1[i][n]])
    except:
        continue
    try:
        hla_anchor_freq[anchors].append(i)
    except:
        hla_anchor_freq[anchors] = [i]

x = [i for i in hla_anchor_freq.keys()]
y = [len(i) for i in hla_anchor_freq.values()]
plt.figure(figsize=(10,10))
plt.bar(x, y)
plt.xticks(rotation=90, size=12)
plt.show()
{pattern:len(hla_anchor_freq[pattern]) for pattern in hla_anchor_freq}

## Sampling for balanced HLA alleles

In [None]:
tcga_1000_m2_filter_500b_1 = pd.read_csv('filter_500b_TCGA_1000_m2_pvacseq_analysis_combined.txt', sep='\t')
tcga_1000_m2_filter_500b_2 = pd.read_csv('filter_500b_TCGA_1000_m2_pvacseq_re_analysis_combined.txt', sep='\t')
tcga_1000_m2_filter_500b = tcga_1000_m2_filter_500b_1.append(tcga_1000_m2_filter_500b_2)

In [None]:
hla_anchor_data = pd.DataFrame()

hla_alleles_for_r4 = np.sort(tcga_1000_m2_filter_500b['HLA Allele'])
print("Number of HLA alleles included in samples from method 1: ", len(np.unique(hla_alleles_for_r4)))

### For each entry, determine if mutation is at anchor position
tcga_1000_m2_condensed_info = []
for n, i in tcga_1000_m2_filter_500b.iterrows():
    if pd.isna(i['Mutation Position']):
        print('mutation position is NA', n)
        continue
    mut_pos = int(i['Mutation Position'])
    wt_seq = i['WT Epitope Seq']
    mt_seq = i['MT Epitope Seq']
    wt_binding = i['Median WT Score']
    mt_binding = i['Median MT Score']
    hla_allele = i['HLA Allele']
    length = i['Peptide Length']
    fold_change = i['Median Fold Change']
    mutation_type = i['Variant Type']
    try:
        anchor_pos_list = filtered_anchor_dict_r4[hla_allele][length]
    except:
        #anchor_pos_list = [-1]
        continue
    if if_anchor_matches_mutation_pos(mutation_type, wt_seq, mt_seq, mut_pos, hla_allele, filtered_anchor_dict_r4):
        at_anchor = 1
    else:
        at_anchor = 0
    tcga_1000_m2_condensed_info.append([mut_pos, mutation_type, wt_seq, mt_seq, wt_binding, mt_binding, hla_allele, length, fold_change, at_anchor, anchor_pos_list])
tcga_1000_m2_condensed_info_df = pd.DataFrame.from_records(tcga_1000_m2_condensed_info, columns=['Mutation Position', 'Variant Type', 'WT Sequence', 'MT Sequence','Median WT Score','Median MT Score', 'HLA Allele', 'Peptide Length', 'Median Fold Change', 'Mutation at Anchor', 'Anchor List'])

anchor_based_decision = []
anchor_based_group = []
og_decision = []
naive_anchor_decision = []
naive_decision_group = []
for n, i in tcga_1000_m2_condensed_info_df.iterrows():
    anchor_decision, anchor_group = anchor_category_determine(i)
    naive_anchor, naive_group = naive_anchor_category_determine(i)
    naive_anchor_decision.append(naive_anchor)
    naive_decision_group.append(naive_group)
    anchor_based_decision.append(anchor_decision)
    anchor_based_group.append(anchor_group)
    og_decision.append(original_decision(i))

agreed_decision_1_3 = []
for i,j in zip(og_decision, anchor_based_decision):
    if i == j:
        agreed_decision_1_3.append('YES')
    else:
        agreed_decision_1_3.append('NO')
        
agreed_decision_1_2 = []
for i,j in zip(og_decision, naive_anchor_decision):
    if i == j:
        agreed_decision_1_2.append('YES')
    else:
        agreed_decision_1_2.append('NO')
        
agreed_decision_2_3 = []
for i,j in zip(naive_anchor_decision, anchor_based_decision):
    if i == j:
        agreed_decision_2_3.append('YES')
    else:
        agreed_decision_2_3.append('NO')
        
tcga_1000_m2_condensed_info_df['Binding < 500 and FC > 1'] = og_decision
tcga_1000_m2_condensed_info_df['Naive Anchor Based decision'] = naive_anchor_decision
tcga_1000_m2_condensed_info_df['Naive Anchor Decision Group'] = naive_decision_group
tcga_1000_m2_condensed_info_df['Anchor Based decision'] = anchor_based_decision
tcga_1000_m2_condensed_info_df['Anchor Decision Group'] = anchor_based_group
tcga_1000_m2_condensed_info_df['Do basic & naive decisions agree?'] = agreed_decision_1_2
tcga_1000_m2_condensed_info_df['Do naive & calculated decisions agree?'] = agreed_decision_2_3
tcga_1000_m2_condensed_info_df['Do basic & calculated decisions agree?'] = agreed_decision_1_3

In [None]:
tcga_1000_m2_condensed_info_df.to_csv('TCGA_1000_anchor_evaluation_3_type_data_results_balanced_m2_corrected.tsv', sep='\t')

In [None]:
print(Counter(og_decision), Counter(naive_anchor_decision), Counter(anchor_based_decision))
filters_decision_combined = pd.DataFrame(columns=['Filter A', 'Filter B', 'Filter C'])
filters_decision_combined['Filter A'] = og_decision
filters_decision_combined['Filter B'] = naive_anchor_decision
filters_decision_combined['Filter C'] = anchor_based_decision

In [None]:
print(Counter(og_decision), Counter(naive_anchor_decision), Counter(anchor_based_decision))
filters_decision_combined = pd.DataFrame(columns=['Filter A', 'Filter B', 'Filter C'])
filters_decision_combined['Filter A'] = og_decision
filters_decision_combined['Filter B'] = naive_anchor_decision
filters_decision_combined['Filter C'] = anchor_based_decision

In [None]:
upset_table, counts = count_set_occurrences(filters_decision_combined, ['Filter A', 'Filter B', 'Filter C'])

for i in zip(upset_table, counts):
    print(i)
upsetplot.plot(upsetplot.from_memberships(upset_table, data=counts), sort_by='cardinality')
#plt.savefig('upset_plot_filters_combined_m2_contribution_80_no_base.pdf')

## Summary of framshift/indel causing mutation anchors

In [None]:
missense_matched = []
fs_matched = []
insert_matched = []
del_matched = []
for n, i in tcga_1000_m1_filter_500b.iterrows():
    if pd.isna(i['Mutation Position']):
        print('mutation position is NA', n)
        continue
    mut_pos = int(i['Mutation Position'])
    wt_seq = i['WT Epitope Seq']
    mt_seq = i['MT Epitope Seq']
    wt_binding = i['Median WT Score']
    mt_binding = i['Median MT Score']
    hla_allele = i['HLA Allele']
    length = i['Peptide Length']
    fold_change = i['Median Fold Change']
    mutation_type = i['Variant Type']
    try:
        anchor_pos_list = filtered_anchor_dict_r4[hla_allele][length]
    except:
        #anchor_pos_list = [-1]
        continue
    if if_anchor_matches_mutation_pos(mutation_type, wt_seq, mt_seq, mut_pos, hla_allele, filtered_anchor_dict_r4):
        at_anchor = 1
        if i['Variant Type'] == 'inframe_del':
            del_matched.append(n)
        if i['Variant Type'] == 'missense':
            missense_matched.append(n)
        if i['Variant Type'] == 'inframe_ins':
            insert_matched.append(n)
        if i['Variant Type'] == 'FS':
            fs_matched.append(n)
    else:
        at_anchor = 0
fs_matched = tcga_1000_m1_filter_500b[tcga_1000_m1_filter_500b['Variant Type'] == 'FS'].loc[fs_matched]
missense_matched = tcga_1000_m1_filter_500b[tcga_1000_m1_filter_500b['Variant Type'] == 'missense'].loc[missense_matched]
insert_matched = tcga_1000_m1_filter_500b[tcga_1000_m1_filter_500b['Variant Type'] == 'inframe_ins'].loc[insert_matched]
del_matched = tcga_1000_m1_filter_500b[tcga_1000_m1_filter_500b['Variant Type'] == 'inframe_del'].loc[del_matched]

print("Framshift matched: ", fs_matched.drop_duplicates(subset=['MT Epitope Seq', 'HLA Allele'], keep='last').shape)
print("Missense matched: ", missense_matched.drop_duplicates(subset=['MT Epitope Seq', 'HLA Allele'], keep='last').shape)
print("Inframe del matched: ", del_matched.drop_duplicates(subset=['MT Epitope Seq', 'HLA Allele'], keep='last').shape)
print("Inframe ins matched: ", insert_matched.drop_duplicates(subset=['MT Epitope Seq', 'HLA Allele'], keep='last').shape)
print(339+11641+41+1)

print(tcga_1000_m1_filter_500b.drop_duplicates(subset=['MT Epitope Seq', 'HLA Allele'], keep='last').shape)
print("all fs: ", tcga_1000_m1_filter_500b[tcga_1000_m1_filter_500b['Variant Type'] == 'FS'].drop_duplicates(subset=['MT Epitope Seq', 'HLA Allele'], keep='last').shape)
print("all missense: ",tcga_1000_m1_filter_500b[tcga_1000_m1_filter_500b['Variant Type'] == 'missense'].drop_duplicates(subset=['MT Epitope Seq', 'HLA Allele'], keep='last').shape)
print("all ins: ",tcga_1000_m1_filter_500b[tcga_1000_m1_filter_500b['Variant Type'] == 'inframe_ins'].drop_duplicates(subset=['MT Epitope Seq', 'HLA Allele'], keep='last').shape)
print("all del: ",tcga_1000_m1_filter_500b[tcga_1000_m1_filter_500b['Variant Type'] == 'inframe_del'].drop_duplicates(subset=['MT Epitope Seq', 'HLA Allele'], keep='last').shape)

In [None]:
missense_matched = []
fs_matched = []
insert_matched = []
del_matched = []
for n, i in tcga_1000_m2_filter_500b.iterrows():
    if pd.isna(i['Mutation Position']):
        print('mutation position is NA', n)
        continue
    mut_pos = int(i['Mutation Position'])
    wt_seq = i['WT Epitope Seq']
    mt_seq = i['MT Epitope Seq']
    wt_binding = i['Median WT Score']
    mt_binding = i['Median MT Score']
    hla_allele = i['HLA Allele']
    length = i['Peptide Length']
    fold_change = i['Median Fold Change']
    mutation_type = i['Variant Type']
    try:
        anchor_pos_list = filtered_anchor_dict_r4[hla_allele][length]
    except:
        #anchor_pos_list = [-1]
        continue
    if if_anchor_matches_mutation_pos(mutation_type, wt_seq, mt_seq, mut_pos, hla_allele, filtered_anchor_dict_r4):
        at_anchor = 1
        if i['Variant Type'] == 'inframe_del':
            del_matched.append(n)
        if i['Variant Type'] == 'missense':
            missense_matched.append(n)
        if i['Variant Type'] == 'inframe_ins':
            insert_matched.append(n)
        if i['Variant Type'] == 'FS':
            fs_matched.append(n)
    else:
        at_anchor = 0
fs_matched = tcga_1000_m2_filter_500b[tcga_1000_m2_filter_500b['Variant Type'] == 'FS'].loc[fs_matched]
missense_matched = tcga_1000_m2_filter_500b[tcga_1000_m2_filter_500b['Variant Type'] == 'missense'].loc[missense_matched]
insert_matched = tcga_1000_m2_filter_500b[tcga_1000_m2_filter_500b['Variant Type'] == 'inframe_ins'].loc[insert_matched]
del_matched = tcga_1000_m2_filter_500b[tcga_1000_m2_filter_500b['Variant Type'] == 'inframe_del'].loc[del_matched]

print("Framshift matched: ", fs_matched.drop_duplicates(subset=['MT Epitope Seq', 'HLA Allele'], keep='last').shape)
print("Missense matched: ", missense_matched.drop_duplicates(subset=['MT Epitope Seq', 'HLA Allele'], keep='last').shape)
print("Inframe del matched: ", del_matched.drop_duplicates(subset=['MT Epitope Seq', 'HLA Allele'], keep='last').shape)
print("Inframe ins matched: ", insert_matched.drop_duplicates(subset=['MT Epitope Seq', 'HLA Allele'], keep='last').shape)

print(tcga_1000_m2_filter_500b.drop_duplicates(subset=['MT Epitope Seq', 'HLA Allele'], keep='last').shape)
print("all fs: ", tcga_1000_m2_filter_500b[tcga_1000_m2_filter_500b['Variant Type'] == 'FS'].drop_duplicates(subset=['MT Epitope Seq', 'HLA Allele'], keep='last').shape)
print("all missense: ",tcga_1000_m2_filter_500b[tcga_1000_m2_filter_500b['Variant Type'] == 'missense'].drop_duplicates(subset=['MT Epitope Seq', 'HLA Allele'], keep='last').shape)
print("all ins: ",tcga_1000_m2_filter_500b[tcga_1000_m2_filter_500b['Variant Type'] == 'inframe_ins'].drop_duplicates(subset=['MT Epitope Seq', 'HLA Allele'], keep='last').shape)
print("all del: ",tcga_1000_m2_filter_500b[tcga_1000_m2_filter_500b['Variant Type'] == 'inframe_del'].drop_duplicates(subset=['MT Epitope Seq', 'HLA Allele'], keep='last').shape)

## Patient Level analysis of impact of anchor considerations

This was done using 100 random TCGA samples that we had full sets of pvactools data for

questions to answer:
1. Out of the 100 patients, how many patients had their neoantigens affected? On average how many were affected?
2. For patients with mutation burden less than 100, how does this impact their pool of peptide candidates?
3. Composition in terms of tumor types for our random 100 samples

In [None]:
## Randomly select samples from TCGA data
## First obtain round 1 + round 2 TCGA data list
round_1_list = pd.read_csv('/Users/h.xia/Desktop/Griffith_Lab/R_shiny_visualization/anchor_analysis/TCGA_samples_round1_noUCEC.tsv', sep='\t', header=None)
round_2_list = pd.read_csv('/Users/h.xia/Desktop/Griffith_Lab/R_shiny_visualization/anchor_analysis/TCGA_samples_round2_withUCEC.tsv', sep='\t', header=None)
round_1_list = round_1_list.append(round_2_list, ignore_index=True)

## Random selection of 100 patients
#random_select = round_1_list.sample(n=100)
#save_obj(random_select, 'random_selection_anchor_patient_impact_100_sub')
random_select = load_obj('random_selection_anchor_patient_impact_100_sub')
dataset = []
for i,j in random_select.iterrows():
    if j[0] in list(round_2_list[0]):
        dataset.append(2)
    else:
        dataset.append(1)
random_select['dataset'] = dataset
#random_select.to_csv('Random_selection_anchor_patient_impact_100_samples.tsv', sep='\t')

In [None]:
## Moved relevant files from cluster for 100 random_samples
## Now each patient needs to be analyzed for the peptides changed between analysis and how many variants they started with

def calculate_change(patient_data, variant_data, patient_unique_data):
    variant_data = variant_data.drop_duplicates(subset=['chromosome_name','start','stop','reference', 'variant'])
    variant_count = len(variant_data.index)
    all_candidate_count = len(patient_data.index)
    candidate_count = len(patient_unique_data.index)
    ### For each entry, determine if mutation is at anchor position
    patient_condensed_info = []
    for n, i in patient_unique_data.iterrows():
        if pd.isna(i['Mutation Position']):
            print('mutation position is NA', n)
            continue
        mut_pos = int(i['Mutation Position'])
        wt_seq = i['WT Epitope Seq']
        mt_seq = i['MT Epitope Seq']
        wt_binding = i['Median WT Score']
        mt_binding = i['Median MT Score']
        hla_allele = i['HLA Allele']
        length = i['Peptide Length']
        fold_change = i['Median Fold Change']
        mutation_type = i['Variant Type']
        try:
            anchor_pos_list = filtered_anchor_dict_r4[hla_allele][length]
        except:
            print("NO INFO for: "+hla_allele+" "+str(length))
            #anchor_pos_list = [-1]
            continue
        if if_anchor_matches_mutation_pos(mutation_type, wt_seq, mt_seq, mut_pos, hla_allele, filtered_anchor_dict_r4):
            at_anchor = 1
        else:
            at_anchor = 0
        patient_condensed_info.append([mut_pos, mutation_type, wt_seq, mt_seq, wt_binding, mt_binding, hla_allele, length, fold_change, at_anchor, anchor_pos_list])
    patient_condensed_info_df = pd.DataFrame.from_records(patient_condensed_info, columns=['Mutation Position', 'Variant Type', 'WT Sequence', 'MT Sequence','Median WT Score','Median MT Score', 'HLA Allele', 'Peptide Length', 'Median Fold Change', 'Mutation at Anchor', 'Anchor List'])

    anchor_based_decision = []
    anchor_based_group = []
    og_decision = []
    naive_anchor_decision = []
    naive_decision_group = []
    for n, i in patient_condensed_info_df.iterrows():
        anchor_decision, anchor_group = anchor_category_determine(i)
        naive_anchor, naive_group = naive_anchor_category_determine(i)
        naive_anchor_decision.append(naive_anchor)
        naive_decision_group.append(naive_group)
        anchor_based_decision.append(anchor_decision)
        anchor_based_group.append(anchor_group)
        og_decision.append(original_decision(i))

    agreed_decision_1_3 = []
    for i,j in zip(og_decision, anchor_based_decision):
        if i == j:
            agreed_decision_1_3.append('YES')
        else:
            agreed_decision_1_3.append('NO')

    agreed_decision_1_2 = []
    for i,j in zip(og_decision, naive_anchor_decision):
        if i == j:
            agreed_decision_1_2.append('YES')
        else:
            agreed_decision_1_2.append('NO')

    agreed_decision_2_3 = []
    for i,j in zip(naive_anchor_decision, anchor_based_decision):
        if i == j:
            agreed_decision_2_3.append('YES')
        else:
            agreed_decision_2_3.append('NO')

    counter_og = Counter(og_decision)
    counter_naive = Counter(naive_anchor_decision)
    counter_anchor = Counter(anchor_based_decision)
    patient_condensed_info_df['Do basic & naive decisions agree?'] = agreed_decision_1_2
    patient_condensed_info_df['Do naive & calculated decisions agree?'] = agreed_decision_2_3
    patient_condensed_info_df['Do basic & calculated decisions agree?'] = agreed_decision_1_3
    
    percent_change_1_3 = Counter(agreed_decision_1_3)['NO']/(Counter(agreed_decision_1_3)['NO']+Counter(agreed_decision_1_3)['YES'])
    percent_change_2_3 = Counter(agreed_decision_2_3)['NO']/(Counter(agreed_decision_2_3)['NO']+Counter(agreed_decision_2_3)['YES'])
    percent_change_1_2 = Counter(agreed_decision_1_2)['NO']/(Counter(agreed_decision_1_2)['NO']+Counter(agreed_decision_1_2)['YES'])

    return percent_change_1_2, percent_change_2_3, percent_change_1_3, variant_count, candidate_count, all_candidate_count, counter_og, counter_naive, counter_anchor

## Question 1

In [None]:
random_samples = load_obj('random_selection_anchor_patient_impact_100_sub')
patient_percentage_counts = []
for n, i in random_samples.iterrows():
    sample = i[0]
    tumor_type = i[1]
    patient_unique_data = pd.read_csv("/Users/h.xia/Desktop/Griffith_Lab/R_shiny_visualization/anchor_analysis/random_100_samples/"+sample+"/filter_500b_topscore.tsv", sep="\t")
    patient_data = pd.read_csv("/Users/h.xia/Desktop/Griffith_Lab/R_shiny_visualization/anchor_analysis/random_100_samples/"+sample+"/filter_500b.tsv", sep="\t")
    variant_data = pd.read_csv("/Users/h.xia/Desktop/Griffith_Lab/R_shiny_visualization/anchor_analysis/random_100_samples/"+sample+"/TUMOR_NEW.tsv", sep="\t")
    [percent_12, percent_23, percent_13, v_count, c_count, all_c_count, counter_og, counter_naive, counter_anchor] = calculate_change(patient_data, variant_data, patient_unique_data)
    patient_percentage_counts.append([sample, tumor_type, v_count, c_count, all_c_count, counter_og, counter_naive, counter_anchor, percent_12, percent_23, percent_13, int(c_count*percent_12), int(c_count*percent_23), int(c_count*percent_13)])
patient_percentage_counts_df = pd.DataFrame.from_records(patient_percentage_counts, columns=['TCGA Sample ID','Tumor Type','Number of Variants', 'Number of Neoantigen Candidates (TopScore)', 'Number of Neoantigen Candidates',
                                                                                             'Counter_og', 'Counter_Naive', 'Counter_Anchor',
                                                                                             '% Difference between filter A and B', '% Difference between filter B and C',
                                                                                             '% Difference between filter A and C','# Difference between filter A and B',
                                                                                             '# Difference between filter B and C', '# Difference between filter A and C'])


In [None]:
patient_percentage_counts_df.to_csv("Patient-Level impact analysis with tcga data.tsv", sep="\t")

In [None]:
## Number of variants vs number of neoantigen candidates
f, ax = plt.subplots(figsize=(10,10))
ax.set(xscale="log", yscale="log")
g = sns.scatterplot(data=patient_percentage_counts_df, x='Number of Variants', y='Number of Neoantigen Candidates (TopScore)', hue='Tumor Type',ax=ax, s=200)
g.set(xlabel='Number of unique variants', ylabel='Number of strong binding neoantigen candidates')
plt.xlim(0, 100000)
plt.ylim(0, 100000)
#plt.savefig("Patient_level_analysis_100_variants_vs_candidates.pdf")
plt.show()

In [None]:
## Number of variants vs number of neoantigen candidates, tumor type catergory bar plot
f, ax = plt.subplots(figsize=(10,10))
g = sns.countplot(data=patient_percentage_counts_df, x='Tumor Type')
g.set(xlabel='Tumor Type', ylabel='Counts')
plt.xticks(rotation=45)
#plt.savefig("Patient_level_analysis_sample_type_bar_plot.pdf")
plt.show()

In [None]:
filters = ['A and B', 'B and C', 'A and C']
choices = ['#', '%']
for filter_1 in filters:
    for choice in choices:
        f, ax = plt.subplots(figsize=(10,10))
        g = sns.histplot(patient_percentage_counts_df, x=choice+' Difference between filter '+filter_1)
        #g.set_xscale('log')
        plt.savefig("Patient_level_analysis_"+choice+" Difference between filter "+filter_1+".pdf")
        plt.show()

In [None]:
patient_percentage_counts_df[patient_percentage_counts_df['# Difference between filter A and B'] == 0].shape

In [None]:
patient_percentage_counts_df[patient_percentage_counts_df['# Difference between filter A and B'] == 0].shape

In [None]:
np.median(patient_percentage_counts_df['# Difference between filter A and B'])

## Question 2


In [None]:
## Using less than 20 strong binding neoantigens as filter (after top score applied)
info = []
for n, i in patient_percentage_counts_df.iterrows():
    info.append([i['Tumor Type'], i['Number of Variants'], i['Number of Neoantigen Candidates (TopScore)'], i['Counter_og']['Accept'], i['Counter_Naive']['Accept'], i['Counter_Anchor']['Accept']])
info_df = pd.DataFrame.from_records(info, columns=['Tumor Type','Number of Unique Variants', 'Number of Neoantigen Candidates (TopScore)',
                                                  'Accepted by Filter A', 'Accepted by Filter B', 'Accepted by Filter C'])
low_candidates_20 = info_df[info_df['Number of Neoantigen Candidates (TopScore)'] < 20]

In [None]:
increase_from_A_to_BC = 0
increase_from_B_to_C = 0
for n, i in low_candidates_20.iterrows():
    if (i['Accepted by Filter A'] < i['Accepted by Filter B']) or (i['Accepted by Filter A'] < i['Accepted by Filter C']):
        increase_from_A_to_BC += 1
    if (i['Accepted by Filter B'] < i['Accepted by Filter C']):
        increase_from_B_to_C += 1
print(len(low_candidates_20), increase_from_A_to_BC, increase_from_B_to_C)
print(increase_from_A_to_BC/len(low_candidates_20), increase_from_B_to_C/len(low_candidates_20))

## Question 3

In [None]:
## Number of variants vs number of neoantigen candidates, tumor type catergory bar plot
f, ax = plt.subplots(figsize=(10,10))
round_1_list.columns = ['TCGA Sample ID', 'Tumor Type'] 
g = sns.countplot(data=round_1_list, x='Tumor Type')
g.set(xlabel='Tumor Type', ylabel='Patient Sample Counts')
plt.xticks(rotation=45)
plt.savefig("Patient_level_analysis_all_1356_sample_type_bar_plot.pdf")
plt.show()