In [None]:
import numpy as np
import pandas as pd
import regex
import random
import pickle
import matplotlib.pyplot as plt
%matplotlib inline
plt.ion()
import os
import matplotlib.style as style
import matplotlib.cm as mplcm
import matplotlib.colors as colors
import upsetplot
from collections import Counter
import csv

In [None]:
def save_obj(obj, name ):
    with open('obj/'+ name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

def load_obj(name ):
    with open('obj/' + name + '.pkl', 'rb') as f:
        return pickle.load(f)

def create_peptide_database(hla_alleles, all_epitopes_combined):
    peptide_database = {}
    for i in hla_alleles:
        if i in peptide_database:
            return "HLA_allele input not unique"
        else:
            peptides = all_epitopes_combined[all_epitopes_combined['HLA Allele']==i]
            peptide_database[i] = {}
            for j in peptides['MT Epitope Seq']:
                peptide_length = len(j)
                if peptide_length in peptide_database[i]:
                    peptide_database[i][peptide_length].append(j)
                else:
                    peptide_database[i][peptide_length] = [j]
            for k in peptide_database[i].keys():
                peptide_database[i][k] = list(set(peptide_database[i][k]))
    return peptide_database
def check_data(data, database, insufficient, prefix='', start=1):
    data = data.fillna('')
    eligible_data = pd.DataFrame(columns=["Patient ID", "HLA_Alleles", "Adding_allele", "Reason"])
    for n, i in data.iterrows():
        patient_id = i[0]
        for j in list(i[start:]):
            if j == '':
                continue
            if j.split('-')[0] != 'HLA' and prefix == '':
                prefix = 'HLA-'
            if prefix+j not in database.keys():
                
                eligible_data = eligible_data.append({
                     "Patient ID": patient_id,
                     "HLA_Alleles": ','.join(list(i[start:])),
                    "Adding_allele": j,
                    "Reason": "missing"
                          }, ignore_index=True)
            if prefix+j in insufficient:
                eligible_data = eligible_data.append({
                     "Patient ID": patient_id,
                     "HLA_Alleles": ','.join(list(i[start:])),
                    "Adding_allele": j,
                    "Reason": "insufficient"
                          }, ignore_index=True)
            else:
                #print(n, j)
                continue
    return(eligible_data)

## Generating Combine Peptide database

In [None]:
## MedImmune Samples
all_epitopes_004 = pd.read_csv("round_1_datasets/H_MT-10109-004/TUMOR.all_epitopes.tsv", sep='\t')
all_epitopes_005 = pd.read_csv("round_1_datasets/H_MT-10109-005/TUMOR.all_epitopes.tsv", sep='\t')
all_epitopes_007 = pd.read_csv("round_1_datasets/H_MT-10109-007/TUMOR.all_epitopes.tsv", sep='\t')
all_epitopes_008 = pd.read_csv("round_1_datasets/H_MT-10109-008/TUMOR.all_epitopes.tsv", sep='\t')
all_epitopes_009 = pd.read_csv("round_1_datasets/H_MT-10109-009/TUMOR.all_epitopes.tsv", sep='\t')
all_epitopes_010 = pd.read_csv("round_1_datasets/H_MT-10109-010/TUMOR.all_epitopes.tsv", sep='\t')
all_epitopes_011 = pd.read_csv("round_1_datasets/H_MT-10109-011/TUMOR.all_epitopes.tsv", sep='\t')

## UCLA samples
all_epitopes_0461 = pd.read_csv("round_1_datasets/UCLA_0461/combined_epitopes.txt", sep='\t')
all_epitopes_0465 = pd.read_csv("round_1_datasets/UCLA_0465/combined_epitopes.tsv", sep='\t')
all_epitopes_0476 = pd.read_csv("round_1_datasets/UCLA_0476/0476_combined_all_epitopes.tsv", sep='\t')
all_epitopes_0482_1 = pd.read_csv("round_1_datasets/UCLA_0482_1/combined_epitopes.tsv", sep='\t')
all_epitopes_0482_2 = pd.read_csv("round_1_datasets/UCLA_0482_2/TUMOR.all_epitopes.tsv", sep='\t')
all_epitopes_0530 = pd.read_csv("round_1_datasets/UCLA_0530/TUMOR.all_epitopes.tsv", sep='\t')

In [None]:
all_epitopes_0461_filtered = all_epitopes_0461[all_epitopes_0461['Median MT Score']<=500]
all_epitopes_0465_filtered = all_epitopes_0465[all_epitopes_0465['Median MT Score']<=500]
all_epitopes_0476_filtered = all_epitopes_0476[all_epitopes_0476['Median MT Score']<=500]
all_epitopes_0482_1_filtered = all_epitopes_0482_1[all_epitopes_0482_1['Median MT Score']<=500]
all_epitopes_0482_2_filtered = all_epitopes_0482_2[all_epitopes_0482_2['Median MT Score']<=500]
all_epitopes_0530_filtered = all_epitopes_0530[all_epitopes_0530['Median MT Score']<=500]
ucla_samples_filtered_epitopes = all_epitopes_0461_filtered.append([all_epitopes_0465_filtered, all_epitopes_0476_filtered, all_epitopes_0482_1_filtered,
                                                                      all_epitopes_0482_2_filtered, all_epitopes_0530_filtered
                                                                      ], ignore_index=True, sort=True)

In [None]:
print(all_epitopes_0461_filtered.shape)
print(all_epitopes_0476_filtered.shape)
print(all_epitopes_0465_filtered.shape)
print(all_epitopes_0482_1_filtered.shape)
print(all_epitopes_0482_2_filtered.shape)
print(all_epitopes_0530_filtered.shape)
print(ucla_samples_filtered_epitopes.shape)

In [None]:
all_epitopes_004_filtered = all_epitopes_004[all_epitopes_004['Median MT Score']<=500]
all_epitopes_005_filtered = all_epitopes_005[all_epitopes_005['Median MT Score']<=500]
all_epitopes_007_filtered = all_epitopes_007[all_epitopes_007['Median MT Score']<=500]
all_epitopes_008_filtered = all_epitopes_008[all_epitopes_008['Median MT Score']<=500]
all_epitopes_009_filtered = all_epitopes_009[all_epitopes_009['Median MT Score']<=500]
all_epitopes_010_filtered = all_epitopes_010[all_epitopes_010['Median MT Score']<=500]
all_epitopes_011_filtered = all_epitopes_011[all_epitopes_011['Median MT Score']<=500]
clinical_samples_filtered_epitopes = all_epitopes_004_filtered.append([all_epitopes_005_filtered, all_epitopes_007_filtered, all_epitopes_008_filtered,
                                                                      all_epitopes_009_filtered, all_epitopes_010_filtered, all_epitopes_011_filtered
                                                                      ],ignore_index=True, sort=True)

In [None]:
print(all_epitopes_004_filtered.shape, all_epitopes_005_filtered.shape, all_epitopes_007_filtered.shape)
print(all_epitopes_008_filtered.shape, all_epitopes_009_filtered.shape, all_epitopes_010_filtered.shape)
print(all_epitopes_011_filtered.shape)
print(clinical_samples_filtered_epitopes.shape)

In [None]:
filtered_epitopes_TCGA_300 = pd.read_table("Inputs/TCGA_300_filter1a_epitopes_combined.txt", delimiter='\t')
filtered_epitopes_combined_round_1 = filtered_epitopes_TCGA_300.append([clinical_samples_filtered_epitopes,ucla_samples_filtered_epitopes]
                                                                      , ignore_index=True, sort=True)
unique_hla_alleles_round_1 = filtered_epitopes_combined_round_1['HLA Allele'].unique()
print(len(unique_hla_alleles_round_1))
combined_peptide_database_round_1 = create_peptide_database(unique_hla_alleles_round_1, filtered_epitopes_combined_round_1)
print(filtered_epitopes_combined_round_1.shape, filtered_epitopes_TCGA_300.shape)

In [None]:
print('Total number of HLA-alleles:'+str(len(combined_peptide_database_round_1.keys())))
count = 0
for i in combined_peptide_database_round_1.keys():
    for j in combined_peptide_database_round_1[i]:
        if len(combined_peptide_database_round_1[i][j]) < 10:
            count += 1
print('Insufficient peptide length combos:', count)
print('NOTE: these combos are only counting the ones below 10 but will not count anything that is 0, which has no entry')

## Analyzing potential additional datasets

In [None]:
## Lymphoma Data
lymphoma_data = pd.read_csv('./additional_datasets/FL_Neoepitope_final_HLA_typing_sorted.tsv', sep='\t')
## GBM data
gbm_data_1 = pd.read_csv('./additional_datasets/twck_classI_hlatypes.tsv', sep='\t|,', header=None, engine='python')
gbm_data_2 = pd.read_csv('./additional_datasets/htc_classI_hlatypes.tsv', sep='\t|,', header=None, engine='python')
print(lymphoma_data.shape, gbm_data_1.shape, gbm_data_2.shape)

In [None]:
insufficient_data_round_1 = []
count = 0
for i in combined_peptide_database_round_1.keys():
    for j in [8,9,10,11]:
        try:
            length = len(combined_peptide_database_round_1[i][j])
        except:
            insufficient_data_round_1.append(i)
        if length <= 30:
            insufficient_data_round_1.append(i)
insufficient_data_round_1 = set(insufficient_data_round_1)
print('Round 1: HLA alleles that have insuffcient data in any length: ', len(insufficient_data_round_1))

In [None]:
gbm_patient_1 = check_data(gbm_data_1, combined_peptide_database_round_1, insufficient_data_round_1)
gbm_patient_2 = check_data(gbm_data_2, combined_peptide_database_round_1, insufficient_data_round_1)
gbm_patient_ids = set(list(gbm_patient_1['Patient ID'])+list(gbm_patient_2['Patient ID']))

In [None]:
lymphoma_patient = check_data(lymphoma_data, combined_peptide_database_round_1, insufficient_data_round_1, prefix='HLA-')
lymphoma_patient_ids = set(list(lymphoma_patient['Patient ID']))

In [None]:
print(len(gbm_patient_ids), len(lymphoma_patient_ids))

In [None]:
lymphoma_hla = []
patients_analyzed = []

for n, i in lymphoma_patient.iterrows():
    if i['Patient ID'] in patients_analyzed:
        continue
    else:
        patients_analyzed.append(i['Patient ID'])
        hla_list = i['HLA_Alleles'].split(',')
        lymphoma_hla.append([i['Patient ID'], ','.join(['HLA-'+j for j in hla_list])])

with open("lymphoma_58_patients_hla.tsv", "w", newline="") as f:
    writer = csv.writer(f, delimiter='\t')
    writer.writerows(lymphoma_hla)

In [None]:
## TCGA data
tcga_data = pd.read_csv('./additional_datasets/hlaTypesAll.tsv', sep='\t')
tcga_patient= check_data(tcga_data, combined_peptide_database_round_1, insufficient_data_round_1, prefix='', start=2)
tcga_patient_ids = set(list(tcga_patient['Patient ID']))
print(len(tcga_patient_ids))

In [None]:
missing_allele = []
insufficient_allele = []
for i in [gbm_patient_1, gbm_patient_2, lymphoma_patient, tcga_patient]:
    for j,k in zip(i['Reason'],i['Adding_allele']) :
        if j == 'missing':
            if k.split('-')[0] != 'HLA':
                missing_allele.append('HLA-'+k)
            else:
                missing_allele.append(k)
        elif j == 'insufficient':
            if k.split('-')[0] != 'HLA':
                insufficient_allele.append('HLA-'+k)
            else:
                insufficient_allele.append(k)
    print('Insufficient Count: '+str(len(set(insufficient_allele))))
    print('Missing Count: '+str(len(set(missing_allele))))
    #print(set(missing_allele))
    
print(len(insufficient_data_round_1), len(set(missing_allele)), len(set(insufficient_allele)))


## Adding Lymphoma & GBM dataset to current database

In [None]:
#filtered_epitopes_combined_round_1 = load_obj('filtered_epitopes_combined_round_1')
print(filtered_epitopes_combined_round_1.shape)
all_epitopes_cody = pd.read_csv("additional_datasets/filter_500bind_cody_54_combined.txt", sep='\t')
all_epitopes_megan = pd.read_csv("additional_datasets/filter_500bind_megan_20_combined.txt", sep='\t')
all_epitopes_FLNA_1 = pd.read_csv("additional_datasets/FLNA-01.all_epitopes.tsv", sep='\t')
all_epitopes_FLNA_2 = pd.read_csv("additional_datasets/FLNA-02.all_epitopes.tsv", sep='\t')
all_epitopes_FLNA_3 = pd.read_csv("additional_datasets/FLNA-03.all_epitopes.tsv", sep='\t')
all_epitopes_FLNA_4 = pd.read_csv("additional_datasets/FLNA-04.all_epitopes.tsv", sep='\t')
all_epitopes_FLNA_1_filtered = all_epitopes_FLNA_1[all_epitopes_FLNA_1['Median MT Score']<=500]
all_epitopes_FLNA_2_filtered = all_epitopes_FLNA_2[all_epitopes_FLNA_2['Median MT Score']<=500]
all_epitopes_FLNA_3_filtered = all_epitopes_FLNA_3[all_epitopes_FLNA_3['Median MT Score']<=500]
all_epitopes_FLNA_4_filtered = all_epitopes_FLNA_4[all_epitopes_FLNA_4['Median MT Score']<=500]
filtered_epitopes_combined_round_2 = filtered_epitopes_combined_round_1.append([all_epitopes_cody, all_epitopes_megan,
                                                                               all_epitopes_FLNA_1_filtered, all_epitopes_FLNA_2_filtered,
                                                                               all_epitopes_FLNA_3_filtered, all_epitopes_FLNA_4_filtered],
                                                                              ignore_index=True, sort=True)

print(filtered_epitopes_combined_round_2.shape)

In [None]:
unique_hla_alleles_round_2 = filtered_epitopes_combined_round_2['HLA Allele'].unique()
print(len(unique_hla_alleles_round_2))
combined_peptide_database_round_2 = create_peptide_database(unique_hla_alleles_round_2, filtered_epitopes_combined_round_2)
print('Total number of HLA-alleles:'+str(len(combined_peptide_database_round_2.keys())))

In [None]:
count = 0
for i in combined_peptide_database_round_2.keys():
    for j in combined_peptide_database_round_2[i]:
        if len(combined_peptide_database_round_2[i][j]) < 10:
            count += 1
print(count)
insufficient_data_round_2 = []
count = 0
for i in combined_peptide_database_round_2.keys():
    for j in [8,9,10,11]:
        try:
            length = len(combined_peptide_database_round_2[i][j])
        except:
            insufficient_data_round_2.append(i)
        if length <= 30:
            insufficient_data_round_2.append(i)
insufficient_data_round_2 = set(insufficient_data_round_2)
print(len(insufficient_data_round_2))

## TCGA DATA ROUND 1 NO UCEC

In [None]:
## TCGA data after around round 2
tcga_data = pd.read_csv('./additional_datasets/hlaTypesAll.tsv', sep='\t')
tcga_patient= check_data(tcga_data, combined_peptide_database_round_2, insufficient_data_round_2, prefix='', start=2)
tcga_patient_ids = set(list(tcga_patient['Patient ID']))
print(len(tcga_patient_ids))

In [None]:
## Looking at what we would gain with additional TCGA samples added
missing_allele = []
insufficient_allele = []
for i in [tcga_patient]:
    for j,k in zip(i['Reason'],i['Adding_allele']) :
        if j == 'missing':
            if k.split('-')[0] != 'HLA':
                missing_allele.append('HLA-'+k)
            else:
                missing_allele.append(k)
        elif j == 'insufficient':
            if k.split('-')[0] != 'HLA':
                insufficient_allele.append('HLA-'+k)
            else:
                insufficient_allele.append(k)
    print('Insufficient Count: '+str(len(set(insufficient_allele))))
    print('Missing Count: '+str(len(set(missing_allele))))
    
print(len(insufficient_data_round_2), len(set(missing_allele)), len(set(insufficient_allele)))
print(insufficient_data_round_2-set(insufficient_allele))
total_hla_alleles_tcga_round_2 = set(missing_allele + insufficient_allele)
print(len(total_hla_alleles_tcga_round_2))

In [None]:
## Build a dictionary specifying which alleles each sample is adding to our combined dataset
tcga_round_3_dict = {}
for n, i in tcga_patient.iterrows():
    patient_id = i['Patient ID']
    corr_hla = i['Adding_allele']
    reason = i['Reason']
    if patient_id not in tcga_round_3_dict:
        tcga_round_3_dict[patient_id] = {}
        tcga_round_3_dict[patient_id]['total'] = []
        tcga_round_3_dict[patient_id]['missing'] = []
        tcga_round_3_dict[patient_id]['insufficient'] = []
    if reason == 'insufficient':
        tcga_round_3_dict[patient_id]['insufficient'].append(corr_hla)
        tcga_round_3_dict[patient_id]['total'].append(corr_hla)
    elif reason == 'missing': 
        tcga_round_3_dict[patient_id]['missing'].append(corr_hla)
        tcga_round_3_dict[patient_id]['total'].append(corr_hla)
## Uniqify in case one patient has the same HLA allele 2 times
for i in tcga_round_3_dict.keys():
    for j in tcga_round_3_dict[i]:
        if len(tcga_round_3_dict[i][j]) != len(set(tcga_round_3_dict[i][j])):
            tcga_round_3_dict[i][j] = list(set(tcga_round_3_dict[i][j]))


In [None]:
## For each patient's missing and insufficient entries, count and sort by the tuple 
tcga_round_3_counts = pd.DataFrame()
for i in tcga_round_3_dict.keys():
    for j in tcga_round_3_dict[i]:
        if j == 'missing': 
            missing_count = len(tcga_round_3_dict[i][j])
        if j == 'insufficient': 
            insuff_count = len(tcga_round_3_dict[i][j])
        if j == 'total': 
            total_count = len(tcga_round_3_dict[i][j])
        
    tcga_round_3_counts = tcga_round_3_counts.append({'Patient ID': i,'Missing Count':missing_count, 'Insuff Count': insuff_count,
                                            'Total Count': total_count}, ignore_index=True)
    #print(i, missing_count, insuff_count, total_count)
tcga_round_3_counts = tcga_round_3_counts.sort_values(by=['Missing Count', 'Insuff Count'], ascending=False)

In [None]:
## Converting TCGA sample names
kelsy_tcga_data_samples = pd.read_csv('./additional_datasets/tcgasamplesons3.tsv', sep='\t', header=None)
## Loading TCGA MetaData in order to know which directories from AWS to pull from 
kelsy_tcga_metadata = pd.read_csv('./additional_datasets/TCGAsamples_metadata.tsv',  sep='\t', header=None)
kelsy_tcga_data_reformat = pd.DataFrame()

## Creating a newly formated dataframe with both shortened and original tcga sample name and cohort 
reformat = []
original = []
full_datatype = []
datatype = []
cohort = []
for n, i in kelsy_tcga_data_samples.iterrows():
    type_of_data = kelsy_tcga_metadata.loc[kelsy_tcga_metadata[0] == i[0],][2].values[0]
    full_datatype.append(type_of_data)
    try:
        datatype.append(type_of_data.split(' ')[2])
    except:
        datatype.append('N/A')
    reformat.append('-'.join(i[0].split('-')[:3]))
    original.append(i[0])
    cohort.append(kelsy_tcga_metadata.loc[kelsy_tcga_metadata[0] == i[0],][1].values[0])
kelsy_tcga_data_reformat['reformat'] = reformat
kelsy_tcga_data_reformat['original'] = original
kelsy_tcga_data_reformat['full_datatype'] = full_datatype
kelsy_tcga_data_reformat['datatype'] = datatype
kelsy_tcga_data_reformat['cohort'] = cohort

In [None]:
## exclude all normal samples
kelsy_tcga_data_reformat_only_tumor = kelsy_tcga_data_reformat[kelsy_tcga_data_reformat['datatype'] != 'Normal']
kelsy_tcga_data_reformat.shape, kelsy_tcga_data_reformat_only_tumor.shape

In [None]:
## total HLA alleles that could be covered by round 2 tcga data: total_hla_alleles_tcga_round_2
def check_if_necessary(covered, adding, reason, limit1=4, limit2=4):
    ## setting limit for insufficient type as 4 extra and missing type as at least 4
    hla_output = []
    for i in adding:
        count = covered.count(i)
        if reason == 'missing':
            if count < limit2:
                hla_output.append(i)
        if reason == 'insufficient':
            if count < limit1:
                hla_output.append(i)
    return hla_output

covered_hla_alleles = []
ids_included = []
ids_excluded = []
full_ids_included = []
full_ids_excluded = []
for n, i in tcga_round_3_counts.iterrows():
    patient_id = i['Patient ID']
    if patient_id not in list(kelsy_tcga_data_reformat_only_tumor['reformat']):
        #print(patient_id)
        continue
    cancer_type = list(kelsy_tcga_data_reformat_only_tumor.loc[kelsy_tcga_data_reformat_only_tumor["reformat"] == patient_id]["cohort"])[0]
    if cancer_type == 'UCEC':
        continue
    full_id = kelsy_tcga_data_reformat_only_tumor.loc[kelsy_tcga_data_reformat_only_tumor['reformat'] == patient_id]['original']
    hla_alleles = tcga_round_3_dict[patient_id]
    total = hla_alleles['total']
    insuff = hla_alleles['insufficient']
    missing = hla_alleles['missing']
    #print(insuff, check_if_necessary(covered_hla_alleles, insuff, 'insufficient'))
    #print(missing, check_if_necessary(covered_hla_alleles, missing, 'missing'))
    if len(check_if_necessary(covered_hla_alleles, insuff, 'insufficient')) + len(check_if_necessary(covered_hla_alleles, missing, 'missing')) != 0:
        covered_hla_alleles = covered_hla_alleles + hla_alleles['total']
        full_ids_included.append(list(full_id)[0])
        ids_included.append(patient_id)
    else:
        #print('Skipping Patient: ', patient_id)
        ids_excluded.append(patient_id)
        full_ids_excluded.append(list(full_id)[0])
print(len(ids_included), len(ids_excluded), len(ids_included)+len(ids_excluded), len(tcga_round_3_counts['Patient ID']))

In [None]:
## confirm what I would gain with subset of additional TCGA samples added
tcga_subset = tcga_patient[tcga_patient['Patient ID'].isin(ids_included)]
missing_allele_tcga = []
insufficient_allele_tcga = []
for i in [tcga_subset]:
    for j,k in zip(i['Reason'],i['Adding_allele']) :
        if j == 'missing':
            if k.split('-')[0] != 'HLA':
                missing_allele_tcga.append('HLA-'+k)
            else:
                missing_allele_tcga.append(k)
        elif j == 'insufficient':
            if k.split('-')[0] != 'HLA':
                insufficient_allele_tcga.append('HLA-'+k)
            else:
                insufficient_allele_tcga.append(k)
    print('Insufficient Count: '+str(len(set(insufficient_allele_tcga))))
    print('Missing Count: '+str(len(set(missing_allele_tcga))))
    
print(len(insufficient_data_round_2), len(set(missing_allele_tcga)), len(set(insufficient_allele_tcga)))
print(insufficient_data_round_2-set(insufficient_allele_tcga))
total_hla_alleles_tcga_round_2 = set(missing_allele_tcga + insufficient_allele_tcga)
print(len(total_hla_alleles_tcga_round_2))
print ('Still missing samples for these alleles:', set(missing_allele)-set(missing_allele_tcga))

In [None]:
full_ids_and_cohort = kelsy_tcga_data_reformat_only_tumor[kelsy_tcga_data_reformat_only_tumor['original'].isin(full_ids_included)]
full_ids_and_cohort.to_csv("TCGA_samples_round1_noUCEC.tsv", sep="\t", columns=['original', 'cohort'], index=None, header=False)

In [None]:
### Creating the HLA datafile for tcga round 1 data
tcga_round_1_hla = []
patients_analyzed = []

for n, i in full_ids_and_cohort.iterrows():
    patients_analyzed.append(i['reformat'])
    #print(i['reformat'])
    hla_data = (tcga_data.loc[tcga_data['patientBarcode'] == i['reformat']]).iloc[0]
    hla_string = ",".join((list(hla_data)[2:8]))
    #print(hla_string)
    tcga_round_1_hla.append([i['original'], hla_string])

with open("tcga_round_1_483_patients_hla.tsv", "w", newline="") as f:
    writer = csv.writer(f, delimiter='\t')
    writer.writerows(tcga_round_1_hla)

## Adding Round 1 TCGA data

In [None]:
print(filtered_epitopes_combined_round_2.shape)
all_epitopes_tcga_r1 = pd.read_csv("./filter_b500_tcga_round1_combined.txt", sep='\t')

filtered_epitopes_combined_round_3 = filtered_epitopes_combined_round_2.append([all_epitopes_tcga_r1], ignore_index=True, sort=True)

print(filtered_epitopes_combined_round_3.shape)

In [None]:
unique_hla_alleles_round_3 = filtered_epitopes_combined_round_3['HLA Allele'].unique()
print(len(unique_hla_alleles_round_3))
combined_peptide_database_round_3 = create_peptide_database(unique_hla_alleles_round_3, filtered_epitopes_combined_round_3)
print('Total number of HLA-alleles:'+str(len(combined_peptide_database_round_3.keys())))

In [None]:
count = 0
for i in combined_peptide_database_round_3.keys():
    for j in combined_peptide_database_round_3[i]:
        if len(combined_peptide_database_round_3[i][j]) < 10:
            count += 1
print(count)
insufficient_data_round_3 = []
count = 0
for i in combined_peptide_database_round_3.keys():
    for j in [8,9,10,11]:
        try:
            length = len(combined_peptide_database_round_3[i][j])
        except:
            insufficient_data_round_3.append(i)
        if length <= 30:
            insufficient_data_round_3.append(i)
insufficient_data_round_3 = set(insufficient_data_round_3)
print(len(insufficient_data_round_3))

## Determining samples for Round 2 TCGA data

In [None]:
## TCGA data after around round 2
tcga_data = pd.read_csv('./additional_datasets/hlaTypesAll.tsv', sep='\t')
tcga_patient_r2= check_data(tcga_data, combined_peptide_database_round_3, insufficient_data_round_3, prefix='', start=2)
tcga_patient_ids_r2 = set(list(tcga_patient_r2['Patient ID']))
print(len(tcga_patient_ids_r2))

In [None]:
## Build a dictionary specifying which alleles each sample is adding to our combined dataset
tcga_round_4_dict = {}
for n, i in tcga_patient_r2.iterrows():
    patient_id = i['Patient ID']
    corr_hla = i['Adding_allele']
    reason = i['Reason']
    if patient_id not in tcga_round_4_dict:
        tcga_round_4_dict[patient_id] = {}
        tcga_round_4_dict[patient_id]['total'] = []
        tcga_round_4_dict[patient_id]['missing'] = []
        tcga_round_4_dict[patient_id]['insufficient'] = []
    if reason == 'insufficient':
        tcga_round_4_dict[patient_id]['insufficient'].append(corr_hla)
        tcga_round_4_dict[patient_id]['total'].append(corr_hla)
    elif reason == 'missing': 
        tcga_round_4_dict[patient_id]['missing'].append(corr_hla)
        tcga_round_4_dict[patient_id]['total'].append(corr_hla)
## Uniqify in case one patient has the same HLA allele 2 times
for i in tcga_round_4_dict.keys():
    for j in tcga_round_4_dict[i]:
        if len(tcga_round_4_dict[i][j]) != len(set(tcga_round_4_dict[i][j])):
            tcga_round_4_dict[i][j] = list(set(tcga_round_4_dict[i][j]))

## For each patient's missing and insufficient entries, count and sort by the tuple 
tcga_round_4_counts = pd.DataFrame()
for i in tcga_round_4_dict.keys():
    for j in tcga_round_4_dict[i]:
        if j == 'missing': 
            missing_count = len(tcga_round_4_dict[i][j])
        if j == 'insufficient': 
            insuff_count = len(tcga_round_4_dict[i][j])
        if j == 'total': 
            total_count = len(tcga_round_4_dict[i][j])
        
    tcga_round_4_counts = tcga_round_4_counts.append({'Patient ID': i,'Missing Count':missing_count, 'Insuff Count': insuff_count,
                                            'Total Count': total_count}, ignore_index=True)
    #print(i, missing_count, insuff_count, total_count)
tcga_round_4_counts = tcga_round_4_counts.sort_values(by=['Missing Count', 'Insuff Count'], ascending=False)


In [None]:
## Converting Kelsy's sample names
kelsy_tcga_data_samples = pd.read_csv('./additional_datasets/tcgasamplesons3.tsv', sep='\t', header=None)
## Loading TCGA MetaData in order to know which directories from AWS to pull from 
kelsy_tcga_metadata = pd.read_csv('./additional_datasets/TCGAsamples_metadata.tsv',  sep='\t', header=None)
kelsy_tcga_data_reformat = pd.DataFrame()

## Creating a newly formated dataframe with both shortened and original tcga sample name and cohort 
reformat = []
original = []
full_datatype = []
datatype = []
cohort = []
for n, i in kelsy_tcga_data_samples.iterrows():
    type_of_data = kelsy_tcga_metadata.loc[kelsy_tcga_metadata[0] == i[0],][2].values[0]
    full_datatype.append(type_of_data)
    try:
        datatype.append(type_of_data.split(' ')[2])
    except:
        datatype.append('N/A')
    reformat.append('-'.join(i[0].split('-')[:3]))
    original.append(i[0])
    cohort.append(kelsy_tcga_metadata.loc[kelsy_tcga_metadata[0] == i[0],][1].values[0])
kelsy_tcga_data_reformat['reformat'] = reformat
kelsy_tcga_data_reformat['original'] = original
kelsy_tcga_data_reformat['full_datatype'] = full_datatype
kelsy_tcga_data_reformat['datatype'] = datatype
kelsy_tcga_data_reformat['cohort'] = cohort

## exclude all normal samples
kelsy_tcga_data_reformat_only_tumor = kelsy_tcga_data_reformat[kelsy_tcga_data_reformat['datatype'] != 'Normal']
kelsy_tcga_data_reformat.shape, kelsy_tcga_data_reformat_only_tumor.shape

## for round 4 overall, round 2 with TCGA data we are upping the limit to 10 (insuff) & 15 (missing) each 
def check_if_necessary(covered, adding, reason, limit1=20, limit2=20):
    hla_output = []
    for i in adding:
        count = covered.count(i)
        if reason == 'missing':
            if count < limit2:
                hla_output.append(i)
        if reason == 'insufficient':
            if count < limit1:
                hla_output.append(i)
    return hla_output

covered_hla_alleles = []
ids_included = []
ids_excluded = []
full_ids_included = []
full_ids_excluded = []
UCEC_ids_to_include = []

round_1_samples = pd.read_csv('TCGA_samples_round1_noUCEC.tsv', sep='\t', header=None)
r1_sample_ids = ["-".join(i.split('-')[:-1]) for i in list(set(round_1_samples[0]))]

for n, i in tcga_round_4_counts.iterrows():
    patient_id = i['Patient ID']
    if patient_id not in list(kelsy_tcga_data_reformat_only_tumor['reformat']):
        #print(patient_id)
        continue
    if patient_id in r1_sample_ids:
        continue
    cancer_type = list(kelsy_tcga_data_reformat_only_tumor.loc[kelsy_tcga_data_reformat_only_tumor["reformat"] == patient_id]["cohort"])[0]
    full_id = kelsy_tcga_data_reformat_only_tumor.loc[kelsy_tcga_data_reformat_only_tumor['reformat'] == patient_id]['original']
    hla_alleles = tcga_round_4_dict[patient_id]
    total = hla_alleles['total']
    insuff = hla_alleles['insufficient']
    missing = hla_alleles['missing']
    #print(insuff, check_if_necessary(covered_hla_alleles, insuff, 'insufficient'))
    #print(missing, check_if_necessary(covered_hla_alleles, missing, 'missing'))
    if len(check_if_necessary(covered_hla_alleles, insuff, 'insufficient')) + len(check_if_necessary(covered_hla_alleles, missing, 'missing')) != 0:
        covered_hla_alleles = covered_hla_alleles + hla_alleles['total']
        full_ids_included.append(list(full_id)[0])
        ids_included.append(patient_id)
    else:
        #print('Skipping Patient: ', patient_id)
        ids_excluded.append(patient_id)
        full_ids_excluded.append(list(full_id)[0])
print(len(ids_included), len(ids_excluded), len(ids_included)+len(ids_excluded), len(tcga_round_4_counts['Patient ID']))

In [None]:
## confirm what I would gain with subset of additional TCGA samples added
tcga_subset_r2 = tcga_patient_r2[tcga_patient_r2['Patient ID'].isin(ids_included)]
missing_allele_tcga_r2 = []
insufficient_allele_tcga_r2 = []
for i in [tcga_subset_r2]:
    for j,k in zip(i['Reason'],i['Adding_allele']) :
        if j == 'missing':
            if k.split('-')[0] != 'HLA':
                missing_allele_tcga_r2.append('HLA-'+k)
            else:
                missing_allele_tcga_r2.append(k)
        elif j == 'insufficient':
            if k.split('-')[0] != 'HLA':
                insufficient_allele_tcga_r2.append('HLA-'+k)
            else:
                insufficient_allele_tcga_r2.append(k)
    print('Insufficient Count: '+str(len(set(insufficient_allele_tcga_r2))))
    print('Missing Count: '+str(len(set(missing_allele_tcga_r2))))
    
print(len(insufficient_data_round_3), len(set(missing_allele_tcga_r2)), len(set(insufficient_allele_tcga_r2)))
print(insufficient_data_round_3-set(insufficient_allele_tcga_r2))
total_hla_alleles_tcga_round_3 = set(missing_allele_tcga_r2 + insufficient_allele_tcga_r2)
print(len(total_hla_alleles_tcga_round_3))

In [None]:
full_ids_and_cohort = kelsy_tcga_data_reformat_only_tumor[kelsy_tcga_data_reformat_only_tumor['original'].isin(full_ids_included)]
full_ids_and_cohort.to_csv("TCGA_samples_round2_withUCEC.tsv", sep="\t", columns=['original', 'cohort'], index=None, header=False)

In [None]:
### Creating the HLA datafile for tcga round 2 data
tcga_round_2_hla = []
patients_analyzed = []

for n, i in full_ids_and_cohort.iterrows():
    patients_analyzed.append(i['reformat'])
    #print(i['reformat'])
    hla_data = (tcga_data.loc[tcga_data['patientBarcode'] == i['reformat']]).iloc[0]
    hla_string = ",".join((list(hla_data)[2:8]))
    #print(hla_string)
    tcga_round_2_hla.append([i['original'], hla_string])

with open("tcga_round_2_873_patients_hla.tsv", "w", newline="") as f:
    writer = csv.writer(f, delimiter='\t')
    writer.writerows(tcga_round_2_hla)

In [None]:
len(set(full_ids_included) - set(round_1_samples[0]))

## Adding data from round 2 TCGA

In [None]:
print(filtered_epitopes_combined_round_3.shape)
all_epitopes_tcga_r2 = pd.read_csv("./filter_b500_tcga_round2_combined.txt", sep='\t', low_memory=False)

filtered_epitopes_combined_round_4 = filtered_epitopes_combined_round_3.append([all_epitopes_tcga_r2], ignore_index=True, sort=True)

print(filtered_epitopes_combined_round_4.shape)

In [None]:
unique_hla_alleles_round_4 = filtered_epitopes_combined_round_4['HLA Allele'].unique()
print(len(unique_hla_alleles_round_4))
combined_peptide_database_round_4 = create_peptide_database(unique_hla_alleles_round_4, filtered_epitopes_combined_round_4)
print('Total number of HLA-alleles:'+str(len(combined_peptide_database_round_4.keys())))

In [None]:
count = 0
for i in combined_peptide_database_round_4.keys():
    for j in combined_peptide_database_round_4[i]:
        if len(combined_peptide_database_round_4[i][j]) < 10:
            count += 1
print(count)
insufficient_data_round_4 = []
count = 0
for i in combined_peptide_database_round_4.keys():
    for j in [8,9,10,11]:
        try:
            length = len(combined_peptide_database_round_4[i][j])
        except:
            insufficient_data_round_4.append(i)
        if length <= 30:
            insufficient_data_round_4.append(i)
insufficient_data_round_4 = set(insufficient_data_round_4)
print(len(insufficient_data_round_4))

In [None]:
print(filtered_epitopes_combined_round_1.shape)
print(filtered_epitopes_combined_round_2.shape)
print(filtered_epitopes_combined_round_3.shape)
print(filtered_epitopes_combined_round_4.shape)

### Analyzing peptide database for bias towards certain algorithms

In [None]:
## Best MT score count
class_I_algorithms = ["MHCnuggetsI", "MHCflurry", "NetMHCcons", "NetMHCpan", "NetMHC", "PickPocket", "SMM", "SMMPMBEC"]
def count_occurrences(combined_epitopes, algorithm_list):
    count_dict = {}
    for i in algorithm_list:
        count_dict[i] = {'Best': 0, 'Below 500': 0, 'Above 500': 0}
    for index, epitope in combined_epitopes.iterrows():
        best_score = epitope['Best MT Score Method']
        count_dict[best_score]['Best'] += 1
        for j in algorithm_list:
            query_str = j+" MT Score"
            try:
                binding_score = float(epitope[query_str])
            except:
                print(index, query_str, epitope[query_str])
            if binding_score > 500: count_dict[j]['Above 500'] += 1
            else: count_dict[j]['Below 500'] += 1
    x_axis_algorithms = list(count_dict.keys())
    counts_best = [count_dict[i]['Best'] for i in count_dict.keys()]
    counts_below = [count_dict[i]['Below 500'] for i in count_dict.keys()]
    counts_above = [count_dict[i]['Above 500'] for i in count_dict.keys()]    
    return x_axis_algorithms, counts_best, counts_below, counts_above

def count_set_occurrences(combined_epitopes, algorithm_list):
    table = []
    for index, epitope in combined_epitopes.iterrows():
        members = []
        for j in algorithm_list:
            query_str = j+" MT Score"
            binding_score = epitope[query_str]
            if binding_score < 500: members.append(j)
        #if members == []:
            #print(index, epitope)
        table.append(members)
    set_table = set(map(tuple, table))
    final_count = Counter(map(tuple, table))
    output_table = []
    counts = []
    for i in set_table:
        counts.append(final_count[i])
        output_table.append(list(i))
    return output_table, counts

In [None]:
x_axis_algorithms, counts_best, counts_below, counts_above = count_occurrences(filtered_epitopes_combined_round_4, class_I_algorithms)

labels = x_axis_algorithms
x = np.arange(len(labels))  # the label locations
width = 0.2  # the width of the bars

fig, ax = plt.subplots()
rects1 = ax.bar(x - width/2, counts_best, width, label='Best Algorithm')
rects2 = ax.bar(x + width/2, counts_below, width, label='Below 500nM')

# Add some text for labels, title and custom x-axis tick labels, etc.
ax.set_ylabel('Counts')
ax.set_title('Distribution of best predicting binding algorithms across peptide database', fontsize=11)
ax.set_xticks(x)
ax.set_xticklabels(labels)
ax.legend()

def autolabel(rects):
    """Attach a text label above each bar in *rects*, displaying its height."""
    for rect in rects:
        height = rect.get_height()
        ax.annotate('{}'.format(height),
                    xy=(rect.get_x() + rect.get_width() / 2, height),
                    xytext=(0, 3),  # 3 points vertical offset
                    textcoords="offset points",
                    ha='center', va='bottom')


autolabel(rects1)
autolabel(rects2)
ax.set_ylim(0, 2000000)
for tick in ax.get_xticklabels():
    tick.set_rotation(45)
fig.tight_layout()

plt.show()

In [None]:
upset_table, counts = count_set_occurrences(filtered_epitopes_combined_round_4, class_I_algorithms)
upsetplot.plot(upsetplot.from_memberships(upset_table, data=counts), sort_by='cardinality')
plt.savefig('upset_plot_combined_peptides_updated_r4.png')

## Generating Random Peptide Database

In [None]:
def three_mutation_sequence_match(seq1, seq2):
    m = regex.findall("("+seq1+"){s<=2}", seq2, overlapped=True)
    return m!=[]

def random_choice_3_mutations(input_data, total):
    new_list = input_data.copy()
    peptide_list = []
    count = 0
    while len(peptide_list) < total:
        if count >= 100: 
            print('Exceeded Maximum tries, length of peptide list is: '+str(len(peptide_list)))
            peptide_list += np.random.choices(new_list,total-len(peptide_list), replace=False)
            break
        peptide = random.choice(new_list)
        new_list.remove(peptide)
        curr_list = [peptide]
        for i in peptide_list:
            if three_mutation_sequence_match(i, peptide): 
                curr_list = []
                break
        peptide_list += curr_list
        count += 1
        #print(peptide_list)
    return peptide_list

In [None]:
count = 0
random_peptide_sets = {}
for i in combined_peptide_database.keys():
    random_peptide_sets[i] = {}
    for k in combined_peptide_database[i].keys():
        if len(combined_peptide_database[i][k]) > 10:
            count += 1 
            #print(i, k, len(combined_peptide_database[i][k]))
            random_10 = random_choice_3_mutations(combined_peptide_database[i][k],total=10)
            random_peptide_sets[i][k] = random_10
        else:
            random_peptide_sets[i][k] = combined_peptide_database[i][k]

In [None]:
count_10 = 0
count_under = 0
count_0 = 0
for i in random_peptide_sets:
    for j in [8,9,10,11]:
        try:
            length = len(random_peptide_sets[i][j])
            if length == 10:
                count_10 += 1
            else:
                count_under += 1
        except:
            count_0 += 1
print(count_10, count_under, count_0)

In [None]:
def create_seq_logo(peptide_list, output_dir):
    amino_acids = ['A', 'R', 'N', 'D', 'C', 'E', 'Q', 'G', 'H', 'I', 'L', 'K', 'M', 'F', 'P', 'S', 'T', 'W', 'Y', 'V']
    counts_mat = lm.alignment_to_matrix(peptide_list)
    cm = plt.get_cmap('viridis')
    cNorm  = colors.Normalize(vmin=0, vmax=20)
    scalarMap = mplcm.ScalarMappable(norm=cNorm, cmap=cm)
    logo = lm.Logo(counts_mat)
    for i in range(0,len(peptide_list[0])):
        for (j,k) in enumerate(amino_acids):
            try:
                logo.style_single_glyph(i,k, color=scalarMap.to_rgba(j))
            except:
                continue
    logo.fig.savefig(output_dir)
    plt.close()
    return None

# CREATING SEQ LOGOS

In [None]:
### CREATING SEQLOGOS for all the data we have available 
for i in combined_peptide_database.keys():
    directory = "./HLA_seq_logos/"+i
    if not os.path.exists(directory):
        os.makedirs(directory)
    for j in combined_peptide_database[i].keys():
        create_seq_logo(combined_peptide_database[i][j], './HLA_seq_logos/'+i+'/Length_'+str(j))
    print("Done: "+i)

In [None]:
#color_choices = ['salmon', 'darkorgange', 'forestgreen', 'mediumturquoise', 'palevioletred', 'steelblue',
#                'chocolate', 'purple', 'cornflowerblue', 'darkslategrey', ]

### Nonpolar Amino Acids

Ala: Alanine           Gly: Glycine          Ile: Isoleucine           Leu: Leucine
Met: Methionine  Trp: Tryptophan    Phe: Phenylalanine    Pro: Proline
Val: Valine

### Polar Amino Acids

Cys: Cysteine         Ser: Serine           Thr: Threonine
Tyr: Tyrosine       Asn: Asparagine Gln: Glutamine

### Polar Basic Amino Acids (Positively Charged)

His: Histidine      Lys: Lysine           Arg: Arginine

#### Polar Acidic Amino Acids (Negatively Charged)

Asp: Aspartate   Glu: Glutamate


### Amino acid 3-letter abbreviation 1-letter abbreviation 
Alanine Ala A Arginine Arg R Asparagine Asn N Aspartic acid Asp D Cysteine Cys C Glutamic acid Glu E Glutamine Gln Q Glycine Gly G Histidine His H Isoleucine Ile I Leucine Leu L Lysine Lys K Methionine Met M Phenylalanine Phe F Proline Pro P Serine Ser S Threonine Thr T Tryptophan Trp W Tyrosine Tyr Y Valine Val V

In [None]:
### Create Seq logos based on hydrophobicity
def create_seq_logo_with_grouping(peptide_list, output_dir):
    nonpolar_aa = ['A', 'G', 'I', 'L', 'M', 'W','F','P', 'V']
    polar_aa = ['C', 'S', 'Q', 'T', 'Y', 'N']
    polar_base = ['R', 'H', 'K']
    polar_acid = ['D', 'E']
    
    counts_mat = lm.alignment_to_matrix(peptide_list)
    cm = plt.get_cmap('Dark2')
    cNorm  = colors.Normalize(vmin=0, vmax=10)
    scalarMap = mplcm.ScalarMappable(norm=cNorm, cmap=cm)
    logo = lm.Logo(counts_mat)
    for i in range(0,len(peptide_list[0])):
        for j in nonpolar_aa:
            try:
                logo.style_single_glyph(i,j, color=scalarMap.to_rgba(1))
            except:
                continue
        for j in polar_aa:
            try:
                logo.style_single_glyph(i,j, color=scalarMap.to_rgba(2))
            except:
                continue
        for j in polar_base:
            try:
                logo.style_single_glyph(i,j, color=scalarMap.to_rgba(3))
            except:
                continue
        for j in polar_acid:
            try:
                logo.style_single_glyph(i,j, color=scalarMap.to_rgba(4))
            except:
                continue
    logo.fig.savefig(output_dir)
    plt.close()
    return None


In [None]:
for i in combined_peptide_database.keys():
    directory = "./HLA_Seqlogo_with_grouping/"+i
    if not os.path.exists(directory):
        os.makedirs(directory)
    for j in combined_peptide_database[i].keys():
        create_seq_logo_with_grouping(combined_peptide_database[i][j], './HLA_Seqlogo_with_grouping/'+i+'/Length_'+str(j))
    print("Done: "+i)