In [1]:
import collections
import os
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

In [2]:
%config InlineBackend.figure_format='retina'

In [7]:
orig_data = pd.read_csv('../AllEpitopeFeatures.csv')
orig_data = orig_data.rename(columns={'epitope': 'sequence'})
orig_data['sequence_length'] = [len(x) for x in orig_data['sequence'].values]
orig_data

Unnamed: 0,protein,sequence,start_pos,epi_len,entropy,perc_mutated,glyco_probs,crosses_cleavage,sequence_length
0,E,MYSFVSEE,0,8,0.002908,0.000213,0.0,0,8
1,E,YSFVSEET,1,8,0.008297,0.000640,0.0,0,8
2,E,SFVSEETG,2,8,0.008297,0.000640,0.0,0,8
3,E,FVSEETGT,3,8,0.008297,0.000640,0.0,0,8
4,E,VSEETGTL,4,8,0.008297,0.000640,0.0,0,8
...,...,...,...,...,...,...,...,...,...
164781,S2,KGCCSCGSCCKFDEDDSEPVLKGVK,1244,25,0.078112,0.007464,0.0,0,25
164782,S2,GCCSCGSCCKFDEDDSEPVLKGVKL,1245,25,0.078112,0.007464,0.0,0,25
164783,S2,CCSCGSCCKFDEDDSEPVLKGVKLH,1246,25,0.078112,0.007464,0.0,0,25
164784,S2,CSCGSCCKFDEDDSEPVLKGVKLHY,1247,25,0.075207,0.007251,0.0,0,25


In [12]:
len(set(orig_data['sequence'].loc[orig_data['epi_len'].isin([8,9,10])].values.tolist()))

29406

In [9]:
# Load final set of HLA alleles.
hla_alleles = pd.read_csv('MHC1_allele_mary_cleaned.txt', names=['allele'])
hla_alleles

Unnamed: 0,allele
0,HLA-B44:04
1,HLA-B44:05
2,HLA-B44:07
3,HLA-A30:10
4,HLA-B44:02
...,...
225,HLA-B55:02
226,HLA-B67:01
227,HLA-A24:10
228,HLA-B15:32


In [28]:
# Filter MHC-1 sequences to epitopes with sequence length 8-10 (inclusive).
mhc1_data = orig_data.loc[orig_data['sequence_length'].isin([8, 9, 10])]
mhc1_data

Unnamed: 0,protein,sequence,start_pos,epi_len,entropy,perc_mutated,glyco_probs,crosses_cleavage,sequence_length
0,E,MYSFVSEE,0,8,0.002908,0.000213,0.0,0,8
1,E,YSFVSEET,1,8,0.008297,0.000640,0.0,0,8
2,E,SFVSEETG,2,8,0.008297,0.000640,0.0,0,8
3,E,FVSEETGT,3,8,0.008297,0.000640,0.0,0,8
4,E,VSEETGTL,4,8,0.008297,0.000640,0.0,0,8
...,...,...,...,...,...,...,...,...,...
147203,S2,DSEPVLKGVK,1259,10,0.056306,0.005758,0.0,0,10
147204,S2,SEPVLKGVKL,1260,10,0.053400,0.005545,0.0,0,10
147205,S2,EPVLKGVKLH,1261,10,0.053400,0.005545,0.0,0,10
147206,S2,PVLKGVKLHY,1262,10,0.053400,0.005545,0.0,0,10


In [10]:
# Write peptides out (unpaired with MHC) for NetMHCpan.
mhc1_data[['sequence']].to_csv('peptides_8-10.pep', index=False, header=False)

# Create commands for running NetMHCpan4.0

In [76]:
cmd_template = '-BA -p peptides_8-10.pep -a {allele} -xls -xlsfile {allele_file}'
cmds = []
for allele in hla_alleles['allele'].values:
    allele_file = 'netmhc_preds/%s_preds.xls' % (allele.replace(':', ''))
    # Check if allele file exists.
    if os.path.exists(allele_file):
        continue
    cmd = cmd_template.format(
        allele=allele,
        allele_file=allele_file,
    )
    cmds.append(cmd)
print('# Commands: ', len(cmds))

with open('netmhc_args.txt', 'w') as f:
    for cmd in cmds:
        f.write(cmd+'\n')

# Commands:  35


In [36]:
# For NetMHCpan4.1:

cmd_template = '-BA -p peptides_8-10.pep -a {allele} -xls -xlsfile {allele_file}'
cmds = []
for allele in hla_alleles['allele'].values:
    allele_file = 'netmhc-4.1_preds/%s_preds.xls' % (allele.replace(':', ''))
    # Check if allele file exists.
    if os.path.exists(allele_file):
        continue
    cmd = cmd_template.format(
        allele=allele,
        allele_file=allele_file,
    )
    cmds.append(cmd)
print('# Commands: ', len(cmds))

with open('netmhc-4.1_args.txt', 'w') as f:
    for cmd in cmds:
        f.write(cmd+'\n')

# Commands:  230


Command to run:    
```
cat netmhc_args.txt | xargs -P 40 -d '\n' -n 1 ./netMHCpan-4.0/netMHCpan

cat netmhc-4.1_args.txt | xargs -P 35 -d '\n' -n 1 ./netMHCpan-4.1/netMHCpan
```

# Load NetMHCpan4.0 predictions

In [7]:
dfs = []
for allele in hla_alleles['allele'].values:
    df = pd.read_csv(
        './netmhc_preds/%s_preds.xls' % allele.replace(':', ''),
        delimiter='\t',
        skiprows=[0],
    )
    df['genotype'] = allele
    df = df.drop(columns=['Pos', 'ID', 'core', 'icore', 'Rank', 'Ave', 'NB'])
    dfs.append(df)
netmhc1_data = pd.concat(dfs)
netmhc1_data['sequence_length'] = [len(x) for x in netmhc1_data['Peptide'].values]
netmhc1_data['loci'] = [x[:5] for x in netmhc1_data['genotype'].values]
netmhc1_data

Unnamed: 0,Peptide,1-log50k,nM,genotype,sequence_length,loci
0,MYSFVSEE,0.0187,40836.4258,HLA-B44:04,8,HLA-B
1,YSFVSEET,0.0136,43144.7188,HLA-B44:04,8,HLA-B
2,SFVSEETG,0.0114,44191.7070,HLA-B44:04,8,HLA-B
3,FVSEETGT,0.0075,46105.8516,HLA-B44:04,8,HLA-B
4,VSEETGTL,0.0146,42674.8945,HLA-B44:04,8,HLA-B
...,...,...,...,...,...,...
29398,DSEPVLKGVK,0.0252,38063.9297,HLA-B56:10,10,HLA-B
29399,SEPVLKGVKL,0.3105,1738.0863,HLA-B56:10,10,HLA-B
29400,EPVLKGVKLH,0.1051,16034.5420,HLA-B56:10,10,HLA-B
29401,PVLKGVKLHY,0.0223,39286.5586,HLA-B56:10,10,HLA-B


In [88]:
a74 = netmhc1_data.loc[netmhc1_data['genotype'].str.contains('HLA-A74')].groupby('Peptide').agg('mean').reset_index() #['mean', 'count'])
a74['loci'] = 'HLA-A'
a74['genotype'] = 'HLA-A74'
#a74

c17 = netmhc1_data.loc[netmhc1_data['genotype'].str.contains('HLA-C17')].groupby('Peptide').agg('mean').reset_index()
c17['loci'] = 'HLA-C'
c17['genotype'] = 'HLA-C17'
#c17

c18 = netmhc1_data.loc[netmhc1_data['genotype'].str.contains('HLA-C18')].groupby('Peptide').agg('mean').reset_index()
c18['loci'] = 'HLA-C'
c18['genotype'] = 'HLA-C18'
#c18

In [89]:
pd.concat([netmhc1_data, a74, c17, c18], sort=False)

Unnamed: 0,Peptide,1-log50k,nM,genotype,sequence_length,loci
0,MYSFVSEE,0.018700,40836.425800,HLA-B44:04,8,HLA-B
1,YSFVSEET,0.013600,43144.718800,HLA-B44:04,8,HLA-B
2,SFVSEETG,0.011400,44191.707000,HLA-B44:04,8,HLA-B
3,FVSEETGT,0.007500,46105.851600,HLA-B44:04,8,HLA-B
4,VSEETGTL,0.014600,42674.894500,HLA-B44:04,8,HLA-B
...,...,...,...,...,...,...
29398,YYVGYLQPR,0.049400,29468.162767,HLA-C18,9,HLA-C
29399,YYVGYLQPRT,0.039567,32691.988933,HLA-C18,10,HLA-C
29400,YYVWKSYV,0.089833,18992.130200,HLA-C18,8,HLA-C
29401,YYVWKSYVH,0.045900,30434.857433,HLA-C18,9,HLA-C


In [90]:
data_pivot = pd.concat([netmhc1_data, a74, c17, c18], sort=False).pivot_table(
    index='Peptide',
    columns=['loci', 'genotype'],
    values='1-log50k',
)
data_pivot.to_pickle('mhc1_haplotype_netmhc_pred_affinity_pivot.pkl.gz', protocol=2)
data_pivot

loci,HLA-A,HLA-A,HLA-A,HLA-A,HLA-A,HLA-A,HLA-A,HLA-A,HLA-A,HLA-A,...,HLA-C,HLA-C,HLA-C,HLA-C,HLA-C,HLA-C,HLA-C,HLA-C,HLA-C,HLA-C
genotype,HLA-A01:01,HLA-A01:02,HLA-A01:03,HLA-A01:09,HLA-A01:23,HLA-A02:01,HLA-A02:02,HLA-A02:03,HLA-A02:04,HLA-A02:05,...,HLA-C17:02,HLA-C17:03,HLA-C17:04,HLA-C17:05,HLA-C17:06,HLA-C17:07,HLA-C18,HLA-C18:01,HLA-C18:02,HLA-C18:03
Peptide,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
AAAYYVGY,0.1054,0.1506,0.0860,0.1054,0.1103,0.0291,0.0483,0.0614,0.0209,0.0601,...,0.0291,0.0291,0.0291,0.0291,0.0291,0.0240,0.015933,0.0166,0.0166,0.0146
AAAYYVGYL,0.0739,0.0925,0.0669,0.0739,0.0789,0.2276,0.4786,0.4725,0.1623,0.5121,...,0.3401,0.3401,0.3401,0.3401,0.3401,0.2976,0.079467,0.0681,0.0681,0.1022
AAAYYVGYLQ,0.0547,0.0909,0.0431,0.0547,0.0508,0.0655,0.1775,0.1745,0.0445,0.1913,...,0.0507,0.0507,0.0507,0.0507,0.0507,0.0429,0.020367,0.0187,0.0187,0.0237
AACCHLAK,0.0345,0.0760,0.0265,0.0345,0.0342,0.0286,0.0299,0.0342,0.0229,0.0394,...,0.0168,0.0168,0.0168,0.0168,0.0168,0.0127,0.016200,0.0166,0.0166,0.0154
AACCHLAKA,0.0460,0.0753,0.0377,0.0460,0.0474,0.1655,0.2493,0.3349,0.0895,0.3377,...,0.0675,0.0675,0.0675,0.0675,0.0675,0.0423,0.029500,0.0276,0.0276,0.0333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
YYVGYLQPR,0.0289,0.0812,0.0259,0.0289,0.0367,0.0993,0.1255,0.1105,0.0972,0.1698,...,0.0293,0.0293,0.0293,0.0293,0.0293,0.0237,0.049400,0.0424,0.0424,0.0634
YYVGYLQPRT,0.0303,0.0544,0.0252,0.0303,0.0325,0.1111,0.1629,0.1631,0.0923,0.2180,...,0.0270,0.0270,0.0270,0.0270,0.0270,0.0197,0.039567,0.0343,0.0343,0.0501
YYVWKSYV,0.0345,0.0552,0.0313,0.0345,0.0367,0.0909,0.1175,0.1459,0.0811,0.1130,...,0.0513,0.0513,0.0513,0.0513,0.0513,0.0456,0.089833,0.0838,0.0838,0.1019
YYVWKSYVH,0.0480,0.1032,0.0413,0.0480,0.0535,0.0274,0.0373,0.0272,0.0254,0.0467,...,0.0180,0.0180,0.0180,0.0180,0.0180,0.0185,0.045900,0.0467,0.0467,0.0443


# Load NetMHCpan4.1 Predictions

In [45]:
dfs = []
for allele in hla_alleles['allele'].values:
    df = pd.read_csv(
        './netmhc-4.1_preds/%s_preds.xls' % allele.replace(':', ''),
        delimiter='\t',
        skiprows=[0],
    )
    df['genotype'] = allele
    df = df.drop(columns=['Pos', 'ID', 'core', 'icore', 'Ave', 'NB'])
    dfs.append(df)
netmhc41_data = pd.concat(dfs)
netmhc41_data['sequence_length'] = [len(x) for x in netmhc41_data['Peptide'].values]
netmhc41_data['loci'] = [x[:5] for x in netmhc41_data['genotype'].values]
netmhc41_data['BA_nM'] = 50000**(1-netmhc41_data['BA-score'])
netmhc41_data

Unnamed: 0,Peptide,EL-score,EL_Rank,BA-score,BA_Rank,genotype,sequence_length,loci,BA_nM
0,MYSFVSEE,0.0000,72.5000,0.0238,62.7906,HLA-B44:04,8,HLA-B,38648.666877
1,YSFVSEET,0.0000,72.5000,0.0172,81.8807,HLA-B44:04,8,HLA-B,41509.520847
2,SFVSEETG,0.0000,72.5000,0.0158,85.7518,HLA-B44:04,8,HLA-B,42143.080553
3,FVSEETGT,0.0000,70.0000,0.0100,95.7597,HLA-B44:04,8,HLA-B,44872.503932
4,VSEETGTL,0.0001,31.8889,0.0174,81.5398,HLA-B44:04,8,HLA-B,41419.793203
...,...,...,...,...,...,...,...,...,...
29398,DSEPVLKGVK,0.0001,54.4000,0.0164,71.7671,HLA-B56:10,10,HLA-B,41870.379408
29399,SEPVLKGVKL,0.4757,0.4963,0.3035,1.8215,HLA-B56:10,10,HLA-B,1874.271960
29400,EPVLKGVKLH,0.0212,5.8240,0.0892,14.2037,HLA-B56:10,10,HLA-B,19046.793791
29401,PVLKGVKLHY,0.0001,49.6667,0.0162,72.3585,HLA-B56:10,10,HLA-B,41961.083157


In [46]:
a74 = netmhc41_data.loc[netmhc41_data['genotype'].str.contains('HLA-A74')].groupby('Peptide').agg('mean').reset_index() #['mean', 'count'])
a74['loci'] = 'HLA-A'
a74['genotype'] = 'HLA-A74'
#a74

c17 = netmhc41_data.loc[netmhc41_data['genotype'].str.contains('HLA-C17')].groupby('Peptide').agg('mean').reset_index()
c17['loci'] = 'HLA-C'
c17['genotype'] = 'HLA-C17'
#c17

c18 = netmhc41_data.loc[netmhc41_data['genotype'].str.contains('HLA-C18')].groupby('Peptide').agg('mean').reset_index()
c18['loci'] = 'HLA-C'
c18['genotype'] = 'HLA-C18'
#c18

In [53]:
data_pivot = pd.concat([netmhc41_data, a74, c17, c18], sort=False).pivot_table(
    index='Peptide',
    columns=['loci', 'genotype'],
    values='BA-score',
)
data_pivot.to_pickle('mhc1_haplotype_netmhc-4.1_pred_affinity_pivot.pkl.gz', protocol=2)
data_pivot

loci,HLA-A,HLA-A,HLA-A,HLA-A,HLA-A,HLA-A,HLA-A,HLA-A,HLA-A,HLA-A,...,HLA-C,HLA-C,HLA-C,HLA-C,HLA-C,HLA-C,HLA-C,HLA-C,HLA-C,HLA-C
genotype,HLA-A01:01,HLA-A01:02,HLA-A01:03,HLA-A01:09,HLA-A01:23,HLA-A02:01,HLA-A02:02,HLA-A02:03,HLA-A02:04,HLA-A02:05,...,HLA-C17:02,HLA-C17:03,HLA-C17:04,HLA-C17:05,HLA-C17:06,HLA-C17:07,HLA-C18,HLA-C18:01,HLA-C18:02,HLA-C18:03
Peptide,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
AAAYYVGY,0.0967,0.1644,0.0733,0.0967,0.0864,0.0278,0.0401,0.0417,0.0199,0.0495,...,0.0394,0.0394,0.0394,0.0394,0.0394,0.0303,0.020300,0.0217,0.0217,0.0175
AAAYYVGYL,0.0879,0.1206,0.0863,0.0879,0.0815,0.2249,0.4798,0.4420,0.2050,0.5578,...,0.5393,0.5393,0.5393,0.5393,0.5393,0.4357,0.110533,0.0931,0.0931,0.1454
AAAYYVGYLQ,0.0527,0.0837,0.0451,0.0527,0.0465,0.0738,0.1754,0.1566,0.0542,0.2006,...,0.0808,0.0808,0.0808,0.0808,0.0808,0.0553,0.028433,0.0266,0.0266,0.0321
AACCHLAK,0.0420,0.0789,0.0354,0.0420,0.0333,0.0287,0.0302,0.0286,0.0236,0.0363,...,0.0167,0.0167,0.0167,0.0167,0.0167,0.0126,0.015900,0.0184,0.0184,0.0109
AACCHLAKA,0.0535,0.0787,0.0447,0.0535,0.0498,0.1479,0.2237,0.3273,0.0916,0.3419,...,0.0691,0.0691,0.0691,0.0691,0.0691,0.0460,0.038067,0.0366,0.0366,0.0410
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
YYVGYLQPR,0.0376,0.1285,0.0346,0.0376,0.0438,0.1105,0.1362,0.1120,0.1013,0.1849,...,0.0390,0.0390,0.0390,0.0390,0.0390,0.0276,0.057833,0.0550,0.0550,0.0635
YYVGYLQPRT,0.0351,0.0759,0.0322,0.0351,0.0361,0.1011,0.1517,0.1558,0.0877,0.2004,...,0.0266,0.0266,0.0266,0.0266,0.0266,0.0191,0.047067,0.0413,0.0413,0.0586
YYVWKSYV,0.0316,0.0701,0.0306,0.0316,0.0310,0.0748,0.1008,0.1146,0.0704,0.1010,...,0.0753,0.0753,0.0753,0.0753,0.0753,0.0576,0.117933,0.1067,0.1067,0.1404
YYVWKSYVH,0.0479,0.1382,0.0443,0.0479,0.0526,0.0411,0.0492,0.0324,0.0332,0.0569,...,0.0415,0.0415,0.0415,0.0415,0.0415,0.0372,0.053200,0.0515,0.0515,0.0566


# Create data for MHCflurry

In [33]:
# Create dataframe with all MHC/peptide pairs.
a = mhc1_data.copy()
b = hla_alleles.copy()
a['key'] = 0
b['key'] = 0
pmhc_pairs = a.merge(b, how='outer')
pmhc_pairs = pmhc_pairs.drop(columns=['key'])
pmhc_pairs

Unnamed: 0,protein,sequence,start_pos,epi_len,entropy,perc_mutated,glyco_probs,crosses_cleavage,sequence_length,allele
0,E,MYSFVSEE,0,8,0.002908,0.000213,0.0,0,8,HLA-B44:04
1,E,MYSFVSEE,0,8,0.002908,0.000213,0.0,0,8,HLA-B44:05
2,E,MYSFVSEE,0,8,0.002908,0.000213,0.0,0,8,HLA-B44:07
3,E,MYSFVSEE,0,8,0.002908,0.000213,0.0,0,8,HLA-A30:10
4,E,MYSFVSEE,0,8,0.002908,0.000213,0.0,0,8,HLA-B44:02
...,...,...,...,...,...,...,...,...,...,...
6763375,S2,VLKGVKLHYT,1263,10,0.011631,0.000853,0.0,0,10,HLA-B55:02
6763376,S2,VLKGVKLHYT,1263,10,0.011631,0.000853,0.0,0,10,HLA-B67:01
6763377,S2,VLKGVKLHYT,1263,10,0.011631,0.000853,0.0,0,10,HLA-A24:10
6763378,S2,VLKGVKLHYT,1263,10,0.011631,0.000853,0.0,0,10,HLA-B15:32


In [78]:
pmhc_pairs[['allele', 'sequence']].to_csv('mhc1_8-10_haplotype_pairs.csv', index=False, header=False)

In [79]:
pmhc_pairs.rename(columns={'sequence': 'peptide'})[['allele', 'peptide']].to_csv(
    'mhc1_8-10_haplotype_pairs_withheader.csv', index=False, header=True)

Command to run MHCflurry:
```
mhcflurry-predict \
    --out mhc1_haplotype_preds_mhcflurry.csv \
    mhc1_8-10_haplotype_pairs_withheader.csv

mhcflurry-predict \
    --out mhc1_haplotype_preds_mhcflurry2.0.csv \
    mhc1_8-10_haplotype_pairs_withheader.csv
```

# Load MHCflurry Predictions

In [8]:
def transform_affinity(x):
    x = np.clip(x, a_min=None, a_max=50000)
    return 1 - np.log(x) / np.log(50000)

# print(transform_affinity(500))

In [9]:
mhcflurry_preds = pd.read_csv('mhc1_haplotype_preds_mhcflurry.csv')

mhcflurry_preds = mhcflurry_preds.rename(columns={'peptide': 'sequence'})
mhcflurry_preds['sequence_length'] = [len(x) for x in mhcflurry_preds['sequence'].values]

# Filter dataframe to alleles in hla_alleles.
# mhcflurry_preds = mhcflurry_preds.merge(hla_alleles, on='allele')

# Add epitope protein data.
# mhcflurry_preds = mhcflurry_preds.merge(mhc1_data[['sequence', 'Protein']], on='sequence')

# Compute logistic-transformed binding affinity.
mhcflurry_preds['transformed_aff'] = [transform_affinity(x) for x in mhcflurry_preds['mhcflurry_affinity'].values]

mhcflurry_preds

Unnamed: 0,allele,sequence,mhcflurry_affinity,mhcflurry_affinity_percentile,mhcflurry_processing_score,mhcflurry_presentation_score,sequence_length,transformed_aff
0,HLA-B44:04,MYSFVSEE,37829.751440,76.370250,0.213669,0.007210,8,0.025779
1,HLA-B44:05,MYSFVSEE,38198.075124,84.530375,0.213669,0.007148,8,0.024884
2,HLA-B44:07,MYSFVSEE,38064.733103,73.786875,0.213669,0.007170,8,0.025207
3,HLA-A30:10,MYSFVSEE,34529.712035,57.241500,0.213669,0.007814,8,0.034215
4,HLA-B44:02,MYSFVSEE,38191.475075,83.348375,0.213669,0.007149,8,0.024900
...,...,...,...,...,...,...,...,...
6762685,HLA-B55:02,VLKGVKLHYT,29249.898280,38.170875,0.126139,0.006637,10,0.049552
6762686,HLA-B67:01,VLKGVKLHYT,34574.906172,41.093875,0.126139,0.005726,10,0.034094
6762687,HLA-A24:10,VLKGVKLHYT,35345.127043,37.469125,0.126139,0.005615,10,0.032058
6762688,HLA-B15:32,VLKGVKLHYT,32874.896891,28.702875,0.126139,0.005986,10,0.038754


In [82]:
df = mhcflurry_preds.copy()
df['loci'] = [x[:5] for x in df['allele'].values]
df = df.drop(columns=['sequence_length', 'mhcflurry_affinity_percentile', 'mhcflurry_processing_score', 'mhcflurry_presentation_score'])
df = df.rename(columns={'sequence': 'Peptide', 'allele': 'genotype'})
df

Unnamed: 0,genotype,Peptide,mhcflurry_affinity,transformed_aff,loci
0,HLA-B44:04,MYSFVSEE,37829.751440,0.025779,HLA-B
1,HLA-B44:05,MYSFVSEE,38198.075124,0.024884,HLA-B
2,HLA-B44:07,MYSFVSEE,38064.733103,0.025207,HLA-B
3,HLA-A30:10,MYSFVSEE,34529.712035,0.034215,HLA-A
4,HLA-B44:02,MYSFVSEE,38191.475075,0.024900,HLA-B
...,...,...,...,...,...
6762685,HLA-B55:02,VLKGVKLHYT,29249.898280,0.049552,HLA-B
6762686,HLA-B67:01,VLKGVKLHYT,34574.906172,0.034094,HLA-B
6762687,HLA-A24:10,VLKGVKLHYT,35345.127043,0.032058,HLA-A
6762688,HLA-B15:32,VLKGVKLHYT,32874.896891,0.038754,HLA-B


In [83]:
a74 = df.loc[df['genotype'].str.contains('HLA-A74')].groupby('Peptide').agg('mean').reset_index() #['mean', 'count'])
a74['loci'] = 'HLA-A'
a74['genotype'] = 'HLA-A74'
#a74

c17 = df.loc[df['genotype'].str.contains('HLA-C17')].groupby('Peptide').agg('mean').reset_index()
c17['loci'] = 'HLA-C'
c17['genotype'] = 'HLA-C17'
#c17

c18 = df.loc[df['genotype'].str.contains('HLA-C18')].groupby('Peptide').agg('mean').reset_index()
c18['loci'] = 'HLA-C'
c18['genotype'] = 'HLA-C18'
#c18

In [84]:
pd.concat([df, a74, c17, c18], sort=False)

Unnamed: 0,genotype,Peptide,mhcflurry_affinity,transformed_aff,loci
0,HLA-B44:04,MYSFVSEE,37829.751440,0.025779,HLA-B
1,HLA-B44:05,MYSFVSEE,38198.075124,0.024884,HLA-B
2,HLA-B44:07,MYSFVSEE,38064.733103,0.025207,HLA-B
3,HLA-A30:10,MYSFVSEE,34529.712035,0.034215,HLA-A
4,HLA-B44:02,MYSFVSEE,38191.475075,0.024900,HLA-B
...,...,...,...,...,...
29398,HLA-C18,YYVGYLQPR,20470.598345,0.082537,HLA-C
29399,HLA-C18,YYVGYLQPRT,33421.974532,0.037235,HLA-C
29400,HLA-C18,YYVWKSYV,22235.680609,0.077440,HLA-C
29401,HLA-C18,YYVWKSYVH,30913.271511,0.044532,HLA-C


In [86]:
data_pivot = pd.concat([df, a74, c17, c18], sort=False).pivot_table(
    index='Peptide',
    columns=['loci', 'genotype'],
    values='transformed_aff',
)
data_pivot.to_pickle('mhc1_haplotype_mhcflurry_pred_affinity_pivot.pkl.gz', protocol=2)
data_pivot

loci,HLA-A,HLA-A,HLA-A,HLA-A,HLA-A,HLA-A,HLA-A,HLA-A,HLA-A,HLA-A,...,HLA-C,HLA-C,HLA-C,HLA-C,HLA-C,HLA-C,HLA-C,HLA-C,HLA-C,HLA-C
genotype,HLA-A01:01,HLA-A01:02,HLA-A01:03,HLA-A01:09,HLA-A01:23,HLA-A02:01,HLA-A02:02,HLA-A02:03,HLA-A02:04,HLA-A02:05,...,HLA-C17:02,HLA-C17:03,HLA-C17:04,HLA-C17:05,HLA-C17:06,HLA-C17:07,HLA-C18,HLA-C18:01,HLA-C18:02,HLA-C18:03
Peptide,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
AAAYYVGY,0.094719,0.096633,0.065167,0.094719,0.096370,0.032908,0.033360,0.031451,0.038903,0.036177,...,0.058567,0.058567,0.058567,0.058567,0.050586,0.045115,0.032813,0.032292,0.032292,0.033854
AAAYYVGYL,0.051921,0.079042,0.045578,0.051921,0.052842,0.249069,0.398928,0.321426,0.505718,0.547036,...,0.662885,0.662885,0.662885,0.662885,0.635438,0.532602,0.123593,0.099414,0.099414,0.171951
AAAYYVGYLQ,0.034828,0.045169,0.032080,0.034828,0.034498,0.035324,0.036939,0.033438,0.038991,0.040745,...,0.038188,0.038188,0.038188,0.038188,0.034922,0.034588,0.032520,0.031588,0.031588,0.034383
AACCHLAK,0.037984,0.044418,0.033238,0.037984,0.038412,0.040850,0.039012,0.035140,0.048772,0.041690,...,0.086629,0.086629,0.086629,0.086629,0.072791,0.092736,0.045756,0.044588,0.044588,0.048091
AACCHLAKA,0.046057,0.063561,0.039850,0.046057,0.045247,0.405164,0.515754,0.503002,0.495733,0.639563,...,0.304727,0.304727,0.304727,0.304727,0.279128,0.230489,0.071056,0.059713,0.059713,0.093743
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
YYVGYLQPR,0.053433,0.101891,0.045845,0.053433,0.054487,0.087990,0.085834,0.075096,0.142467,0.085585,...,0.062016,0.062016,0.062016,0.062016,0.056933,0.067355,0.082537,0.082538,0.082538,0.082535
YYVGYLQPRT,0.026303,0.034723,0.024996,0.026303,0.026396,0.036421,0.042080,0.035949,0.052389,0.043779,...,0.028586,0.028586,0.028586,0.028586,0.027950,0.028314,0.037235,0.037964,0.037964,0.035777
YYVWKSYV,0.028913,0.037161,0.026876,0.028913,0.028558,0.039297,0.038931,0.034029,0.050133,0.039407,...,0.039761,0.039761,0.039761,0.039761,0.037265,0.034266,0.077440,0.092437,0.092437,0.047445
YYVWKSYVH,0.044654,0.075975,0.037013,0.044654,0.044827,0.036534,0.032578,0.030818,0.042809,0.032589,...,0.032421,0.032421,0.032421,0.032421,0.031183,0.031946,0.044532,0.047430,0.047430,0.038737


In [None]:
df = mhcflurry_preds.copy()
df['loci'] = [x[:5] for x in df['allele'].values]
df = df.drop(columns=['sequence_length', 'mhcflurry_affinity_percentile', 'mhcflurry_processing_score', 'mhcflurry_presentation_score'])
df = df.rename(columns={'sequence': 'Peptide', 'allele': 'genotype'})

# df2 = df.groupby(['Peptide', 'loci']).count().reset_index()[['Peptide', 'loci']]
# df2['genotype'] = 'unknown'
# df2['transformed_aff'] = 0.
# df2['mhcflurry_affinity'] = 0.

# df_with_unknown = pd.concat([df, df2], sort=False)

# data_pivot = df_with_unknown.pivot_table(
#     index='Peptide',
#     columns=['loci', 'genotype'],
#     values='transformed_aff',
# )

data_pivot = df.pivot_table(
    index='Peptide',
    columns=['loci', 'genotype'],
    values='transformed_aff',
)
# data_pivot.to_pickle('mhc1_haplotype_mhcflurry_pred_affinity_pivot.pkl.gz', protocol=2)
data_pivot

# Load MHCflurry 2.0 Predictions

In [41]:
mhcflurry_preds = pd.read_csv('mhc1_haplotype_preds_mhcflurry2.0.csv')

mhcflurry_preds = mhcflurry_preds.rename(columns={'peptide': 'sequence'})
mhcflurry_preds['sequence_length'] = [len(x) for x in mhcflurry_preds['sequence'].values]

# Filter dataframe to alleles in hla_alleles.
# mhcflurry_preds = mhcflurry_preds.merge(hla_alleles, on='allele')

# Add epitope protein data.
# mhcflurry_preds = mhcflurry_preds.merge(mhc1_data[['sequence', 'Protein']], on='sequence')

# Compute logistic-transformed binding affinity.
mhcflurry_preds['transformed_aff'] = 1 - np.log(mhcflurry_preds['mhcflurry_affinity']) / np.log(50000)

mhcflurry_preds['loci'] = [x[:5] for x in mhcflurry_preds['allele'].values]
mhcflurry_preds.drop(
    columns=['sequence_length', 'mhcflurry_affinity_percentile', 'mhcflurry_processing_score', 'mhcflurry_presentation_score'],
    inplace=True,
)

mhcflurry_preds.rename(columns={'allele': 'genotype', 'sequence': 'Peptide'}, inplace=True)

mhcflurry_preds

Unnamed: 0,genotype,Peptide,mhcflurry_affinity,mhcflurry_presentation_percentile,transformed_aff,loci
0,HLA-B44:04,MYSFVSEE,33181.568128,37.359049,0.037896,HLA-B
1,HLA-B44:05,MYSFVSEE,33603.431308,37.359049,0.036729,HLA-B
2,HLA-B44:07,MYSFVSEE,33443.140537,37.359049,0.037170,HLA-B
3,HLA-A30:10,MYSFVSEE,30462.139109,37.359049,0.045799,HLA-A
4,HLA-B44:02,MYSFVSEE,33494.479955,37.359049,0.037029,HLA-B
...,...,...,...,...,...,...
6762685,HLA-B55:02,VLKGVKLHYT,27082.660337,46.224674,0.056667,HLA-B
6762686,HLA-B67:01,VLKGVKLHYT,32158.622657,62.744674,0.040790,HLA-B
6762687,HLA-A24:10,VLKGVKLHYT,31610.775717,62.744674,0.042378,HLA-A
6762688,HLA-B15:32,VLKGVKLHYT,30062.941186,46.224674,0.047019,HLA-B


In [43]:
df = mhcflurry_preds

a74 = df.loc[df['genotype'].str.contains('HLA-A74')].groupby('Peptide').agg('mean').reset_index() #['mean', 'count'])
a74['loci'] = 'HLA-A'
a74['genotype'] = 'HLA-A74'
#a74

c17 = df.loc[df['genotype'].str.contains('HLA-C17')].groupby('Peptide').agg('mean').reset_index()
c17['loci'] = 'HLA-C'
c17['genotype'] = 'HLA-C17'
#c17

c18 = df.loc[df['genotype'].str.contains('HLA-C18')].groupby('Peptide').agg('mean').reset_index()
c18['loci'] = 'HLA-C'
c18['genotype'] = 'HLA-C18'
#c18

data_pivot = pd.concat([df, a74, c17, c18], sort=False).pivot_table(
    index='Peptide',
    columns=['loci', 'genotype'],
    values='transformed_aff',
)
data_pivot.to_pickle('mhc1_haplotype_mhcflurry2.0_pred_affinity_pivot.pkl.gz', protocol=2)
data_pivot

loci,HLA-A,HLA-A,HLA-A,HLA-A,HLA-A,HLA-A,HLA-A,HLA-A,HLA-A,HLA-A,...,HLA-C,HLA-C,HLA-C,HLA-C,HLA-C,HLA-C,HLA-C,HLA-C,HLA-C,HLA-C
genotype,HLA-A01:01,HLA-A01:02,HLA-A01:03,HLA-A01:09,HLA-A01:23,HLA-A02:01,HLA-A02:02,HLA-A02:03,HLA-A02:04,HLA-A02:05,...,HLA-C17:02,HLA-C17:03,HLA-C17:04,HLA-C17:05,HLA-C17:06,HLA-C17:07,HLA-C18,HLA-C18:01,HLA-C18:02,HLA-C18:03
Peptide,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
AAAYYVGY,0.105609,0.136948,0.078258,0.105609,0.099377,0.045679,0.044721,0.043267,0.052031,0.047437,...,0.082833,0.082833,0.082833,0.082833,0.076228,0.069838,0.043940,0.042713,0.042713,0.046394
AAAYYVGYL,0.059259,0.106627,0.055272,0.059259,0.059694,0.261589,0.432255,0.372506,0.509595,0.575560,...,0.602506,0.602506,0.602506,0.602506,0.588590,0.515765,0.143171,0.099117,0.099117,0.231281
AAAYYVGYLQ,0.047088,0.058239,0.043649,0.047088,0.045955,0.046259,0.048152,0.043510,0.056553,0.056510,...,0.053779,0.053779,0.053779,0.053779,0.050793,0.051782,0.042645,0.040981,0.040981,0.045974
AACCHLAK,0.051374,0.067382,0.045865,0.051374,0.049864,0.050148,0.050080,0.047409,0.062404,0.053535,...,0.109689,0.109689,0.109689,0.109689,0.098246,0.118048,0.055409,0.053855,0.053855,0.058517
AACCHLAKA,0.055832,0.079984,0.050194,0.055832,0.053780,0.340709,0.476701,0.502855,0.411340,0.621088,...,0.302690,0.302690,0.302690,0.302690,0.295129,0.251292,0.081719,0.068141,0.068141,0.108877
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
YYVGYLQPR,0.070006,0.147469,0.060451,0.070006,0.073079,0.103479,0.099143,0.092059,0.159730,0.100631,...,0.083839,0.083839,0.083839,0.083839,0.078158,0.089626,0.099271,0.102135,0.102135,0.093543
YYVGYLQPRT,0.038383,0.048676,0.037177,0.038383,0.038131,0.049290,0.050717,0.045845,0.062303,0.051598,...,0.040252,0.040252,0.040252,0.040252,0.040166,0.040705,0.047682,0.047887,0.047887,0.047271
YYVWKSYV,0.042219,0.052309,0.039432,0.042219,0.041000,0.047750,0.047634,0.043581,0.062600,0.050109,...,0.052355,0.052355,0.052355,0.052355,0.052012,0.051413,0.085485,0.098521,0.098521,0.059411
YYVWKSYVH,0.062425,0.135287,0.054883,0.062425,0.062163,0.048823,0.046424,0.043007,0.058425,0.045571,...,0.044454,0.044454,0.044454,0.044454,0.043531,0.045259,0.054809,0.058303,0.058303,0.047823


# Mean Ensemble from NetMHC and MHCflurry

In [10]:
ens_netmhc1 = netmhc1_data.copy().drop(columns=['sequence_length', 'loci', '1-log50k'])
ens_netmhc1 = ens_netmhc1.rename(columns={'nM': 'netmhc_nM'})
ens_netmhc1

Unnamed: 0,Peptide,netmhc_nM,genotype
0,MYSFVSEE,40836.4258,HLA-B44:04
1,YSFVSEET,43144.7188,HLA-B44:04
2,SFVSEETG,44191.7070,HLA-B44:04
3,FVSEETGT,46105.8516,HLA-B44:04
4,VSEETGTL,42674.8945,HLA-B44:04
...,...,...,...
29398,DSEPVLKGVK,38063.9297,HLA-B56:10
29399,SEPVLKGVKL,1738.0863,HLA-B56:10
29400,EPVLKGVKLH,16034.5420,HLA-B56:10
29401,PVLKGVKLHY,39286.5586,HLA-B56:10


In [11]:
ens_mhcflurry = mhcflurry_preds.copy().drop(
    columns=['mhcflurry_affinity_percentile', 'mhcflurry_processing_score', 'mhcflurry_presentation_score',
             'transformed_aff', 'sequence_length'])
ens_mhcflurry = ens_mhcflurry.rename(
    columns={'allele': 'genotype', 'sequence': 'Peptide', 'mhcflurry_affinity': 'mhcflurry_nM'})
ens_mhcflurry

Unnamed: 0,genotype,Peptide,mhcflurry_nM
0,HLA-B44:04,MYSFVSEE,37829.751440
1,HLA-B44:05,MYSFVSEE,38198.075124
2,HLA-B44:07,MYSFVSEE,38064.733103
3,HLA-A30:10,MYSFVSEE,34529.712035
4,HLA-B44:02,MYSFVSEE,38191.475075
...,...,...,...
6762685,HLA-B55:02,VLKGVKLHYT,29249.898280
6762686,HLA-B67:01,VLKGVKLHYT,34574.906172
6762687,HLA-A24:10,VLKGVKLHYT,35345.127043
6762688,HLA-B15:32,VLKGVKLHYT,32874.896891


In [20]:
ens_combined = ens_netmhc1.merge(ens_mhcflurry, on=['genotype', 'Peptide'], how='inner')
ens_combined['loci'] = [x[:5] for x in ens_combined['genotype'].values]
ens_combined['mean_nM'] = (ens_combined['netmhc_nM'] + ens_combined['mhcflurry_nM']) / 2
ens_combined['max_nM'] = ens_combined[['netmhc_nM', 'mhcflurry_nM']].max(axis=1)
ens_combined['mean_transformed'] = [transform_affinity(x) for x in ens_combined['mean_nM'].values]
ens_combined['max_transformed'] = [transform_affinity(x) for x in ens_combined['max_nM'].values]
ens_combined

Unnamed: 0,Peptide,netmhc_nM,genotype,mhcflurry_nM,mean_nM,max_nM,mean_transformed,max_transformed
0,MYSFVSEE,40836.4258,HLA-B44:04,37829.751440,39333.088620,40836.425800,0.022178,0.018711
1,YSFVSEET,43144.7188,HLA-B44:04,35954.932789,39549.825795,43144.718800,0.021670,0.013629
2,SFVSEETG,44191.7070,HLA-B44:04,38013.397725,41102.552363,44191.707000,0.018111,0.011413
3,FVSEETGT,46105.8516,HLA-B44:04,37585.110542,41845.481071,46105.851600,0.016455,0.007494
4,VSEETGTL,42674.8945,HLA-B44:04,32961.628791,37818.261646,42674.894500,0.025807,0.014641
...,...,...,...,...,...,...,...,...
6762685,DSEPVLKGVK,38063.9297,HLA-B56:10,11957.159303,25010.544502,38063.929700,0.064024,0.025209
6762686,SEPVLKGVKL,1738.0863,HLA-B56:10,2608.759444,2173.422872,2608.759444,0.289814,0.272940
6762687,EPVLKGVKLH,16034.5420,HLA-B56:10,12895.423658,14464.982829,16034.542000,0.114632,0.105111
6762688,PVLKGVKLHY,39286.5586,HLA-B56:10,25858.490129,32572.524364,39286.558600,0.039608,0.022287


In [33]:
print('Total pMHC pairs: ', len(ens_combined))
print('# NetMHC binders: ', (ens_combined['netmhc_nM'].values <= 50).sum())
print('# MHCflurry binders: ', (ens_combined['mhcflurry_nM'].values <= 50).sum())
print('# Mean Ens. binders: ', (ens_combined['mean_nM'].values <= 50).sum())
print('# Max Ens. binders: ', (ens_combined['max_nM'].values <= 50).sum())

Total pMHC pairs:  6762690
# NetMHC binders:  12497
# MHCflurry binders:  38845
# Mean Ens. binders:  10036
# Max Ens. binders:  7528


In [21]:
print(transform_affinity(50))

0.6384377847127609


In [27]:
transform_affinity(31838)

0.04171642418454635

In [34]:
df = ens_combined

a74 = df.loc[df['genotype'].str.contains('HLA-A74')].groupby('Peptide').agg('mean').reset_index() #['mean', 'count'])
a74['loci'] = 'HLA-A'
a74['genotype'] = 'HLA-A74'
#a74

c17 = df.loc[df['genotype'].str.contains('HLA-C17')].groupby('Peptide').agg('mean').reset_index()
c17['loci'] = 'HLA-C'
c17['genotype'] = 'HLA-C17'
#c17

c18 = df.loc[df['genotype'].str.contains('HLA-C18')].groupby('Peptide').agg('mean').reset_index()
c18['loci'] = 'HLA-C'
c18['genotype'] = 'HLA-C18'
#c18

data_pivot = pd.concat([df, a74, c17, c18], sort=False).pivot_table(
    index='Peptide',
    columns=['loci', 'genotype'],
    values='mean_transformed',
)
data_pivot.to_pickle('mhc1_haplotype_mean-ensemble_pivot.pkl.gz', protocol=2)
data_pivot

loci,HLA-A,HLA-A,HLA-A,HLA-A,HLA-A,HLA-A,HLA-A,HLA-A,HLA-A,HLA-A,...,HLA-C,HLA-C,HLA-C,HLA-C,HLA-C,HLA-C,HLA-C,HLA-C,HLA-C,HLA-C
genotype,HLA-A01:01,HLA-A01:02,HLA-A01:03,HLA-A01:09,HLA-A01:23,HLA-A02:01,HLA-A02:02,HLA-A02:03,HLA-A02:04,HLA-A02:05,...,HLA-C17:02,HLA-C17:03,HLA-C17:04,HLA-C17:05,HLA-C17:06,HLA-C17:07,HLA-C18,HLA-C18:01,HLA-C18:02,HLA-C18:03
Peptide,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
AAAYYVGY,0.099904,0.119730,0.075010,0.099904,0.103053,0.030960,0.040544,0.045232,0.029476,0.047374,...,0.042643,0.042643,0.042643,0.042643,0.039200,0.033960,0.023998,0.024134,0.024134,0.023725
AAAYYVGYL,0.062267,0.085513,0.055643,0.062267,0.064940,0.237727,0.430435,0.369027,0.224186,0.527945,...,0.401438,0.401438,0.401438,0.401438,0.400499,0.354651,0.098521,0.082455,0.082455,0.130655
AAAYYVGYLQ,0.044230,0.065236,0.037418,0.044230,0.042310,0.049180,0.082729,0.079332,0.041714,0.088255,...,0.044212,0.044212,0.044212,0.044212,0.042455,0.038649,0.026250,0.024935,0.024935,0.028880
AACCHLAK,0.036249,0.058857,0.029827,0.036249,0.036305,0.034517,0.034367,0.034648,0.034952,0.040558,...,0.045251,0.045251,0.045251,0.045251,0.040600,0.044300,0.029786,0.029516,0.029516,0.030327
AACCHLAKA,0.046042,0.069239,0.038753,0.046042,0.046332,0.222861,0.308355,0.385027,0.152427,0.398331,...,0.124742,0.124742,0.124742,0.124742,0.122662,0.095026,0.047754,0.042288,0.042288,0.058685
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
YYVGYLQPR,0.040328,0.090958,0.035315,0.040328,0.045142,0.093471,0.103548,0.091099,0.117097,0.118416,...,0.044245,0.044245,0.044245,0.044245,0.042114,0.042946,0.064351,0.060302,0.060302,0.072448
YYVGYLQPRT,0.028263,0.044036,0.025075,0.028263,0.029381,0.066425,0.084018,0.079199,0.070191,0.094782,...,0.027799,0.027799,0.027799,0.027799,0.027483,0.023921,0.038295,0.036115,0.036115,0.042655
YYVWKSYV,0.031677,0.045735,0.029042,0.031677,0.032530,0.061548,0.070117,0.073983,0.064317,0.069071,...,0.045366,0.045366,0.045366,0.045366,0.044032,0.039757,0.082259,0.088021,0.088021,0.070733
YYVWKSYVH,0.046323,0.088608,0.039156,0.046323,0.049080,0.031857,0.034921,0.028971,0.033717,0.039375,...,0.024933,0.024933,0.024933,0.024933,0.024360,0.024976,0.045196,0.047047,0.047047,0.041493


In [35]:
df = ens_combined

a74 = df.loc[df['genotype'].str.contains('HLA-A74')].groupby('Peptide').agg('mean').reset_index() #['mean', 'count'])
a74['loci'] = 'HLA-A'
a74['genotype'] = 'HLA-A74'
#a74

c17 = df.loc[df['genotype'].str.contains('HLA-C17')].groupby('Peptide').agg('mean').reset_index()
c17['loci'] = 'HLA-C'
c17['genotype'] = 'HLA-C17'
#c17

c18 = df.loc[df['genotype'].str.contains('HLA-C18')].groupby('Peptide').agg('mean').reset_index()
c18['loci'] = 'HLA-C'
c18['genotype'] = 'HLA-C18'
#c18

data_pivot = pd.concat([df, a74, c17, c18], sort=False).pivot_table(
    index='Peptide',
    columns=['loci', 'genotype'],
    values='max_transformed',
)
data_pivot.to_pickle('mhc1_haplotype_max-ensemble_pivot.pkl.gz', protocol=2)
data_pivot

loci,HLA-A,HLA-A,HLA-A,HLA-A,HLA-A,HLA-A,HLA-A,HLA-A,HLA-A,HLA-A,...,HLA-C,HLA-C,HLA-C,HLA-C,HLA-C,HLA-C,HLA-C,HLA-C,HLA-C,HLA-C
genotype,HLA-A01:01,HLA-A01:02,HLA-A01:03,HLA-A01:09,HLA-A01:23,HLA-A02:01,HLA-A02:02,HLA-A02:03,HLA-A02:04,HLA-A02:05,...,HLA-C17:02,HLA-C17:03,HLA-C17:04,HLA-C17:05,HLA-C17:06,HLA-C17:07,HLA-C18,HLA-C18:01,HLA-C18:02,HLA-C18:03
Peptide,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
AAAYYVGY,0.094719,0.096633,0.065167,0.094719,0.096370,0.029052,0.033360,0.031451,0.020923,0.036177,...,0.029064,0.029064,0.029064,0.029064,0.029064,0.024008,0.015958,0.016639,0.016639,0.014597
AAAYYVGYL,0.051921,0.079042,0.045578,0.051921,0.052842,0.227627,0.398928,0.321426,0.162347,0.512131,...,0.340147,0.340147,0.340147,0.340147,0.340147,0.297585,0.079491,0.068130,0.068130,0.102213
AAAYYVGYLQ,0.034828,0.045169,0.032080,0.034828,0.034498,0.035324,0.036939,0.033438,0.038991,0.040745,...,0.038188,0.038188,0.038188,0.038188,0.034922,0.034588,0.020381,0.018729,0.018729,0.023686
AACCHLAK,0.034546,0.044418,0.026537,0.034546,0.034245,0.028591,0.029944,0.034158,0.022931,0.039439,...,0.016770,0.016770,0.016770,0.016770,0.016770,0.012682,0.016185,0.016560,0.016560,0.015434
AACCHLAKA,0.046027,0.063561,0.037668,0.046027,0.045247,0.165461,0.249327,0.334851,0.089497,0.337730,...,0.067518,0.067518,0.067518,0.067518,0.067518,0.042302,0.029532,0.027634,0.027634,0.033329
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
YYVGYLQPR,0.028852,0.081183,0.025862,0.028852,0.036655,0.087990,0.085834,0.075096,0.097212,0.085585,...,0.029346,0.029346,0.029346,0.029346,0.029346,0.023655,0.049380,0.042392,0.042392,0.063355
YYVGYLQPRT,0.026303,0.034723,0.024996,0.026303,0.026396,0.036421,0.042080,0.035949,0.052389,0.043779,...,0.027019,0.027019,0.027019,0.027019,0.027019,0.019727,0.034794,0.034302,0.034302,0.035777
YYVWKSYV,0.028913,0.037161,0.026876,0.028913,0.028558,0.039297,0.038931,0.034029,0.050133,0.039407,...,0.039761,0.039761,0.039761,0.039761,0.037265,0.034266,0.071686,0.083807,0.083807,0.047445
YYVWKSYVH,0.044654,0.075975,0.037013,0.044654,0.044827,0.027405,0.032578,0.027160,0.025439,0.032589,...,0.018007,0.018007,0.018007,0.018007,0.018007,0.018495,0.044023,0.046666,0.046666,0.038737


# Mean ensemble of NetMHCpan4.0 and MHCflurry2.0

In [48]:
netmhc_pivot = pd.read_pickle('mhc1_haplotype_netmhc_pred_affinity_pivot.pkl.gz')
mhcflurry_pivot = pd.read_pickle('mhc1_haplotype_mhcflurry2.0_pred_affinity_pivot.pkl.gz')
assert netmhc_pivot.shape == mhcflurry_pivot.shape
assert set(netmhc_pivot.T.index.values.tolist()) == set(mhcflurry_pivot.T.index.values.tolist())
assert set(netmhc_pivot.index.values.tolist()) == set(mhcflurry_pivot.index.values.tolist())

netmhc_pivot_nm = 50000**(1-netmhc_pivot)
mhcflurry_pivot_nm = 50000**(1-mhcflurry_pivot)

ens_pivot_nm = (netmhc_pivot_nm + mhcflurry_pivot_nm) / 2
ens_pivot = 1 - np.log(ens_pivot_nm) / np.log(50000)

ens_pivot.to_pickle('mhc1_netmhcpan4.0_mhcflurry2.0_ensemble_affinity.pkl.gz', protocol=2)
ens_pivot

loci,HLA-A,HLA-A,HLA-A,HLA-A,HLA-A,HLA-A,HLA-A,HLA-A,HLA-A,HLA-A,...,HLA-C,HLA-C,HLA-C,HLA-C,HLA-C,HLA-C,HLA-C,HLA-C,HLA-C,HLA-C
genotype,HLA-A01:01,HLA-A01:02,HLA-A01:03,HLA-A01:09,HLA-A01:23,HLA-A02:01,HLA-A02:02,HLA-A02:03,HLA-A02:04,HLA-A02:05,...,HLA-C17:02,HLA-C17:03,HLA-C17:04,HLA-C17:05,HLA-C17:06,HLA-C17:07,HLA-C18,HLA-C18:01,HLA-C18:02,HLA-C18:03
Peptide,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
AAAYYVGY,0.105505,0.143522,0.082048,0.105505,0.104677,0.037018,0.046493,0.051890,0.035161,0.053552,...,0.052115,0.052115,0.052115,0.052115,0.049692,0.044106,0.028880,0.028737,0.028737,0.029137
AAAYYVGYL,0.066290,0.099294,0.060903,0.066290,0.068799,0.243041,0.452552,0.409592,0.224231,0.538487,...,0.398911,0.398911,0.398911,0.398911,0.398085,0.353328,0.105936,0.082313,0.082313,0.145829
AAAYYVGYLQ,0.050816,0.073134,0.043374,0.050816,0.048346,0.055380,0.091835,0.087515,0.050330,0.101244,...,0.052227,0.052227,0.052227,0.052227,0.050746,0.047234,0.030836,0.029171,0.029171,0.034168
AACCHLAK,0.042553,0.071591,0.035676,0.042553,0.041701,0.038747,0.039441,0.040569,0.040557,0.046198,...,0.052035,0.052035,0.052035,0.052035,0.048827,0.051112,0.033741,0.033363,0.033363,0.034467
AACCHLAKA,0.050785,0.077612,0.043736,0.050785,0.050535,0.216629,0.305789,0.385049,0.150765,0.397554,...,0.124579,0.124579,0.124579,0.124579,0.124007,0.097200,0.051970,0.045665,0.045665,0.063569
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
YYVGYLQPR,0.047186,0.108518,0.041570,0.047186,0.053111,0.101366,0.111385,0.100820,0.123275,0.128891,...,0.052604,0.052604,0.052604,0.052604,0.050537,0.050905,0.071012,0.067523,0.067523,0.077248
YYVGYLQPRT,0.034253,0.051494,0.030995,0.034253,0.035273,0.075122,0.090740,0.087005,0.076090,0.101528,...,0.033389,0.033389,0.033389,0.033389,0.033349,0.029607,0.043535,0.040844,0.040844,0.048675
YYVWKSYV,0.038279,0.053743,0.035277,0.038279,0.038825,0.066829,0.076117,0.081250,0.071388,0.076305,...,0.051826,0.051826,0.051826,0.051826,0.051655,0.048461,0.087633,0.090868,0.090868,0.078235
YYVWKSYVH,0.054932,0.117858,0.047842,0.054932,0.057730,0.037492,0.041749,0.034766,0.040445,0.046134,...,0.030284,0.030284,0.030284,0.030284,0.029886,0.030914,0.050247,0.052319,0.052319,0.046045


# PUFFIN

In [29]:
# Create dataframe with all MHC/peptide pairs.
a = mhc1_data.copy()
b = hla_alleles.copy()
a['key'] = 0
b['key'] = 0
pmhc_pairs = a.merge(b, how='outer')
pmhc_pairs = pmhc_pairs.drop(columns=['key'])
pmhc_pairs

Unnamed: 0,protein,sequence,start_pos,epi_len,entropy,perc_mutated,glyco_probs,crosses_cleavage,sequence_length,allele
0,E,MYSFVSEE,0,8,0.002908,0.000213,0.0,0,8,HLA-B44:04
1,E,MYSFVSEE,0,8,0.002908,0.000213,0.0,0,8,HLA-B44:05
2,E,MYSFVSEE,0,8,0.002908,0.000213,0.0,0,8,HLA-B44:07
3,E,MYSFVSEE,0,8,0.002908,0.000213,0.0,0,8,HLA-A30:10
4,E,MYSFVSEE,0,8,0.002908,0.000213,0.0,0,8,HLA-B44:02
...,...,...,...,...,...,...,...,...,...,...
6763375,S2,VLKGVKLHYT,1263,10,0.011631,0.000853,0.0,0,10,HLA-B55:02
6763376,S2,VLKGVKLHYT,1263,10,0.011631,0.000853,0.0,0,10,HLA-B67:01
6763377,S2,VLKGVKLHYT,1263,10,0.011631,0.000853,0.0,0,10,HLA-A24:10
6763378,S2,VLKGVKLHYT,1263,10,0.011631,0.000853,0.0,0,10,HLA-B15:32


In [30]:
pmhc_pairs['placeholder'] = -1
pmhc_pairs

Unnamed: 0,protein,sequence,start_pos,epi_len,entropy,perc_mutated,glyco_probs,crosses_cleavage,sequence_length,allele,placeholder
0,E,MYSFVSEE,0,8,0.002908,0.000213,0.0,0,8,HLA-B44:04,-1
1,E,MYSFVSEE,0,8,0.002908,0.000213,0.0,0,8,HLA-B44:05,-1
2,E,MYSFVSEE,0,8,0.002908,0.000213,0.0,0,8,HLA-B44:07,-1
3,E,MYSFVSEE,0,8,0.002908,0.000213,0.0,0,8,HLA-A30:10,-1
4,E,MYSFVSEE,0,8,0.002908,0.000213,0.0,0,8,HLA-B44:02,-1
...,...,...,...,...,...,...,...,...,...,...,...
6763375,S2,VLKGVKLHYT,1263,10,0.011631,0.000853,0.0,0,10,HLA-B55:02,-1
6763376,S2,VLKGVKLHYT,1263,10,0.011631,0.000853,0.0,0,10,HLA-B67:01,-1
6763377,S2,VLKGVKLHYT,1263,10,0.011631,0.000853,0.0,0,10,HLA-A24:10,-1
6763378,S2,VLKGVKLHYT,1263,10,0.011631,0.000853,0.0,0,10,HLA-B15:32,-1


In [32]:
pmhc_pairs[['sequence', 'placeholder', 'allele']].to_csv(
    'puffin_mhc1_preds/datafile.tsv', sep='\t', index=False, header=False)

Commands to run PUFFIN:
```
cd /path/to/PUFFIN

python preprocess.py \
    -i puffin/puffin_mhc1_preds/datafile.tsv \
    -o puffin/puffin_mhc1_preds/puffin_outdir \
    -c 1
    
python score.py \
    -o puffin/puffin_mhc1_preds/puffin_outdir \
    -c 1 \
    -g 0

```