In [1]:
import collections
import os
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

In [2]:
%config InlineBackend.figure_format='retina'

In [3]:
orig_data = pd.read_csv('../AllEpitopeFeatures.csv')
orig_data = orig_data.rename(columns={'Epitope': 'sequence'})
orig_data['sequence_length'] = [len(x) for x in orig_data['sequence'].values]
orig_data

Unnamed: 0,protein,sequence,entropy_sum,epi_start_pos,epi_len,glyco_probs,crosses_cleavage,sequence_length
0,E,MYSFVSEE,0.002395,0,8,0.0,0,8
1,E,YSFVSEET,0.002395,1,8,0.0,0,8
2,E,SFVSEETG,0.002395,2,8,0.0,0,8
3,E,FVSEETGT,0.002395,3,8,0.0,0,8
4,E,VSEETGTL,0.002395,4,8,0.0,0,8
...,...,...,...,...,...,...,...,...
174523,S2,KGCCSCGSCCKFDEDDSEPVLKGVK,0.095264,1244,25,0.0,0,25
174524,S2,GCCSCGSCCKFDEDDSEPVLKGVKL,0.092869,1245,25,0.0,0,25
174525,S2,CCSCGSCCKFDEDDSEPVLKGVKLH,0.092869,1246,25,0.0,0,25
174526,S2,CSCGSCCKFDEDDSEPVLKGVKLHY,0.086502,1247,25,0.0,0,25


In [4]:
# Load final set of HLA alleles.
hla_alleles = pd.read_csv('MHC2_allele_marry.txt', names=['allele'])
hla_alleles

Unnamed: 0,allele
0,HLA-DPA10301-DPB11301
1,HLA-DPA10201-DPB155801
2,DRB1_0701
3,HLA-DPA10207-DPB116201
4,HLA-DPA10301-DPB15501
...,...
313,HLA-DQA10401-DQB10301
314,HLA-DPA10103-DPB112601
315,HLA-DPA10301-DPB16501
316,HLA-DPA10202-DPB11301


In [8]:
# Filter MHC-II sequences to epitopes with sequence length 13-25 (inclusive).
mhc2_data = orig_data.loc[orig_data['sequence_length'].isin(range(13,26))]
mhc2_data

Unnamed: 0,protein,sequence,entropy_sum,epi_start_pos,epi_len,glyco_probs,crosses_cleavage,sequence_length
330,E,MYSFVSEETGTLI,0.002395,0,13,0.0,0,13
331,E,YSFVSEETGTLIV,0.002395,1,13,0.0,0,13
332,E,SFVSEETGTLIVN,0.002395,2,13,0.0,0,13
333,E,FVSEETGTLIVNS,0.002395,3,13,0.0,0,13
334,E,VSEETGTLIVNSV,0.004789,4,13,0.0,0,13
...,...,...,...,...,...,...,...,...
174523,S2,KGCCSCGSCCKFDEDDSEPVLKGVK,0.095264,1244,25,0.0,0,25
174524,S2,GCCSCGSCCKFDEDDSEPVLKGVKL,0.092869,1245,25,0.0,0,25
174525,S2,CCSCGSCCKFDEDDSEPVLKGVKLH,0.092869,1246,25,0.0,0,25
174526,S2,CSCGSCCKFDEDDSEPVLKGVKLHY,0.086502,1247,25,0.0,0,25


In [7]:
# Write peptides out (unpaired with MHC) for NetMHCpan.
mhc2_data[['sequence']].to_csv('peptides_13-25.pep', index=False, header=False)

# Create commands for running NetMHCIIpan-3.2 and NetMHCIIpan-4.0

In [9]:
# Create commands for running NetMHCIIpan3.2 (MHC-II).
cmd_template = '-inptype 1 -f peptides_13-25.pep -a {allele} -xls -xlsfile {allele_file}'
cmds = []
for allele in hla_alleles['allele'].values:
    cmd = cmd_template.format(
        allele=allele.replace('*', '_').replace(':', ''),
        allele_file='netmhcii-3.2_preds/%s_preds.xls' % (allele.replace('*', '_').replace(':', ''))
    )
    cmds.append(cmd)
print('# cmds: ', len(cmds))
with open('netmhc_class2_args.txt', 'w') as f:
    for cmd in cmds:
        f.write(cmd+'\n')

# cmds:  318


In [10]:
# Create commands for running NetMHCIIpan4.0 (MHC-II).
cmd_template = '-inptype 1 -f peptides_13-25.pep -a {allele} -BA -xls -xlsfile {allele_file}'
cmds = []
for allele in hla_alleles['allele'].values:
    cmd = cmd_template.format(
        allele=allele.replace('*', '_').replace(':', ''),
        allele_file='netmhcii-4.0_preds/%s_preds.xls' % (allele.replace('*', '_').replace(':', ''))
    )
    cmds.append(cmd)
print('# cmds: ', len(cmds))
with open('netmhc4.0_class2_args.txt', 'w') as f:
    for cmd in cmds:
        f.write(cmd+'\n')

# cmds:  318


Command to run:

```
cat netmhc_class2_args.txt | xargs -P 70 -d '\n' -n 1 ./netMHCIIpan-3.2/netMHCIIpan

cat netmhc4.0_class2_args.txt | xargs -P 70 -d '\n' -n 1 ./netMHCIIpan-4.0/netMHCIIpan
``````

# Load NetMHCIIpan4.0 predictions

In [17]:
dfs = []
for allele in hla_alleles['allele'].values:
    try:
        df = pd.read_csv(
            './netmhcii-4.0_preds/%s_preds.xls' % allele.replace(':', ''),
            delimiter='\t',
            skiprows=[0],
        )
    except:
        continue
    df['genotype'] = allele
    df = df.drop(columns=['Pos', 'ID', 'Ave', 'NB'])
    dfs.append(df)
print('Loaded %d alleles' % len(dfs))
netmhc2_data = pd.concat(dfs)
netmhc2_data['sequence_length'] = [len(x) for x in netmhc2_data['Peptide'].values]
netmhc2_data['loci'] = [x[:4] if x[:3] == 'DRB' else x[:6] for x in netmhc2_data['genotype'].values]
# netmhc2_data['1-log50k'] = 1 - np.log(netmhc2_data['nM']) / np.log(50000)
netmhc2_data

Loaded 280 alleles


Unnamed: 0,Peptide,Target,Score,Rank,Score_BA,nM,Rank_BA,genotype,sequence_length,loci
0,MYSFVSEETGTLI,-99.999,0.041539,35.55,0.246930,3456.633552,50.21,HLA-DPA10301-DPB11301,13,HLA-DP
1,YSFVSEETGTLIV,-99.999,0.038426,39.10,0.240716,3697.028008,52.48,HLA-DPA10301-DPB11301,13,HLA-DP
2,SFVSEETGTLIVN,-99.999,0.028666,53.15,0.212631,5009.833741,62.80,HLA-DPA10301-DPB11301,13,HLA-DP
3,FVSEETGTLIVNS,-99.999,0.036350,41.72,0.206312,5364.337635,65.16,HLA-DPA10301-DPB11301,13,HLA-DP
4,VSEETGTLIVNSV,-99.999,0.043983,32.96,0.200067,5739.329682,67.36,HLA-DPA10301-DPB11301,13,HLA-DP
...,...,...,...,...,...,...,...,...,...,...
125588,KGCCSCGSCCKFDEDDSEPVLKGVK,-99.999,0.002639,91.75,0.168159,8105.800438,79.10,HLA-DPA10103-DPB14101,25,HLA-DP
125589,GCCSCGSCCKFDEDDSEPVLKGVKL,-99.999,0.003358,88.46,0.173627,7640.150953,77.25,HLA-DPA10103-DPB14101,25,HLA-DP
125590,CCSCGSCCKFDEDDSEPVLKGVKLH,-99.999,0.003350,88.49,0.174621,7558.422479,76.91,HLA-DPA10103-DPB14101,25,HLA-DP
125591,CSCGSCCKFDEDDSEPVLKGVKLHY,-99.999,0.003275,88.85,0.214361,4916.930790,62.79,HLA-DPA10103-DPB14101,25,HLA-DP


In [7]:
# Write predicted binding affinity predictions.

df = netmhc2_data

df2 = df.groupby(['Peptide', 'loci']).count().reset_index()[['Peptide', 'loci']]
df2['genotype'] = 'unknown'
df2['Score'] = 0.
df2['Rank'] = 0.
df2['Score_BA'] = 0.
df2['Rank_BA'] = 0.
df2['nM'] = 0.

data_pivot = pd.concat([df, df2], sort=False).pivot_table(
    index='Peptide',
    columns=['loci', 'genotype'],
    values='Score_BA',
)
data_pivot.to_pickle('mhc2_haplotype_netmhcii-4.0_pred_affinity_pivot_v1v2.pkl.gz', protocol=2)
data_pivot

loci,DRB1,DRB1,DRB1,DRB1,DRB1,DRB1,DRB1,DRB1,DRB1,DRB1,...,HLA-DQ,HLA-DQ,HLA-DQ,HLA-DQ,HLA-DQ,HLA-DQ,HLA-DQ,HLA-DQ,HLA-DQ,HLA-DQ
genotype,DRB1_0101,DRB1_0102,DRB1_0103,DRB1_0301,DRB1_0302,DRB1_0401,DRB1_0402,DRB1_0403,DRB1_0404,DRB1_0405,...,HLA-DQA10505-DQB10309,HLA-DQA10505-DQB10319,HLA-DQA10505-DQB10402,HLA-DQA10505-DQB10501,HLA-DQA10505-DQB10502,HLA-DQA10506-DQB10303,HLA-DQA10508-DQB10301,HLA-DQA10509-DQB10301,HLA-DQA10601-DQB10301,unknown
Peptide,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
AAAYYVGYLQPRT,0.262275,0.210513,0.162170,0.138032,0.064328,0.200257,0.182535,0.152988,0.196000,0.237120,...,0.249702,0.249702,0.297920,0.264890,0.205538,0.227213,0.249702,0.249702,0.219885,0.0
AAAYYVGYLQPRTF,0.435325,0.357479,0.232249,0.188956,0.081843,0.289715,0.248274,0.208763,0.302934,0.341891,...,0.333322,0.333322,0.435867,0.339701,0.268764,0.341743,0.333322,0.333322,0.294761,0.0
AAAYYVGYLQPRTFL,0.620743,0.533288,0.350694,0.281340,0.119930,0.388640,0.326797,0.272157,0.420422,0.449729,...,0.383470,0.383470,0.512765,0.384206,0.308803,0.387634,0.383470,0.383470,0.326042,0.0
AAAYYVGYLQPRTFLL,0.603934,0.471715,0.317015,0.240276,0.107944,0.385170,0.287178,0.258092,0.384697,0.441300,...,0.374134,0.374134,0.490339,0.372928,0.299992,0.376569,0.374134,0.374134,0.327447,0.0
AAAYYVGYLQPRTFLLK,0.669816,0.519657,0.374913,0.280090,0.120530,0.433591,0.309963,0.284692,0.428667,0.472981,...,0.354784,0.354784,0.482520,0.379043,0.295066,0.355889,0.354784,0.354784,0.307027,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
YYVWKSYVHVVDGCNSSTCMM,0.399611,0.273032,0.203689,0.191331,0.094848,0.360642,0.217532,0.232871,0.344261,0.421699,...,0.285683,0.285683,0.307903,0.269684,0.211645,0.291034,0.285683,0.285683,0.277248,0.0
YYVWKSYVHVVDGCNSSTCMMC,0.400962,0.277270,0.206981,0.195065,0.095416,0.363595,0.220804,0.233858,0.347686,0.422885,...,0.292342,0.292342,0.311268,0.270227,0.212058,0.295712,0.292342,0.292342,0.281802,0.0
YYVWKSYVHVVDGCNSSTCMMCY,0.402886,0.282106,0.209602,0.198191,0.096931,0.365776,0.222661,0.234946,0.350836,0.424360,...,0.306004,0.306004,0.318901,0.272231,0.215326,0.307148,0.306004,0.306004,0.296308,0.0
YYVWKSYVHVVDGCNSSTCMMCYK,0.405321,0.287790,0.211981,0.201682,0.097617,0.369570,0.224174,0.236694,0.353541,0.425438,...,0.319624,0.319624,0.327946,0.275950,0.218148,0.319855,0.319624,0.319624,0.306854,0.0


In [9]:
# Write predicted rank (EL score) predictions.

df = netmhc2_data

data_pivot = netmhc2_data.pivot_table(
    index='Peptide',
    columns=['loci', 'genotype'],
    values='Rank',
)
# Invert rank so threshold is >= 99.5 rather than <= 0.5.
data_pivot = 100 - data_pivot
data_pivot.to_pickle('mhc2_haplotype_netmhcii-4.0_el_rank_pivot_v2.pkl.gz', protocol=2)
data_pivot

loci,DRB1,DRB1,DRB1,DRB1,DRB1,DRB1,DRB1,DRB1,DRB1,DRB1,...,HLA-DQ,HLA-DQ,HLA-DQ,HLA-DQ,HLA-DQ,HLA-DQ,HLA-DQ,HLA-DQ,HLA-DQ,HLA-DQ
genotype,DRB1_0101,DRB1_0102,DRB1_0103,DRB1_0301,DRB1_0302,DRB1_0401,DRB1_0402,DRB1_0403,DRB1_0404,DRB1_0405,...,HLA-DQA10505-DQB10302,HLA-DQA10505-DQB10309,HLA-DQA10505-DQB10319,HLA-DQA10505-DQB10402,HLA-DQA10505-DQB10501,HLA-DQA10505-DQB10502,HLA-DQA10506-DQB10303,HLA-DQA10508-DQB10301,HLA-DQA10509-DQB10301,HLA-DQA10601-DQB10301
Peptide,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
AAAYYVGYLQPRT,49.55,42.30,58.67,63.13,73.96,47.28,53.43,47.67,30.49,45.41,...,40.69,84.33,84.33,72.33,54.93,81.42,58.50,84.33,84.33,78.62
AAAYYVGYLQPRTF,56.91,59.24,69.37,62.94,75.26,48.72,68.74,50.66,38.28,55.17,...,50.49,82.80,82.80,78.55,63.17,82.94,62.70,82.80,82.80,78.40
AAAYYVGYLQPRTFL,58.92,73.44,71.54,58.67,73.21,40.21,65.65,42.11,29.50,47.17,...,41.01,75.16,75.16,68.35,62.58,73.86,50.36,75.16,75.16,70.55
AAAYYVGYLQPRTFLL,51.01,67.06,64.39,50.77,59.00,27.82,53.27,30.22,21.77,37.50,...,35.00,64.62,64.62,52.23,45.77,53.66,39.54,64.62,64.62,58.08
AAAYYVGYLQPRTFLLK,69.09,74.68,74.51,41.10,49.39,42.60,46.21,30.74,23.76,33.08,...,31.49,52.17,52.17,39.41,41.95,36.83,33.72,52.17,52.17,48.23
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
YYVWKSYVHVVDGCNSSTCMM,5.35,5.00,10.74,5.24,8.20,41.29,17.81,25.87,24.18,56.19,...,6.48,5.07,5.07,5.00,5.00,5.81,10.04,5.07,5.07,5.16
YYVWKSYVHVVDGCNSSTCMMC,5.78,5.00,11.53,5.13,8.50,42.73,18.43,26.91,25.46,57.03,...,6.38,5.03,5.03,5.00,5.00,5.18,9.29,5.03,5.03,5.02
YYVWKSYVHVVDGCNSSTCMMCY,6.27,5.42,12.38,5.42,8.80,43.94,19.29,27.82,26.36,57.68,...,6.83,5.23,5.23,5.00,5.00,5.35,9.49,5.23,5.23,5.35
YYVWKSYVHVVDGCNSSTCMMCYK,6.73,5.82,13.23,5.68,9.00,44.96,20.01,28.60,27.12,58.19,...,7.11,5.89,5.89,5.00,5.00,5.51,9.78,5.89,5.89,5.76


In [9]:
# Write predicted rank (BA score) predictions.

df = netmhc2_data

data_pivot = netmhc2_data.pivot_table(
    index='Peptide',
    columns=['loci', 'genotype'],
    values='Rank_BA',
)
# Invert rank so threshold is >= 99.5 rather than <= 0.5.
data_pivot = 100 - data_pivot
data_pivot.to_pickle('mhc2_haplotype_netmhcii-4.0_ba_rank_pivot_v2.pkl.gz', protocol=2)
data_pivot

loci,DRB1,DRB1,DRB1,DRB1,DRB1,DRB1,DRB1,DRB1,DRB1,DRB1,...,HLA-DQ,HLA-DQ,HLA-DQ,HLA-DQ,HLA-DQ,HLA-DQ,HLA-DQ,HLA-DQ,HLA-DQ,HLA-DQ
genotype,DRB1_0101,DRB1_0102,DRB1_0103,DRB1_0301,DRB1_0302,DRB1_0401,DRB1_0402,DRB1_0403,DRB1_0404,DRB1_0405,...,HLA-DQA10505-DQB10302,HLA-DQA10505-DQB10309,HLA-DQA10505-DQB10319,HLA-DQA10505-DQB10402,HLA-DQA10505-DQB10501,HLA-DQA10505-DQB10502,HLA-DQA10506-DQB10303,HLA-DQA10508-DQB10301,HLA-DQA10509-DQB10301,HLA-DQA10601-DQB10301
Peptide,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
AAAYYVGYLQPRT,31.27,27.39,36.80,28.28,34.18,30.15,42.73,30.35,27.78,40.20,...,38.97,44.30,44.30,51.11,59.32,66.54,30.60,44.30,44.30,35.11
AAAYYVGYLQPRTF,67.76,69.94,67.74,49.59,56.03,58.28,69.87,57.24,57.25,69.57,...,74.25,69.05,69.05,89.15,89.76,93.88,69.34,69.05,69.05,62.72
AAAYYVGYLQPRTFL,92.52,95.35,92.74,78.44,84.58,82.62,89.66,81.19,83.07,89.92,...,81.16,79.14,79.14,97.49,97.50,98.68,81.28,79.14,79.14,71.59
AAAYYVGYLQPRTFLL,91.12,90.22,88.57,67.90,78.45,81.96,81.51,76.86,76.52,88.86,...,80.93,77.50,77.50,95.89,96.21,98.11,78.73,77.50,77.50,71.97
AAAYYVGYLQPRTFLLK,95.70,94.47,94.79,78.17,84.84,89.30,86.61,84.49,84.41,92.48,...,71.93,73.72,73.72,95.21,96.97,97.66,73.39,73.72,73.72,66.37
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
YYVWKSYVHVVDGCNSSTCMM,60.90,46.33,56.44,50.53,69.03,76.94,58.06,67.65,67.61,85.89,...,43.98,56.32,56.32,54.63,61.76,70.44,52.95,56.32,56.32,57.15
YYVWKSYVHVVDGCNSSTCMMC,61.17,47.64,57.82,52.02,69.52,77.59,59.38,68.04,68.39,86.08,...,45.22,58.26,58.26,55.79,62.03,70.70,54.57,58.26,58.26,58.64
YYVWKSYVHVVDGCNSSTCMMCY,61.55,49.13,58.92,53.27,70.78,78.06,60.10,68.46,69.08,86.32,...,52.55,62.05,62.05,58.27,63.01,72.65,58.44,62.05,62.05,63.18
YYVWKSYVHVVDGCNSSTCMMCYK,62.02,50.81,59.90,54.57,71.32,78.88,60.71,69.14,69.75,86.49,...,53.74,65.68,65.68,61.34,64.81,74.32,62.60,65.68,65.68,66.32


# Load NetMHCIIpan3.2 predictions

In [6]:
dfs = []
for allele in hla_alleles['allele'].values:
    try:
        df = pd.read_csv(
            './netmhcii-3.2_preds/%s_preds.xls' % allele.replace(':', ''),
            delimiter='\t',
            skiprows=[0],
        )
    except:
        continue
    df['genotype'] = allele
    df = df.drop(columns=['Pos', 'ID', 'Ave', 'NB'])
    dfs.append(df)
print('Loaded %d alleles' % len(dfs))
netmhc2_data = pd.concat(dfs)
netmhc2_data['sequence_length'] = [len(x) for x in netmhc2_data['Peptide'].values]
netmhc2_data['loci'] = [x[:4] if x[:3] == 'DRB' else x[:6] for x in netmhc2_data['genotype'].values]
# netmhc2_data['1-log50k'] = 1 - np.log(netmhc2_data['nM']) / np.log(50000)
netmhc2_data

Loaded 280 alleles


Unnamed: 0,Peptide,1-log50k,nM,Rank,genotype,sequence_length,loci
0,MYSFVSEETGTLI,0.209,5188.97,46.0,HLA-DPA10301-DPB11301,13,HLA-DP
1,YSFVSEETGTLIV,0.211,5106.81,45.0,HLA-DPA10301-DPB11301,13,HLA-DP
2,SFVSEETGTLIVN,0.189,6500.21,55.0,HLA-DPA10301-DPB11301,13,HLA-DP
3,FVSEETGTLIVNS,0.167,8166.10,65.0,HLA-DPA10301-DPB11301,13,HLA-DP
4,VSEETGTLIVNSV,0.151,9707.22,70.0,HLA-DPA10301-DPB11301,13,HLA-DP
...,...,...,...,...,...,...,...
125588,KGCCSCGSCCKFDEDDSEPVLKGVK,0.218,4718.05,95.0,HLA-DPA10103-DPB14101,25,HLA-DP
125589,GCCSCGSCCKFDEDDSEPVLKGVKL,0.232,4066.30,95.0,HLA-DPA10103-DPB14101,25,HLA-DP
125590,CCSCGSCCKFDEDDSEPVLKGVKLH,0.248,3417.66,90.0,HLA-DPA10103-DPB14101,25,HLA-DP
125591,CSCGSCCKFDEDDSEPVLKGVKLHY,0.316,1644.14,75.0,HLA-DPA10103-DPB14101,25,HLA-DP


In [11]:
# Write predicted binding affinity predictions.

df = netmhc2_data

df2 = df.groupby(['Peptide', 'loci']).count().reset_index()[['Peptide', 'loci']]
df2['genotype'] = 'unknown'
df2['Rank'] = 0.
df2['1-log50k'] = 0.
df2['nM'] = 0.

data_pivot = pd.concat([df, df2], sort=False).pivot_table(
    index='Peptide',
    columns=['loci', 'genotype'],
    values='1-log50k',
)
data_pivot.to_pickle('mhc2_haplotype_netmhcii-3.2_pred_affinity_pivot_v1v2.pkl.gz', protocol=2)
data_pivot

loci,DRB1,DRB1,DRB1,DRB1,DRB1,DRB1,DRB1,DRB1,DRB1,DRB1,...,HLA-DQ,HLA-DQ,HLA-DQ,HLA-DQ,HLA-DQ,HLA-DQ,HLA-DQ,HLA-DQ,HLA-DQ,HLA-DQ
genotype,DRB1_0101,DRB1_0102,DRB1_0103,DRB1_0301,DRB1_0302,DRB1_0401,DRB1_0402,DRB1_0403,DRB1_0404,DRB1_0405,...,HLA-DQA10505-DQB10309,HLA-DQA10505-DQB10319,HLA-DQA10505-DQB10402,HLA-DQA10505-DQB10501,HLA-DQA10505-DQB10502,HLA-DQA10506-DQB10303,HLA-DQA10508-DQB10301,HLA-DQA10509-DQB10301,HLA-DQA10601-DQB10301,unknown
Peptide,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
AAAYYVGYLQPRT,0.554,0.364,0.266,0.199,0.091,0.403,0.295,0.265,0.364,0.424,...,0.327,0.327,0.428,0.467,0.367,0.336,0.327,0.327,0.259,0.0
AAAYYVGYLQPRTF,0.642,0.456,0.327,0.265,0.108,0.458,0.370,0.327,0.443,0.489,...,0.364,0.364,0.504,0.524,0.415,0.385,0.364,0.364,0.292,0.0
AAAYYVGYLQPRTFL,0.659,0.486,0.346,0.297,0.112,0.466,0.397,0.357,0.470,0.506,...,0.388,0.388,0.537,0.559,0.438,0.406,0.388,0.388,0.307,0.0
AAAYYVGYLQPRTFLL,0.647,0.460,0.343,0.285,0.113,0.467,0.387,0.348,0.457,0.500,...,0.396,0.396,0.551,0.566,0.441,0.404,0.396,0.396,0.309,0.0
AAAYYVGYLQPRTFLLK,0.622,0.432,0.321,0.277,0.109,0.445,0.371,0.332,0.442,0.478,...,0.394,0.394,0.548,0.558,0.430,0.391,0.394,0.394,0.302,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
YYVWKSYVHVVDGCNSSTCMM,0.396,0.260,0.201,0.243,0.102,0.332,0.259,0.267,0.346,0.372,...,0.392,0.392,0.427,0.472,0.335,0.353,0.392,0.392,0.333,0.0
YYVWKSYVHVVDGCNSSTCMMC,0.397,0.261,0.202,0.246,0.102,0.334,0.260,0.269,0.348,0.373,...,0.397,0.397,0.429,0.473,0.338,0.355,0.397,0.397,0.339,0.0
YYVWKSYVHVVDGCNSSTCMMCY,0.398,0.264,0.202,0.248,0.103,0.336,0.261,0.271,0.350,0.375,...,0.408,0.408,0.433,0.476,0.339,0.361,0.408,0.408,0.351,0.0
YYVWKSYVHVVDGCNSSTCMMCYK,0.399,0.265,0.203,0.251,0.104,0.338,0.263,0.273,0.352,0.376,...,0.413,0.413,0.437,0.478,0.340,0.365,0.413,0.413,0.357,0.0


In [12]:
(data_pivot >= 0.638).sum().sum()

174296

In [13]:
# Using % Rank (BA) rather than binding affinity
data_pivot = netmhc2_data.pivot_table(
    index='Peptide',
    columns=['loci', 'genotype'],
    values='Rank',
)
# Invert so binders are >= 98 rather than <= 2.
data_pivot = 100 - data_pivot
data_pivot.to_pickle('mhc2_haplotype_netmhcii-3.2_ba_rank_pivot_v2.pkl.gz', protocol=2)
data_pivot

loci,DRB1,DRB1,DRB1,DRB1,DRB1,DRB1,DRB1,DRB1,DRB1,DRB1,...,HLA-DQ,HLA-DQ,HLA-DQ,HLA-DQ,HLA-DQ,HLA-DQ,HLA-DQ,HLA-DQ,HLA-DQ,HLA-DQ
genotype,DRB1_0101,DRB1_0102,DRB1_0103,DRB1_0301,DRB1_0302,DRB1_0401,DRB1_0402,DRB1_0403,DRB1_0404,DRB1_0405,...,HLA-DQA10505-DQB10302,HLA-DQA10505-DQB10309,HLA-DQA10505-DQB10319,HLA-DQA10505-DQB10402,HLA-DQA10505-DQB10501,HLA-DQA10505-DQB10502,HLA-DQA10506-DQB10303,HLA-DQA10508-DQB10301,HLA-DQA10509-DQB10301,HLA-DQA10601-DQB10301
Peptide,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
AAAYYVGYLQPRT,79.0,77.0,79.0,40.0,58.0,81.0,69.0,63.0,66.0,85.0,...,83.0,76.0,76.0,90.0,94.0,98.6,75.0,76.0,76.0,70.0
AAAYYVGYLQPRTF,90.0,91.0,88.0,56.0,65.0,87.0,82.0,74.0,78.0,91.5,...,87.0,78.0,78.0,96.0,97.5,99.5,80.0,78.0,78.0,72.0
AAAYYVGYLQPRTFL,90.5,93.0,89.0,58.0,60.0,85.0,84.0,75.0,79.0,91.0,...,85.0,76.0,76.0,96.5,98.7,99.6,80.0,76.0,76.0,69.0
AAAYYVGYLQPRTFLL,90.0,91.0,90.0,52.0,59.0,86.0,82.0,72.0,76.0,90.5,...,81.0,75.0,75.0,97.0,98.6,99.6,77.0,75.0,75.0,66.0
AAAYYVGYLQPRTFLLK,87.0,88.0,87.0,45.0,53.0,82.0,79.0,65.0,72.0,87.0,...,72.0,70.0,70.0,96.0,97.0,99.0,71.0,70.0,70.0,60.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
YYVWKSYVHVVDGCNSSTCMM,30.0,30.0,45.0,25.0,40.0,40.0,35.0,30.0,35.0,54.0,...,40.0,62.0,62.0,63.0,58.0,68.0,53.0,62.0,62.0,61.0
YYVWKSYVHVVDGCNSSTCMMC,30.0,30.0,45.0,25.0,40.0,45.0,35.0,30.0,35.0,54.0,...,45.0,63.0,63.0,64.0,58.0,69.0,54.0,63.0,63.0,62.0
YYVWKSYVHVVDGCNSSTCMMCY,30.0,30.0,45.0,30.0,40.0,45.0,35.0,35.0,35.0,55.0,...,51.0,66.0,66.0,65.0,60.0,70.0,56.0,66.0,66.0,66.0
YYVWKSYVHVVDGCNSSTCMMCYK,30.0,30.0,45.0,30.0,40.0,45.0,35.0,35.0,35.0,56.0,...,52.0,67.0,67.0,67.0,61.0,71.0,57.0,67.0,67.0,67.0


In [14]:
(data_pivot >= 98).sum().sum()

626482

In [18]:
(data_pivot >= 90).sum().sum()

3463122

In [17]:
626482 / (125593*280) * 100.

1.781497148953934

# PUFFIN MHC Class II

In [7]:
puffin_mhc2_preds = pd.read_csv('PUFFIN/all_preds_combined_classII.csv.gz')

# Filter dataframe to peptide lengths [13, 25].
puffin_mhc2_preds['sequence_length'] = [len(x) for x in puffin_mhc2_preds['epitope'].values]
puffin_mhc2_preds = puffin_mhc2_preds.loc[puffin_mhc2_preds['sequence_length'].isin(range(13, 26))]

puffin_mhc2_preds.rename(columns={'epitope': 'Peptide', 'mhc': 'genotype'}, inplace=True)

# puffin_mhc2_preds['loci'] = [x[:5] for x in puffin_mhc2_preds['genotype'].values]

puffin_mhc2_preds

Unnamed: 0,mean_pred,epistemic,aleatoric,binding_likelihood,avg_suff_stat_1,avg_suff_stat_2,genotype,Peptide,sequence_length
0,0.289376,0.003492,0.035966,0.236246,0.289376,0.189648,HLA-DPA10103-DPB10101,MDLFMRIFTIGTVTLKQGEIK,21
1,0.364605,0.006347,0.038386,0.377729,0.364605,0.195923,HLA-DPA10103-DPB10201,MDLFMRIFTIGTVTLKQGEIK,21
2,0.337425,0.006072,0.039994,0.329593,0.337425,0.199984,HLA-DPA10103-DPB10202,MDLFMRIFTIGTVTLKQGEIK,21
3,0.367785,0.004348,0.056473,0.403850,0.367785,0.237641,HLA-DPA10103-DPB10301,MDLFMRIFTIGTVTLKQGEIK,21
4,0.320184,0.005176,0.032980,0.280750,0.320184,0.181603,HLA-DPA10103-DPB10401,MDLFMRIFTIGTVTLKQGEIK,21
...,...,...,...,...,...,...,...,...,...
98967279,0.211966,0.006626,0.028748,0.103811,0.211966,0.169553,HLA-DQA10601-DQB10630,TKLATTEELPDEFVVVTVK,19
98967280,0.230003,0.006200,0.028258,0.122268,0.230003,0.168100,HLA-DQA10601-DQB10632,TKLATTEELPDEFVVVTVK,19
98967281,0.273412,0.004453,0.029170,0.186404,0.273412,0.170792,HLA-DQA10601-DQB10633,TKLATTEELPDEFVVVTVK,19
98967282,0.294119,0.003196,0.028181,0.216706,0.294119,0.167873,HLA-DQA10601-DQB10637,TKLATTEELPDEFVVVTVK,19


In [8]:
puffin_mhc2_new_preds = pd.read_csv('PUFFIN/extra_MHC2_predictions.csv')

# Filter dataframe to peptide lengths [13, 25].
puffin_mhc2_new_preds['sequence_length'] = [len(x) for x in puffin_mhc2_new_preds['epitope'].values]
puffin_mhc2_new_preds = puffin_mhc2_new_preds.loc[puffin_mhc2_new_preds['sequence_length'].isin(range(13, 26))]

puffin_mhc2_new_preds.rename(columns={'epitope': 'Peptide', 'mhc': 'genotype'}, inplace=True)

# puffin_mhc2_new_preds['loci'] = [x[:5] for x in puffin_mhc2_new_preds['genotype'].values]

puffin_mhc2_new_preds

Unnamed: 0,mean_pred,epistemic,aleatoric,binding_likelihood,avg_suff_stat_1,avg_suff_stat_2,genotype,Peptide,sequence_length
0,0.545357,0.006401,0.046334,0.710976,0.545357,0.215254,DRB1_0101,MDLFMRIFTIGTVT,14
1,0.486121,0.004832,0.044232,0.613191,0.486121,0.210314,DRB1_0102,MDLFMRIFTIGTVT,14
2,0.449011,0.008343,0.042273,0.545279,0.449011,0.205604,DRB1_0103,MDLFMRIFTIGTVT,14
3,0.137176,0.001960,0.018459,0.016875,0.137176,0.135865,DRB1_0301,MDLFMRIFTIGTVT,14
4,0.174542,0.001935,0.023181,0.049561,0.174542,0.152252,DRB1_0302,MDLFMRIFTIGTVT,14
...,...,...,...,...,...,...,...,...,...
20597247,0.370838,0.013723,0.037287,0.388309,0.370838,0.193099,HLA-DQA10505-DQB10501,PIAVQMTKLATTEELPDEFVVVTVK,25
20597248,0.182281,0.003463,0.028081,0.073230,0.182281,0.167575,HLA-DQA10505-DQB10502,PIAVQMTKLATTEELPDEFVVVTVK,25
20597249,0.369109,0.006680,0.022698,0.353784,0.369109,0.150659,HLA-DQA10506-DQB10303,PIAVQMTKLATTEELPDEFVVVTVK,25
20597250,0.244158,0.002368,0.031147,0.151921,0.244158,0.176485,HLA-DQA10508-DQB10301,PIAVQMTKLATTEELPDEFVVVTVK,25


In [9]:
puffin_mhc2_all = pd.concat([puffin_mhc2_preds, puffin_mhc2_new_preds], sort=False)
puffin_mhc2_all

Unnamed: 0,mean_pred,epistemic,aleatoric,binding_likelihood,avg_suff_stat_1,avg_suff_stat_2,genotype,Peptide,sequence_length
0,0.289376,0.003492,0.035966,0.236246,0.289376,0.189648,HLA-DPA10103-DPB10101,MDLFMRIFTIGTVTLKQGEIK,21
1,0.364605,0.006347,0.038386,0.377729,0.364605,0.195923,HLA-DPA10103-DPB10201,MDLFMRIFTIGTVTLKQGEIK,21
2,0.337425,0.006072,0.039994,0.329593,0.337425,0.199984,HLA-DPA10103-DPB10202,MDLFMRIFTIGTVTLKQGEIK,21
3,0.367785,0.004348,0.056473,0.403850,0.367785,0.237641,HLA-DPA10103-DPB10301,MDLFMRIFTIGTVTLKQGEIK,21
4,0.320184,0.005176,0.032980,0.280750,0.320184,0.181603,HLA-DPA10103-DPB10401,MDLFMRIFTIGTVTLKQGEIK,21
...,...,...,...,...,...,...,...,...,...
20597247,0.370838,0.013723,0.037287,0.388309,0.370838,0.193099,HLA-DQA10505-DQB10501,PIAVQMTKLATTEELPDEFVVVTVK,25
20597248,0.182281,0.003463,0.028081,0.073230,0.182281,0.167575,HLA-DQA10505-DQB10502,PIAVQMTKLATTEELPDEFVVVTVK,25
20597249,0.369109,0.006680,0.022698,0.353784,0.369109,0.150659,HLA-DQA10506-DQB10303,PIAVQMTKLATTEELPDEFVVVTVK,25
20597250,0.244158,0.002368,0.031147,0.151921,0.244158,0.176485,HLA-DQA10508-DQB10301,PIAVQMTKLATTEELPDEFVVVTVK,25


In [None]:
puffin_mhc2_filtered = puffin_mhc2_all.loc[puffin_mhc2_all['genotype'].isin(set(hla_alleles['allele'].values))]
puffin_mhc2_filtered['loci'] = [x[:4] if x[:3] == 'DRB' else x[:6] for x in puffin_mhc2_filtered['genotype'].values]
puffin_mhc2_filtered

In [18]:
df = puffin_mhc2_filtered

df2 = df.groupby(['Peptide', 'loci']).count().reset_index()[['Peptide', 'loci']]
df2['genotype'] = 'unknown'
df2['mean_pred'] = 0.
df2['binding_likelihood'] = 0.

data_pivot = pd.concat([df, df2], sort=False).pivot_table(
    index='Peptide',
    columns=['loci', 'genotype'],
    values='mean_pred',
)
data_pivot.to_pickle('mhc2_puffin_pred_affinity_pivot_v1v2.pkl.gz', protocol=2)
data_pivot

loci,DRB1,DRB1,DRB1,DRB1,DRB1,DRB1,DRB1,DRB1,DRB1,DRB1,...,HLA-DQ,HLA-DQ,HLA-DQ,HLA-DQ,HLA-DQ,HLA-DQ,HLA-DQ,HLA-DQ,HLA-DQ,HLA-DQ
genotype,DRB1_0101,DRB1_0102,DRB1_0103,DRB1_0301,DRB1_0302,DRB1_0401,DRB1_0402,DRB1_0403,DRB1_0404,DRB1_0405,...,HLA-DQA10505-DQB10309,HLA-DQA10505-DQB10319,HLA-DQA10505-DQB10402,HLA-DQA10505-DQB10501,HLA-DQA10505-DQB10502,HLA-DQA10506-DQB10303,HLA-DQA10508-DQB10301,HLA-DQA10509-DQB10301,HLA-DQA10601-DQB10301,unknown
Peptide,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
AAAYYVGYLQPRT,0.400601,0.312500,0.305490,0.091306,0.135300,0.289606,0.310504,0.227737,0.257072,0.384428,...,0.406164,0.406164,0.584529,0.456067,0.495886,0.386837,0.406164,0.406164,0.315752,0.0
AAAYYVGYLQPRTF,0.580028,0.513865,0.536929,0.160847,0.209960,0.403355,0.465695,0.351060,0.383100,0.463590,...,0.437686,0.437686,0.655237,0.542353,0.569029,0.442531,0.437686,0.437686,0.350472,0.0
AAAYYVGYLQPRTFL,0.648609,0.569664,0.614583,0.189893,0.233327,0.406949,0.483555,0.369354,0.404608,0.482794,...,0.419085,0.419085,0.672601,0.551296,0.566071,0.435325,0.419085,0.419085,0.347583,0.0
AAAYYVGYLQPRTFLL,0.691830,0.570180,0.659630,0.214472,0.273981,0.443015,0.508008,0.388828,0.416008,0.506840,...,0.397634,0.397634,0.675484,0.540213,0.549265,0.425213,0.397634,0.397634,0.331075,0.0
AAAYYVGYLQPRTFLLK,0.728757,0.602119,0.679677,0.255770,0.328044,0.472579,0.542195,0.418011,0.444002,0.515598,...,0.389671,0.389671,0.729996,0.531725,0.520856,0.410215,0.389671,0.389671,0.321716,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
YYVWKSYVHVVDGCNSSTCMM,0.486435,0.325522,0.467445,0.178505,0.250803,0.409197,0.355279,0.390921,0.414456,0.483765,...,0.374938,0.374938,0.562576,0.472952,0.351609,0.460095,0.374938,0.374938,0.348333,0.0
YYVWKSYVHVVDGCNSSTCMMC,0.483641,0.320667,0.466266,0.179284,0.253753,0.399223,0.350970,0.385805,0.406049,0.472025,...,0.373139,0.373139,0.563148,0.467725,0.341265,0.461041,0.373139,0.373139,0.361561,0.0
YYVWKSYVHVVDGCNSSTCMMCY,0.468505,0.317547,0.456332,0.174601,0.253481,0.381635,0.344578,0.378381,0.396111,0.440015,...,0.350295,0.350295,0.550179,0.457353,0.319373,0.450740,0.350295,0.350295,0.355496,0.0
YYVWKSYVHVVDGCNSSTCMMCYK,0.460330,0.309825,0.443855,0.183948,0.260090,0.378605,0.343438,0.376068,0.395613,0.442305,...,0.338088,0.338088,0.533762,0.441113,0.279456,0.419088,0.338088,0.338088,0.340112,0.0


In [19]:
data_pivot = pd.concat([df, df2], sort=False).pivot_table(
    index='Peptide',
    columns=['loci', 'genotype'],
    values='binding_likelihood',
)
data_pivot.to_pickle('mhc2_puffin_binding_likelihood_pivot_v1v2.pkl.gz', protocol=2)
data_pivot

loci,DRB1,DRB1,DRB1,DRB1,DRB1,DRB1,DRB1,DRB1,DRB1,DRB1,...,HLA-DQ,HLA-DQ,HLA-DQ,HLA-DQ,HLA-DQ,HLA-DQ,HLA-DQ,HLA-DQ,HLA-DQ,HLA-DQ
genotype,DRB1_0101,DRB1_0102,DRB1_0103,DRB1_0301,DRB1_0302,DRB1_0401,DRB1_0402,DRB1_0403,DRB1_0404,DRB1_0405,...,HLA-DQA10505-DQB10309,HLA-DQA10505-DQB10319,HLA-DQA10505-DQB10402,HLA-DQA10505-DQB10501,HLA-DQA10505-DQB10502,HLA-DQA10506-DQB10303,HLA-DQA10508-DQB10301,HLA-DQA10509-DQB10301,HLA-DQA10601-DQB10301,unknown
Peptide,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
AAAYYVGYLQPRT,0.449154,0.275837,0.251057,0.004402,0.019182,0.209804,0.273397,0.121780,0.160569,0.397723,...,0.456711,0.456711,0.911465,0.568704,0.675733,0.380722,0.456711,0.456711,0.286662,0.0
AAAYYVGYLQPRTF,0.763526,0.662420,0.696835,0.043372,0.097140,0.453153,0.579785,0.343221,0.408718,0.589835,...,0.526885,0.526885,0.977296,0.753945,0.826413,0.553813,0.526885,0.526885,0.353530,0.0
AAAYYVGYLQPRTFL,0.830495,0.739159,0.790134,0.095286,0.153303,0.463464,0.607626,0.387753,0.457538,0.624485,...,0.485483,0.485483,0.983811,0.765804,0.816345,0.529432,0.485483,0.485483,0.350396,0.0
AAAYYVGYLQPRTFLL,0.866045,0.723269,0.830479,0.162656,0.245546,0.529735,0.641095,0.433302,0.482508,0.657231,...,0.439450,0.439450,0.984019,0.734214,0.772907,0.498833,0.439450,0.439450,0.325055,0.0
AAAYYVGYLQPRTFLLK,0.899770,0.759244,0.852191,0.232364,0.338678,0.576275,0.690256,0.486627,0.532303,0.670455,...,0.423755,0.423755,0.997090,0.715366,0.712002,0.457734,0.423755,0.423755,0.312253,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
YYVWKSYVHVVDGCNSSTCMM,0.589478,0.342606,0.561539,0.087292,0.197640,0.473397,0.381563,0.440111,0.480615,0.612662,...,0.398580,0.398580,0.842378,0.604732,0.335097,0.612148,0.398580,0.398580,0.364732,0.0
YYVWKSYVHVVDGCNSSTCMMC,0.584971,0.336226,0.559315,0.093154,0.206702,0.457544,0.376589,0.432581,0.466811,0.588546,...,0.397415,0.397415,0.838706,0.592264,0.315956,0.613799,0.397415,0.397415,0.389500,0.0
YYVWKSYVHVVDGCNSSTCMMCY,0.562452,0.333869,0.544634,0.096750,0.213904,0.430278,0.370461,0.422776,0.451707,0.526737,...,0.357532,0.357532,0.811466,0.567773,0.276164,0.579128,0.357532,0.357532,0.382995,0.0
YYVWKSYVHVVDGCNSSTCMMCYK,0.550762,0.322279,0.526595,0.115894,0.223661,0.424311,0.368985,0.417780,0.450152,0.532259,...,0.336787,0.336787,0.772785,0.532789,0.208163,0.479738,0.336787,0.336787,0.359745,0.0


# Compute ensembles of NetMHCIIpan3.2 and 4.0

In [10]:
netmhc32_aff_pivot = pd.read_pickle(
    'mhc2_haplotype_netmhcii-3.2_pred_affinity_pivot_v1v2.pkl.gz',
)

netmhc40_aff_pivot = pd.read_pickle(
    'mhc2_haplotype_netmhcii-4.0_pred_affinity_pivot_v1v2.pkl.gz',
)

netmhc32_ba_rank_pivot = pd.read_pickle(
    'mhc2_haplotype_netmhcii-3.2_ba_rank_pivot_v2.pkl.gz',
)

netmhc40_el_rank_pivot = pd.read_pickle(
    'mhc2_haplotype_netmhcii-4.0_el_rank_pivot_v2.pkl.gz',
)

netmhc40_ba_rank_pivot = pd.read_pickle(
    'mhc2_haplotype_netmhcii-4.0_ba_rank_pivot_v2.pkl.gz',
)

In [13]:
# Average of 3.2 BA %rank and 4.0 EL %rank
ens_32ba_40el = (netmhc32_ba_rank_pivot + netmhc40_el_rank_pivot) / 2
ens_32ba_40el.to_pickle('mhc2_haplotype_ens_netmhcii-3.2_ba_rank_netmhcii-4.0_el_rank.pkl.gz')
ens_32ba_40el

loci,DRB1,DRB1,DRB1,DRB1,DRB1,DRB1,DRB1,DRB1,DRB1,DRB1,...,HLA-DQ,HLA-DQ,HLA-DQ,HLA-DQ,HLA-DQ,HLA-DQ,HLA-DQ,HLA-DQ,HLA-DQ,HLA-DQ
genotype,DRB1_0101,DRB1_0102,DRB1_0103,DRB1_0301,DRB1_0302,DRB1_0401,DRB1_0402,DRB1_0403,DRB1_0404,DRB1_0405,...,HLA-DQA10505-DQB10302,HLA-DQA10505-DQB10309,HLA-DQA10505-DQB10319,HLA-DQA10505-DQB10402,HLA-DQA10505-DQB10501,HLA-DQA10505-DQB10502,HLA-DQA10506-DQB10303,HLA-DQA10508-DQB10301,HLA-DQA10509-DQB10301,HLA-DQA10601-DQB10301
Peptide,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
AAAYYVGYLQPRT,64.275,59.650,68.835,51.565,65.980,64.140,61.215,55.335,48.245,65.205,...,61.845,80.165,80.165,81.165,74.465,90.010,66.750,80.165,80.165,74.310
AAAYYVGYLQPRTF,73.455,75.120,78.685,59.470,70.130,67.860,75.370,62.330,58.140,73.335,...,68.745,80.400,80.400,87.275,80.335,91.220,71.350,80.400,80.400,75.200
AAAYYVGYLQPRTFL,74.710,83.220,80.270,58.335,66.605,62.605,74.825,58.555,54.250,69.085,...,63.005,75.580,75.580,82.425,80.640,86.730,65.180,75.580,75.580,69.775
AAAYYVGYLQPRTFLL,70.505,79.030,77.195,51.385,59.000,56.910,67.635,51.110,48.885,64.000,...,58.000,69.810,69.810,74.615,72.185,76.630,58.270,69.810,69.810,62.040
AAAYYVGYLQPRTFLLK,78.045,81.340,80.755,43.050,51.195,62.300,62.605,47.870,47.880,60.040,...,51.745,61.085,61.085,67.705,69.475,67.915,52.360,61.085,61.085,54.115
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
YYVWKSYVHVVDGCNSSTCMM,17.675,17.500,27.870,15.120,24.100,40.645,26.405,27.935,29.590,55.095,...,23.240,33.535,33.535,34.000,31.500,36.905,31.520,33.535,33.535,33.080
YYVWKSYVHVVDGCNSSTCMMC,17.890,17.500,28.265,15.065,24.250,43.865,26.715,28.455,30.230,55.515,...,25.690,34.015,34.015,34.500,31.500,37.090,31.645,34.015,34.015,33.510
YYVWKSYVHVVDGCNSSTCMMCY,18.135,17.710,28.690,17.710,24.400,44.470,27.145,31.410,30.680,56.340,...,28.915,35.615,35.615,35.000,32.500,37.675,32.745,35.615,35.615,35.675
YYVWKSYVHVVDGCNSSTCMMCYK,18.365,17.910,29.115,17.840,24.500,44.980,27.505,31.800,31.060,57.095,...,29.555,36.445,36.445,36.000,33.000,38.255,33.390,36.445,36.445,36.380


In [14]:
# Average of 4.0 BA %rank and 4.0 EL %rank
ens_40ba_40el = (netmhc40_ba_rank_pivot + netmhc40_el_rank_pivot) / 2
ens_40ba_40el.to_pickle('mhc2_haplotype_ens_netmhcii-4.0_ba_rank_netmhcii-4.0_el_rank.pkl.gz')
ens_40ba_40el

loci,DRB1,DRB1,DRB1,DRB1,DRB1,DRB1,DRB1,DRB1,DRB1,DRB1,...,HLA-DQ,HLA-DQ,HLA-DQ,HLA-DQ,HLA-DQ,HLA-DQ,HLA-DQ,HLA-DQ,HLA-DQ,HLA-DQ
genotype,DRB1_0101,DRB1_0102,DRB1_0103,DRB1_0301,DRB1_0302,DRB1_0401,DRB1_0402,DRB1_0403,DRB1_0404,DRB1_0405,...,HLA-DQA10505-DQB10302,HLA-DQA10505-DQB10309,HLA-DQA10505-DQB10319,HLA-DQA10505-DQB10402,HLA-DQA10505-DQB10501,HLA-DQA10505-DQB10502,HLA-DQA10506-DQB10303,HLA-DQA10508-DQB10301,HLA-DQA10509-DQB10301,HLA-DQA10601-DQB10301
Peptide,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
AAAYYVGYLQPRT,40.410,34.845,47.735,45.705,54.070,38.715,48.080,39.010,29.135,42.805,...,39.830,64.315,64.315,61.720,57.125,73.980,44.550,64.315,64.315,56.865
AAAYYVGYLQPRTF,62.335,64.590,68.555,56.265,65.645,53.500,69.305,53.950,47.765,62.370,...,62.370,75.925,75.925,83.850,76.465,88.410,66.020,75.925,75.925,70.560
AAAYYVGYLQPRTFL,75.720,84.395,82.140,68.555,78.895,61.415,77.655,61.650,56.285,68.545,...,61.085,77.150,77.150,82.920,80.040,86.270,65.820,77.150,77.150,71.070
AAAYYVGYLQPRTFLL,71.065,78.640,76.480,59.335,68.725,54.890,67.390,53.540,49.145,63.180,...,57.965,71.060,71.060,74.060,70.990,75.885,59.135,71.060,71.060,65.025
AAAYYVGYLQPRTFLLK,82.395,84.575,84.650,59.635,67.115,65.950,66.410,57.615,54.085,62.780,...,51.710,62.945,62.945,67.310,69.460,67.245,53.555,62.945,62.945,57.300
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
YYVWKSYVHVVDGCNSSTCMM,33.125,25.665,33.590,27.885,38.615,59.115,37.935,46.760,45.895,71.040,...,25.230,30.695,30.695,29.815,33.380,38.125,31.495,30.695,30.695,31.155
YYVWKSYVHVVDGCNSSTCMMC,33.475,26.320,34.675,28.575,39.010,60.160,38.905,47.475,46.925,71.555,...,25.800,31.645,31.645,30.395,33.515,37.940,31.930,31.645,31.645,31.830
YYVWKSYVHVVDGCNSSTCMMCY,33.910,27.275,35.650,29.345,39.790,61.000,39.695,48.140,47.720,72.000,...,29.690,33.640,33.640,31.635,34.005,39.000,33.965,33.640,33.640,34.265
YYVWKSYVHVVDGCNSSTCMMCYK,34.375,28.315,36.565,30.125,40.160,61.920,40.360,48.870,48.435,72.340,...,30.425,35.785,35.785,33.170,34.905,39.915,36.190,35.785,35.785,36.040


In [15]:
# Average of 3.2 predicted affinity and 4.0 predicted affinity
ens_32aff_40aff = 1 - np.log(((50000 ** (1-netmhc32_aff_pivot)) + (50000 ** (1-netmhc40_aff_pivot))) / 2) / np.log(50000)
ens_32aff_40aff.to_pickle('mhc2_haplotype_ens_netmhcii-3.2_aff_netmhcii-4.0_aff.pkl.gz')
ens_32aff_40aff

loci,DRB1,DRB1,DRB1,DRB1,DRB1,DRB1,DRB1,DRB1,DRB1,DRB1,...,HLA-DQ,HLA-DQ,HLA-DQ,HLA-DQ,HLA-DQ,HLA-DQ,HLA-DQ,HLA-DQ,HLA-DQ,HLA-DQ
genotype,DRB1_0101,DRB1_0102,DRB1_0103,DRB1_0301,DRB1_0302,DRB1_0401,DRB1_0402,DRB1_0403,DRB1_0404,DRB1_0405,...,HLA-DQA10505-DQB10309,HLA-DQA10505-DQB10319,HLA-DQA10505-DQB10402,HLA-DQA10505-DQB10501,HLA-DQA10505-DQB10502,HLA-DQA10506-DQB10303,HLA-DQA10508-DQB10301,HLA-DQA10509-DQB10301,HLA-DQA10601-DQB10301,unknown
Peptide,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
AAAYYVGYLQPRT,0.322484,0.258498,0.200212,0.163577,0.076705,0.254549,0.222623,0.192972,0.246155,0.289692,...,0.280495,0.280495,0.341747,0.319118,0.254751,0.266447,0.280495,0.280495,0.237389,0.0
AAAYYVGYLQPRTF,0.490004,0.394190,0.267980,0.219368,0.093999,0.339910,0.290397,0.250138,0.348641,0.388828,...,0.347394,0.347394,0.463792,0.391967,0.315552,0.360864,0.347394,0.347394,0.293370,0.0
AAAYYVGYLQPRTFL,0.637906,0.506652,0.348317,0.288839,0.115880,0.419452,0.355387,0.305167,0.441926,0.473647,...,0.385707,0.385707,0.524090,0.435281,0.352456,0.396362,0.385707,0.385707,0.316031,0.0
AAAYYVGYLQPRTFLL,0.622981,0.465672,0.329097,0.259959,0.110437,0.417310,0.324221,0.292519,0.413951,0.466066,...,0.384422,0.384422,0.515780,0.426203,0.345868,0.389271,0.384422,0.384422,0.317764,0.0
AAAYYVGYLQPRTFLLK,0.642850,0.465804,0.344080,0.278532,0.114585,0.439120,0.335532,0.305352,0.435093,0.475456,...,0.372327,0.372327,0.509578,0.430653,0.339827,0.371787,0.372327,0.372327,0.304479,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
YYVWKSYVHVVDGCNSSTCMM,0.397788,0.266286,0.202335,0.213601,0.098355,0.345216,0.235960,0.248369,0.345126,0.393548,...,0.324329,0.324329,0.349465,0.323933,0.254110,0.316918,0.324329,0.324329,0.300982,0.0
YYVWKSYVHVVDGCNSSTCMMC,0.398960,0.268777,0.204457,0.217067,0.098649,0.347618,0.238340,0.249769,0.347843,0.394617,...,0.330587,0.330587,0.352533,0.324522,0.255056,0.320681,0.330587,0.330587,0.306045,0.0
YYVWKSYVHVVDGCNSSTCMMCY,0.400411,0.272610,0.205723,0.219780,0.099916,0.349694,0.239857,0.251226,0.350417,0.396423,...,0.343593,0.343593,0.359359,0.326625,0.257858,0.330206,0.343593,0.343593,0.319666,0.0
YYVWKSYVHVVDGCNSSTCMMCYK,0.402106,0.275694,0.207381,0.223090,0.100753,0.352444,0.241563,0.253076,0.352767,0.397452,...,0.354990,0.354990,0.367243,0.330172,0.260298,0.339698,0.354990,0.354990,0.328567,0.0


In [16]:
netmhc32_aff_pivot

loci,DRB1,DRB1,DRB1,DRB1,DRB1,DRB1,DRB1,DRB1,DRB1,DRB1,...,HLA-DQ,HLA-DQ,HLA-DQ,HLA-DQ,HLA-DQ,HLA-DQ,HLA-DQ,HLA-DQ,HLA-DQ,HLA-DQ
genotype,DRB1_0101,DRB1_0102,DRB1_0103,DRB1_0301,DRB1_0302,DRB1_0401,DRB1_0402,DRB1_0403,DRB1_0404,DRB1_0405,...,HLA-DQA10505-DQB10309,HLA-DQA10505-DQB10319,HLA-DQA10505-DQB10402,HLA-DQA10505-DQB10501,HLA-DQA10505-DQB10502,HLA-DQA10506-DQB10303,HLA-DQA10508-DQB10301,HLA-DQA10509-DQB10301,HLA-DQA10601-DQB10301,unknown
Peptide,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
AAAYYVGYLQPRT,0.554,0.364,0.266,0.199,0.091,0.403,0.295,0.265,0.364,0.424,...,0.327,0.327,0.428,0.467,0.367,0.336,0.327,0.327,0.259,0.0
AAAYYVGYLQPRTF,0.642,0.456,0.327,0.265,0.108,0.458,0.370,0.327,0.443,0.489,...,0.364,0.364,0.504,0.524,0.415,0.385,0.364,0.364,0.292,0.0
AAAYYVGYLQPRTFL,0.659,0.486,0.346,0.297,0.112,0.466,0.397,0.357,0.470,0.506,...,0.388,0.388,0.537,0.559,0.438,0.406,0.388,0.388,0.307,0.0
AAAYYVGYLQPRTFLL,0.647,0.460,0.343,0.285,0.113,0.467,0.387,0.348,0.457,0.500,...,0.396,0.396,0.551,0.566,0.441,0.404,0.396,0.396,0.309,0.0
AAAYYVGYLQPRTFLLK,0.622,0.432,0.321,0.277,0.109,0.445,0.371,0.332,0.442,0.478,...,0.394,0.394,0.548,0.558,0.430,0.391,0.394,0.394,0.302,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
YYVWKSYVHVVDGCNSSTCMM,0.396,0.260,0.201,0.243,0.102,0.332,0.259,0.267,0.346,0.372,...,0.392,0.392,0.427,0.472,0.335,0.353,0.392,0.392,0.333,0.0
YYVWKSYVHVVDGCNSSTCMMC,0.397,0.261,0.202,0.246,0.102,0.334,0.260,0.269,0.348,0.373,...,0.397,0.397,0.429,0.473,0.338,0.355,0.397,0.397,0.339,0.0
YYVWKSYVHVVDGCNSSTCMMCY,0.398,0.264,0.202,0.248,0.103,0.336,0.261,0.271,0.350,0.375,...,0.408,0.408,0.433,0.476,0.339,0.361,0.408,0.408,0.351,0.0
YYVWKSYVHVVDGCNSSTCMMCYK,0.399,0.265,0.203,0.251,0.104,0.338,0.263,0.273,0.352,0.376,...,0.413,0.413,0.437,0.478,0.340,0.365,0.413,0.413,0.357,0.0


In [17]:
netmhc40_aff_pivot

loci,DRB1,DRB1,DRB1,DRB1,DRB1,DRB1,DRB1,DRB1,DRB1,DRB1,...,HLA-DQ,HLA-DQ,HLA-DQ,HLA-DQ,HLA-DQ,HLA-DQ,HLA-DQ,HLA-DQ,HLA-DQ,HLA-DQ
genotype,DRB1_0101,DRB1_0102,DRB1_0103,DRB1_0301,DRB1_0302,DRB1_0401,DRB1_0402,DRB1_0403,DRB1_0404,DRB1_0405,...,HLA-DQA10505-DQB10309,HLA-DQA10505-DQB10319,HLA-DQA10505-DQB10402,HLA-DQA10505-DQB10501,HLA-DQA10505-DQB10502,HLA-DQA10506-DQB10303,HLA-DQA10508-DQB10301,HLA-DQA10509-DQB10301,HLA-DQA10601-DQB10301,unknown
Peptide,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
AAAYYVGYLQPRT,0.262275,0.210513,0.162170,0.138032,0.064328,0.200257,0.182535,0.152988,0.196000,0.237120,...,0.249702,0.249702,0.297920,0.264890,0.205538,0.227213,0.249702,0.249702,0.219885,0.0
AAAYYVGYLQPRTF,0.435325,0.357479,0.232249,0.188956,0.081843,0.289715,0.248274,0.208763,0.302934,0.341891,...,0.333322,0.333322,0.435867,0.339701,0.268764,0.341743,0.333322,0.333322,0.294761,0.0
AAAYYVGYLQPRTFL,0.620743,0.533288,0.350694,0.281340,0.119930,0.388640,0.326797,0.272157,0.420422,0.449729,...,0.383470,0.383470,0.512765,0.384206,0.308803,0.387634,0.383470,0.383470,0.326042,0.0
AAAYYVGYLQPRTFLL,0.603934,0.471715,0.317015,0.240276,0.107944,0.385170,0.287178,0.258092,0.384697,0.441300,...,0.374134,0.374134,0.490339,0.372928,0.299992,0.376569,0.374134,0.374134,0.327447,0.0
AAAYYVGYLQPRTFLLK,0.669816,0.519657,0.374913,0.280090,0.120530,0.433591,0.309963,0.284692,0.428667,0.472981,...,0.354784,0.354784,0.482520,0.379043,0.295066,0.355889,0.354784,0.354784,0.307027,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
YYVWKSYVHVVDGCNSSTCMM,0.399611,0.273032,0.203689,0.191331,0.094848,0.360642,0.217532,0.232871,0.344261,0.421699,...,0.285683,0.285683,0.307903,0.269684,0.211645,0.291034,0.285683,0.285683,0.277248,0.0
YYVWKSYVHVVDGCNSSTCMMC,0.400962,0.277270,0.206981,0.195065,0.095416,0.363595,0.220804,0.233858,0.347686,0.422885,...,0.292342,0.292342,0.311268,0.270227,0.212058,0.295712,0.292342,0.292342,0.281802,0.0
YYVWKSYVHVVDGCNSSTCMMCY,0.402886,0.282106,0.209602,0.198191,0.096931,0.365776,0.222661,0.234946,0.350836,0.424360,...,0.306004,0.306004,0.318901,0.272231,0.215326,0.307148,0.306004,0.306004,0.296308,0.0
YYVWKSYVHVVDGCNSSTCMMCYK,0.405321,0.287790,0.211981,0.201682,0.097617,0.369570,0.224174,0.236694,0.353541,0.425438,...,0.319624,0.319624,0.327946,0.275950,0.218148,0.319855,0.319624,0.319624,0.306854,0.0
