# MHC Epitope Prediction

Reference: https://github.com/GfellerLab/MixMHC2pred

In [1]:
import os
import warnings

In [2]:
warnings.simplefilter('ignore', FutureWarning)

# S. aureus analysis

Use hihgly expressedd proteins in `sa_highly_expressed_genes.fasta`

In [3]:
pid = 'sa_highly_expressed_genes'

In [4]:
import glob
import os
import pandas as pd

In [5]:
from Bio import SeqIO

### Prepare inputs for MixMHC2pred

In [6]:
fasta_sequences = SeqIO.parse(open(f'{pid}.fasta'),'fasta')
w = 15
for fasta in fasta_sequences:
    name, sequence = fasta.id, str(fasta.seq)
    sequence = '------' + sequence + '------'
    name = name.split('|')[-1]
    with open(f'data/{name}.kmers', 'w') as fo:
        for i in range(6, len(sequence) - (w-1) - 12):
            context = sequence[i-6:i] + sequence[i+w:i+w+6]
            fo.write(sequence[i:i+w] + '\t' + context + '\n')

In [7]:
alleles = (
    'HLA-DRB1*04:01 HLA-DRB1*04:02 HLA-DRB1*15:01 HLA-DRB1*12:01'.
    replace('HLA-', '').
    replace('*', '_').
    replace(':', '_')
)
alleles

'DRB1_04_01 DRB1_04_02 DRB1_15_01 DRB1_12_01'

### Run MixMHC2pred

In [8]:
%%time
ref = 'MixMHC2pred-2.0.2/PWMdef_Human'

outputs = []
infiles = glob.glob('data/*kmers')
for fi in infiles:
    fo = fi.replace('kmers', 'out')
    cmd = f'MixMHC2pred-2.0.2/MixMHC2pred -i {fi} -o {fo} -a {alleles} -f {ref}'
    outputs.append(os.popen(cmd).read())

CPU times: user 5.7 ms, sys: 46.5 ms, total: 52.2 ms
Wall time: 10.7 s


In [9]:
print(outputs[0])

Runing MixMHC2pred (v2.0.2) for peptide file: data/A0A0H2XFL0_STAA3.kmers
Imported 295 peptides. Computing now the PWM-based scores from each peptide.
Computing now the full scores from each peptide.
Saving the results in the output file.
Finished the computations.



### Assemble DataFrame from MixMHC2pred outputs

In [10]:
dfs = []
for fi in infiles:
    protein = fi.split('/')[-1].rstrip('.kmers')
    fo = fi.replace('kmers', 'out')
    df_ = pd.read_table(fo, comment='#')
    df_['protein'] = protein
    dfs.append(df_)
df = pd.concat(dfs)
df = df[df.columns[-1:].tolist() + df.columns[:-1].tolist()]

In [11]:
df.sample(10)

Unnamed: 0,protein,Peptide,Context,BestAllele,%Rank_best,Core_best,CoreP1_best,SubSpec_best,%Rank_DRB1_04_01,CoreP1_DRB1_04_01,SubSpec_DRB1_04_01,%Rank_DRB1_04_02,CoreP1_DRB1_04_02,SubSpec_DRB1_04_02,%Rank_DRB1_15_01,CoreP1_DRB1_15_01,SubSpec_DRB1_15_01,%Rank_DRB1_12_01,CoreP1_DRB1_12_01,SubSpec_DRB1_12_01
68,ISDC_STAA3,QITVNHSHWITGMSI,NGKLYVEGHKEN,DRB1_12_01,23.7,ITVNHSHWI,2,1,32.7,2,1,33.4,2,1,25.3,2,1,23.7,2,1
98,A0A0H2XIG1_STAA3,LFFPTGHILLKLVFA,GILIALVICSIC,DRB1_12_01,82.2,GHILLKLVF,6,1,92.8,2,1,88.6,4,1,92.0,1,1,82.2,6,1
260,GUAC_STAA3,QKGEHKNVEGKKMFV,GSASEFEHKGSL,DRB1_15_01,17.9,HKNVEGKKM,5,1,30.8,2,1,34.0,5,1,17.9,5,1,25.5,5,1
128,A0A0H2XEL7_STAA3,LGASYSTSSNNVQVT,SYRTGGTTMAPS,DRB1_04_01,0.303,YSTSSNNVQ,5,1,0.303,5,1,36.1,5,1,55.1,4,1,55.5,5,1
122,A0A0H2XDN8_STAA3,IFKIGHFSIYFILLI,LILFQGGVLLGT,DRB1_12_01,18.4,IGHFSIYFI,4,1,57.6,4,1,48.3,4,1,19.4,4,1,18.4,4,1
287,GUAC_STAA3,MQQDLQSSISYAGGK,MDTLKEDLKSLR,DRB1_04_02,2.13,LQSSISYAG,5,1,3.3,5,1,2.13,5,1,6.99,5,1,20.0,5,1
190,ISDE_STAA3,LVKIAGGENVIKVKD,KSYIGDRQYISS,DRB1_04_02,4.63,IAGGENVIK,4,1,8.43,4,1,4.63,4,1,13.0,2,1,17.7,2,1
35,ISDE_STAA3,IVPTTVALTMTLDKL,KSGEFRDLPIVG,DRB1_12_01,37.7,TVALTMTLD,5,1,38.6,1,1,50.8,6,1,42.1,4,1,37.7,5,1
134,A0A0H2XDN8_STAA3,LLIGVLLGTFFRSIT,FSIYFIGFIQLI,DRB1_12_01,76.1,IGVLLGTFF,3,1,96.3,5,1,94.9,5,1,93.8,5,1,76.1,3,1
89,A0A0H2XH75_STAA3,MNYVRSSNKSHGKQN,EKLKLPQIEGAK,DRB1_04_01,22.2,YVRSSNKSH,3,1,22.2,3,1,30.4,4,1,37.5,4,1,39.5,4,1
