In [1]:
import glob
import os
import pandas as pd

In [2]:
from Bio import SeqIO

### Prepare inputs for MARIA

The input file is a plain tap-delaminated text file with a header and 5 required columns. Column 1 and 2 are HLA-DR or DQ alleles of the cells (see Supported Alleles for details). Column 3 is the gene symbol (e.g. CTSK) of genes encoding the peptide of interest. Column 4 is peptide sequences in single letter format (all capitalized, no space). Column 5 is optional gene expression values if users want to provide specific gene expression values for this antigen gene (in TPM). Otherwise gene expression values will be estimated from external RNA-Seq references (e.g. TCGA) and genes with unknown gene expression will be assigned with a TPM of 5.


In [3]:
! head -5 MARIA/Example1_K562_ligands.txt

Allele 1	Allele 2	Genes	Sequences	TPM	Notes
HLA-DRB1*01:01	HLA-DRB1*01:01	hCG_40889	HGRKFVQGKSIDVACHPG		HLA-DRB1*01:01 ligands
HLA-DRB1*01:01	HLA-DRB1*01:01	hCG_40889	EPDREYHFGQAVRFV		HLA-DRB1*01:01 ligands
HLA-DRB1*01:01	HLA-DRB1*01:01	B4DJQ5	KKLIELQAGKKSLED		HLA-DRB1*01:01 ligands
HLA-DRB1*01:01	HLA-DRB1*01:01	COX8	IHSLPPEGKLGIMELAVGLTS		HLA-DRB1*01:01 ligands


In [4]:
pid = 'sa_highly_expressed_genes'

In [5]:
alleles = 'HLA-DRB1*04:01 HLA-DRB1*04:02 HLA-DRB1*15:01 HLA-DRB1*12:01'.split()
alleles

['HLA-DRB1*04:01', 'HLA-DRB1*04:02', 'HLA-DRB1*15:01', 'HLA-DRB1*12:01']

In [6]:
fasta_sequences = SeqIO.parse(open(f'{pid}.fasta'),'fasta')
w = 15
rows = []
for allele in alleles:
    for fasta in fasta_sequences:
        name, sequence = fasta.id, str(fasta.seq)
        name = name.split('|')[-1]
        for i in range(len(sequence) - (w-1)):
            rows.append([allele, allele, name, sequence[i:i+w], None])
df = pd.DataFrame(rows, columns = 'Allele 1	Allele 2	Genes	Sequences	TPM'.split('\t'))

In [7]:
df.head(3)

Unnamed: 0,Allele 1,Allele 2,Genes,Sequences,TPM
0,HLA-DRB1*04:01,HLA-DRB1*04:01,GUAC_STAA3,MKIFDYEDIQLIPNK,
1,HLA-DRB1*04:01,HLA-DRB1*04:01,GUAC_STAA3,KIFDYEDIQLIPNKC,
2,HLA-DRB1*04:01,HLA-DRB1*04:01,GUAC_STAA3,IFDYEDIQLIPNKCI,


In [8]:
df.to_csv(f'data/maria_input.txt', sep='\t', index=False)

### Run in shell with environment `conda activate maria`

```bash
(maria) ccc14@BSTAT-FT236YNJ hla % python2.7 MARIA/maria.py data/maria_input.txt -cut_off 90
Using Theano backend.
Ddeveloped with keras version 2.0.3, and current keras version:  2.0.3
Loading data from /Users/ccc14/learning/learn-immune-ds/hla/MARIA/supporting_file/
Each DR allele presented by a 19-AA long pseudosequence
The maximum length allowed for MARIA MHC-DR is 25
Running recurrent neural network for HLA-DR ligand prediction
MARIA run was successful
The output was saved to data/maria_input.txt.output.txt
```

In [9]:
! head -10 data/maria_input.txt.output.txt

Allele 1	Allele 2	Genes	Sequences	TPM	TPM estimated	MARIA raw scores	MARIA percentile scores	15mer core	Positive presenters
HLA-DRB1*04:01	HLA-DRB1*04:01	GUAC_STAA3	MKIFDYEDIQLIPNK		5	0.1312	63.716	MKIFDYEDIQLIPNK	0
HLA-DRB1*04:01	HLA-DRB1*04:01	GUAC_STAA3	KIFDYEDIQLIPNKC		5	0.0825	51.131	KIFDYEDIQLIPNKC	0
HLA-DRB1*04:01	HLA-DRB1*04:01	GUAC_STAA3	IFDYEDIQLIPNKCI		5	0.0347	14.928	IFDYEDIQLIPNKCI	0
HLA-DRB1*04:01	HLA-DRB1*04:01	GUAC_STAA3	FDYEDIQLIPNKCIV		5	0.0501	29.225	FDYEDIQLIPNKCIV	0
HLA-DRB1*04:01	HLA-DRB1*04:01	GUAC_STAA3	DYEDIQLIPNKCIVE		5	0.4124	81.007	DYEDIQLIPNKCIVE	0
HLA-DRB1*04:01	HLA-DRB1*04:01	GUAC_STAA3	YEDIQLIPNKCIVES		5	0.1337	64.197	YEDIQLIPNKCIVES	0
HLA-DRB1*04:01	HLA-DRB1*04:01	GUAC_STAA3	EDIQLIPNKCIVESR		5	0.1356	64.477	EDIQLIPNKCIVESR	0
HLA-DRB1*04:01	HLA-DRB1*04:01	GUAC_STAA3	DIQLIPNKCIVESRS		5	0.0748	47.597	DIQLIPNKCIVESRS	0
HLA-DRB1*04:01	HLA-DRB1*04:01	GUAC_STAA3	IQLIPNKCIVESRSE		5	0.0615	39.117	IQLIPNKCIVESRSE	0
