In [2]:
import warnings
warnings.filterwarnings("ignore")

import io
import requests
import json
import gzip
import numpy as np
import pandas as pd

from pepmatch import Preprocessor, Matcher
from collections import Counter
from Bio import SeqIO

In [5]:
def pull_iedb_assay_data(table):
    '''Extracts T cell and B cell positive assay data from the IEDB.'''

    # first get the total number of assays as first request to loop through API
    url = 'https://query-api.iedb.org/%s_search' % table
    params = {'order': 'structure_id',
              'qualitative_measure': 'neq.Negative'} # select positive assays only
    r = requests.get(url, params=params, headers={'Prefer': 'count=exact'})
    pages = int(r.headers['Content-Range'].split('/')[-1])
    
    # loop through IEDB API pages using requests - read into pandas DataFrame and concat
    df = pd.DataFrame()
#     for i in range(pages // 10000 + 1): # API limit is 10,000 entries
#         params['offset'] = i*10000

#         # request API call returning csv formatting using parameters in params
#         s = requests.get(url, params=params, headers={'accept': 'text/csv', 'Prefer': 'count=exact'})
#         df = pd.concat([df, pd.read_csv(io.StringIO(s.content.decode('utf-8')))])
    s = requests.get(url, params=params, headers={'accept': 'text/csv', 'Prefer': 'count=exact'})
    df = pd.concat([df, pd.read_csv(io.StringIO(s.content.decode('utf-8')))])
    return df

In [6]:
df = pull_iedb_assay_data('tcell')

In [7]:
df

Unnamed: 0,tcell_id,tcell_iri,structure_id,structure_iri,linear_sequence,structure_type,structure_description,curated_source_antigen,reference_id,reference_iri,...,non_peptidic_molecule_iri_search,non_peptidic_molecule_iri,non_peptidic_molecule_name,r_object_source_molecule_iri_search,r_object_source_molecule_iri,r_object_source_molecule_name,r_object_source_organism_iri_search,r_object_source_organism_iri,r_object_source_organism_name,e_related_object_type
0,4372,IEDB_ASSAY:4372,10,IEDB_EPITOPE:10,AAAAAIFVI,Linear peptide,AAAAAIFVI,"(AAU95382.1,""MHC class I related protein A"",GE...",541,IEDB_REFERENCE:541,...,,,,,,,,,,
1,5785,IEDB_ASSAY:5785,10,IEDB_EPITOPE:10,AAAAAIFVI,Linear peptide,AAAAAIFVI,"(AAU95382.1,""MHC class I related protein A"",GE...",541,IEDB_REFERENCE:541,...,,,,,,,,,,
2,1848919,IEDB_ASSAY:1848919,10,IEDB_EPITOPE:10,AAAAAIFVI,Linear peptide,AAAAAIFVI,"(AAU95382.1,""MHC class I related protein A"",GE...",541,IEDB_REFERENCE:541,...,,,,,,,,,,
3,6013202,IEDB_ASSAY:6013202,10,IEDB_EPITOPE:10,AAAAAIFVI,Linear peptide,AAAAAIFVI,"(AAK26323.1,""MHC class I chain-related protein...",1035582,IEDB_REFERENCE:1035582,...,,,,,,,,,,
4,1328560,IEDB_ASSAY:1328560,11,IEDB_EPITOPE:11,AAAAALDKKQRNFDKILA,Linear peptide,AAAAALDKKQRNFDKILA,"(P12883.5,Myosin-7,UNIPROT:P12883.5,1437,1454,...",1002200,IEDB_REFERENCE:1002200,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,1686217,IEDB_ASSAY:1686217,13572,IEDB_EPITOPE:13572,ENPVVHFFKNIVTPR,Linear peptide,ENPVVHFFKNIVTPR,"(AAH08749.3,""MBP protein"",GENPEPT:AAH08749.3,8...",315040,IEDB_REFERENCE:315040,...,,,,,,,,,,
9996,1686218,IEDB_ASSAY:1686218,13572,IEDB_EPITOPE:13572,ENPVVHFFKNIVTPR,Linear peptide,ENPVVHFFKNIVTPR,"(AAH08749.3,""MBP protein"",GENPEPT:AAH08749.3,8...",315040,IEDB_REFERENCE:315040,...,,,,,,,,,,
9997,1686228,IEDB_ASSAY:1686228,13572,IEDB_EPITOPE:13572,ENPVVHFFKNIVTPR,Linear peptide,ENPVVHFFKNIVTPR,"(AAH08749.3,""MBP protein"",GENPEPT:AAH08749.3,8...",315040,IEDB_REFERENCE:315040,...,,,,,,,,,,
9998,1689978,IEDB_ASSAY:1689978,13572,IEDB_EPITOPE:13572,ENPVVHFFKNIVTPR,Linear peptide,ENPVVHFFKNIVTPR,"(AAH08749.3,""MBP protein"",GENPEPT:AAH08749.3,8...",315040,IEDB_REFERENCE:315040,...,,,,,,,,,,
