In [1]:
import requests
import pandas as pd
from io import StringIO

# swagger docs link: https://query-api.iedb.org/docs/swagger/#/

## Example #1: Pull T cell Assay Data

In [None]:
url = f'https://query-api.iedb.org/tcell_search'
params = {
    'select': 'structure_id, structure_description, parent_source_antigen_iri'
}
response = requests.get(url, params=params)
response.raise_for_status() # checks for any HTTP errors
response.text

In [None]:
df = pd.read_json(StringIO(response.text)) # read_json doesn't like strings - it treats the string like a file object
df

### Paginate API Since it Only Returns 10,000 Entries at a time

In [None]:
params = {
    'order': 'structure_id', # we want the pages to be in order by structure ID
    'select': 'structure_id, structure_description, parent_source_antigen_iri',
    'offset': 0              # offset starting at 0 - we will add 10,000 per API call
}

In [None]:
df = pd.DataFrame()
while(True):
    response = requests.get(url, params=params)
    response.raise_for_status()
    
    df = pd.concat([df, pd.read_json(StringIO(response.text))])
    print(len(df))
    
    params['offset'] += 10000
    if response.text == '[]' or len(df) == 100000:
        break

df.reset_index(inplace=True, drop=True)
df

## Example #2: Pull All Relevant Epitopes for UniProt Bridge

In [5]:
def pull_data_from_iedb(table: str) -> pd.DataFrame:
    """Read data from the IEDB API into a pandas dataframe.
    
    Args:
        table (str): Table search name ('tcell', 'bcell', or 'mhc')."""
    
    url = f'https://query-api.iedb.org/{table}_search'
    params = {
        'order': 'structure_id',                         # we want the pages to be in order by structure ID
        'select': 'structure_id, linear_sequence, structure_description, curated_source_antigen, ' \
          'source_organism_iri, pubmed_id, parent_source_antigen_iri, parent_source_antigen_name, ' \
          'parent_source_antigen_source_org_iri, e_modification, disease_iris, disease_names',
        'offset': 0,                                     # offset starting at 0 - we will add 10,000 per API call
        'qualitative_measure': 'neq.Negative',           # select positive assays only
        'structure_type': 'eq.Linear peptide',           # select linear peptides only
        'epitope_structure_defined': 'eq.Exact Epitope', # only exact epitopes
        'reference_type': 'eq.Literature'                # select only data from literature
    }

    # https://ontobee.org/ontology/OBI?iri=http://purl.obolibrary.org/obo/OBI_1110125
    if table == 'mhc': params['assay_iri_search'] = 'cs.{"OBI:1110125"}' # include ligand elution data

    df = iterate_api(url, params)
    
    if table == 'tcell':
        df['assay_type'] = 'T cell'
    elif table == 'bcell':
        df['assay_type'] = 'B cell'
    else:
        df['assay_type'] = 'MHC'
    
    return df

In [6]:
def iterate_api(url: str, params: dict) -> pd.DataFrame:
    """IEDB API only allows 10,000 entries per request. We use this function to loop through
    all requested pages using the URL and desired parameters until we receive no more data.
    
    Args:
        url (str):IEDB API URL with the search table name.
        params (dict): Parameters to include for the call."""
    
    df = pd.DataFrame()
    while(True):

        response = requests.get(url, params=params)
        response.raise_for_status()
        
        df = pd.concat([df, pd.read_json(StringIO(response.text))])
        params['offset'] += 10000
        if response.text == '[]' or len(df) == 100000:
            break
        
    return df

In [7]:
tcell_df = pull_data_from_iedb('tcell')
bcell_df = pull_data_from_iedb('bcell')
mhc_df   = pull_data_from_iedb('mhc')

total_df = pd.concat([tcell_df, bcell_df, mhc_df])
total_df.reset_index(inplace=True, drop=True)
total_df

Unnamed: 0,structure_id,linear_sequence,structure_description,curated_source_antigen,source_organism_iri,pubmed_id,parent_source_antigen_iri,parent_source_antigen_name,parent_source_antigen_source_org_iri,e_modification,disease_iris,disease_names,assay_type
0,10,AAAAAIFVI,AAAAAIFVI,"{'accession': 'AAU95382.1', 'name': 'MHC class...",NCBITaxon:9606,15529349,UNIPROT:Q29983,MHC class I polypeptide-related sequence A (Un...,NCBITaxon:9606,,[DOID:13241],[Behcet's disease],T cell
1,10,AAAAAIFVI,AAAAAIFVI,"{'accession': 'AAU95382.1', 'name': 'MHC class...",NCBITaxon:9606,15529349,UNIPROT:Q29983,MHC class I polypeptide-related sequence A (Un...,NCBITaxon:9606,,[DOID:13241],[Behcet's disease],T cell
2,10,AAAAAIFVI,AAAAAIFVI,"{'accession': 'AAU95382.1', 'name': 'MHC class...",NCBITaxon:9606,15529349,UNIPROT:Q29983,MHC class I polypeptide-related sequence A (Un...,NCBITaxon:9606,,[DOID:13241],[Behcet's disease],T cell
3,10,AAAAAIFVI,AAAAAIFVI,"{'accession': 'AAK26323.1', 'name': 'MHC class...",NCBITaxon:9606,31513650,UNIPROT:Q29983,MHC class I polypeptide-related sequence A (Un...,NCBITaxon:9606,,[DOID:13241],[Behcet's disease],T cell
4,46,AAALEQLLGQTADVA,AAALEQLLGQTADVA,"{'accession': 'NP_301777.1', 'name': 'hypothet...",NCBITaxon:272631,14699084,UNIPROT:Q7AQA0,Uncharacterized protein (UniProt:Q7AQA0),NCBITaxon:1769,,,,T cell
...,...,...,...,...,...,...,...,...,...,...,...,...,...
245680,176054,KSFDFHFGF,KSFDFHFGF,"{'accession': 'Q13432.1', 'name': 'Protein unc...",NCBITaxon:9606,22645359,UNIPROT:Q13432,Protein unc-119 homolog A,NCBITaxon:9606,,,,MHC
245681,176054,KSFDFHFGF,KSFDFHFGF,"{'accession': 'Q13432.1', 'name': 'Protein unc...",NCBITaxon:9606,26154972,UNIPROT:Q13432,Protein unc-119 homolog A,NCBITaxon:9606,,,,MHC
245682,176054,KSFDFHFGF,KSFDFHFGF,"{'accession': 'NP_005139.1', 'name': 'protein ...",NCBITaxon:9606,28218747,UNIPROT:Q13432,Protein unc-119 homolog A,NCBITaxon:9606,,,,MHC
245683,176054,KSFDFHFGF,KSFDFHFGF,"{'accession': 'NP_005139.1', 'name': 'protein ...",NCBITaxon:9606,28228285,UNIPROT:Q13432,Protein unc-119 homolog A,NCBITaxon:9606,,,,MHC


## Example #3: Pull Autoimmune Epitopes Example

In [8]:
def pull_data_from_iedb(table: str, autoimmune: bool = False) -> pd.DataFrame:
    """Read data from the IEDB API into a pandas dataframe.
    
    Args:
        table (str): Table search name ('tcell', 'bcell', or 'mhc').
        autoimmune (bool): Pull autoimmune epitopes only."""
    
    url = f'https://query-api.iedb.org/{table}_search'
    params = {
        'order': 'structure_id',                         # we want the pages to be in order by structure ID
        'select': 'structure_id, linear_sequence, structure_description, curated_source_antigen, ' \
          'source_organism_iri, pubmed_id, parent_source_antigen_iri, parent_source_antigen_name, ' \
          'parent_source_antigen_source_org_iri, e_modification, disease_iris, disease_names',
        'offset': 0,                                     # offset starting at 0 - we will add 10,000 per API call
        'qualitative_measure': 'neq.Negative',           # select positive assays only
        'structure_type': 'eq.Linear peptide',           # select linear peptides only
        'epitope_structure_defined': 'eq.Exact Epitope', # only exact epitopes
        'reference_type': 'eq.Literature'                # select only data from literature
    }

    # https://ontobee.org/ontology/OBI?iri=http://purl.obolibrary.org/obo/OBI_1110125
    if table == 'mhc': params['assay_iri_search'] = 'cs.{"OBI:1110125"}' # include ligand elution data
    
    if autoimmune:
        with open('autoimmune_diseases.json') as f:
            diseases = json.load(f)
        
        df = pd.DataFrame()
        for doid in diseases.keys():                           # loop through autoimmune diseases
            params['offset'] = 0                               # reset offset
            params['disease_iris'] = f'cs.{{{"DOID:"+doid}}}'  # add disease ID as parameter to check
            df = pd.concat([df, iterate_api(url, params)])
        
        df = df[df['source_organism_iri'] == 'NCBITaxon:9606'] # select human autoimmune epitopes only
    
    else:
        df = iterate_api(url, params)
    
    if table == 'tcell':
        df['assay_type'] = 'T cell'
    elif table == 'bcell':
        df['assay_type'] = 'B cell'
    else:
        df['assay_type'] = 'MHC'
    
    return df

In [9]:
bcell_df = pull_data_from_iedb('bcell')

In [10]:
bcell_df

Unnamed: 0,structure_id,linear_sequence,structure_description,curated_source_antigen,source_organism_iri,pubmed_id,parent_source_antigen_iri,parent_source_antigen_name,parent_source_antigen_source_org_iri,e_modification,disease_iris,disease_names,assay_type
0,1,AA,"AA + MCM(A1,A2)","{'accession': 'AAB20743.1', 'name': 'streptoki...",,6754610,UNIPROT:P10520,Streptokinase A,NCBITaxon:1314,Main chain modification,,,B cell
1,34,AAAGDK,AAAGDK,"{'accession': 'AAP88022.1', 'name': 'B13 antig...",NCBITaxon:5693,7536937,taxon_protein:5693-other,Other Trypanosoma cruzi protein,NCBITaxon:5693,,[DOID:12140],[Chagas disease],B cell
2,34,AAAGDK,AAAGDK,"{'accession': 'AAP88022.1', 'name': 'B13 antig...",NCBITaxon:5693,15183869,taxon_protein:5693-other,Other Trypanosoma cruzi protein,NCBITaxon:5693,,[DOID:12140],[Chagas disease],B cell
3,49,AAALPGKCGV,AAALPGKCGV,"{'accession': 'CAB96876.2', 'name': 'pru p 1',...",NCBITaxon:3760,13679821,UNIPROT:Q9LED1,Pru p 3,NCBITaxon:3760,,[DOID:1205],[allergic disease],B cell
4,49,AAALPGKCGV,AAALPGKCGV,"{'accession': 'P81402.1', 'name': 'Non-specifi...",NCBITaxon:3760,19846220,UNIPROT:Q9LED1,Pru p 3,NCBITaxon:3760,,[DOID:0060510],[peach allergy],B cell
...,...,...,...,...,...,...,...,...,...,...,...,...,...
9848,2246968,PLPDHVSIVEPKDEILPTTPISEQKGGK,PLPDHVSIVEPKDEILPTTPISEQKGGK,"{'accession': 'NP_001243731.1', 'name': '40S r...",NCBITaxon:9606,34745438,UNIPROT:P23396,Small ribosomal subunit protein uS3,NCBITaxon:9606,,,,B cell
9849,2246968,PLPDHVSIVEPKDEILPTTPISEQKGGK,PLPDHVSIVEPKDEILPTTPISEQKGGK,"{'accession': 'NP_001243731.1', 'name': '40S r...",NCBITaxon:9606,34745438,UNIPROT:P23396,Small ribosomal subunit protein uS3,NCBITaxon:9606,,,,B cell
9850,2246968,PLPDHVSIVEPKDEILPTTPISEQKGGK,PLPDHVSIVEPKDEILPTTPISEQKGGK,"{'accession': 'NP_001243731.1', 'name': '40S r...",NCBITaxon:9606,34745438,UNIPROT:P23396,Small ribosomal subunit protein uS3,NCBITaxon:9606,,,,B cell
9851,2248188,VEPKDEILPTTPISE,VEPKDEILPTTPISE,"{'accession': 'NP_001243731.1', 'name': '40S r...",NCBITaxon:9606,34745438,UNIPROT:P23396,Small ribosomal subunit protein uS3,NCBITaxon:9606,,,,B cell
