In [9]:
import warnings
warnings.filterwarnings("ignore")

import io
import requests
import json
import gzip
import numpy as np
import pandas as pd

from pepmatch import Preprocessor, Matcher
from collections import Counter
from Bio import SeqIO

with open('../autoimmune_diseases.json' , 'r') as f:
    diseases = json.load(f)

In [77]:
def pull_iedb_assay_data(table):
    '''
    Extracts T cell and B cell positive assay data from the IEDB.
    Parameter table = 'tcell' or 'bcell' to specify. 
    '''
    
    df = pd.DataFrame()
    for doid in diseases.keys(): 

        # first get the total number of assays as first request to loop through API
        url = 'https://query-api.iedb.org/%s_search' % table
        params = {'order': 'structure_id',
                  'qualitative_measure': 'neq.Negative', # select positive assays only
                  'disease_iris': f'cs.{{{"DOID:"+doid}}}'} 
        r = requests.get(url, params=params, headers={'Prefer': 'count=exact'})
        pages = int(r.headers['Content-Range'].split('/')[-1])
      
      # loop through IEDB API pages using requests - read into pandas DataFrame and concat
        for i in range(pages // 10000 + 1): # API limit is 10,000 entries
            params['offset'] = i*10000

            # request API call returning csv formatting using parameters in params
            s = requests.get(url, params=params, headers={'accept': 'text/csv', 'Prefer': 'count=exact'})
            try:
                df = pd.concat([df, pd.read_csv(io.StringIO(s.content.decode('utf-8')))])
            except pd.errors.EmptyDataError:
                continue

    return df

In [103]:
tcell = pull_iedb_assay_data('tcell')
bcell = pull_iedb_assay_data('bcell')

In [104]:
# select epitopes where host and source organism are identical
tdf = tcell[(tcell['host_organism_iri'] == tcell['parent_source_antigen_source_org_iri']) | (tcell['host_organism_iri'] == tcell['r_object_source_organism_iri'])]
bdf = bcell[(bcell['host_organism_iri'] == bcell['parent_source_antigen_source_org_iri']) | (bcell['host_organism_iri'] == bcell['r_object_source_organism_iri'])]

In [106]:
tcell[(tcell['host_organism_iri'] == tcell['r_object_source_organism_iri'])].to_csv('test.csv')

In [108]:
tdf.to_csv('tcell.csv')

In [121]:
tdf[tdf['parent_source_antigen_iri'].fillna(tdf['r_object_source_molecule_iri']).isna()].to_csv('test.csv')
