In [9]:
import warnings
warnings.filterwarnings("ignore")

import io
import requests
import json
import gzip
import numpy as np
import pandas as pd

from pepmatch import Preprocessor, Matcher
from collections import Counter
from Bio import SeqIO

with open('../autoimmune_diseases.json' , 'r') as f:
    diseases = json.load(f)

In [26]:
def pull_iedb_assay_data(table):
    '''Extracts T cell and B cell positive assay data from the IEDB.'''
    
    df = pd.DataFrame()
    for doid in diseases.keys(): 

        # first get the total number of assays as first request to loop through API
        url = 'https://query-api.iedb.org/%s_search' % table
        params = {'order': 'structure_id',
                  'qualitative_measure': 'neq.Negative', # select positive assays only
                  'disease_iris': f'cs.{{{"DOID:"+doid}}}'} 
        r = requests.get(url, params=params, headers={'Prefer': 'count=exact'})
        pages = int(r.headers['Content-Range'].split('/')[-1])
      
      # loop through IEDB API pages using requests - read into pandas DataFrame and concat
        for i in range(pages // 10000 + 1): # API limit is 10,000 entries
            params['offset'] = i*10000

            # request API call returning csv formatting using parameters in params
            s = requests.get(url, params=params, headers={'accept': 'text/csv', 'Prefer': 'count=exact'})
            df = pd.concat([df, pd.read_csv(io.StringIO(s.content.decode('utf-8')))])

    return df

In [29]:
df = pull_iedb_assay_data('tcell')

417 1
437 454
676 69
718 166
986 37
2048 162
2377 2449
2988 16
3492 76
4313 0
5213 30
6196 120
7147 77
7148 796
7188 118
8506 123
8577 9
8781 97
8857 0
8869 120
8893 252
8924 67
9008 6
9074 322
9182 48
9744 2729
9808 90
10608 1374
11656 0
12132 58
12236 122
12297 61
12306 34
12361 243
12842 24
12894 11
13241 45
14040 0
14482 3
0040087 0
0040088 148
0040089 1
0040092 6
0040094 36
0040096 7
0040097 15
0050167 0
0050169 0
0050214 0
0060004 46
0060030 4
0060032 2
0060039 20
0060050 2
0060051 0
0060499 4


In [7]:
df

Unnamed: 0,tcell_id,tcell_iri,structure_id,structure_iri,linear_sequence,structure_type,structure_description,curated_source_antigen,reference_id,reference_iri,...,non_peptidic_molecule_iri_search,non_peptidic_molecule_iri,non_peptidic_molecule_name,r_object_source_molecule_iri_search,r_object_source_molecule_iri,r_object_source_molecule_name,r_object_source_organism_iri_search,r_object_source_organism_iri,r_object_source_organism_name,e_related_object_type
0,4372,IEDB_ASSAY:4372,10,IEDB_EPITOPE:10,AAAAAIFVI,Linear peptide,AAAAAIFVI,"(AAU95382.1,""MHC class I related protein A"",GE...",541,IEDB_REFERENCE:541,...,,,,,,,,,,
1,5785,IEDB_ASSAY:5785,10,IEDB_EPITOPE:10,AAAAAIFVI,Linear peptide,AAAAAIFVI,"(AAU95382.1,""MHC class I related protein A"",GE...",541,IEDB_REFERENCE:541,...,,,,,,,,,,
2,1848919,IEDB_ASSAY:1848919,10,IEDB_EPITOPE:10,AAAAAIFVI,Linear peptide,AAAAAIFVI,"(AAU95382.1,""MHC class I related protein A"",GE...",541,IEDB_REFERENCE:541,...,,,,,,,,,,
3,6013202,IEDB_ASSAY:6013202,10,IEDB_EPITOPE:10,AAAAAIFVI,Linear peptide,AAAAAIFVI,"(AAK26323.1,""MHC class I chain-related protein...",1035582,IEDB_REFERENCE:1035582,...,,,,,,,,,,
4,1328560,IEDB_ASSAY:1328560,11,IEDB_EPITOPE:11,AAAAALDKKQRNFDKILA,Linear peptide,AAAAALDKKQRNFDKILA,"(P12883.5,Myosin-7,UNIPROT:P12883.5,1437,1454,...",1002200,IEDB_REFERENCE:1002200,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,1686217,IEDB_ASSAY:1686217,13572,IEDB_EPITOPE:13572,ENPVVHFFKNIVTPR,Linear peptide,ENPVVHFFKNIVTPR,"(AAH08749.3,""MBP protein"",GENPEPT:AAH08749.3,8...",315040,IEDB_REFERENCE:315040,...,,,,,,,,,,
9996,1686218,IEDB_ASSAY:1686218,13572,IEDB_EPITOPE:13572,ENPVVHFFKNIVTPR,Linear peptide,ENPVVHFFKNIVTPR,"(AAH08749.3,""MBP protein"",GENPEPT:AAH08749.3,8...",315040,IEDB_REFERENCE:315040,...,,,,,,,,,,
9997,1686228,IEDB_ASSAY:1686228,13572,IEDB_EPITOPE:13572,ENPVVHFFKNIVTPR,Linear peptide,ENPVVHFFKNIVTPR,"(AAH08749.3,""MBP protein"",GENPEPT:AAH08749.3,8...",315040,IEDB_REFERENCE:315040,...,,,,,,,,,,
9998,1689978,IEDB_ASSAY:1689978,13572,IEDB_EPITOPE:13572,ENPVVHFFKNIVTPR,Linear peptide,ENPVVHFFKNIVTPR,"(AAH08749.3,""MBP protein"",GENPEPT:AAH08749.3,8...",315040,IEDB_REFERENCE:315040,...,,,,,,,,,,


In [14]:
for i in diseases.keys():
    print('DOID:'+i)

DOID:417
DOID:437
DOID:676
DOID:718
DOID:986
DOID:2048
DOID:2377
DOID:2988
DOID:3492
DOID:4313
DOID:5213
DOID:6196
DOID:7147
DOID:7148
DOID:7188
DOID:8506
DOID:8577
DOID:8781
DOID:8857
DOID:8869
DOID:8893
DOID:8924
DOID:9008
DOID:9074
DOID:9182
DOID:9744
DOID:9808
DOID:10608
DOID:11656
DOID:12132
DOID:12236
DOID:12297
DOID:12306
DOID:12361
DOID:12842
DOID:12894
DOID:13241
DOID:14040
DOID:14482
DOID:0040087
DOID:0040088
DOID:0040089
DOID:0040092
DOID:0040094
DOID:0040096
DOID:0040097
DOID:0050167
DOID:0050169
DOID:0050214
DOID:0060004
DOID:0060030
DOID:0060032
DOID:0060039
DOID:0060050
DOID:0060051
DOID:0060499
