In [9]:
import warnings
warnings.filterwarnings("ignore")

import io
import requests
import json
import gzip
import numpy as np
import pandas as pd

from pepmatch import Preprocessor, Matcher
from collections import Counter
from Bio import SeqIO

with open('../autoimmune_diseases.json' , 'r') as f:
    diseases = json.load(f)

In [38]:
def pull_iedb_assay_data(table):
    '''
    Extracts T cell and B cell positive assay data from the IEDB.
    Parameter table = 'tcell' or 'bcell' to specify. 
    '''
    
    df = pd.DataFrame()
    for doid in diseases.keys(): 

        # first get the total number of assays as first request to loop through API
        url = 'https://query-api.iedb.org/%s_search' % table
        params = {'order': 'structure_id',
                  'qualitative_measure': 'neq.Negative', # select positive assays only
                  'disease_iris': f'cs.{{{"DOID:"+doid}}}'} 
        r = requests.get(url, params=params, headers={'Prefer': 'count=exact'})
        pages = int(r.headers['Content-Range'].split('/')[-1])
      
      # loop through IEDB API pages using requests - read into pandas DataFrame and concat
        for i in range(pages // 10000 + 1): # API limit is 10,000 entries
            params['offset'] = i*10000

            # request API call returning csv formatting using parameters in params
            s = requests.get(url, params=params, headers={'accept': 'text/csv', 'Prefer': 'count=exact'})
            try:
                df = pd.concat([df, pd.read_csv(io.StringIO(s.content.decode('utf-8')))])
            except pd.errors.EmptyDataError:
                continue

    return df

In [39]:
df = pull_iedb_assay_data('tcell')

done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done


In [44]:
df['host_organism_iri'].value_counts()

NCBITaxon:9606     9552
NCBITaxon:10090       2
NCBITaxon:9823        1
Name: host_organism_iri, dtype: int64

In [55]:
df[['parent_source_antigen_source_org_iri', 'r_object_source_organism_iri']].value_counts(dropna=False)

parent_source_antigen_source_org_iri  r_object_source_organism_iri
NCBITaxon:9606                        NaN                             6567
NaN                                   NaN                             1127
NCBITaxon:10090                       NaN                              673
NaN                                   NCBITaxon:9606                   489
NCBITaxon:10376                       NaN                              159
                                                                      ... 
NCBITaxon:2024894                     NaN                                1
NCBITaxon:202950                      NaN                                1
NCBITaxon:450                         NaN                                1
NCBITaxon:2066                        NaN                                1
NCBITaxon:1005962                     NaN                                1
Length: 215, dtype: int64

In [62]:
df['combined_source_organism_iri'] = df['parent_source_antigen_source_org_iri'].fillna(df['r_object_source_organism_iri'])

In [69]:
df[df['host_organism_iri'] == df['combined_source_organism_iri']]['curated_source_antigen'].value_counts()

(Q7LDG7.1,"RAS guanyl-releasing protein 2",UNIPROT:Q7LDG7.1,78,87,"Homo sapiens (human)",NCBITaxon:9606)                                                                                70
(NP_000198.1,"proinsulin precursor",GENPEPT:NP_000198.1,34,42,"Homo sapiens (human)",NCBITaxon:9606)                                                                                    60
(AAH08749.3,"MBP protein",GENPEPT:AAH08749.3,84,98,"Homo sapiens (human)",NCBITaxon:9606)                                                                                               58
(EAW67174.1,"dihydrolipoamide S-acetyltransferase (E2 component of pyruvate dehydrogenase complex), isoform CRA_a",GENPEPT:EAW67174.1,249,262,"Homo sapiens (human)",NCBITaxon:9606)    52
(SRC279945,"Myelin basic protein",ONTIE:0002546,,,"Homo sapiens (human)",NCBITaxon:9606)                                                                                                42
                                                                 

In [75]:
df.dropna(subset=['r_object_source_organism_iri'])

Unnamed: 0,tcell_id,tcell_iri,structure_id,structure_iri,linear_sequence,structure_type,structure_description,curated_source_antigen,reference_id,reference_iri,...,non_peptidic_molecule_iri,non_peptidic_molecule_name,r_object_source_molecule_iri_search,r_object_source_molecule_iri,r_object_source_molecule_name,r_object_source_organism_iri_search,r_object_source_organism_iri,r_object_source_organism_name,e_related_object_type,combined_source_organism_iri
7,1773329,IEDB_ASSAY:1773329,131861,IEDB_EPITOPE:131861,ELAVLWVVKSTPASK,Linear peptide,ELAVLWVVKSTPASK,,1017528,IEDB_REFERENCE:1017528,...,,,"{BFO:0000040,PR:000000001,taxon_protein:10088,...",UNIPROT:P04919,Band 3 anion transport protein,"{NCBITaxon:1,NCBITaxon:10088,NCBITaxon:10090,N...",NCBITaxon:10090,Mus musculus (mouse),analog,NCBITaxon:10090
29,2854862,IEDB_ASSAY:2854862,549118,IEDB_EPITOPE:549118,ELAGIGILT,Linear peptide,ELAGIGILT,,1030572,IEDB_REFERENCE:1030572,...,,,"{BFO:0000040,PR:000000001,taxon_protein:2759,t...",UNIPROT:Q16655,Melanoma antigen recognized by T-cells 1,"{NCBITaxon:1,NCBITaxon:2759,NCBITaxon:314295,N...",NCBITaxon:9606,Homo sapiens (human),analog,NCBITaxon:9606
21,2885449,IEDB_ASSAY:2885449,13563,IEDB_EPITOPE:13563,ENPVVAFFKNIVTPR,Linear peptide,ENPVVAFFKNIVTPR,,1015435,IEDB_REFERENCE:1015435,...,,,"{BFO:0000040,PR:000000001,taxon_protein:2759,t...",UNIPROT:J3QL64,Myelin basic protein (UniProt:J3QL64),"{NCBITaxon:1,NCBITaxon:2759,NCBITaxon:314295,N...",NCBITaxon:9606,Homo sapiens (human),analog,NCBITaxon:9606
22,2868334,IEDB_ASSAY:2868334,13563,IEDB_EPITOPE:13563,ENPVVAFFKNIVTPR,Linear peptide,ENPVVAFFKNIVTPR,,1015435,IEDB_REFERENCE:1015435,...,,,"{BFO:0000040,PR:000000001,taxon_protein:2759,t...",UNIPROT:J3QL64,Myelin basic protein (UniProt:J3QL64),"{NCBITaxon:1,NCBITaxon:2759,NCBITaxon:314295,N...",NCBITaxon:9606,Homo sapiens (human),analog,NCBITaxon:9606
23,1661594,IEDB_ASSAY:1661594,13563,IEDB_EPITOPE:13563,ENPVVAFFKNIVTPR,Linear peptide,ENPVVAFFKNIVTPR,,200028,IEDB_REFERENCE:200028,...,,,"{BFO:0000040,PR:000000001,taxon_protein:2759,t...",UNIPROT:P02686,Myelin basic protein (UniProt:P02686),"{NCBITaxon:1,NCBITaxon:2759,NCBITaxon:314295,N...",NCBITaxon:9606,Homo sapiens (human),analog,NCBITaxon:9606
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10,12761153,IEDB_ASSAY:12761153,27469,IEDB_EPITOPE:27469,IMDQVPFSV,Linear peptide,IMDQVPFSV,,1038026,IEDB_REFERENCE:1038026,...,,,"{BFO:0000040,PR:000000001,taxon_protein:2759,t...",UNIPROT:P40967,Melanocyte protein PMEL,"{NCBITaxon:1,NCBITaxon:2759,NCBITaxon:314295,N...",NCBITaxon:9606,Homo sapiens (human),analog,NCBITaxon:9606
19,4583496,IEDB_ASSAY:4583496,74641,IEDB_EPITOPE:74641,YLEPGPVTV,Linear peptide,YLEPGPVTV,,1033693,IEDB_REFERENCE:1033693,...,,,"{BFO:0000040,PR:000000001,taxon_protein:2759,t...",UNIPROT:P40967,Melanocyte protein PMEL,"{NCBITaxon:1,NCBITaxon:2759,NCBITaxon:314295,N...",NCBITaxon:9606,Homo sapiens (human),analog,NCBITaxon:9606
20,4583492,IEDB_ASSAY:4583492,75043,IEDB_EPITOPE:75043,YMDGTMSQV,Linear peptide,YMDGTMSQV,,1033693,IEDB_REFERENCE:1033693,...,,,"{BFO:0000040,PR:000000001,taxon_protein:2759,t...",UNIPROT:P14679,Tyrosinase,"{NCBITaxon:1,NCBITaxon:2759,NCBITaxon:314295,N...",NCBITaxon:9606,Homo sapiens (human),analog,NCBITaxon:9606
33,12761160,IEDB_ASSAY:12761160,1336938,IEDB_EPITOPE:1336938,ILDQVPFSV,Linear peptide,ILDQVPFSV + NLeu(L2),,1038026,IEDB_REFERENCE:1038026,...,,,"{BFO:0000040,PR:000000001,taxon_protein:2759,t...",UNIPROT:P40967,Melanocyte protein PMEL,"{NCBITaxon:1,NCBITaxon:2759,NCBITaxon:314295,N...",NCBITaxon:9606,Homo sapiens (human),analog,NCBITaxon:9606
