In [1]:
import warnings
warnings.filterwarnings("ignore")

import io
import requests
import json
import gzip
import numpy as np
import pandas as pd

from pepmatch import Preprocessor, Matcher
from collections import Counter
from Bio import SeqIO

with open('../autoimmune_diseases.json' , 'r') as f:
    diseases = json.load(f)

In [2]:
def pull_iedb_assay_data(table):
    '''
    Extracts T cell and B cell positive assay data from the IEDB.
    Parameter table = 'tcell' or 'bcell' to specify. 
    '''
    
    df = pd.DataFrame()
    for doid in diseases.keys(): 

        # first get the total number of assays as first request to loop through API
        url = 'https://query-api.iedb.org/%s_search' % table
        params = {'order': 'structure_id',
                  'qualitative_measure': 'neq.Negative', # select positive assays only
                  'disease_iris': f'cs.{{{"DOID:"+doid}}}'} 
        r = requests.get(url, params=params, headers={'Prefer': 'count=exact'})
        pages = int(r.headers['Content-Range'].split('/')[-1])
      
      # loop through IEDB API pages using requests - read into pandas DataFrame and concat
        for i in range(pages // 10000 + 1): # API limit is 10,000 entries
            params['offset'] = i*10000

            # request API call returning csv formatting using parameters in params
            s = requests.get(url, params=params, headers={'accept': 'text/csv', 'Prefer': 'count=exact'})
            try:
                df = pd.concat([df, pd.read_csv(io.StringIO(s.content.decode('utf-8')))])
            except pd.errors.EmptyDataError:
                continue

    return df

In [3]:
tcell = pull_iedb_assay_data('tcell')
bcell = pull_iedb_assay_data('bcell')

In [4]:
# select epitopes where host and source organism are identical
tdf = tcell[(tcell['host_organism_iri'] == tcell['parent_source_antigen_source_org_iri']) | (tcell['host_organism_iri'] == tcell['r_object_source_organism_iri'])]
bdf = bcell[(bcell['host_organism_iri'] == bcell['parent_source_antigen_source_org_iri']) | (bcell['host_organism_iri'] == bcell['r_object_source_organism_iri'])]

In [62]:
tdf['source_antigen_iri'] = tdf['parent_source_antigen_iri'].fillna(tdf['r_object_source_molecule_iri'])
bdf['source_antigen_iri'] = bdf['parent_source_antigen_iri'].fillna(bdf['r_object_source_molecule_iri'])

tdf['source_antigen_name'] = tdf['parent_source_antigen_name'].fillna(tdf['r_object_source_molecule_name'])
bdf['source_antigen_name'] = bdf['parent_source_antigen_name'].fillna(bdf['r_object_source_molecule_name'])

tdf['source_organism_name'] = tdf['source_organism_name'].fillna(tdf['r_object_source_organism_name'])
bdf['source_organism_name'] = bdf['source_organism_name'].fillna(bdf['r_object_source_organism_name'])

In [63]:
bdf['source_organism_name'].isna().value_counts()

False    31218
Name: source_organism_name, dtype: int64

In [42]:
 
for i, row in tdf.groupby('source_antigen_iri'):
    print(list(row['parent_source_antigen_name']))
    

['T cell receptor beta, variable 10 (Fragment)']
['Immunoglobulin lambda variable 4-69', 'Immunoglobulin lambda variable 4-69']
['T cell receptor beta variable 19']
['T cell receptor beta variable 20-1', 'T cell receptor beta variable 20-1']
['T cell receptor beta variable 25-1']
['T cell receptor alpha variable 6']
['T cell receptor alpha variable 23/delta variable 6']
['T cell receptor alpha variable 30']
['T cell receptor beta variable 15 (Fragment)']
['Probable non-functional T cell receptor beta variable 23-1']
['T cell receptor alpha variable 16']
['T cell receptor alpha variable 19']
['T cell receptor beta variable 9']
['Immunoglobulin heavy variable 6-1']
['Immunoglobulin heavy variable 3-74']
['T cell receptor alpha variable 13-2', 'T cell receptor alpha variable 13-2']
['T cell receptor alpha variable 8-2']
['T cell receptor alpha variable 13-1', 'T cell receptor alpha variable 13-1']
['T cell receptor alpha variable 5']
['T cell receptor alpha variable 8-6']
['T cell recepto

In [74]:
a_b_cell_counts = []
for i, row in bdf.groupby('source_antigen_iri'):
    diseases = row['disease_names']
    counts = []
    counts.append(i)
    counts.append(list(row['source_antigen_name'].dropna())[0])
    counts.append(', '.join(set(diseases)))
    counts.append(len(row['reference_id'].unique()))
    counts.append(len(row['structure_id'].unique()))
    counts.append(len(row))
    counts.append(list(row['source_organism_name'].dropna())[0])

    a_b_cell_counts.append(counts)

In [75]:
a_b_cell_counts = pd.DataFrame(a_b_cell_counts, columns=['Protein ID','Protein Name', 'Diseases', 'Reference Count',
                                     'Epitope Count', 'Assay Count', 'Parent Species'])

In [77]:
a_b_cell_counts = a_b_cell_counts[a_b_cell_counts['Reference Count'] > 1]

In [106]:
tcell[(tcell['host_organism_iri'] == tcell['r_object_source_organism_iri'])].to_csv('test.csv')

In [108]:
tdf.to_csv('tcell.csv')

In [121]:
tdf[tdf['parent_source_antigen_iri'].fillna(tdf['r_object_source_molecule_iri']).isna()].to_csv('test.csv')


In [39]:
def pull_iedb_assay_data(table):
    '''
    Extracts T cell and B cell positive assay data from the IEDB.
    Parameter table = 'tcell' or 'bcell' to specify. 
    '''
    
    df = pd.DataFrame()

    # first get the total number of assays as first request to loop through API
    url = 'https://query-api.iedb.org/%s_search' % table
    params = {'order': 'structure_id',
              'qualitative_measure': 'neq.Negative',
              'or': '(e_related_object_type.eq.neo-epitope, immunization_description.plfts."Occurrence of cancer")'}
    r = requests.get(url, params=params, headers={'Prefer': 'count=exact'})
    pages = int(r.headers['Content-Range'].split('/')[-1])
      
    # loop through IEDB API pages using requests - read into pandas DataFrame and concat
    for i in range(pages // 10000 + 1): # API limit is 10,000 entries
        params['offset'] = i*10000

        # request API call returning csv formatting using parameters in params
        s = requests.get(url, params=params, headers={'accept': 'text/csv', 'Prefer': 'count=exact'})
        try:
            df = pd.concat([df, pd.read_csv(io.StringIO(s.content.decode('utf-8')))])
        except pd.errors.EmptyDataError:
            continue

    return df

In [41]:
tcell = pull_iedb_assay_data('tcell')
print('hi')
bcell = pull_iedb_assay_data('bcell')

hi


In [42]:
tcell

Unnamed: 0,tcell_id,tcell_iri,structure_id,structure_iri,linear_sequence,structure_type,structure_description,curated_source_antigen,reference_id,reference_iri,...,non_peptidic_molecule_iri_search,non_peptidic_molecule_iri,non_peptidic_molecule_name,r_object_source_molecule_iri_search,r_object_source_molecule_iri,r_object_source_molecule_name,r_object_source_organism_iri_search,r_object_source_organism_iri,r_object_source_organism_name,e_related_object_type
0,4958,IEDB_ASSAY:4958,155,IEDB_EPITOPE:155,AAGIGILTV,Linear peptide,AAGIGILTV,"(Q16655.1,""Melanoma antigen recognized by T-ce...",573,IEDB_REFERENCE:573,...,,,,,,,,,,
1,1243434,IEDB_ASSAY:1243434,155,IEDB_EPITOPE:155,AAGIGILTV,Linear peptide,AAGIGILTV,"(Q16655.1,""Melanoma antigen recognized by T-ce...",1000806,IEDB_REFERENCE:1000806,...,,,,,,,,,,
2,1775498,IEDB_ASSAY:1775498,155,IEDB_EPITOPE:155,AAGIGILTV,Linear peptide,AAGIGILTV,"(Q16655.1,""Melanoma antigen recognized by T-ce...",1013533,IEDB_REFERENCE:1013533,...,,,,,,,,,,
3,1779719,IEDB_ASSAY:1779719,155,IEDB_EPITOPE:155,AAGIGILTV,Linear peptide,AAGIGILTV,"(Q16655.1,""Melanoma antigen recognized by T-ce...",1013533,IEDB_REFERENCE:1013533,...,,,,,,,,,,
4,1907776,IEDB_ASSAY:1907776,155,IEDB_EPITOPE:155,AAGIGILTV,Linear peptide,AAGIGILTV,"(Q16655.1,""Melanoma antigen recognized by T-ce...",1013616,IEDB_REFERENCE:1013616,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7301,21117517,IEDB_ASSAY:21117517,2135599,IEDB_EPITOPE:2135599,YIPPHSALCL,Linear peptide,YIPPHSALCL,,1040922,IEDB_REFERENCE:1040922,...,,,,"{BFO:0000040,PR:000000001,taxon_protein:2759,t...",UNIPROT:M0R2K9,Peptidylprolyl isomerase (Fragment) (UniProt:M...,"{NCBITaxon:1,NCBITaxon:2759,NCBITaxon:314295,N...",NCBITaxon:9606,Homo sapiens (human),neo-epitope
7302,21117665,IEDB_ASSAY:21117665,2135601,IEDB_EPITOPE:2135601,YPYLMEAYW,Linear peptide,YPYLMEAYW,,1040922,IEDB_REFERENCE:1040922,...,,,,"{BFO:0000040,PR:000000001,taxon_protein:2759,t...",UNIPROT:P82251,"b(0,+)-type amino acid transporter 1","{NCBITaxon:1,NCBITaxon:2759,NCBITaxon:314295,N...",NCBITaxon:9606,Homo sapiens (human),neo-epitope
7303,21117666,IEDB_ASSAY:21117666,2135601,IEDB_EPITOPE:2135601,YPYLMEAYW,Linear peptide,YPYLMEAYW,,1040922,IEDB_REFERENCE:1040922,...,,,,"{BFO:0000040,PR:000000001,taxon_protein:2759,t...",UNIPROT:P82251,"b(0,+)-type amino acid transporter 1","{NCBITaxon:1,NCBITaxon:2759,NCBITaxon:314295,N...",NCBITaxon:9606,Homo sapiens (human),neo-epitope
7304,21117582,IEDB_ASSAY:21117582,2135602,IEDB_EPITOPE:2135602,YSKDVLPRL,Linear peptide,YSKDVLPRL,,1040922,IEDB_REFERENCE:1040922,...,,,,"{BFO:0000040,PR:000000001,taxon_protein:2759,t...",UNIPROT:Q9NVM6,DnaJ homolog subfamily C member 17,"{NCBITaxon:1,NCBITaxon:2759,NCBITaxon:314295,N...",NCBITaxon:9606,Homo sapiens (human),neo-epitope
