In [1]:
import warnings
warnings.filterwarnings("ignore")

import io
import requests
import json
import gzip
import numpy as np
import pandas as pd

from pepmatch import Preprocessor, Matcher
from collections import Counter
from Bio import SeqIO

with open('../autoimmune_diseases.json' , 'r') as f:
    diseases = json.load(f)

In [2]:
def pull_iedb_assay_data(table):
    '''
    Extracts T cell and B cell positive assay data from the IEDB.
    Parameter table = 'tcell' or 'bcell' to specify. 
    '''
    
    df = pd.DataFrame()
    for doid in diseases.keys(): 

        # first get the total number of assays as first request to loop through API
        url = 'https://query-api.iedb.org/%s_search' % table
        params = {'order': 'structure_id',
                  'qualitative_measure': 'neq.Negative', # select positive assays only
                  'disease_iris': f'cs.{{{"DOID:"+doid}}}'} 
        r = requests.get(url, params=params, headers={'Prefer': 'count=exact'})
        pages = int(r.headers['Content-Range'].split('/')[-1])
      
      # loop through IEDB API pages using requests - read into pandas DataFrame and concat
        for i in range(pages // 10000 + 1): # API limit is 10,000 entries
            params['offset'] = i*10000

            # request API call returning csv formatting using parameters in params
            s = requests.get(url, params=params, headers={'accept': 'text/csv', 'Prefer': 'count=exact'})
            try:
                df = pd.concat([df, pd.read_csv(io.StringIO(s.content.decode('utf-8')))])
            except pd.errors.EmptyDataError:
                continue

    return df

In [3]:
tcell = pull_iedb_assay_data('tcell')
bcell = pull_iedb_assay_data('bcell')

In [4]:
# select epitopes where host and source organism are identical
tdf = tcell[(tcell['host_organism_iri'] == tcell['parent_source_antigen_source_org_iri']) | (tcell['host_organism_iri'] == tcell['r_object_source_organism_iri'])]
bdf = bcell[(bcell['host_organism_iri'] == bcell['parent_source_antigen_source_org_iri']) | (bcell['host_organism_iri'] == bcell['r_object_source_organism_iri'])]

In [62]:
tdf['source_antigen_iri'] = tdf['parent_source_antigen_iri'].fillna(tdf['r_object_source_molecule_iri'])
bdf['source_antigen_iri'] = bdf['parent_source_antigen_iri'].fillna(bdf['r_object_source_molecule_iri'])

tdf['source_antigen_name'] = tdf['parent_source_antigen_name'].fillna(tdf['r_object_source_molecule_name'])
bdf['source_antigen_name'] = bdf['parent_source_antigen_name'].fillna(bdf['r_object_source_molecule_name'])

tdf['source_organism_name'] = tdf['source_organism_name'].fillna(tdf['r_object_source_organism_name'])
bdf['source_organism_name'] = bdf['source_organism_name'].fillna(bdf['r_object_source_organism_name'])

In [63]:
bdf['source_organism_name'].isna().value_counts()

False    31218
Name: source_organism_name, dtype: int64

In [42]:
 
for i, row in tdf.groupby('source_antigen_iri'):
    print(list(row['parent_source_antigen_name']))
    

['T cell receptor beta, variable 10 (Fragment)']
['Immunoglobulin lambda variable 4-69', 'Immunoglobulin lambda variable 4-69']
['T cell receptor beta variable 19']
['T cell receptor beta variable 20-1', 'T cell receptor beta variable 20-1']
['T cell receptor beta variable 25-1']
['T cell receptor alpha variable 6']
['T cell receptor alpha variable 23/delta variable 6']
['T cell receptor alpha variable 30']
['T cell receptor beta variable 15 (Fragment)']
['Probable non-functional T cell receptor beta variable 23-1']
['T cell receptor alpha variable 16']
['T cell receptor alpha variable 19']
['T cell receptor beta variable 9']
['Immunoglobulin heavy variable 6-1']
['Immunoglobulin heavy variable 3-74']
['T cell receptor alpha variable 13-2', 'T cell receptor alpha variable 13-2']
['T cell receptor alpha variable 8-2']
['T cell receptor alpha variable 13-1', 'T cell receptor alpha variable 13-1']
['T cell receptor alpha variable 5']
['T cell receptor alpha variable 8-6']
['T cell recepto

In [64]:
a_t_cell_counts = []
for i, row in tdf.groupby('source_antigen_iri'):
    diseases = row['disease_names']
    counts = []
    counts.append(i)
    counts.append(', '.join(set(diseases)))
    counts.append(list(row['source_antigen_name'].dropna())[0])
    counts.append(len(row['reference_id'].unique()))
    counts.append(len(row['structure_id'].unique()))
    counts.append(len(row))
    counts.append(list(row['source_organism_name'].dropna())[0])

    a_t_cell_counts.append(counts)

In [65]:
a_t_cell_counts

[['UNIPROT:A0A075B5I2',
  '{"rheumatoid arthritis"}',
  'T cell receptor beta, variable 10 (Fragment)',
  1,
  1,
  1,
  'Mus musculus (mouse)'],
 ['UNIPROT:A0A075B6H9',
  '{"multiple sclerosis"}',
  'Immunoglobulin lambda variable 4-69',
  1,
  2,
  2,
  'Homo sapiens (human)'],
 ['UNIPROT:A0A075B6N1',
  '{"rheumatoid arthritis"}',
  'T cell receptor beta variable 19',
  1,
  1,
  1,
  'Homo sapiens (human)'],
 ['UNIPROT:A0A075B6N2',
  '{"multiple sclerosis"}',
  'T cell receptor beta variable 20-1',
  1,
  2,
  2,
  'Homo sapiens (human)'],
 ['UNIPROT:A0A075B6N4',
  '{"multiple sclerosis"}',
  'T cell receptor beta variable 25-1',
  1,
  1,
  1,
  'Homo sapiens (human)'],
 ['UNIPROT:A0A075B6T7',
  '{"multiple sclerosis"}',
  'T cell receptor alpha variable 6',
  1,
  1,
  1,
  'Homo sapiens (human)'],
 ['UNIPROT:A0A075B6W5',
  '{"multiple sclerosis"}',
  'T cell receptor alpha variable 23/delta variable 6',
  1,
  1,
  1,
  'Homo sapiens (human)'],
 ['UNIPROT:A0A087WSZ9',
  '{"multip

In [106]:
tcell[(tcell['host_organism_iri'] == tcell['r_object_source_organism_iri'])].to_csv('test.csv')

In [108]:
tdf.to_csv('tcell.csv')

In [121]:
tdf[tdf['parent_source_antigen_iri'].fillna(tdf['r_object_source_molecule_iri']).isna()].to_csv('test.csv')
