In [53]:
import warnings
warnings.filterwarnings("ignore")

import io
import requests
import json
import gzip
import numpy as np
import pandas as pd

from pepmatch import Preprocessor, Matcher
from collections import Counter
from Bio import SeqIO

with open('../autoimmune_diseases.json' , 'r') as f:
    diseases = json.load(f)

In [7]:
def pull_autoimmune_data(table):
    '''
    Extracts T cell and B cell positive assay data from the IEDB.
    Parameter table = 'tcell' or 'bcell' to specify. 
    '''
    
    df = pd.DataFrame()
    for doid in diseases.keys(): 

        # first get the total number of assays as first request to loop through API
        url = 'https://query-api.iedb.org/%s_search' % table
        params = {'order': 'structure_id',
                  'qualitative_measure': 'neq.Negative', # select positive assays only
                  'disease_iris': f'cs.{{{"DOID:"+doid}}}'} 
        r = requests.get(url, params=params, headers={'Prefer': 'count=exact'})
        pages = int(r.headers['Content-Range'].split('/')[-1])
      
      # loop through IEDB API pages using requests - read into pandas DataFrame and concat
        for i in range(pages // 10000 + 1): # API limit is 10,000 entries
            params['offset'] = i*10000

            # request API call returning csv formatting using parameters in params
            s = requests.get(url, params=params, headers={'accept': 'text/csv', 'Prefer': 'count=exact'})
            try:
                df = pd.concat([df, pd.read_csv(io.StringIO(s.content.decode('utf-8')))])
            except pd.errors.EmptyDataError:
                continue

    return df

In [3]:
tcell = pull_iedb_assay_data('tcell')
bcell = pull_iedb_assay_data('bcell')

In [4]:
# select epitopes where host and source organism are identical
tdf = tcell[(tcell['host_organism_iri'] == tcell['parent_source_antigen_source_org_iri']) | (tcell['host_organism_iri'] == tcell['r_object_source_organism_iri'])]
bdf = bcell[(bcell['host_organism_iri'] == bcell['parent_source_antigen_source_org_iri']) | (bcell['host_organism_iri'] == bcell['r_object_source_organism_iri'])]

In [62]:
tdf['source_antigen_iri'] = tdf['parent_source_antigen_iri'].fillna(tdf['r_object_source_molecule_iri'])
bdf['source_antigen_iri'] = bdf['parent_source_antigen_iri'].fillna(bdf['r_object_source_molecule_iri'])

tdf['source_antigen_name'] = tdf['parent_source_antigen_name'].fillna(tdf['r_object_source_molecule_name'])
bdf['source_antigen_name'] = bdf['parent_source_antigen_name'].fillna(bdf['r_object_source_molecule_name'])

tdf['source_organism_name'] = tdf['source_organism_name'].fillna(tdf['r_object_source_organism_name'])
bdf['source_organism_name'] = bdf['source_organism_name'].fillna(bdf['r_object_source_organism_name'])

In [63]:
bdf['source_organism_name'].isna().value_counts()

False    31218
Name: source_organism_name, dtype: int64

In [42]:
 
for i, row in tdf.groupby('source_antigen_iri'):
    print(list(row['parent_source_antigen_name']))
    

['T cell receptor beta, variable 10 (Fragment)']
['Immunoglobulin lambda variable 4-69', 'Immunoglobulin lambda variable 4-69']
['T cell receptor beta variable 19']
['T cell receptor beta variable 20-1', 'T cell receptor beta variable 20-1']
['T cell receptor beta variable 25-1']
['T cell receptor alpha variable 6']
['T cell receptor alpha variable 23/delta variable 6']
['T cell receptor alpha variable 30']
['T cell receptor beta variable 15 (Fragment)']
['Probable non-functional T cell receptor beta variable 23-1']
['T cell receptor alpha variable 16']
['T cell receptor alpha variable 19']
['T cell receptor beta variable 9']
['Immunoglobulin heavy variable 6-1']
['Immunoglobulin heavy variable 3-74']
['T cell receptor alpha variable 13-2', 'T cell receptor alpha variable 13-2']
['T cell receptor alpha variable 8-2']
['T cell receptor alpha variable 13-1', 'T cell receptor alpha variable 13-1']
['T cell receptor alpha variable 5']
['T cell receptor alpha variable 8-6']
['T cell recepto

In [74]:
a_b_cell_counts = []
for i, row in bdf.groupby('source_antigen_iri'):
    diseases = row['disease_names']
    counts = []
    counts.append(i)
    counts.append(list(row['source_antigen_name'].dropna())[0])
    counts.append(', '.join(set(diseases)))
    counts.append(len(row['reference_id'].unique()))
    counts.append(len(row['structure_id'].unique()))
    counts.append(len(row))
    counts.append(list(row['source_organism_name'].dropna())[0])

    a_b_cell_counts.append(counts)

In [75]:
a_b_cell_counts = pd.DataFrame(a_b_cell_counts, columns=['Protein ID','Protein Name', 'Diseases', 'Reference Count',
                                     'Epitope Count', 'Assay Count', 'Parent Species'])

In [77]:
a_b_cell_counts = a_b_cell_counts[a_b_cell_counts['Reference Count'] > 1]

In [106]:
tcell[(tcell['host_organism_iri'] == tcell['r_object_source_organism_iri'])].to_csv('test.csv')

In [108]:
tdf.to_csv('tcell.csv')

In [121]:
tdf[tdf['parent_source_antigen_iri'].fillna(tdf['r_object_source_molecule_iri']).isna()].to_csv('test.csv')


In [5]:
def pull_cancer_data(table):
    '''
    Extracts T cell and B cell positive assay data from the IEDB.
    Parameter table = 'tcell' or 'bcell' to specify. 
    '''
    
    df = pd.DataFrame()

    # first get the total number of assays as first request to loop through API
    url = 'https://query-api.iedb.org/%s_search' % table
    params = {'order': 'structure_id',
              'qualitative_measure': 'neq.Negative',
              'or': '(e_related_object_type.eq.neo-epitope, immunization_description.plfts."Occurrence of cancer")'}
    r = requests.get(url, params=params, headers={'Prefer': 'count=exact'})
    pages = int(r.headers['Content-Range'].split('/')[-1])
      
    # loop through IEDB API pages using requests - read into pandas DataFrame and concat
    for i in range(pages // 10000 + 1): # API limit is 10,000 entries
        params['offset'] = i*10000

        # request API call returning csv formatting using parameters in params
        s = requests.get(url, params=params, headers={'accept': 'text/csv', 'Prefer': 'count=exact'})
        try:
            df = pd.concat([df, pd.read_csv(io.StringIO(s.content.decode('utf-8')))])
        except pd.errors.EmptyDataError:
            continue

    return df

In [6]:
tcell_cancer = pull_cancer_data('tcell')
bcell_cancer = pull_cancer_data('bcell')

tcell_autoimmune = pull_autoimmune_data('tcell')
bcell_autoimmune = pull_autoimmune_data('bcell')

In [16]:
tcell_autoimmune = tcell_autoimmune[(tcell_autoimmune['host_organism_iri'] == tcell_autoimmune['parent_source_antigen_source_org_iri']) | # OR
                                    (tcell_autoimmune['host_organism_iri'] == tcell_autoimmune['r_object_source_organism_iri'])]
bcell_autoimmune = bcell_autoimmune[(bcell_autoimmune['host_organism_iri'] == bcell_autoimmune['parent_source_antigen_source_org_iri']) | # OR  
                                    (bcell_autoimmune['host_organism_iri'] == bcell_autoimmune['r_object_source_organism_iri'])]

# create a column combining epitope and related object source antigen IDs
tcell_autoimmune['source_antigen_iri'] = tcell_autoimmune['parent_source_antigen_iri'].fillna(tcell_autoimmune['r_object_source_molecule_iri'])
bcell_autoimmune['source_antigen_iri'] = bcell_autoimmune['parent_source_antigen_iri'].fillna(bcell_autoimmune['r_object_source_molecule_iri'])

# create a column combining epitope and related object source antigen names
tcell_autoimmune['source_antigen_name'] = tcell_autoimmune['parent_source_antigen_name'].fillna(tcell_autoimmune['r_object_source_molecule_name'])
bcell_autoimmune['source_antigen_name'] = bcell_autoimmune['parent_source_antigen_name'].fillna(bcell_autoimmune['r_object_source_molecule_name'])

# create a column combining epitope and related object source organism names
tcell_autoimmune['source_organism_name'] = tcell_autoimmune['source_organism_name'].fillna(tcell_autoimmune['r_object_source_organism_name'])
bcell_autoimmune['source_organism_name'] = bcell_autoimmune['source_organism_name'].fillna(bcell_autoimmune['r_object_source_organism_name'])


# get reference, unique epitope, and counts for reference, epitope and assay by unique antigen for T cell
tcell_counts = []
for i, row in tcell_autoimmune.groupby('source_antigen_iri'):
    diseases = row['disease_names']
    counts = []
    counts.append(i)
    counts.append(list(row['source_antigen_name'].dropna())[0])
    counts.append(', '.join(set(diseases)))
    counts.append(len(row['reference_id'].unique()))
    counts.append(len(row['structure_id'].unique()))
    counts.append(len(row))
    counts.append(list(row['source_organism_name'].dropna())[0])

    tcell_counts.append(counts)

# get reference, unique epitope, and counts for reference, epitope and assay by unique antigen for B cell
bcell_counts = []
for i, row in bcell_autoimmune.groupby('source_antigen_iri'):
    diseases = row['disease_names']
    counts = []
    counts.append(i)
    counts.append(list(row['source_antigen_name'].dropna())[0])
    counts.append(', '.join(set(diseases)))
    counts.append(len(row['reference_id'].unique()))
    counts.append(len(row['structure_id'].unique()))
    counts.append(len(row))
    counts.append(list(row['source_organism_name'].dropna())[0])

    bcell_counts.append(counts)

# put counts into pandas DataFrame
tcell_counts = pd.DataFrame(tcell_counts, columns=['Protein ID', 'Protein Name', 'Diseases',
                                     'Reference Count', 'Epitope Count', 'Assay Count', 'Parent Species'])
bcell_counts = pd.DataFrame(bcell_counts, columns=['Protein ID','Protein Name', 'Diseases', 
                                     'Reference Count', 'Epitope Count', 'Assay Count', 'Parent Species'])

tcell_counts = tcell_counts[tcell_counts['Reference Count'] > 1]
bcell_counts = bcell_counts[bcell_counts['Reference Count'] > 1]

In [31]:
ids = list(tcell_counts['Protein ID']) + list(bcell_counts['Protein ID'])

l = []
for i in ids:
    try:
        l.append(i.split('UNIPROT:')[1])
    except IndexError:
        continue

In [43]:
set(master['id']) - set(l)

{'A0A087WWR8',
 'A0A0A0MS08',
 'A0A0G2JPR0',
 'A0A1B0GTN7',
 'A0A1W2PNV4',
 'A0A286YEY1',
 'A0A499FJK2',
 'A0A5S8K7D6',
 'A0A669KA35',
 'B3KWE1',
 'E7ENL6',
 'E7ETN3',
 'H0Y7L5',
 'H3BRY3',
 'K7EK07',
 'O00330',
 'P01889',
 'P01903',
 'P01911',
 'P02745',
 'P12956',
 'P18124',
 'P19338',
 'P20671',
 'P23229',
 'P25189',
 'P50281',
 'P60709',
 'P78508',
 'Q14767',
 'Q30154',
 'Q5T7C4',
 'Q5TAL2',
 'Q8IUK5',
 'Q8N6G6',
 'Q9BZR6',
 'Q9GZM7'}

In [39]:
master = pd.read_csv('../data/autoimmune_antigens.csv', index_col=0)
len(set(master['id']))

372

In [47]:
bcell_autoimmune[bcell_autoimmune['bcell_id'] == '1708210']

Unnamed: 0,bcell_id,bcell_iri,structure_id,structure_iri,linear_sequence,structure_type,structure_description,curated_source_antigen,reference_id,reference_iri,...,non_peptidic_molecule_name,r_object_source_molecule_iri_search,r_object_source_molecule_iri,r_object_source_molecule_name,r_object_source_organism_iri_search,r_object_source_organism_iri,r_object_source_organism_name,e_related_object_type,source_antigen_iri,source_antigen_name


In [52]:
diseases.keys()

dict_keys(['417', '437', '676', '718', '986', '2048', '2377', '2988', '3492', '4313', '5213', '6196', '7147', '7148', '7188', '8506', '8577', '8781', '8857', '8869', '8893', '8924', '9008', '9074', '9182', '9744', '9808', '10608', '11656', '12132', '12236', '12297', '12306', '12361', '12842', '12894', '13241', '14040', '14482', '0040087', '0040088', '0040089', '0040092', '0040094', '0040096', '0040097', '0050167', '0050169', '0050214', '0060004', '0060030', '0060032', '0060039', '0060050', '0060051', '0060499'])