## This notebook fetches data from the UniProt and IEDB live APIs. If these servers are experiencing maintenance, downtime or are busy, the cells below may fail or time out. Pre-downloaded datasets are provided in the SEQUENCES-COLLECTION/ folder to ensure reproducibility without relying on live network calls.

### Import Packages

In [1]:
import pandas as pd
import requests
import time
from io import StringIO
from Bio import SeqIO
from itertools import islice

### Get Positive B-cell Data From IEDB

###### Objective: Fetch B-cell assay data from the Immune Epitope Database (IEDB).
###### Search Parameters:
###### -> Source: Bacteria (NCBITaxon:2)
###### -> Host: Homo sapiens (NCBITaxon:9606)
###### -> Disease: Infection (DOID:0050117)
###### -> Filter: Exclude negative results (neq.Negative)
###### -> Batch limit of 1000 and pagination (offset) were used to retrieve the full dataset without timing out the API.

In [2]:
bcell_url = "https://query-api.iedb.org/bcell_search"
bcell_search_params = {
    'source_organism_iri_search': 'cs.{"NCBITaxon:2"}',
    'host_organism_iri_search': 'cs.{"NCBITaxon:9606"}',
    'disease_iri_search': 'cs.{"DOID:0050117"}',
    'qualitative_measure': 'neq.Negative',
    'limit': 1000,
    'offset': 0,
    'order': 'bcell_iri'
}

bcell_results = requests.get(bcell_url, params=bcell_search_params)
bcell_search_params['offset'] += len(bcell_results.json())
print(bcell_search_params['offset'])
bcell_data = bcell_results.json()

while (bcell_results.json() != []):
    time.sleep(2)
    bcell_results = requests.get(bcell_url, params=bcell_search_params)
    if (bcell_results.json() != []):
        bcell_search_params['offset'] += len(bcell_results.json())
        print(bcell_search_params['offset'])
        bcell_data.extend(bcell_results.json())
    else:
        break

1000
2000
2808


### Convert The Retrieved Data Into A Dataframe

In [3]:
bcell_df = pd.DataFrame(bcell_data)
bcell_df

Unnamed: 0,bcell_id,bcell_iri,structure_id,structure_iri,linear_sequence,structure_type,structure_description,curated_source_antigen,reference_id,reference_iri,...,non_peptidic_molecule_iri,non_peptidic_molecule_name,r_object_source_molecule_iri_search,r_object_source_molecule_iri,r_object_source_molecule_name,r_object_source_organism_iri_search,r_object_source_organism_iri,r_object_source_organism_name,e_related_object_type,host_mhc_types_present
0,1041040,IEDB_ASSAY:1041040,41948,IEDB_EPITOPE:41948,MLGNAPSVVPNTTLGM,Linear peptide,MLGNAPSVVPNTTLGM,"{'accession': 'A43589', 'name': 'mtp40 protein...",1000620,IEDB_REFERENCE:1000620,...,,,,,,,,,,
1,1041055,IEDB_ASSAY:1041055,66693,IEDB_EPITOPE:66693,TTLGMHCGSFGSAPSNG,Linear peptide,TTLGMHCGSFGSAPSNG,"{'accession': 'A43589', 'name': 'mtp40 protein...",1000620,IEDB_REFERENCE:1000620,...,,,,,,,,,,
2,1041060,IEDB_ASSAY:1041060,72741,IEDB_EPITOPE:72741,WLKLGLVEFGGVAKLNAEVMS,Linear peptide,WLKLGLVEFGGVAKLNAEVMS,"{'accession': 'A43589', 'name': 'mtp40 protein...",1000620,IEDB_REFERENCE:1000620,...,,,,,,,,,,
3,1041067,IEDB_ASSAY:1041067,41952,IEDB_EPITOPE:41952,MLGTGTPNRARINFNC,Linear peptide,MLGTGTPNRARINFNC,"{'accession': 'A43589', 'name': 'mtp40 protein...",1000620,IEDB_REFERENCE:1000620,...,,,,,,,,,,
4,1041069,IEDB_ASSAY:1041069,27635,IEDB_EPITOPE:27635,INFNCEVWSNVSETISGPRLY,Linear peptide,INFNCEVWSNVSETISGPRLY,"{'accession': 'A43589', 'name': 'mtp40 protein...",1000620,IEDB_REFERENCE:1000620,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2803,9067,IEDB_ASSAY:9067,41071,IEDB_EPITOPE:41071,MAKVNIKPLEDKILV,Linear peptide,MAKVNIKPLEDKILV,"{'accession': 'P09621.3', 'name': '10 kDa chap...",833,IEDB_REFERENCE:833,...,,,,,,,,,,
2804,9080,IEDB_ASSAY:9080,72294,IEDB_EPITOPE:72294,WDEDGEKRIPLDVAE,Linear peptide,WDEDGEKRIPLDVAE,"{'accession': 'P09621.3', 'name': '10 kDa chap...",833,IEDB_REFERENCE:833,...,,,,,,,,,,
2805,9082,IEDB_ASSAY:9082,35316,IEDB_EPITOPE:35316,LDVAEGDTVIYSKYG,Linear peptide,LDVAEGDTVIYSKYG,"{'accession': 'P09621.3', 'name': '10 kDa chap...",833,IEDB_REFERENCE:833,...,,,,,,,,,,
2806,9176063,IEDB_ASSAY:9176063,1125229,IEDB_EPITOPE:1125229,SEKQMPSVVNENAVTPEK,Linear peptide,SEKQMPSVVNENAVTPEK,"{'accession': 'WP_106459654.1', 'name': 'LPXTG...",1037215,IEDB_REFERENCE:1037215,...,,,,,,,,,,


### Get Positive T-cell Data From IEDB

###### Objective: Fetch T-cell assay data from the Immune Epitope Database (IEDB).
###### Search Parameters:
###### -> Source: Bacteria (NCBITaxon:2)
###### -> Host: Homo sapiens (NCBITaxon:9606)
###### -> Disease: Infection (DOID:0050117)
###### -> Filter: Exclude negative results (neq.Negative)
###### -> Batch limit of 1000 and pagination (offset) were used to retrieve the full dataset without timing out the API.

In [4]:
tcell_url = "https://query-api.iedb.org/tcell_search"
tcell_search_params = {
    'source_organism_iri_search': 'cs.{"NCBITaxon:2"}',
    'host_organism_iri_search': 'cs.{"NCBITaxon:9606"}',
    'disease_iri_search': 'cs.{"DOID:0050117"}',
    'qualitative_measure': 'neq.Negative',
    'limit':1000,
    'offset': 0,
    'order': 'tcell_iri'
}
tcell_data=[]
tcell_results = requests.get(tcell_url, params=tcell_search_params)
tcell_search_params['offset'] += len(tcell_results.json())
print(tcell_search_params['offset'])
tcell_data = tcell_results.json()

while (tcell_results.json() != []):
    time.sleep(2)
    tcell_results = requests.get(tcell_url, params=tcell_search_params)
    if (tcell_results.json() != []):
        tcell_search_params['offset'] += len(tcell_results.json())
        print(tcell_search_params['offset'])
        tcell_data.extend(tcell_results.json())
    else:
        break

1000
2000
3000
4000
4898


### Convert The Retrieved Data Into A Dataframe

In [5]:
tcell_df = pd.DataFrame(tcell_data)
tcell_df

Unnamed: 0,tcell_id,tcell_iri,structure_id,structure_iri,linear_sequence,structure_type,structure_description,curated_source_antigen,reference_id,reference_iri,...,non_peptidic_molecule_iri,non_peptidic_molecule_name,r_object_source_molecule_iri_search,r_object_source_molecule_iri,r_object_source_molecule_name,r_object_source_organism_iri_search,r_object_source_organism_iri,r_object_source_organism_name,e_related_object_type,host_mhc_types_present
0,10035,IEDB_ASSAY:10035,55139,IEDB_EPITOPE:55139,RPEAVLQHARTLAKI,Linear peptide,RPEAVLQHARTLAKI,"{'accession': 'P45487.1', 'name': '8-amino-7-o...",890,IEDB_REFERENCE:890,...,,,,,,,,,,
1,10037,IEDB_ASSAY:10037,41499,IEDB_EPITOPE:41499,MFGCMNYSTRVTLAD,Linear peptide,MFGCMNYSTRVTLAD,"{'accession': 'P35901.1', 'name': 'Protein rec...",890,IEDB_REFERENCE:890,...,,,,,,,,,,
2,10039,IEDB_ASSAY:10039,41499,IEDB_EPITOPE:41499,MFGCMNYSTRVTLAD,Linear peptide,MFGCMNYSTRVTLAD,"{'accession': 'P35901.1', 'name': 'Protein rec...",890,IEDB_REFERENCE:890,...,,,,,,,,,,
3,10041,IEDB_ASSAY:10041,20811,IEDB_EPITOPE:20811,GLDSIISSASASLLT,Linear peptide,GLDSIISSASASLLT,"{'accession': 'Q07297.1', 'name': 'Serine-rich...",890,IEDB_REFERENCE:890,...,,,,,,,,,,
4,10043,IEDB_ASSAY:10043,20811,IEDB_EPITOPE:20811,GLDSIISSASASLLT,Linear peptide,GLDSIISSASASLLT,"{'accession': 'Q07297.1', 'name': 'Serine-rich...",890,IEDB_REFERENCE:890,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4893,9815,IEDB_ASSAY:9815,69572,IEDB_EPITOPE:69572,VLKSYVLEGTLTAEK,Linear peptide,VLKSYVLEGTLTAEK,"{'accession': 'P14013.1', 'name': 'Outer surfa...",626,IEDB_REFERENCE:626,...,,,,,,,,,,
4894,9897,IEDB_ASSAY:9897,69572,IEDB_EPITOPE:69572,VLKSYVLEGTLTAEK,Linear peptide,VLKSYVLEGTLTAEK,"{'accession': 'P14013.1', 'name': 'Outer surfa...",626,IEDB_REFERENCE:626,...,,,,,,,,,,
4895,9898,IEDB_ASSAY:9898,69572,IEDB_EPITOPE:69572,VLKSYVLEGTLTAEK,Linear peptide,VLKSYVLEGTLTAEK,"{'accession': 'P14013.1', 'name': 'Outer surfa...",626,IEDB_REFERENCE:626,...,,,,,,,,,,
4896,9913,IEDB_ASSAY:9913,69569,IEDB_EPITOPE:69569,VLKNFTLEGKVANDK,Linear peptide,VLKNFTLEGKVANDK,"{'accession': 'P0A3N7.1', 'name': 'Outer surfa...",626,IEDB_REFERENCE:626,...,,,,,,,,,,


### Extract Peptide-Only Entries Into A New Dataframe

In [6]:
peptide_bcell_df = bcell_df[bcell_df['structure_type'].str.contains("peptide")].reset_index(drop=True)
peptide_tcell_df = tcell_df[tcell_df['structure_type'].str.contains("peptide")].reset_index(drop=True)
peptide_df = pd.concat([peptide_bcell_df, peptide_tcell_df], axis=0).reset_index(drop=True)

### Extract Entries With non-null Values In 'parent_source_antigen_iri' Column
### Extract Uniprot IDs From 'parent_source_antigen_iri' Column
### Write The IDs Into A File 

In [7]:
peptide_df = peptide_df[~peptide_df['parent_source_antigen_iri'].isna()].reset_index(drop=True)
antigen_ids = list(set(peptide_df['parent_source_antigen_iri'].tolist()))

for i in range(len(antigen_ids)):
    antigen_ids[i] = antigen_ids[i].split(':')[1]

print("Total no.of IDs:", len(antigen_ids))
antigen_ids = ",".join(antigen_ids)
with open("antigen_ids.txt", "w") as file:
    file.write(antigen_ids)

print(antigen_ids)

Total no.of IDs: 900
O51381,P9WNE5,Q05868,P9WKX1,Q9CD33,P9WQB1,P9WIG3,P38056,P9WJI7,Q8YGS6,P9WNG7,O84534,P9WN39,Q83AD7,I6YA32,P9WLH9,Q6MWV0,O06807,A0A143QSL7,P9WID7,P55980,A0PSI5,Q5NEY9,P38006,Q7APV8,Q59191,Q7MXK0,D0CB10,Q8F2W1,O53781,P96872,P9WQ87,Q2FZK7,A0A0H3MU32,P9WQJ9,P9WJD7,O06625,O84818,P9WG11,O07738,O53697,P9WG67,Q7AQ56,Q2FYP2,O05578,Q83BU7,P9WHY5,O07420,O33183,I3NID5,P9WMM9,P95087,Q79FP3,Q5NHJ0,P96221,O06311,H7C7K8,Q79FV6,O51477,Q83BT6,Q8YC41,P9WQN7,O05877,P9WIN9,P9WI43,O53332,P9WLQ1,P9WI37,O33192,P9WF43,P0CL66,Q9CCX9,O50391,P9WHX7,P9WL75,P0C922,P17835,L7N680,L7N695,P9WNB1,Q83AF7,O53611,Q50700,Q2FUX4,P9WIF1,P14738,Q44767,P96901,P9WFK1,A0AAQ3CNY5,P9WNK3,Q83F57,P18584,Q9CBG6,Q8EXJ2,P9WPV3,Q49769,A0A481YLK4,P0C0Z7,O84616,Q6MWX9,P9WIE7,P9WPD9,P16055,O07236,P9WFL9,A0A3A9MB66,P9WG25,P9WIR7,O69661,Q9RPX7,P19421,Q9A155,Q45010,P9WG41,Q79FS8,P9WI09,P9WP81,P9WI03,Q5NH51,P9WK55,P9WID3,O05311,P71540,Q79FH3,944-other,Q6MX26,P9WMY9,Q63RA7,P9WL67,P9WPD5,Q5NHA8,P9WQN9,Q9ZH99,O84385,P07643,P134

### Using UniProtKB ID-Mapping To Map antigen_ids



In [8]:
ID_mapping_post_url = "https://rest.uniprot.org/idmapping/run"
ID_mapping_post_params = {
    "from": "UniProtKB_AC-ID",
    "to": "UniProtKB",
    "ids": antigen_ids
}

jobId = requests.post(ID_mapping_post_url, data=ID_mapping_post_params).json()['jobId']
jobId

'57BOFhQhdi'

### Checks The Status Of The Submitted Job

In [9]:
ID_mapping_status_url = f"https://rest.uniprot.org/idmapping/status/{jobId}"
status = requests.get(ID_mapping_status_url)

while status.json()['jobStatus'] in ["RUNNING", "NEW", "QUEUED"]:
    print("Fetching...")
    time.sleep(5)
    status = requests.get(ID_mapping_status_url)
    if 'jobStatus' not in status.json():
        print("Done")
        break

Fetching...
Done


### Download The Mapped IDs FASTA Sequences And Write Into A File

In [10]:
ID_mapping_results_url = f"https://rest.uniprot.org/idmapping/uniprotkb/results/stream/{jobId}"
ID_mapping_results_params = {
    "query": "taxonomy_id:2 AND reviewed:true AND fragment:false",
    "format": "fasta"
}

result = requests.get(ID_mapping_results_url, params=ID_mapping_results_params) 

with open("antigenic_sequences.fasta", "w") as file:
    file.write(result.text)

count=0
with open("antigenic_sequences.fasta", "r") as file:
    for record in SeqIO.parse(file, "fasta"):
        count+=1
print("Total antigenic sequences:", count)

Total antigenic sequences: 496


### Print First Ten FASTA Sequences

In [11]:
with open("antigenic_sequences.fasta", "r") as file:
    first_10 = list(islice(SeqIO.parse(file, "fasta"), 10))

for record in first_10:
    print(record)

ID: sp|A0PSI5|MASZ_MYCUA
Name: sp|A0PSI5|MASZ_MYCUA
Description: sp|A0PSI5|MASZ_MYCUA Malate synthase G OS=Mycobacterium ulcerans (strain Agy99) OX=362242 GN=glcB PE=3 SV=1
Number of features: 0
Seq('MTDRVSAGNLRVARVLYDFVNNEALPGTDIDQDSFWAGVDKVVTDLTPQNQDLL...AGA')
ID: sp|A1JIP3|CH60_YERE8
Name: sp|A1JIP3|CH60_YERE8
Description: sp|A1JIP3|CH60_YERE8 Chaperonin GroEL OS=Yersinia enterocolitica serotype O:8 / biotype 1B (strain NCTC 13174 / 8081) OX=393305 GN=groEL PE=3 SV=1
Number of features: 0
Seq('MAAKDVKFGNDARIKMLRGVNILADAVKVTLGPKGRNVVLDKSFGSPTITKDGV...GMM')
ID: sp|A1JNN2|CLPP_YERE8
Name: sp|A1JNN2|CLPP_YERE8
Description: sp|A1JNN2|CLPP_YERE8 ATP-dependent Clp protease proteolytic subunit OS=Yersinia enterocolitica serotype O:8 / biotype 1B (strain NCTC 13174 / 8081) OX=393305 GN=clpP PE=3 SV=1
Number of features: 0
Seq('MSYSGERDQFAPNMALVPMVVEQTSRGERSYDIFSRLLKERIIFLTGQVEDHMA...RRD')
ID: sp|A1JUB7|YADA_YERE8
Name: sp|A1JUB7|YADA_YERE8
Description: sp|A1JUB7|YADA_YERE8 Adhesin YadA OS=Ye

### Download The Selected Information("fields") Of The Mapped IDs In .tsv Format AND Print First 50 Rows
### Export The Data In '.csv' Format

In [12]:
ID_mapping_results_url = f"https://rest.uniprot.org/idmapping/uniprotkb/results/stream/{jobId}"
ID_mapping_results_params = {
    "query": "taxonomy_id:2 AND reviewed:true AND fragment:false",
    "format": "tsv",
    "fields": "accession,id,protein_name,gene_names,organism_name,organism_id,length,mass,cc_subcellular_location,ft_signal,ft_helix,ft_strand,ft_turn,fragment,reviewed"
}

result = requests.get(ID_mapping_results_url, params=ID_mapping_results_params)
tsv_antigen = pd.read_csv(StringIO(result.text), sep='\t')
tsv_antigen.to_csv("tsv_antigen.csv", index=False)
tsv_antigen.head(50)

Unnamed: 0,From,Entry,Entry Name,Protein names,Gene Names,Organism,Organism (ID),Length,Mass,Subcellular location [CC],Signal peptide,Helix,Beta strand,Turn,Fragment,Reviewed
0,A0PSI5,A0PSI5,MASZ_MYCUA,Malate synthase G (EC 2.3.3.9),glcB MUL_3055,Mycobacterium ulcerans (strain Agy99),362242,731,79479,SUBCELLULAR LOCATION: Cytoplasm {ECO:0000255|H...,,,,,,reviewed
1,A1JIP3,A1JIP3,CH60_YERE8,Chaperonin GroEL (EC 5.6.1.7) (60 kDa chaperon...,groEL groL YE0354,Yersinia enterocolitica serotype O:8 / biotype...,393305,550,57513,SUBCELLULAR LOCATION: Cytoplasm {ECO:0000255|H...,,,,,,reviewed
2,A1JNN2,A1JNN2,CLPP_YERE8,ATP-dependent Clp protease proteolytic subunit...,clpP YE3134,Yersinia enterocolitica serotype O:8 / biotype...,393305,207,23256,SUBCELLULAR LOCATION: Cytoplasm {ECO:0000255|H...,,,,,,reviewed
3,A1JUB7,A1JUB7,YADA_YERE8,Adhesin YadA (Protein Yop1) (Trimeric autotran...,yadA yopA YEP0066,Yersinia enterocolitica serotype O:8 / biotype...,393305,422,44139,SUBCELLULAR LOCATION: Cell surface {ECO:000026...,"SIGNAL 1..25; /evidence=""ECO:0000255""","HELIX 333..360; /evidence=""ECO:0007829|PDB:2LME""","STRAND 368..380; /evidence=""ECO:0007829|PDB:2L...","TURN 408..410; /evidence=""ECO:0007829|PDB:2LME""",,reviewed
4,A5CDL9,A5CDL9,CH60_ORITB,Chaperonin GroEL (EC 5.6.1.7) (60 kDa chaperon...,groEL groL OTBS_0917,Orientia tsutsugamushi (strain Boryong) (Ricke...,357244,554,59715,SUBCELLULAR LOCATION: Cytoplasm {ECO:0000255|H...,,,,,,reviewed
5,B2HSY2,B2HSY2,MASZ_MYCMM,Malate synthase G (EC 2.3.3.9),glcB MMAR_2713,Mycobacterium marinum (strain ATCC BAA-535 / M),216594,731,79479,SUBCELLULAR LOCATION: Cytoplasm {ECO:0000255|H...,,"HELIX 14..22; /evidence=""ECO:0007829|PDB:6AXE""...","STRAND 4..7; /evidence=""ECO:0007829|PDB:6AXE"";...","TURN 151..153; /evidence=""ECO:0007829|PDB:6AXE...",,reviewed
6,I3NID5,I3NID5,ENCP2_MYCPA,Type 2A encapsulin shell protein (Major membra...,enc2 MAP_2121c,Mycolicibacterium paratuberculosis (strain ATC...,262316,307,33671,SUBCELLULAR LOCATION: Encapsulin nanocompartme...,,,,,,reviewed
7,I6XFZ8,I6XFZ8,Y3035_MYCTU,Protein Rv3035,Rv3035 RVBD_3035 P425_03163,Mycobacterium tuberculosis (strain ATCC 25618 ...,83332,411,42539,,,,,,,reviewed
8,I6YA32,I6YA32,CHIZ_MYCTU,Cell wall hydrolase ChiZ (EC 3.4.-.-),chiZ Rv2719c,Mycobacterium tuberculosis (strain ATCC 25618 ...,83332,165,17324,SUBCELLULAR LOCATION: Cell membrane {ECO:00003...,,,,,,reviewed
9,I6YET7,I6YET7,Y2963_MYCTU,Putative permease Rv2963,Rv2963 RVBD_2963 P425_03084,Mycobacterium tuberculosis (strain ATCC 25618 ...,83332,406,43729,SUBCELLULAR LOCATION: Cell membrane {ECO:00003...,,,,,,reviewed


### Extract Unique Organism IDs of the antigens and create a search query that could be used for downloading non-antigens

In [13]:
organism_ids = tsv_antigen['Organism (ID)'].unique().tolist()
organism_ids_query=[]
for i in organism_ids:
    organism_ids_query.append(f"organism_id:{i}")
organism_ids_query = " OR ".join(organism_ids_query)
organism_ids_query 

'organism_id:362242 OR organism_id:393305 OR organism_id:357244 OR organism_id:216594 OR organism_id:262316 OR organism_id:83332 OR organism_id:272631 OR organism_id:224326 OR organism_id:243276 OR organism_id:272561 OR organism_id:83558 OR organism_id:243277 OR organism_id:212717 OR organism_id:257313 OR organism_id:85962 OR organism_id:99287 OR organism_id:224914 OR organism_id:83333 OR organism_id:227377 OR organism_id:301447 OR organism_id:272634 OR organism_id:208964 OR organism_id:1392 OR organism_id:210007 OR organism_id:93061 OR organism_id:122586 OR organism_id:242619 OR organism_id:71421 OR organism_id:189518 OR organism_id:390236 OR organism_id:171101 OR organism_id:177416'

### Download Full Length(fragment:false) Reviewed(reviewed:true) Sequences in FASTA 
### We make sure that we get the non-antigens with the same Organism ID as the antigens using "{organism_ids_query}" in the query.
### Write The Sequences Into A File

In [14]:
non_antigens_url = "https://rest.uniprot.org/uniprotkb/stream"
non_antigens_search_params = {
    "query": f"({organism_ids_query}) AND reviewed:true AND fragment:false",
    "format": "fasta"
}

result = requests.get(non_antigens_url, params=non_antigens_search_params) 

with open("non_antigenic_sequences.fasta", "w") as file:
    file.write(result.text)

count=0
with open("non_antigenic_sequences.fasta", "r") as file:
    for record in SeqIO.parse(file, "fasta"):
        count+=1
print("Total non-antigen sequences:", count)

Total non-antigen sequences: 25274


### Print First Ten FASTA Sequences

In [15]:
with open("non_antigenic_sequences.fasta", "r") as file:
    first_10 = list(islice(SeqIO.parse(file, "fasta"), 10))
    
for record in first_10:
    print(record)

ID: sp|A0A089QRB9|MSL3_MYCTU
Name: sp|A0A089QRB9|MSL3_MYCTU
Description: sp|A0A089QRB9|MSL3_MYCTU Mycolipanoate synthase OS=Mycobacterium tuberculosis (strain ATCC 25618 / H37Rv) OX=83332 GN=msl3 PE=1 SV=2
Number of features: 0
Seq('MRTATATSVAVIGMACRLPGGIDSPQRLWEALLRGDDLVGEIPADRWDANVYYD...APV')
ID: sp|A0A0F7KYQ8|BLBV1_PSEAE
Name: sp|A0A0F7KYQ8|BLBV1_PSEAE
Description: sp|A0A0F7KYQ8|BLBV1_PSEAE Metallo-beta-lactamase VIM-1 OS=Pseudomonas aeruginosa (strain ATCC 15692 / DSM 22644 / CIP 104116 / JCM 14847 / LMG 12228 / 1C / PRS 101 / PAO1) OX=208964 GN=VIM-1 PE=1 SV=1
Number of features: 0
Seq('MLKVISSLLVYMTASVMAVASPLAHSGEPSGEYPTVNEIPVGEVRLYQIADGVW...VAE')
ID: sp|A0A4Y1WBN6|YYCJ_BACAN
Name: sp|A0A4Y1WBN6|YYCJ_BACAN
Description: sp|A0A4Y1WBN6|YYCJ_BACAN Exodeoxyribonuclease YycJ OS=Bacillus anthracis OX=1392 GN=yycJ PE=1 SV=1
Number of features: 0
Seq('MGLHFSVLASGSTGNMLYVGTDEKKLLVDAGLSGKATEALFKQAELNINDVSGI...QYV')
ID: sp|A0PVU7|KGD_MYCUA
Name: sp|A0PVU7|KGD_MYCUA
Description: sp|A0PVU7|KGD

### Download The Selected Information("fields") Of The Sequences In .tsv Format AND Print First 50 Rows
### Export The Data In '.csv' format

In [16]:
non_antigens_url = "https://rest.uniprot.org/uniprotkb/stream"
non_antigens_search_params = {
    "query": f"({organism_ids_query}) AND reviewed:true AND fragment:false",
    "format": "tsv",
    "fields": "accession,id,protein_name,gene_names,organism_name,organism_id,length,mass,cc_subcellular_location,ft_signal,ft_helix,ft_strand,ft_turn,fragment,reviewed"
}

result = requests.get(non_antigens_url, params=non_antigens_search_params)
tsv_non_antigen = pd.read_csv(StringIO(result.text), sep='\t')
tsv_non_antigen.to_csv("tsv_non_antigen.csv", index=False)
tsv_non_antigen.head(50)

Unnamed: 0,Entry,Entry Name,Protein names,Gene Names,Organism,Organism (ID),Length,Mass,Subcellular location [CC],Signal peptide,Helix,Beta strand,Turn,Fragment,Reviewed
0,A0A089QRB9,MSL3_MYCTU,Mycolipanoate synthase (EC 2.3.1.252) (Mycocer...,msl3 pks3 pks4 Rv1180/Rv1181,Mycobacterium tuberculosis (strain ATCC 25618 ...,83332,2085,220462,SUBCELLULAR LOCATION: Cell membrane {ECO:00002...,"SIGNAL 1..15; /evidence=""ECO:0000255|PROSITE-P...",,,,,reviewed
1,A0A0F7KYQ8,BLBV1_PSEAE,Metallo-beta-lactamase VIM-1 (EC 3.5.2.6),VIM-1 blaVIM-1,Pseudomonas aeruginosa (strain ATCC 15692 / DS...,208964,266,28024,SUBCELLULAR LOCATION: Periplasm {ECO:0000250|U...,"SIGNAL 1..20; /evidence=""ECO:0000255""","HELIX 36..38; /evidence=""ECO:0007829|PDB:7UYA""...","STRAND 45..50; /evidence=""ECO:0007829|PDB:7UYA...","TURN 189..192; /evidence=""ECO:0007829|PDB:7UYA...",,reviewed
2,A0A4Y1WBN6,YYCJ_BACAN,Exodeoxyribonuclease YycJ (EC 3.1.11.-),yycJ GBAA_5711,Bacillus anthracis,1392,264,29293,,,,,,,reviewed
3,A0PVU7,KGD_MYCUA,Multifunctional 2-oxoglutarate metabolism enzy...,kgd MUL_4500,Mycobacterium ulcerans (strain Agy99),362242,1238,136742,,,,,,,reviewed
4,A1JIG4,FADB_YERE8,Fatty acid oxidation complex subunit alpha [In...,fadB YE0268,Yersinia enterocolitica serotype O:8 / biotype...,393305,729,78871,,,,,,,reviewed
5,A1JUB7,YADA_YERE8,Adhesin YadA (Protein Yop1) (Trimeric autotran...,yadA yopA YEP0066,Yersinia enterocolitica serotype O:8 / biotype...,393305,422,44139,SUBCELLULAR LOCATION: Cell surface {ECO:000026...,"SIGNAL 1..25; /evidence=""ECO:0000255""","HELIX 333..360; /evidence=""ECO:0007829|PDB:2LME""","STRAND 368..380; /evidence=""ECO:0007829|PDB:2L...","TURN 408..410; /evidence=""ECO:0007829|PDB:2LME""",,reviewed
6,A5A616,MGTS_ECOLI,Small protein MgtS,mgtS yneM b4599 JW1527.1,Escherichia coli (strain K12),83333,31,3509,SUBCELLULAR LOCATION: Cell inner membrane {ECO...,,"HELIX 3..24; /evidence=""ECO:0007829|PDB:5OQT""",,,,reviewed
7,B2HIL7,MSL7_MYCMM,Phenolphthiocerol synthesis polyketide synthas...,pks15/1 msl7 MMAR_1762,Mycobacterium marinum (strain ATCC BAA-535 / M),216594,2104,217744,,,,,,,reviewed
8,B2HN69,CAR_MYCMM,Carboxylic acid reductase (CAR) (EC 1.2.1.-) (...,car fadD9 MMAR_2117,Mycobacterium marinum (strain ATCC BAA-535 / M),216594,1174,127797,,,"HELIX 716..725; /evidence=""ECO:0007829|PDB:5MS...","STRAND 748..750; /evidence=""ECO:0007829|PDB:5M...","TURN 1148..1151; /evidence=""ECO:0007829|PDB:5MSO""",,reviewed
9,G3XCV0,FLEQ_PSEAE,Transcriptional regulator FleQ,fleQ PA1097,Pseudomonas aeruginosa (strain ATCC 15692 / DS...,208964,490,55277,,,"HELIX 14..26; /evidence=""ECO:0007829|PDB:4WXM""...","STRAND 6..10; /evidence=""ECO:0007829|PDB:4WXM""...","TURN 36..38; /evidence=""ECO:0007829|PDB:4WXM"";...",,reviewed


### Organisms in antigens dataset

In [19]:
tsv_antigen['Organism'].value_counts()

Organism
Mycobacterium tuberculosis (strain ATCC 25618 / H37Rv)                                                               342
Chlamydia trachomatis serovar D (strain ATCC VR-885 / DSM 19411 / UW-3/Cx)                                            26
Mycobacterium leprae (strain TN)                                                                                      22
Borreliella burgdorferi (strain ATCC 35210 / DSM 4680 / CIP 102532 / B31) (Borrelia burgdorferi)                      17
Staphylococcus aureus (strain NCTC 8325 / PS 47)                                                                      13
Coxiella burnetii (strain RSA 493 / Nine Mile phase I)                                                                12
Helicobacter pylori (strain ATCC 700392 / 26695) (Campylobacter pylori)                                                8
Brucella melitensis biotype 1 (strain ATCC 23456 / CCUG 17765 / NCTC 10094 / 16M)                                      8
Chlamydia pneumoniae (C

### Organisms in non-antigens dataset

In [20]:
tsv_non_antigen['Organism'].value_counts()

Organism
Escherichia coli (strain K12)                                                                                        4530
Mycobacterium tuberculosis (strain ATCC 25618 / H37Rv)                                                               2326
Salmonella typhimurium (strain LT2 / SGSC1412 / ATCC 700720)                                                         1830
Haemophilus influenzae (strain ATCC 51907 / DSM 11121 / KW20 / Rd)                                                   1703
Pseudomonas aeruginosa (strain ATCC 15692 / DSM 22644 / CIP 104116 / JCM 14847 / LMG 12228 / 1C / PRS 101 / PAO1)    1479
Vibrio cholerae serotype O1 (strain ATCC 39315 / El Tor Inaba N16961)                                                1000
Staphylococcus aureus (strain NCTC 8325 / PS 47)                                                                      812
Bacillus anthracis                                                                                                    758
Yersinia entero

### Verifying whether antigens and non-antigens have the same organism name and organism IDs 

In [22]:
if sorted(tsv_antigen['Organism (ID)'].unique().tolist()) == sorted(tsv_non_antigen['Organism (ID)'].unique().tolist()):
    print("Both have same organism IDs")
else:
    print("Both differ in organism IDs")
if sorted(tsv_antigen['Organism'].unique().tolist()) == sorted(tsv_non_antigen['Organism'].unique().tolist()):
    print("Both have same organism names")
else:
    print("Both differ in organism names")

Both have same organism IDs
Both have same organism names
