## This notebook fetches data from the UniProt and IEDB live APIs. If these servers are experiencing maintenance, downtime or are busy, the cells below may fail or time out. Pre-downloaded datasets are provided in the SEQUENCES-COLLECTION/ folder to ensure reproducibility without relying on live network calls.

### Import Packages

In [1]:
import pandas as pd
import requests
import time
import json
from io import StringIO
from Bio import SeqIO
import gzip
from itertools import islice

### Get Positive B-cell Data From IEDB

###### Objective: Fetch B-cell assay data from the Immune Epitope Database (IEDB).
###### Search Parameters:
###### -> Source: Bacteria (NCBITaxon:2)
###### -> Host: Homo sapiens (NCBITaxon:9606)
###### -> Disease: Infection (DOID:0050117)
###### -> Filter: Exclude negative results (neq.Negative)
###### -> Batch limit of 1000 and pagination (offset) were used to retrieve the full dataset without timing out the API.

In [2]:
bcell_url = "https://query-api.iedb.org/bcell_search"
bcell_search_params = {
    'source_organism_iri_search': 'cs.{"NCBITaxon:2"}',
    'host_organism_iri_search': 'cs.{"NCBITaxon:9606"}',
    'disease_iri_search': 'cs.{"DOID:0050117"}',
    'qualitative_measure': 'neq.Negative',
    'limit': 1000,
    'offset': 0,
    'order': 'bcell_iri'
}

bcell_results = requests.get(bcell_url, params=bcell_search_params)
bcell_search_params['offset'] += len(bcell_results.json())
print(bcell_search_params['offset'])
bcell_data = bcell_results.json()

while (bcell_results.json() != []):
    time.sleep(2)
    bcell_results = requests.get(bcell_url, params=bcell_search_params)
    if (bcell_results.json() != []):
        bcell_search_params['offset'] += len(bcell_results.json())
        print(bcell_search_params['offset'])
        bcell_data.extend(bcell_results.json())
    else:
        break

1000
2000
2798


### Convert The Retrieved Data Into A Dataframe

In [3]:
bcell_df = pd.DataFrame(bcell_data)
bcell_df

Unnamed: 0,bcell_id,bcell_iri,structure_id,structure_iri,linear_sequence,structure_type,structure_description,curated_source_antigen,reference_id,reference_iri,...,non_peptidic_molecule_iri,non_peptidic_molecule_name,r_object_source_molecule_iri_search,r_object_source_molecule_iri,r_object_source_molecule_name,r_object_source_organism_iri_search,r_object_source_organism_iri,r_object_source_organism_name,e_related_object_type,host_mhc_types_present
0,1041040,IEDB_ASSAY:1041040,41948,IEDB_EPITOPE:41948,MLGNAPSVVPNTTLGM,Linear peptide,MLGNAPSVVPNTTLGM,"{'accession': 'A43589', 'name': 'mtp40 protein...",1000620,IEDB_REFERENCE:1000620,...,,,,,,,,,,
1,1041055,IEDB_ASSAY:1041055,66693,IEDB_EPITOPE:66693,TTLGMHCGSFGSAPSNG,Linear peptide,TTLGMHCGSFGSAPSNG,"{'accession': 'A43589', 'name': 'mtp40 protein...",1000620,IEDB_REFERENCE:1000620,...,,,,,,,,,,
2,1041060,IEDB_ASSAY:1041060,72741,IEDB_EPITOPE:72741,WLKLGLVEFGGVAKLNAEVMS,Linear peptide,WLKLGLVEFGGVAKLNAEVMS,"{'accession': 'A43589', 'name': 'mtp40 protein...",1000620,IEDB_REFERENCE:1000620,...,,,,,,,,,,
3,1041067,IEDB_ASSAY:1041067,41952,IEDB_EPITOPE:41952,MLGTGTPNRARINFNC,Linear peptide,MLGTGTPNRARINFNC,"{'accession': 'A43589', 'name': 'mtp40 protein...",1000620,IEDB_REFERENCE:1000620,...,,,,,,,,,,
4,1041069,IEDB_ASSAY:1041069,27635,IEDB_EPITOPE:27635,INFNCEVWSNVSETISGPRLY,Linear peptide,INFNCEVWSNVSETISGPRLY,"{'accession': 'A43589', 'name': 'mtp40 protein...",1000620,IEDB_REFERENCE:1000620,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2793,9067,IEDB_ASSAY:9067,41071,IEDB_EPITOPE:41071,MAKVNIKPLEDKILV,Linear peptide,MAKVNIKPLEDKILV,"{'accession': 'P09621.3', 'name': '10 kDa chap...",833,IEDB_REFERENCE:833,...,,,,,,,,,,
2794,9080,IEDB_ASSAY:9080,72294,IEDB_EPITOPE:72294,WDEDGEKRIPLDVAE,Linear peptide,WDEDGEKRIPLDVAE,"{'accession': 'P09621.3', 'name': '10 kDa chap...",833,IEDB_REFERENCE:833,...,,,,,,,,,,
2795,9082,IEDB_ASSAY:9082,35316,IEDB_EPITOPE:35316,LDVAEGDTVIYSKYG,Linear peptide,LDVAEGDTVIYSKYG,"{'accession': 'P09621.3', 'name': '10 kDa chap...",833,IEDB_REFERENCE:833,...,,,,,,,,,,
2796,9176063,IEDB_ASSAY:9176063,1125229,IEDB_EPITOPE:1125229,SEKQMPSVVNENAVTPEK,Linear peptide,SEKQMPSVVNENAVTPEK,"{'accession': 'WP_106459654.1', 'name': 'LPXTG...",1037215,IEDB_REFERENCE:1037215,...,,,,,,,,,,


### Get Positive T-cell Data From IEDB

###### Objective: Fetch T-cell assay data from the Immune Epitope Database (IEDB).
###### Search Parameters:
###### -> Source: Bacteria (NCBITaxon:2)
###### -> Host: Homo sapiens (NCBITaxon:9606)
###### -> Disease: Infection (DOID:0050117)
###### -> Filter: Exclude negative results (neq.Negative)
###### -> Batch limit of 1000 and pagination (offset) were used to retrieve the full dataset without timing out the API.

In [4]:
tcell_url = "https://query-api.iedb.org/tcell_search"
tcell_search_params = {
    'source_organism_iri_search': 'cs.{"NCBITaxon:2"}',
    'host_organism_iri_search': 'cs.{"NCBITaxon:9606"}',
    'disease_iri_search': 'cs.{"DOID:0050117"}',
    'qualitative_measure': 'neq.Negative',
    'limit':1000,
    'offset': 0,
    'order': 'tcell_iri'
}
tcell_data=[]
tcell_results = requests.get(tcell_url, params=tcell_search_params)
tcell_search_params['offset'] += len(tcell_results.json())
print(tcell_search_params['offset'])
tcell_data = tcell_results.json()

while (tcell_results.json() != []):
    time.sleep(2)
    tcell_results = requests.get(tcell_url, params=tcell_search_params)
    if (tcell_results.json() != []):
        tcell_search_params['offset'] += len(tcell_results.json())
        print(tcell_search_params['offset'])
        tcell_data.extend(tcell_results.json())
    else:
        break

1000
2000
3000
4000
4898


### Convert The Retrieved Data Into A Dataframe

In [5]:
tcell_df = pd.DataFrame(tcell_data)
tcell_df

Unnamed: 0,tcell_id,tcell_iri,structure_id,structure_iri,linear_sequence,structure_type,structure_description,curated_source_antigen,reference_id,reference_iri,...,non_peptidic_molecule_iri,non_peptidic_molecule_name,r_object_source_molecule_iri_search,r_object_source_molecule_iri,r_object_source_molecule_name,r_object_source_organism_iri_search,r_object_source_organism_iri,r_object_source_organism_name,e_related_object_type,host_mhc_types_present
0,10035,IEDB_ASSAY:10035,55139,IEDB_EPITOPE:55139,RPEAVLQHARTLAKI,Linear peptide,RPEAVLQHARTLAKI,"{'accession': 'P45487.1', 'name': '8-amino-7-o...",890,IEDB_REFERENCE:890,...,,,,,,,,,,
1,10037,IEDB_ASSAY:10037,41499,IEDB_EPITOPE:41499,MFGCMNYSTRVTLAD,Linear peptide,MFGCMNYSTRVTLAD,"{'accession': 'P35901.1', 'name': 'Protein rec...",890,IEDB_REFERENCE:890,...,,,,,,,,,,
2,10039,IEDB_ASSAY:10039,41499,IEDB_EPITOPE:41499,MFGCMNYSTRVTLAD,Linear peptide,MFGCMNYSTRVTLAD,"{'accession': 'P35901.1', 'name': 'Protein rec...",890,IEDB_REFERENCE:890,...,,,,,,,,,,
3,10041,IEDB_ASSAY:10041,20811,IEDB_EPITOPE:20811,GLDSIISSASASLLT,Linear peptide,GLDSIISSASASLLT,"{'accession': 'Q07297.1', 'name': 'Serine-rich...",890,IEDB_REFERENCE:890,...,,,,,,,,,,
4,10043,IEDB_ASSAY:10043,20811,IEDB_EPITOPE:20811,GLDSIISSASASLLT,Linear peptide,GLDSIISSASASLLT,"{'accession': 'Q07297.1', 'name': 'Serine-rich...",890,IEDB_REFERENCE:890,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4893,9815,IEDB_ASSAY:9815,69572,IEDB_EPITOPE:69572,VLKSYVLEGTLTAEK,Linear peptide,VLKSYVLEGTLTAEK,"{'accession': 'P14013.1', 'name': 'Outer surfa...",626,IEDB_REFERENCE:626,...,,,,,,,,,,
4894,9897,IEDB_ASSAY:9897,69572,IEDB_EPITOPE:69572,VLKSYVLEGTLTAEK,Linear peptide,VLKSYVLEGTLTAEK,"{'accession': 'P14013.1', 'name': 'Outer surfa...",626,IEDB_REFERENCE:626,...,,,,,,,,,,
4895,9898,IEDB_ASSAY:9898,69572,IEDB_EPITOPE:69572,VLKSYVLEGTLTAEK,Linear peptide,VLKSYVLEGTLTAEK,"{'accession': 'P14013.1', 'name': 'Outer surfa...",626,IEDB_REFERENCE:626,...,,,,,,,,,,
4896,9913,IEDB_ASSAY:9913,69569,IEDB_EPITOPE:69569,VLKNFTLEGKVANDK,Linear peptide,VLKNFTLEGKVANDK,"{'accession': 'P0A3N7.1', 'name': 'Outer surfa...",626,IEDB_REFERENCE:626,...,,,,,,,,,,


### Extract Peptide-Only Entries Into A New Dataframe

In [6]:
peptide_bcell_df = bcell_df[bcell_df['structure_type'].str.contains("peptide")].reset_index(drop=True)
peptide_tcell_df = tcell_df[tcell_df['structure_type'].str.contains("peptide")].reset_index(drop=True)
peptide_df = pd.concat([peptide_bcell_df, peptide_tcell_df], axis=0).reset_index(drop=True)

### Extract Entries With non-null Values In 'parent_source_antigen_iri' Column
### Extract Uniprot IDs From 'parent_source_antigen_iri' Column
### Write The IDs Into A File 

In [7]:
peptide_df = peptide_df[~peptide_df['parent_source_antigen_iri'].isna()].reset_index(drop=True)
antigen_ids = list(set(peptide_df['parent_source_antigen_iri'].tolist()))

for i in range(len(antigen_ids)):
    antigen_ids[i] = antigen_ids[i].split(':')[1]

antigen_ids = ",".join(antigen_ids)
with open("antigen_ids.txt", "w") as file:
    file.write(antigen_ids)

print(antigen_ids)

Q5NGE4,P9WI45,Q83DJ3,P9WPQ1,Q5NIA7,P9WQ59,O06625,P55969,P9WLX1,O06608,O06574,P96821,P26623,P24094,A1UQW0,P9WKJ3,O53673,Q8YIE1,P9WGK3,P04958,P9WH35,Q9Z7K3,O53632,Q9RZX1,O69661,O84419,O53478,P9WKJ7,P9WHV9,Q9CC67,P0A6F5,O51624,O33269,P9WG63,Q83F57,Q9ZH99,O50850,Q50700,P9WIG1,I6YGB1,O53972,P9WIE5,Q83AP6,O06817,P33768,O84462,Q79FH3,Q83CY3,Q5NEM8,O53781,P9WKC9,P55980,P9WGY7,L0T8G4,Q79FS8,A0A1N0BRQ6,P9WJI7,P9WHX7,P9WH91,Q9CCU2,Q79FQ7,Q6MX34,P9WKX1,O53699,O84801,Q6MX50,P9WJV1,P9WMB9,Q9R344,Q8YDZ3,P9WFJ9,P27455,Q9CD21,Q9Z7A6,P71658,P0A0V0,X8CE55,P9WJK5,P13470,Q7AQJ8,P9WNJ7,O06404,I6Y293,P9WI79,P9WNF3,47466-other,Q5NFR6,P9WL47,L7N680,P0CC04,Q79G04,P0A4Q1,P60230,P95223,P9WHB9,P9WMX7,B4EDC1,P9WNK3,Q83BJ5,P9WI43,O84700,Q2FYP2,Q7MXK0,P9WI07,Q83AL4,P9WPU7,Q44849,I6YC99,Q2GI62,P96901,Q2FXB0,P9WKD1,P9WNJ1,P38056,Q83CA7,O84449,O84623,P9WL87,Q5NGG4,P9WI23,A1JNN2,O05576,L0T7Y7,P9WQN7,O84534,P9WP47,P9WM17,P9WFL9,O84646,O84229,P9WJD9,P96221,P43315,Q5NEY9,P9WMJ9,Q7ZAK5,O86351,L0T2W6,Q8YG32,P95225,Q9CCX9,P952

### Using UniProtKB ID-Mapping To Map antigen_ids



In [8]:
ID_mapping_post_url = "https://rest.uniprot.org/idmapping/run"
ID_mapping_post_params = {
    "from": "UniProtKB_AC-ID",
    "to": "UniProtKB",
    "ids": antigen_ids
}

jobId = requests.post(ID_mapping_post_url, data=ID_mapping_post_params).json()['jobId']
jobId

'wpYBObywl9'

### Checks The Status Of The Submitted Job

In [9]:
ID_mapping_status_url = f"https://rest.uniprot.org/idmapping/status/{jobId}"
status = requests.get(ID_mapping_status_url)

while status.json()['jobStatus'] in ["RUNNING", "NEW", "QUEUED"]:
    print("Fetching...")
    time.sleep(5)
    status = requests.get(ID_mapping_status_url)
    if 'jobStatus' not in status.json():
        print("Done")
        break

Fetching...
Done


### Download The Mapped IDs FASTA Sequences And Write Into A Compressed File

In [10]:
ID_mapping_results_url = f"https://rest.uniprot.org/idmapping/uniprotkb/results/stream/{jobId}"
ID_mapping_results_params = {
    "query": "taxonomy_id:2 AND reviewed:true AND fragment:false",
    "format": "fasta"
}

result = requests.get(ID_mapping_results_url, params=ID_mapping_results_params) 

with gzip.open("antigenic_sequences.fasta.gz", "wt") as handle:
    handle.write(result.text)

count=0
with gzip.open("antigenic_sequences.fasta.gz", "rt") as handle:
    for record in SeqIO.parse(handle, "fasta"):
        count+=1
print(count)

494


### Print First Ten FASTA Sequences

In [11]:
with gzip.open("antigenic_sequences.fasta.gz", "rt") as handle:
    first_10 = list(islice(SeqIO.parse(handle, "fasta"), 10))

for record in first_10:
    print(record)

ID: sp|A0PSI5|MASZ_MYCUA
Name: sp|A0PSI5|MASZ_MYCUA
Description: sp|A0PSI5|MASZ_MYCUA Malate synthase G OS=Mycobacterium ulcerans (strain Agy99) OX=362242 GN=glcB PE=3 SV=1
Number of features: 0
Seq('MTDRVSAGNLRVARVLYDFVNNEALPGTDIDQDSFWAGVDKVVTDLTPQNQDLL...AGA')
ID: sp|A1JIP3|CH60_YERE8
Name: sp|A1JIP3|CH60_YERE8
Description: sp|A1JIP3|CH60_YERE8 Chaperonin GroEL OS=Yersinia enterocolitica serotype O:8 / biotype 1B (strain NCTC 13174 / 8081) OX=393305 GN=groEL PE=3 SV=1
Number of features: 0
Seq('MAAKDVKFGNDARIKMLRGVNILADAVKVTLGPKGRNVVLDKSFGSPTITKDGV...GMM')
ID: sp|A1JNN2|CLPP_YERE8
Name: sp|A1JNN2|CLPP_YERE8
Description: sp|A1JNN2|CLPP_YERE8 ATP-dependent Clp protease proteolytic subunit OS=Yersinia enterocolitica serotype O:8 / biotype 1B (strain NCTC 13174 / 8081) OX=393305 GN=clpP PE=3 SV=1
Number of features: 0
Seq('MSYSGERDQFAPNMALVPMVVEQTSRGERSYDIFSRLLKERIIFLTGQVEDHMA...RRD')
ID: sp|A1JUB7|YADA_YERE8
Name: sp|A1JUB7|YADA_YERE8
Description: sp|A1JUB7|YADA_YERE8 Adhesin YadA OS=Ye

### Download The Selected Information("fields") Of The Mapped IDs In .tsv Format AND Print First 50 Rows
### Export The Data In '.parquet' Format

In [12]:
ID_mapping_results_url = f"https://rest.uniprot.org/idmapping/uniprotkb/results/stream/{jobId}"
ID_mapping_results_params = {
    "query": "taxonomy_id:2 AND reviewed:true AND fragment:false",
    "format": "tsv",
    "fields": "accession,id,protein_name,gene_names,organism_name,organism_id,length,mass,cc_subcellular_location,ft_signal,ft_helix,ft_strand,ft_turn,reviewed"
}

result = requests.get(ID_mapping_results_url, params=ID_mapping_results_params)
tsv_antigen = pd.read_csv(StringIO(result.text), sep='\t')
tsv_antigen.to_parquet("tsv_antigen.parquet", index=False)
tsv_antigen.head(50)

Unnamed: 0,From,Entry,Entry Name,Protein names,Gene Names,Organism,Organism (ID),Length,Mass,Subcellular location [CC],Signal peptide,Helix,Beta strand,Turn,Reviewed
0,A0PSI5,A0PSI5,MASZ_MYCUA,Malate synthase G (EC 2.3.3.9),glcB MUL_3055,Mycobacterium ulcerans (strain Agy99),362242,731,79479,SUBCELLULAR LOCATION: Cytoplasm {ECO:0000255|H...,,,,,reviewed
1,A1JIP3,A1JIP3,CH60_YERE8,Chaperonin GroEL (EC 5.6.1.7) (60 kDa chaperon...,groEL groL YE0354,Yersinia enterocolitica serotype O:8 / biotype...,393305,550,57513,SUBCELLULAR LOCATION: Cytoplasm {ECO:0000255|H...,,,,,reviewed
2,A1JNN2,A1JNN2,CLPP_YERE8,ATP-dependent Clp protease proteolytic subunit...,clpP YE3134,Yersinia enterocolitica serotype O:8 / biotype...,393305,207,23256,SUBCELLULAR LOCATION: Cytoplasm {ECO:0000255|H...,,,,,reviewed
3,A1JUB7,A1JUB7,YADA_YERE8,Adhesin YadA (Protein Yop1) (Trimeric autotran...,yadA yopA YEP0066,Yersinia enterocolitica serotype O:8 / biotype...,393305,422,44139,SUBCELLULAR LOCATION: Cell surface {ECO:000026...,"SIGNAL 1..25; /evidence=""ECO:0000255""","HELIX 333..360; /evidence=""ECO:0007829|PDB:2LME""","STRAND 368..380; /evidence=""ECO:0007829|PDB:2L...","TURN 408..410; /evidence=""ECO:0007829|PDB:2LME""",reviewed
4,A5CDL9,A5CDL9,CH60_ORITB,Chaperonin GroEL (EC 5.6.1.7) (60 kDa chaperon...,groEL groL OTBS_0917,Orientia tsutsugamushi (strain Boryong) (Ricke...,357244,554,59715,SUBCELLULAR LOCATION: Cytoplasm {ECO:0000255|H...,,,,,reviewed
5,B2HSY2,B2HSY2,MASZ_MYCMM,Malate synthase G (EC 2.3.3.9),glcB MMAR_2713,Mycobacterium marinum (strain ATCC BAA-535 / M),216594,731,79479,SUBCELLULAR LOCATION: Cytoplasm {ECO:0000255|H...,,"HELIX 14..22; /evidence=""ECO:0007829|PDB:6AXE""...","STRAND 4..7; /evidence=""ECO:0007829|PDB:6AXE"";...","TURN 151..153; /evidence=""ECO:0007829|PDB:6AXE...",reviewed
6,I3NID5,I3NID5,ENCP2_MYCPA,Type 2A encapsulin shell protein (Major membra...,enc2 MAP_2121c,Mycolicibacterium paratuberculosis (strain ATC...,262316,307,33671,SUBCELLULAR LOCATION: Encapsulin nanocompartme...,,,,,reviewed
7,I6XFZ8,I6XFZ8,Y3035_MYCTU,Protein Rv3035,Rv3035 RVBD_3035 P425_03163,Mycobacterium tuberculosis (strain ATCC 25618 ...,83332,411,42539,,,,,,reviewed
8,I6YA32,I6YA32,CHIZ_MYCTU,Cell wall hydrolase ChiZ (EC 3.4.-.-),chiZ Rv2719c,Mycobacterium tuberculosis (strain ATCC 25618 ...,83332,165,17324,SUBCELLULAR LOCATION: Cell membrane {ECO:00003...,,,,,reviewed
9,I6YET7,I6YET7,Y2963_MYCTU,Putative permease Rv2963,Rv2963 RVBD_2963 P425_03084,Mycobacterium tuberculosis (strain ATCC 25618 ...,83332,406,43729,SUBCELLULAR LOCATION: Cell membrane {ECO:00003...,,,,,reviewed


### Download Full Length(fragment:false) Reviewed(reviewed:true) FASTA Sequences Of Bacteria(taxonomy_id:2) 
### Write Into A Compressed File

In [13]:
non_antigens_url = "https://rest.uniprot.org/uniprotkb/stream"
non_antigens_search_params = {
    "query": "taxonomy_id:2 AND reviewed:true AND fragment:false",
    "format": "fasta"
}

result = requests.get(non_antigens_url, params=non_antigens_search_params) 

with gzip.open("non_antigenic_sequences.fasta.gz", "wt") as handle:
    handle.write(result.text)

count=0
with gzip.open("non_antigenic_sequences.fasta.gz", "rt") as handle:
    for record in SeqIO.parse(handle, "fasta"):
        count+=1
print(count)

335285


### Print First Ten FASTA Sequences

In [14]:
with gzip.open("non_antigenic_sequences.fasta.gz", "rt") as handle:
    first_10 = list(islice(SeqIO.parse(handle, "fasta"), 10))
    
for record in first_10:
    print(record)

ID: sp|A0A009IHW8|ABTIR_ACIB9
Name: sp|A0A009IHW8|ABTIR_ACIB9
Description: sp|A0A009IHW8|ABTIR_ACIB9 2' cyclic ADP-D-ribose synthase AbTIR OS=Acinetobacter baumannii (strain 1295743) OX=1310613 GN=J512_3302 PE=1 SV=1
Number of features: 0
Seq('MSLEQKKGADIISKILQIQNSIGKTTSPSTLKTKLSEISRKEQENARIQSKLSD...LNR')
ID: sp|A0A059WI14|ARSI_BACX0
Name: sp|A0A059WI14|ARSI_BACX0
Description: sp|A0A059WI14|ARSI_BACX0 Trivalent organoarsenical cleaving enzyme OS=Bacillus sp. (strain MD1) OX=1501233 GN=arsI PE=1 SV=1
Number of features: 0
Seq('MKYAHVGLNVTNLEKSIEFYSKLFGAEPVKVKPDYAKFLLESPGLNFTLNLRDE...CCS')
ID: sp|A0A075BSX9|HLNO_SHIS7
Name: sp|A0A075BSX9|HLNO_SHIS7
Description: sp|A0A075BSX9|HLNO_SHIS7 (S)-6-hydroxynicotine oxidase OS=Shinella sp. (strain HZN7) OX=879274 GN=nctB PE=1 SV=1
Number of features: 0
Seq('MTEKIYDAIVVGAGFSGLVAARELSAQGRSVLIIEARHRLGGRTHVVNFLGRPV...ATA')
ID: sp|A0A089QRB9|MSL3_MYCTU
Name: sp|A0A089QRB9|MSL3_MYCTU
Description: sp|A0A089QRB9|MSL3_MYCTU Mycolipanoate synthase OS=Mycob

### Download The Selected Information("fields") Of The Sequences In .tsv Format AND Print First 50 Rows
### Export The Data In '.parquet' format

In [15]:
non_antigens_url = "https://rest.uniprot.org/uniprotkb/stream"
non_antigens_search_params = {
    "query": "taxonomy_id:2 AND reviewed:true AND fragment:false",
    "format": "tsv",
    "fields": "accession,id,protein_name,gene_names,organism_name,organism_id,length,mass,cc_subcellular_location,ft_signal,ft_helix,ft_strand,ft_turn,reviewed"
}

result = requests.get(non_antigens_url, params=non_antigens_search_params)
tsv_non_antigen = pd.read_csv(StringIO(result.text), sep='\t')
tsv_non_antigen.to_parquet("tsv_non_antigen.parquet", index=False)
tsv_non_antigen.head(50)

Unnamed: 0,Entry,Entry Name,Protein names,Gene Names,Organism,Organism (ID),Length,Mass,Subcellular location [CC],Signal peptide,Helix,Beta strand,Turn,Reviewed
0,A0A009IHW8,ABTIR_ACIB9,2' cyclic ADP-D-ribose synthase AbTIR (2'cADPR...,J512_3302,Acinetobacter baumannii (strain 1295743),1310613,269,30922,,,"HELIX 143..145; /evidence=""ECO:0007829|PDB:7UW...","STRAND 135..142; /evidence=""ECO:0007829|PDB:7U...","TURN 146..149; /evidence=""ECO:0007829|PDB:7UWG...",reviewed
1,A0A059WI14,ARSI_BACX0,Trivalent organoarsenical cleaving enzyme (EC ...,arsI,Bacillus sp. (strain MD1),1501233,161,18109,,,,,,reviewed
2,A0A075BSX9,HLNO_SHIS7,(S)-6-hydroxynicotine oxidase ((S)-6HN oxidase...,nctB shn_30305,Shinella sp. (strain HZN7),879274,437,48736,,,"HELIX 15..26; /evidence=""ECO:0007829|PDB:6CR0""...","STRAND 6..11; /evidence=""ECO:0007829|PDB:6CR0""...","TURN 119..121; /evidence=""ECO:0007829|PDB:6CR0...",reviewed
3,A0A089QRB9,MSL3_MYCTU,Mycolipanoate synthase (EC 2.3.1.252) (Mycocer...,msl3 pks3 pks4 Rv1180/Rv1181,Mycobacterium tuberculosis (strain ATCC 25618 ...,83332,2085,220462,SUBCELLULAR LOCATION: Cell membrane {ECO:00002...,"SIGNAL 1..15; /evidence=""ECO:0000255|PROSITE-P...",,,,reviewed
4,A0A095AMW7,MLES_LEUME,Malolactic enzyme (MLE) (EC 4.1.1.101),mleS LH61_04880,Leuconostoc mesenteroides,1245,542,59205,,,,,,reviewed
5,A0A0A1H8I4,AIS_PSESP,Aconitate isomerase (AI) (EC 5.3.3.7),ais,Pseudomonas sp,306,262,27449,,"SIGNAL 1..22; /evidence=""ECO:0000269|PubMed:26...",,,,reviewed
6,A0A0A7HFE1,CAS10_STRTR,CRISPR system single-strand-specific deoxyribo...,cas10 csm1,Streptococcus thermophilus,1308,758,86822,,,"HELIX 6..13; /evidence=""ECO:0007829|PDB:6NUE"";...","STRAND 55..58; /evidence=""ECO:0007829|PDB:6NUE...","TURN 15..17; /evidence=""ECO:0007829|PDB:6NUE"";...",reviewed
7,A0A0A7HIF0,CSM3_STRTR,CRISPR system Cms endoribonuclease Csm3 (Csm3 ...,csm3,Streptococcus thermophilus,1308,220,24569,,,"HELIX 50..60; /evidence=""ECO:0007829|PDB:6NUE""...","STRAND 5..16; /evidence=""ECO:0007829|PDB:6NUE""...","TURN 61..63; /evidence=""ECO:0007829|PDB:6NUE"";...",reviewed
8,A0A0A7HIX6,CSM6A_STRTR,CRISPR system endoribonuclease Csm6 (EC 3.1.-....,csm6,Streptococcus thermophilus,1308,428,49361,,,,,,reviewed
9,A0A0B0QJR1,HEPT_APHF2,tRNA nuclease HepT (EC 3.1.27.-) (Toxin HEPN) ...,hepT hepn OA07_26455,Aphanizomenon flos-aquae (strain 2012/KM1/D3),1532906,147,17110,,,"HELIX 5..23; /evidence=""ECO:0007829|PDB:7AE6"";...","STRAND 67..69; /evidence=""ECO:0007829|PDB:7AE6...",,reviewed
