* Select only eukaryotic (superkingdom) proteins (protein evidence, reviewed)
* Filter-out sequences shorter than 40 residues
* Filter-out sequences having SP (any evidence)
* Select only proteins experimentally verified to be localized into: cytosol, nucleus,
mitochondrion, plastid, peroxisome, cell membrane

OUTPUT:
* A TSV file reporting relevant information about the proteins included in the
dataset
  1. The protein UniProt accession
  2. The organism name
  3. The Eukaryotic kingdom (Metazoa, Fungi, Plants, Other)
  4. The protein length
  5. Whether the protein has a transmembrane helix starting in the first 90 residues (true or false)
* A FASTA file reporting the protein sequences



Advanced Search:
(taxonomy_id:2759) AND (existence:1) AND (reviewed:true) NOT (ft_signal:*) AND (length:[40 TO *])AND (fragment:false) AND ((cc_scl_term_exp:SL-0091) OR (cc_scl_term_exp:SL-0191) OR (cc_scl_term_exp:SL-0173) OR (cc_scl_term_exp:SL-0209) OR (cc_scl_term_exp:SL-0204) OR (cc_scl_term_exp:SL-0039))


---


20210 results



print only some filds in the API or append only the fields after using the extracted function

In [None]:
#Import necessary packages and modules
import requests
from requests.adapters import HTTPAdapter, Retry
import json
import re

#Global variables
retries = Retry(total=5, backoff_factor=0.25, status_forcelist=[500, 502, 503, 504])
session = requests.Session()
session.mount("https://", HTTPAdapter(max_retries=retries))

In [None]:
#Define functions to handle API calls and pagination

def get_next_link(headers):
    if "Link" in headers:
        # The regular expression is used to extract the next link for pagination
        re_next_link = re.compile(r'<(.+)>; rel="next"')
        match = re_next_link.match(headers["Link"])
        if match:
            return match.group(1)

def get_batch(batch_url):
    while batch_url:
        # Run the API call -> get request
        response = session.get(batch_url)
        # Will raise an error if an error status code is obtained
        response.raise_for_status()
        # Get the total number of entries in the search
        total = response.headers["x-total-results"]
        # Yield the response and the total number of entries
        yield response, total
        # Get the link to the API call for the next data batch
        batch_url = get_next_link(response.headers)

In [None]:
# Advanced search function: (taxonomy_id:2759) AND (existence:1) AND (reviewed:true) NOT (ft_signal:*) AND (length:[40 TO *]) AND ((cc_scl_term_exp:SL-0091) OR (cc_scl_term_exp:SL-0191) OR (cc_scl_term_exp:SL-0173) OR (cc_scl_term_exp:SL-0209) OR (cc_scl_term_exp:SL-0204) OR (cc_scl_term_exp:SL-0039))
batch_size = 500
url = f"https://rest.uniprot.org/uniprotkb/search?format=json&query=%28%28taxonomy_id%3A2759%29+AND+%28existence%3A1%29+AND+%28reviewed%3Atrue%29+NOT+%28ft_signal%3A*%29+AND+%28length%3A%5B40+TO+*%5D%29AND+%28fragment%3Afalse%29+AND+%28%28cc_scl_term_exp%3ASL-0091%29+OR+%28cc_scl_term_exp%3ASL-0191%29+OR+%28cc_scl_term_exp%3ASL-0173%29+OR+%28cc_scl_term_exp%3ASL-0209%29+OR+%28cc_scl_term_exp%3ASL-0204%29+OR+%28cc_scl_term_exp%3ASL-0039%29%29%29&size={batch_size}"

# Set the name of the output file, we want TSV output
output_file_TSV = "negative.tsv"
output_file_fasta = "negative.fasta"

# Run the API call requiring JSON format and build our own TSV file

def extract_fields(entry):
    # Definition of the organism field
    if "Metazoa" in entry["organism"]["lineage"]:
        kingdom = "Metazoa"
    elif "Fungi" in entry["organism"]["lineage"]:
        kingdom = "Fungi"
    elif "Viridiplantae" in entry["organism"]["lineage"]:
        kingdom = "Viridiplantae"
    else:
        kingdom = "Other"

    # Iterate over the features of the entry
    for f in entry["features"]:
      tr = False
      if f["type"] == "Transmembrane":
        s = f["location"]["start"]["value"]
        if (s<=90) and ("Helical" in f["description"]): #re.search("Helical", f["description"])
          tr = True
          break

    return (entry["primaryAccession"], entry["organism"]["scientificName"], kingdom, entry["sequence"]["length"], tr, entry["sequence"]["value"])

In [None]:
def get_dataset(search_url, extract_function, output_file_name_TSV, output_file_name_fasta):
    extracted_json = []
    n_total= 0
    # Run the API call in batches
    for batch, total in get_batch(search_url):
        # parse the JSON body of the response
        batch_json = json.loads(batch.text)
        # extract the features
        for entry in batch_json["results"]:
            n_total += 1
            extracted_json.append(extract_function(entry))
    print(n_total)

    #write the results in an external file using the TSV format
    with open(output_file_name_TSV, "w") as ofs:
        for entry in extracted_json:
            fields = entry[:-1]
            # Print the fields in TSV format (tab separated)
            print(*fields, sep="\t", file=ofs)
        ofs.close

    #write the results in an external file using the FASTA format
    with open(output_file_name_fasta, "w") as ofs:
        for entry in extracted_json:
            print(">"+entry[0]+"\n"+entry[-1], file=ofs)
        ofs.close

In [None]:
#We call the above function to obtain our dataset
get_dataset(url, extract_fields, output_file_TSV, output_file_fasta)

20210
