* Select only eukaryotic (superkindom) proteins (protein evidence)
* Filter-out sequences shorter than 40 residues
* Filter-out unreviewed proteins

* Select on protein with experimental SP evidence
* Filter out proteins with SP shorter than 14 residues


OUTPUT:
* A TSV file reporting relevant information about the proteins included in the
dataset
  1. The protein UniProt accession
  2. The organism name
  3. The Eukaryotic kingdom (Metazoa, Fungi, Plants, Other)
  4. The protein length
  5. The position of the signal peptide cleavage site
* A FASTA file reporting the protein sequences

Advanced Search:

(taxonomy_id:2759) AND (existence:1) AND (length:[40 TO *]) AND (reviewed:true) AND (ft_signal_exp:*) AND (fragment:false)

In [None]:
#Import necessary packages and modules
import requests
from requests.adapters import HTTPAdapter, Retry
import json
import re

#Global variables
retries = Retry(total=5, backoff_factor=0.25, status_forcelist=[500, 502, 503, 504])
session = requests.Session()
session.mount("https://", HTTPAdapter(max_retries=retries))

In [None]:
#Define functions to handle API calls and pagination

def get_next_link(headers):
    if "Link" in headers:
        # The regular expression is used to extract the next link for pagination
        re_next_link = re.compile(r'<(.+)>; rel="next"')
        match = re_next_link.match(headers["Link"])
        if match:
            return match.group(1)

def get_batch(batch_url):
    while batch_url:
        # Run the API call -> get request
        response = session.get(batch_url)
        # Will raise an error if an error status code is obtained
        response.raise_for_status()
        # Get the total number of entries in the search
        total = response.headers["x-total-results"]
        # Yield the response and the total number of entries
        yield response, total
        # Get the link to the API call for the next data batch
        batch_url = get_next_link(response.headers)

In [None]:
# Advanced search function: (taxonomy_id:2759) AND (existence:1) AND (length:[40 TO *]) AND (reviewed:true) AND (ft_signal_exp:*)
batch_size = 500
url = f"https://rest.uniprot.org/uniprotkb/search?format=json&query=%28%28taxonomy_id%3A2759%29+AND+%28existence%3A1%29+AND+%28length%3A%5B40+TO+*%5D%29+AND+%28reviewed%3Atrue%29+AND+%28ft_signal_exp%3A*%29+AND+%28fragment%3Afalse%29%29&size={batch_size}"

#Filter-out SP shorter than 14 residues
def filter_entry(entry):
    # We iterate over the features of the entry
    for feature in entry["features"]:
      if feature["type"] == "Signal":
        if type(feature["location"]["end"]["value"]) == int:
        #if feature["location"]["end"]["value"] != "?":
          if feature["description"] == "":
            length = feature["location"]["end"]["value"] - feature["location"]["start"]["value"] + 1
            if length > 13:
              return True
      return False

"""
#another option
def filter_entry(entry):
    # We iterate over the features of the entry
    for feature in entry["features"]:
      try:
        e = int(feature["location"]["end"]["value"])
        assert(feature["description"] == "")
        assert(e>13)
      except:
        return False
      return True
"""

# Set the name of the output file, we want TSV output
output_file_TSV = "positive.tsv"
output_file_fasta = "positive.fasta"

# Run the API call requiring JSON format and build our own TSV file

def extract_fields(entry):
    # Definition of the organism field
    if "Metazoa" in entry["organism"]["lineage"]:
        kingdom = "Metazoa"
    elif "Fungi" in entry["organism"]["lineage"]:
        kingdom = "Fungi"
    elif "Viridiplantae" in entry["organism"]["lineage"]:
        kingdom = "Viridiplantae"
    else:
        kingdom = "Other"

    return (entry["primaryAccession"], entry["organism"]["scientificName"], kingdom, entry["sequence"]["length"], entry["features"][0]["location"]["end"]["value"])

In [None]:
def get_dataset(search_url, filter_function, extract_function, output_file_name_TSV, output_file_name_fasta):
    filtered_json = []
    n_total, n_filtered = 0, 0
    # Run the API call in batches
    for batch, total in get_batch(search_url):
        # parse the JSON body of the response
        batch_json = json.loads(batch.text)
        # filter the entries
        for entry in batch_json["results"]:
            n_total += 1
            # Check if the entry passes the filter
            if filter_function(entry):
                n_filtered += 1
                filtered_json.append(entry)
    print(n_total, n_filtered)

    #write the results in an external file using the TSV format
    with open(output_file_name_TSV, "w") as ofs:
        for entry in filtered_json:
            # Extract the fields of interest
            fields = extract_function(entry)
            # Print the fields in TSV format (tab separated)
            print(*fields, sep="\t", file=ofs)
        ofs.close

    #write the results in an external file using the FASTA format
    with open(output_file_name_fasta, "w") as ofs:
        for entry in filtered_json:
            print(">"+entry["primaryAccession"]+"\n"+entry["sequence"]["value"], file=ofs)
        ofs.close

In [None]:
#We call the above function to obtain our dataset
get_dataset(url, filter_entry, extract_fields, output_file_TSV, output_file_fasta)

2904 2887
