#**DATA COLLECTION**

Import necessary packages and modules


In [13]:
import requests
from requests.adapters import HTTPAdapter, Retry
import json
import re

Define global variables to handle the connection with the UniProt website. We use the Session object to allow retries in case of temporary service unavaiability. We set 5 as tha max number of retries.

In [14]:
retries = Retry(total=5, backoff_factor=0.25, status_forcelist=[500, 502, 503, 504])
session = requests.Session()
session.mount("https://", HTTPAdapter(max_retries=retries))

Define functions to handle API calls and pagination (useful when number of entries exceed the current limit)

In [15]:
def get_next_link(headers):
    if "Link" in headers:
        # The regular expression is used to extract the next link for pagination
        re_next_link = re.compile(r'<(.+)>; rel="next"')
        match = re_next_link.match(headers["Link"])
        if match:
            return match.group(1)

# This function actually retrieve the next data batch in the search.
# The function act as an iterator, yielding the next result batch at every call
# The function terminates after the last batch has been returned.
# In this case, the next link will be None
def get_batch(batch_url):
    while batch_url:
        # Run the API call
        response = session.get(batch_url)
        # Will raise an error if an error status code is obtained
        response.raise_for_status()
        # Get the total number of entries in the search
        total = response.headers.get("x-total-results", 0)
        # Yield the response and the total number of entries
        yield response, total
        # Get the link to the API call for the next data batch
        batch_url = get_next_link(response.headers)

#**POSITIVE DATASET**

#### Data Input

In [16]:
# We define a basic URL for the search.
# The URL has been generated from the UniProtKB website, using the Advanced search function.
url_positive = "https://rest.uniprot.org/uniprotkb/search?format=json&query=%28taxonomy_id%3A2759%29+AND+%28length%3A%5B40+TO+*%5D%29+AND+%28fragment%3Afalse%29+AND+%28reviewed%3Atrue%29+AND+%28ft_signal_exp%3A*%29&size=500"

#### Function for entry filtering
##### It takes as input the an entry and returns it only if all the requests are satisfied

In [17]:
def filter_entry_positive(entry):
    # We iterate over the features of the entry
    if "features" in entry:
        for feature in entry["features"]:
            # We only consider features of type Signal Peptides
            if feature["type"] == "Signal":
                # Check if the lentgh is >14
                if type(feature["location"]["end"]["value"]) == int:
                  if not feature.get("description") :
                    if feature["location"]["end"]["value"] >= 14 :
                      return True
    return False

#### Function extracting specific fields for each entry.
##### It takes as input the an entry and returns the extracted fields and the protein sequence.

In [18]:
def extract_fields_positive(entry):
    # We extract Uniprot accession, organism name, eukaryotic kingdom, protein length, position of signal peptide cleavage site
    # We iterate over the features of the entry
    protein_id = entry["primaryAccession"] #Uniprot accession
    organism_name = entry["organism"]["scientificName"] #organism name
    kingdom = entry["organism"]["lineage"][1] #eukaryotic kingdom
    protein_length = entry["sequence"]["length"] #protein length
    fasta_line = ">"
    fasta_line += entry["primaryAccession"]
    fasta_line += "\n"
    fasta_line += entry["sequence"]["value"] #protein sequence
    for feature in entry["features"]:
      if feature["type"] == "Signal":
        pos_cleavage_site = feature["location"]["end"]["value"] #position of signal peptide cleavage site
    return [(protein_id, organism_name, kingdom, protein_length, pos_cleavage_site),fasta_line]

#### Definition of a function to execute the positive dataset retrivial.
##### It takes as input the url and the 2 output file names

In [19]:
def get_dataset_positive(search_url, output_file_tsv, output_file_fasta):
    filtered_json = []
    n_total, n_filtered = 0, 0
    # Run the API call in batches
    for batch, total in get_batch(search_url):
        # parse the JSON body of the response
        batch_json = json.loads(batch.text)
        # filter the entries
        for entry in batch_json["results"]:
            n_total += 1
            # Check if the entry passes the filter
            if filter_entry_positive(entry):
                n_filtered += 1
                filtered_json.append(entry)
    print(n_total, n_filtered)
    with open(output_file_tsv, "w") as f1, open(output_file_fasta, "w") as f2:
      print("protein_id", "organism_name", "kingdom", "protein_length", "pos_cleavage_site",sep="\t", file=f1)
      for entry in filtered_json:
          # Extract the fields of interest
          fields = extract_fields_positive(entry)[0]
          #Extract protein sequence
          seq = extract_fields_positive(entry)[1]
          # Print the fields in TSV format
          print(*fields, sep="\t", file=f1)
          # Print the sequence in FASTA format
          print(seq, file=f2)
      f1.close
      f2.close
    return

#### Output

In [20]:
# We set the name of the output TSV file
output_positive_file_tsv = "sp_positive.tsv"
# And a name for the FASTA file
output_positive_file_fasta = "sp_positive.fasta"

#Call the function to execute the whole script for positive dataset retrivial
get_dataset_positive(url_positive, output_positive_file_tsv, output_positive_file_fasta)

2949 2932


# ******negative dataset******

In [21]:
def get_dataset_negative(search_url, output_file_tsv, output_file_fasta):
    filtered_json = []
    n_total, n_filtered = 0, 0
    # Run the API call in batches
    for batch, total in get_batch(search_url):
        # parse the JSON body of the response
        batch_json = json.loads(batch.text)
        # filter the entries
        for entry in batch_json["results"]:
            n_total += 1
            # Check if the entry passes the filter
            n_filtered += 1
            filtered_json.append(entry)
    print(n_total)
    with open(output_file_tsv, "w") as f1, open(output_file_fasta, "w") as f2:
      print("protein_id", "organism_name", "kingdom", "protein_length", "pos_cleavage_site",sep="\t", file=f1)
      for entry in filtered_json:
          # Extract the fields of interest
          fields = extract_fields_negative(entry)[0]
          #Extract protein sequence
          seq = extract_fields_negative(entry)[1]
          # Print the fields in TSV format
          print(*fields, sep="\t", file=f1)
          # Print the sequence in FASTA format
          print(seq, file=f2)
      f1.close
      f2.close
    return


def extract_fields_negative(entry):
  # We extract Uniprot accession, organism name, eukaryotic kingdom, protein length, position of signal peptide cleavage site
  # We iterate over the features of the entry
    protein_id = entry["primaryAccession"] #Uniprot accession
    organism_name = entry["organism"]["scientificName"] #organism name
    kingdom = entry["organism"]["lineage"][1] #eukaryotic kingdom
    protein_length = entry["sequence"]["length"] #protein length
    helix_presence = "5" #presence of transmember helix in the first 90 residues
    helix_presence = False
    fasta_line = ">"
    fasta_line += entry["primaryAccession"]
    fasta_line += "\n"
    fasta_line += entry["sequence"]["value"] #protein sequence
    for feature in entry["features"]:
      if feature["type"] == "Transmembrane":
        helix_presence = True #presence of transmember helix in the first 90 residues
    return [(protein_id, organism_name, kingdom, protein_length, helix_presence),fasta_line]


In [22]:
# We set the name of the output TSV file
output_negative_file_tsv = "sp_negative.tsv"
# And a name for the FASTA file
output_negative_file_fasta = "sp_negative.fasta"

url_negative = "https://rest.uniprot.org/uniprotkb/search?format=json&query=%28fragment%3Afalse%29+AND+%28taxonomy_id%3A2759%29+AND+%28length%3A%5B40+TO+*%5D%29+NOT+%28ft_signal%3A*%29+AND+%28reviewed%3Atrue%29+AND+%28existence%3A1%29+AND+%28%28cc_scl_term_exp%3ASL-0091%29+OR+%28cc_scl_term_exp%3ASL-0191%29+OR+%28cc_scl_term_exp%3ASL-0173%29+OR+%28cc_scl_term_exp%3ASL-0209%29+OR+%28cc_scl_term_exp%3ASL-0204%29+OR+%28cc_scl_term_exp%3ASL-0039%29%29&size=500"

get_dataset_negative(url_negative, output_negative_file_tsv, output_negative_file_fasta)

20615
