
## Download FASTA file from UniProt

Before we start, we need to download some sample FASTA file from [UniProt](https://www.uniprot.org/help/downloads) repository and save it to a UC Volume.    

Source: https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.fasta.gz

You can use a serverless compute to run this.

In [0]:
%pip install biopython==1.86
dbutils.library.restartPython()

In [0]:
%run ./utils

In [0]:
remove_widgets() 
uc_config = setup_uc_paths(spark=None, use_widgets=True); ## if you update the values in widgest -- it will automatically trigger an update of the UC paths

# Extract catalog, schema, volume names
catalog_name = uc_config["catalog_name"]
schema_name = uc_config["schema_name"]
volume_name = uc_config["volume_name"]

In [0]:
import requests
import gzip

url = "https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.fasta.gz"
output_path = f"/Volumes/{catalog_name}/{schema_name}/protein_seq/uniprot_sprot.fasta"

response = requests.get(url, stream=True)
response.raise_for_status()

with gzip.open(response.raw, "rt") as gz_file, open(output_path, "w") as out_file:
    for line in gz_file:
        out_file.write(line)

In [0]:
from Bio import SeqIO

fasta_path = f"/Volumes/{catalog_name}/{schema_name}/{volume_name}/uniprot_sprot.fasta"

records = list(
    SeqIO.parse(fasta_path, "fasta")
)

print(f"Number of sequences: {len(records)}")
print(f"First record ID: {records[0].id}")
print(f"First record sequence: {records[0].seq[:50]}")