## Data Download from UniProt

After running this notebook, you should have 
- `data/raw/uniprot_raw.tsv` with enzyme sequences
- Columns: `Entry`, `Sequence`, `EC Number`

In [4]:
# Load config file

import yaml

with open("../configs/data.yaml", "r") as f:
    cfg = yaml.safe_load(f)

query_url = cfg["query_url"]
query_url

'https://rest.uniprot.org/uniprotkb/stream?compressed=true&fields=accession%2Csequence%2Cec&format=tsv&query=%28reviewed%3Atrue+AND+ec%3A*+AND+length%3A%5B50+TO+1000%5D%29+AND+%28reviewed%3Atrue%29'

In [5]:
# Download from UniProt

import requests
from io import BytesIO

response = requests.get(query_url)
bio = BytesIO(response.content)
bio

<_io.BytesIO at 0x24d1a8421b0>

In [6]:
# Load raw data

import pandas
import os

os.makedirs("../data/raw", exist_ok=True)

df = pandas.read_csv(bio, compression='gzip', sep='\t')
df = df.dropna() 

df.to_csv("../data/raw/uniprot_raw.csv", index=False) 
df.head(3)

Unnamed: 0,Entry,Sequence,EC number
0,A0A009IHW8,MSLEQKKGADIISKILQIQNSIGKTTSPSTLKTKLSEISRKEQENA...,3.2.2.-; 3.2.2.6
1,A0A023I7E1,MRFQVIVAAATITMITSYIPGVASQSTSDGDDLFVPVSNFDPKSIF...,3.2.1.39
2,A0A024RXP8,MYRKLAVISAFLATARAQSACTLQSETHPPLTWQKCSSGGTCTQQT...,3.2.1.91
