In [1]:
import os

import time

import pandas as pd

import requests
from requests.adapters import HTTPAdapter, Retry

In [2]:
DATA_DIR = "../data/"

In [3]:
komodo_df = pd.read_csv(
    os.path.join(
        DATA_DIR,
        "komodo",
        "komodo.tsv"
    ),
    dtype={"Taxon ID": "str"},
    sep="\t"
)

# Drop entries without taxon ID
komodo_df = komodo_df.dropna(subset="Taxon ID")

komodo_df

Unnamed: 0,Organism DSMZ ID,Taxon ID,Organism Name,Media
1,9849,46125,Abiotrophia defectiva,PYG-MEDIUM (modified)
2,14247,291968,Acaricomes phytoseiuli,TRYPTICASE SOY Yeast extract medium
4,5522,2382,Acetitomaculum ruminis,ACETITOMACULUM medium | METHANOBACTERIUM medium
5,1870,35830,Acetivibrio cellulolyticus,ACETIVIBRIO CELLULOLYTICUS MEDIUM
6,3005,290052,Acetivibrio ethanolgignens,PYG medium WITH VOLATILE FATTY ACIDS
...,...,...,...,...
8622,18599,325472,Zymomonas mobilis subsp. francensis,ZYMOMONAS medium
8623,473,120045,Zymomonas mobilis subsp. mobilis,ZYMOMONAS medium
8624,22645,120044,Zymomonas mobilis subsp. pomaceae,ZYMOMONAS medium
8625,7201,86958,Zymophilus paucivorans,MEDIUM 58 MODIFIED FOR DSM 7201


In [4]:
taxon_id_list = komodo_df["Taxon ID"]
taxon_id_list

1        46125
2       291968
4         2382
5        35830
6       290052
         ...  
8622    325472
8623    120045
8624    120044
8625     86958
8626     86959
Name: Taxon ID, Length: 7819, dtype: object

In [5]:
def get_ec_values(record: dict) -> list:
    return "|".join([
        ec_record["value"] for ec_record in
        record\
            .get("proteinDescription", dict())\
            .get("recommendedName", dict())\
            .get("ecNumbers", "")
    ])


def get_record(record: dict) -> pd.DataFrame:
    record_df = pd.Series({
        "entryType": record["entryType"],
        "primaryAccession": record["primaryAccession"],
        "uniProtkbId": record["uniProtkbId"],
        "taxonId": record["organism"]["taxonId"],
        "fullName": record\
            .get("proteinDescription", dict())\
            .get("recommendedName", dict())\
            .get("fullName", dict())\
            .get("value", None),
        "ecNumbers": get_ec_values(record)
    })
    record_df = record_df.to_frame().T

    return record_df

In [None]:
# Initialize results CSV file
results_path = os.path.join(
    DATA_DIR,
    "uniprot",
    "komodo_taxon_to_uniprot_ec.csv"
)
record_columns = [
    "entryType",
    "primaryAccession",
    "uniProtkbId",
    "taxonId",
    "fullName",
    "ecNumbers"
]
pd.DataFrame(columns=record_columns).to_csv(
    results_path,
    index=False,
    mode="w"
)

retries = Retry(
    total=5,
    backoff_factor=0.25,
    status_forcelist=[500, 502, 503, 504]
)
session = requests.Session()
session.mount("https://", HTTPAdapter(max_retries=retries))

for taxon_idx, taxon_id in enumerate(taxon_id_list):
    url = f"https://rest.uniprot.org/uniprotkb/search?query=organism_id:{taxon_id}&fields=id,organism_id,ec,cc_function,cc_pathway"

    print(f"[+] Retrieving EC numbers for taxon ID: {taxon_id}")

    response = session.get(url)
    response.raise_for_status()

    response_df = pd.DataFrame(columns=record_columns)

    for record_idx, record in enumerate(response.json()["results"]):

        response_df = pd.concat(
            [response_df, get_record(record)],
            axis=0,
            ignore_index=True
        )

        if record_idx % 10 == 0:
            print(
                f"[+] Taxon ID {taxon_id} - " + \
                f"Processed record {record_idx + 1} / " + \
                f"{len(response.json()['results'])}"
            )

    # Save dataframe to file
    response_df.to_csv(
        results_path,
        index=False,
        header=False,
        mode="a"
    )

    # Delete dataframe to save space
    del response_df

    if taxon_idx % 100 == 0:
        print(f"[+] Processed taxon ID {taxon_idx + 1} / {len(taxon_id_list)}")

    time.sleep(0.25)
