In [1]:
import os
import pandas as pd
import xml.etree.ElementTree as ET
from Bio.Blast import NCBIXML
from Bio import Entrez, SeqIO
import requests
import re
from tqdm import tqdm

In [2]:
data_folder = './data/blast_results'
paths = [f"{data_folder}/{x}" for x in os.listdir(data_folder)]
paths[:3]

['./data/blast_results/A0A1S4AUX8_blast_results.xml',
 './data/blast_results/P52579_blast_results.xml',
 './data/blast_results/A0A1S3Y1C1_blast_results.xml']

In [3]:
from ratelimit import limits, RateLimitException, sleep_and_retry

ONE_MINUTE = 60
MAX_CALLS_PER_MINUTE = 1

@sleep_and_retry
@limits(calls=MAX_CALLS_PER_MINUTE, period=ONE_MINUTE)
def upload_uniprotid(path):
    # The rest of your function code remains the same
    uniprot_regex = r'>sp\|[\w.-]+\|'
    # Open the XML file for reading
    count = 0
    url = "http://localhost:8004/protein/"  # Replace with the appropriate API URL
    try:
        print(path)
        if 'xml' in path:
            with open(path) as result_file:
                print(result_file)
                # Parse the Blast results
                blast_records = NCBIXML.read(result_file)
                # Iterate over each alignment (hit)
                for alignment in blast_records.alignments:
                    #print(alignment.hit_id)
                    database = (alignment.hit_id.split('|')[0])
                    id = (alignment.hit_id.split('|')[1]).split('.')[0]                      
                    #check if id already in database
                    
                    r = requests.get(f'http://localhost:8004/protein/{id}')
                    if 'exists' in list(r.json().keys()):
                        print('already exists')
                    else:
                        handle = Entrez.efetch(db="protein", id=id, rettype="gb", retmode="text")
                        record = SeqIO.read(handle, format="gb")
                        scientific_name = record.annotations['organism']
                        domain = record.annotations['taxonomy'][0]
                        kingdom = record.annotations['taxonomy'][1]
                        order = record.annotations['taxonomy'][8]
                        genus = record.annotations['taxonomy'][-1]
                        sequence = str(record.seq)

                        data = {
                        "primary_accession":id,
                        "sequence": sequence,
                        "scientific_name": scientific_name,
                        "superkingdom": domain,
                        "kingdom": kingdom,
                        "order": order,
                        "genus": genus
                        }
                        response = requests.post(url, json=data)
                        # Check the response status and print the result
                        if response.status_code == 200:
                            print("Data uploaded successfully")
                        else:
                            print(f"Error uploading data: {response.text}")               
    except Exception as e:
        print(f'error {e}')

In [4]:
r = requests.get(f'http://localhost:8004/protein/MPO1_TOBAC')

In [5]:
r.json()

{'exists': True}

In [6]:
from concurrent.futures import ProcessPoolExecutor
input_files = paths

In [7]:
def run_in_parallel(function, input_list, max_workers=8):
    with ProcessPoolExecutor(max_workers=max_workers) as executor:
        results = list(executor.map(function, input_list))
    return results


In [8]:
run_in_parallel(upload_uniprotid, input_files)


./data/blast_results/A0A172WCA8_blast_results.xml./data/blast_results/A0A172WCA4_blast_results.xml./data/blast_results/B1NYI5_blast_results.xml
./data/blast_results/P52579_blast_results.xml./data/blast_results/A0A1S3Y1C1_blast_results.xml
<_io.TextIOWrapper name='./data/blast_results/A0A172WCA8_blast_results.xml' mode='r' encoding='UTF-8'>
./data/blast_results/Q9SEH5_blast_results.xml
./data/blast_results/A0A1S4AUX8_blast_results.xml<_io.TextIOWrapper name='./data/blast_results/A0A172WCA4_blast_results.xml' mode='r' encoding='UTF-8'><_io.TextIOWrapper name='./data/blast_results/B1NYI5_blast_results.xml' mode='r' encoding='UTF-8'>



./data/blast_results/A0A0K0K5B3_blast_results.xml<_io.TextIOWrapper name='./data/blast_results/A0A1S3Y1C1_blast_results.xml' mode='r' encoding='UTF-8'><_io.TextIOWrapper name='./data/blast_results/P52579_blast_results.xml' mode='r' encoding='UTF-8'>


<_io.TextIOWrapper name='./data/blast_results/A0A1S4AUX8_blast_results.xml' mode='r' encoding='UTF-8'>

<_i

            Email address is not specified.

            To make use of NCBI's E-utilities, NCBI requires you to specify your
            email address with each request.  As an example, if your email address
            is A.N.Other@example.com, you can specify it as follows:
               from Bio import Entrez
               Entrez.email = 'A.N.Other@example.com'
            In case of excessive usage of the E-utilities, NCBI will attempt to contact
            a user at the email address provided before blocking access to the
            E-utilities.
            Email address is not specified.

            To make use of NCBI's E-utilities, NCBI requires you to specify your
            email address with each request.  As an example, if your email address
            is A.N.Other@example.com, you can specify it as follows:
               from Bio import Entrez
               Entrez.email = 'A.N.Other@example.com'
            In case of excessive usage of the E-utilities, NCBI will

Error uploading data: {"detail":[{"loc":[],"msg":"duplicate key value violates unique constraint \"protein_sequence_key\"\nDETAIL:  Key (sequence)=(KKAQEEIDTKVGKDRWVEKSDIKDLVYLQAIVKEVLQLYPPGPLLVPHKNVKDCVVSGYHIPKGTKLFANVMKLQRDPKLWSNPEMFDPKRFIATDIDFRGHHYEYIPFGSGKQSCPGMTYALQVEHLTMAHLIQGFNYRTPNDEPLDMKEGAG) already exists.","type":"IntegrityError"}]}Error uploading data: {"detail":[{"loc":[],"msg":"duplicate key value violates unique constraint \"protein_sequence_key\"\nDETAIL:  Key (sequence)=(MAGQTIIVSGLNPAAILQSTIGGGASPTAAAAENGTRKVIPLSRDALQDFMLSIITQKLQDEKQPFYVLDLGEVVSLIDQWKSALPNIRPFYAVKCNPEPSFLSILSAMGSNFDCASRAEIEYVLSLGISPDRIVFANPCKPESDIIFAAKVGVNLTTYDSEDEVYKIRKHHPKSELLLRIKPMFDGNARCPMGPKYGALPEEVEPLLRAAQAARLTVSGVSFHIGSGDADSNAYLGAIAAAKEVFETAAKLGMSKMTVLDVGGGFTSGHQFTTAAVAVRSALKQHFDDQPELTIIAEPGRFFAETAFTLATTIIGKRVRGELREYWINDGLYGSMNCVLYDHATVNATPLAVLSNRTNVTCGGSKTFPTTVFGPTCDALDTVLRDYQLPELQVNDWLVFPNMGAYTKAAGSNFNGFNTSAIVTHLAYAYPS) already exists.","type":"IntegrityError"}]}

Error uploading data: {"de

            Email address is not specified.

            To make use of NCBI's E-utilities, NCBI requires you to specify your
            email address with each request.  As an example, if your email address
            is A.N.Other@example.com, you can specify it as follows:
               from Bio import Entrez
               Entrez.email = 'A.N.Other@example.com'
            In case of excessive usage of the E-utilities, NCBI will attempt to contact
            a user at the email address provided before blocking access to the
            E-utilities.
            Email address is not specified.

            To make use of NCBI's E-utilities, NCBI requires you to specify your
            email address with each request.  As an example, if your email address
            is A.N.Other@example.com, you can specify it as follows:
               from Bio import Entrez
               Entrez.email = 'A.N.Other@example.com'
            In case of excessive usage of the E-utilities, NCBI will

Error uploading data: {"detail":[{"loc":[],"msg":"duplicate key value violates unique constraint \"protein_sequence_key\"\nDETAIL:  Key (sequence)=(METFLFTSESVNEGHPDKLCDQVSDAILDACLEQDPESKVACETCTKTNMVMVFGEITTKATVDYEKIVRDTCRGIGFTSADVGLDADNCKVLVNIEQQSPDIAQGVHGHLTK) already exists.","type":"IntegrityError"}]}
already exists
already exists
already exists
already exists
already exists
already exists
already exists
already exists
already exists
already exists
already exists
already exists
already exists
already exists
Error uploading data: {"detail":[{"loc":[],"msg":"duplicate key value violates unique constraint \"protein_sequence_key\"\nDETAIL:  Key (sequence)=(MLKHSRTSTMTSLNFVFLSLLVVLLPFSSNVLSDTTNEKFYQCVCQNSDYCVPFSTAFVTPANASFTTILQSTAQNLRLLVPSVPKPQLIFTPMAESHVQAAVICSKQLGLQLRVRSGGHDYEGLSYISEMESPFIILDLSKLRGIEVNIEDNSVWAQAGATVGEVYYRISEKSKTHGFPAGLCTSLGIGGHITGGAYGTMMRKYGLGADNVEDARIVDANGRILDRQSMGEDLFWAIRGGGGASFGIILSWKLRLVPVPSIVTVFTVSKTLEQNGTKIIYKWQQVADKIDEDLFIRVIMNVVDKKDKKGEKTIQMAYNSLFLGRSDRLLEIMNE

[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,

In [None]:
r = view.map_async(wrapped_upload_uniprotid, paths)

In [37]:
upload_uniprotid(paths[3])

./data/blast_results/A0A172WCA8_blast_results.xml
<_io.TextIOWrapper name='./data/blast_results/A0A172WCA8_blast_results.xml' mode='r' encoding='UTF-8'>
ANF07088


            Email address is not specified.

            To make use of NCBI's E-utilities, NCBI requires you to specify your
            email address with each request.  As an example, if your email address
            is A.N.Other@example.com, you can specify it as follows:
               from Bio import Entrez
               Entrez.email = 'A.N.Other@example.com'
            In case of excessive usage of the E-utilities, NCBI will attempt to contact
            a user at the email address provided before blocking access to the
            E-utilities.


Error uploading data: {"detail":[{"loc":[],"msg":"duplicate key value violates unique constraint \"protein_sequence_key\"\nDETAIL:  Key (sequence)=(KKAQEEIDTKVGKDRWVEKSDIKDLVYLQAIVKEVLQLYPPGPLLVPHKNVKDCVVSGYHIPKGTKLFANVMKLQRDPKLWSNPEMFDPKRFIATDIDFRGHHYEYIPFGSGKQSCPGMTYALQVEHLTMAHLIQGFNYRTPNDEPLDMKEGAG) already exists.","type":"IntegrityError"}]}
ANF07091
Error uploading data: {"detail":[{"loc":[],"msg":"duplicate key value violates unique constraint \"protein_sequence_key\"\nDETAIL:  Key (sequence)=(LKKAQEEIDTKVGKDRWVEESDIKDLVYLQAIVKEVLRLYPPGPLLVPHENVKDCVVSGYHIPKGTRLFANVMKLQRDPKLWSNPDMFDPERFIASDIDFRGHHYEYIPFGSGRRSCPGMTYALQVEHLTMAHLIQGFNYRTPNDEPLDMKEGAG) already exists.","type":"IntegrityError"}]}
ANF07089
Error uploading data: {"detail":[{"loc":[],"msg":"duplicate key value violates unique constraint \"protein_sequence_key\"\nDETAIL:  Key (sequence)=(AQEEIDTKVGKDRWVEESDIKDLVYLQAIVKEVLRLYPPGPLLVPHENVKDCVVSGYHIPKGTRLFANVMKLQRDPKLWSNPETFDPERFIASDIDFRGHHYEYIPFGSGRRSCPGMTYALQVEHLTMAHLIQGFNY