In [None]:
# Installation
!pip install biopython



In [None]:
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio import SearchIO
from Bio import Entrez

In [None]:
from google.colab import userdata
email = userdata.get('EntrezEmail')
api_key = userdata.get('EntrezKey')

In [None]:
ncbi_proteins = {}
with open('/content/ncbi.txt', mode='r', encoding='utf-8-sig') as prediction:
    for line in prediction:
        line = line.strip()
        line = line.split('\t')
        key = line[0]
        value = line[1:]
        ncbi_proteins[key] = value

print(ncbi_proteins)

{'XP_027157697.1': [], 'XP_027181395.1': [], 'XP_027148929.1': [], 'XP_027157975.1': [], 'XP_027158018.1': [], 'XP_027158022.1': [], 'XP_027150783.1': [], 'XP_027158106.1': [], 'XP_027150148.1': [], 'XP_027150140.1': [], 'XP_027159923.1': [], 'XP_027158444.1': [], 'XP_027158452.1': [], 'XP_027158620.1': [], 'XP_027183147.1': [], 'XP_027158651.1': [], 'XP_027158696.1': [], 'XP_027158740.1': [], 'XP_027147841.1': [], 'XP_027158787.1': [], 'XP_027158752.1': [], 'XP_027158763.1': [], 'XP_027158798.1': [], 'XP_027151262.1': [], 'XP_027151315.1': [], 'XP_027159054.1': [], 'XP_027159078.1': [], 'XP_027168680.1': [], 'XP_027149786.1': [], 'XP_027165688.1': [], 'XP_027165897.1': [], 'XP_027165940.1': [], 'XP_027180107.1': [], 'XP_027150414.1': [], 'XP_027165999.1': [], 'XP_027166025.1': [], 'XP_027166035.1': [], 'XP_027166105.1': [], 'XP_027152446.1': [], 'XP_027166332.1': [], 'XP_027166342.1': [], 'XP_027166354.1': [], 'XP_027166380.1': [], 'XP_027177534.1': [], 'XP_027166395.1': [], 'XP_02716

In [None]:
import time

# Function to fetch protein sequence from NCBI
def get_protein_seq(protein_id):
    try:
        # NCBI Identification
        Entrez.email = email
        Entrez.api_key = api_key

        handle = Entrez.efetch(db="protein", id=protein_id, rettype="fasta", retmode="text")
        protein_record = SeqIO.read(handle, "fasta")
        handle.close()

        # Return the sequence as a string
        return protein_record.seq
    except Exception as e:
        print(f"Error fetching sequence for protein ID {protein_id}: {e}")
        return None

# Input
for key, value in ncbi_proteins.items():
    protein_seq = get_protein_seq(key)
    if protein_seq:
        ncbi_proteins[key] = protein_seq

    # NCBI limits 10 queries/second using an API key
    time.sleep(1/10)

In [None]:
print(ncbi_proteins)

{'XP_027157697.1': Seq('MADAALSATIQVALQTVVSLAGDHVNLVREFPEELQRFNQSAAMIRGFLAGAEK...PEN'), 'XP_027181395.1': Seq('MAEGFLQPDSQNERMMETIGYEYLRILLQTSLLEEVNYGRRTRYKMHDLVHDFA...FKV'), 'XP_027148929.1': Seq('MADAAVSATIRVALQTVVSLAADHVNLAREFPKELERLEKSAAMIRGFLAGADK...LFS'), 'XP_027157975.1': Seq('MADAALSATIQVALQTVVSLAGDHVNLVREFPEELERFNQSAAMIRGFLAGAEK...LWY'), 'XP_027158018.1': Seq('MADAAVSATIKVALQAVVSRAADHREFPEELERLKKSAKIIRGFLAGADEAKYS...PSQ'), 'XP_027158022.1': Seq('MADAAVSATIQVALQAVVSLAADHVNLVREFPTELERLNKSAEMIRGFLAGADE...MKE'), 'XP_027150783.1': Seq('MVEIINEKKSASLRTLFLKGGIADDMLSKFKYLHVLKLFGADAKELPTSIGKLI...YLC'), 'XP_027158106.1': Seq('MADAAVSATIQVALQAVISLAADHVNLAREFPEELEKLDKSAAMIRGFLAGADE...CKH'), 'XP_027150148.1': Seq('MLVDGFPKKMSNLISMRHLHYDDDDTGREIQMPSGIGRLTCLQTLKFFNIGRQE...KSF'), 'XP_027150140.1': Seq('MGGLGKTTLAKAVYKNEQIVGHFDQTMWVCVAEKVDRIEVVFKMILESLIGGRV...ETL'), 'XP_027159923.1': Seq('MADAAVSATIQVGLQTVVSLAADHVNLVREFPTELERLNDSAEMIRGFLAGADE...KID'), 'XP_027158444.1': Seq('MADAAVSATIRVALQTVVS

In [None]:
# Writing sequences to FASTA files
sequence_records = []
max_entries_per_file = 500
entry_count = 0
file_count = 1

for key, value in ncbi_proteins.items():
    # Check if the sequence is not None before creating SeqRecord
    if value is not None:
        seq_record = SeqRecord(value, id=key)
        sequence_records.append(seq_record)
        entry_count += 1

        if entry_count == max_entries_per_file:
            # Write to a new file
            sequence_file = f'ncbi_proteins_{file_count}.fasta'
            SeqIO.write(sequence_records, sequence_file, 'fasta')

            # Reset variables for the next file
            sequence_records = []
            entry_count = 0
            file_count += 1

# Write any remaining entries to a new file
if sequence_records:
    sequence_file = f'ncbi_proteins_{file_count}.fasta'
    SeqIO.write(sequence_records, sequence_file, 'fasta')