In [19]:
import re
from Bio import Entrez
import time
import pandas as pd

Entrez.email = 'emma.tysinger@gmail.com'

def extract_protein_names(file_path):
    # Regular expression pattern to match the lines with protein names
    pattern = r"No ids found for (\w+)"

    # List to store extracted protein names
    protein_names = []

    # Open and read the file
    with open(file_path, 'r') as file:
        for line in file:
            # Use regular expression to find matches
            match = re.search(pattern, line)
            if match:
                # Append the found protein name to the list
                protein_names.append(match.group(1))

    return protein_names

# Replace 'missing_retrieval.txt' with the path to your text file
file_path = 'missing_retrieval.txt'
protein_names = extract_protein_names(file_path)

# Print extracted protein names
print(len(protein_names))
print(len(set(protein_names)))


5846
5551


In [7]:
print(protein_names[:20])

['FAM153CP', 'MT2P1', 'SNORD55', 'AATBC', 'LINC00908', 'MIR130A', 'MIR107', 'MIR219A2', 'MIRLET7I', 'MIR148A', 'MIR30B', 'MIR942', 'MIR370', 'MIR494', 'MIR539', 'MIR33B', 'MIR10B', 'MIR200B', 'MIRLET7G', 'MIR149']


In [31]:
outfile = 'gene_sequences_7.fasta'
gene_id_dict = {}
gene_id_filename = 'gene_id_dict_7.csv'
non_sequences = []
with open(outfile, 'w') as outfile:
    for i, prot in enumerate(protein_names[1400:]): 
        if i%100 == 0:
            print(f'Done with {i} of {len(protein_names[1400:])} sequences')
        try:
            handle = Entrez.esearch(db="nuccore", retmax=10, term=prot, idtype='acc')
            record = Entrez.read(handle)
            handle.close()
        except:
            non_sequences.append(prot)
            continue
        try:
            id = record['IdList'][0]
        except:
            non_sequences.append(prot)
            continue

        try:
            summary_result = Entrez.esummary(db="nuccore", id=record['IdList'][0])
            summary = Entrez.read(summary_result, validate=False)
        except:
            non_sequences.append(prot)
            continue 

        if int(summary[0]['Length']) < 1000:
            handle = Entrez.efetch(db="nuccore", id=id, rettype="fasta", retmode="text")
            gene_data = handle.read()
            gene_id_dict[id] = prot
            outfile.write(gene_data)
            pd.DataFrame(list(gene_id_dict.items()), columns=['gene_id', 'sequence']).to_csv(gene_id_filename, index=False)
        else:
            non_sequences.append(prot)
        time.sleep(1.5)

file_name = 'non_seuqnces.txt'

# Write the list to the file
with open(file_name, 'w') as file:
    for item in non_sequences:
        file.write(item + '\n')

Done with 0 of 4446 sequences
Done with 100 of 4446 sequences
Done with 200 of 4446 sequences
Done with 300 of 4446 sequences
Done with 400 of 4446 sequences
Done with 500 of 4446 sequences
Done with 600 of 4446 sequences
Done with 700 of 4446 sequences
Done with 800 of 4446 sequences
Done with 900 of 4446 sequences
Done with 1000 of 4446 sequences
Done with 1100 of 4446 sequences
Done with 1200 of 4446 sequences
Done with 1300 of 4446 sequences
Done with 1400 of 4446 sequences
Done with 1500 of 4446 sequences
Done with 1600 of 4446 sequences
Done with 1700 of 4446 sequences
Done with 1800 of 4446 sequences
Done with 1900 of 4446 sequences
Done with 2000 of 4446 sequences
Done with 2100 of 4446 sequences
Done with 2200 of 4446 sequences
Done with 2300 of 4446 sequences
Done with 2400 of 4446 sequences
Done with 2500 of 4446 sequences
Done with 2600 of 4446 sequences
Done with 2700 of 4446 sequences
Done with 2800 of 4446 sequences
Done with 2900 of 4446 sequences
Done with 3000 of 4446

In [29]:
summary_result = Entrez.esummary(db="nuccore", id='NG_009780.1')
summary = Entrez.read(summary_result, validate=False)
int(summary[0]['Length'])

21934

In [27]:
gene_data

'>NG_009780.1 Homo sapiens dyskerin pseudouridine synthase 1 (DKC1), RefSeqGene (LRG_55) on chromosome X\nTCCGCCCGCCTCGGCCTCCCAAAGTGCTGGGATTACAGGCGTGAGCCACCGCGCCCGGCCAACGTGGCAG\nTTTTAAACCAGGACTCCAGAATTCTTAACACTCCTCCCACTGAGAGGTGGGGTTAATGTTCCCTCCCCCT\nGAACCTGAGCAGGCTTGTGGCTGCTTCAACCATAGACTGCAACAAAAATGGCATGATGAGACTTCCAAGG\nCCGAGTCATAAAAGGTGATGCAGCTTTTTGCCTTTTTCACTCTAACCCTTGCTTTGGAGGTCCTGAGCCA\nCCAAGTACAGACTCCAGCTAGTCTAAGGCTGCCACGCCATAAAGAGAGGCTGTGTGTAGTAGGTGCTCAA\nGTTCCAGTGGAACCCAGCCTTCCAGCCATCCTAGTATAGGCAACAGGCATGTTAGTGAAGAAGCTTCCAG\nAAGATTATAGCCTCCAGCCATCCTGTCACCCCTATCCATCAAGTCATCTCAGCCAAGACCTCAGACATTA\nCAGAGCAAAGAGAAACCATTCTCACTGTGTCCTGTCTTGATTTCTCACCTGAAGAACCTGTCAGCACAAT\nGAAATGGCTGTTTAATGGCACTAAGTTTGGGGATGGTTTGTTATGCAGCAGTAGAAACCTAAACTGTGGA\nCATTAAACGACTGTGGATTACTGAAAGCATAATCAGGTGGTTATTTCAAGTGCTGCTACGGTTACAGGTG\nTGATCTATTTGCTAAAACAATTCCATACCTCTTCCAACATTCTGTAGGCAGCCGTTGATCTGGCCAATGA\nTAAAATGGCCTGTTGAAGATTCTGCTGTGTTCCTAGTTGGGACACAACACTCTTGTGAGGATCTGGACCA\nTACGCTGAACTAGAAACCATTAAATGGTGC

In [11]:
print(record['IdList'][0])
summary_result = Entrez.esummary(db="nuccore", id=record['IdList'][0])
summary = Entrez.read(summary_result, validate=False)
print(summary)

# Extract start and end positions
start = int(summary[0]['GenomicInfo'][0]['ChrStart'])
end = int(summary[0]['GenomicInfo'][0]['ChrStop'])

result = end - start + 1
print(result)


NC_000005.10
[{'Item': [], 'Id': '568815593', 'Caption': 'NC_000005', 'Title': 'Homo sapiens chromosome 5, GRCh38.p14 Primary Assembly', 'Extra': 'gi|568815593|gnl|ASM:GCF_000001305|5|ref|NC_000005.10||gpp|GPC_000001297.1||gnl|NCBI_GENOMES|5[568815593]', 'Gi': IntegerElement(568815593, attributes={}), 'CreateDate': '2002/08/22', 'UpdateDate': '2023/10/07', 'Flags': IntegerElement(544, attributes={}), 'TaxId': IntegerElement(9606, attributes={}), 'Length': IntegerElement(181538259, attributes={}), 'Status': 'live', 'ReplacedBy': '', 'Comment': '  ', 'AccessionVersion': 'NC_000005.10'}]


KeyError: 'GenomicInfo'

In [15]:
int(summary[0]['Length'])

181538259