In [43]:
from Bio import Entrez
from Bio import SeqIO
import io
from io import StringIO
import json
import pandas as pd
import logging
from datetime import datetime


In [3]:
# set up logging file
# Step 1: Configure the logging
logging.basicConfig(filename='../logs/tx_pull.log',  # Log file path
                    filemode='a',  # Append mode (use 'w' for overwrite mode)
                    level=logging.INFO,  # Log level (DEBUG, INFO, WARNING, ERROR, CRITICAL)
                    format='%(asctime)s - %(levelname)s - %(message)s',  # Log message format
                    datefmt='%Y-%m-%d %H:%M:%S')  # Date format
# Step 2: Write log messages
# Generate a timestamp
timestamp = datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
logging.info(f"Starting transcript pull: {timestamp}")  # Example of logging an informational message

In [16]:
## Set some parameters

Entrez.email = 'deanna.church@gmail.com' #let them know who I am
%config ZMQInteractiveShell.iopub_data_rate_limit=100000000  #throttle requests


In [44]:
def download_entrez_record(organism_ids, rettype="gbwithparts", retmode="text"):
    """Downloads a full Entrez record using the specified ID and parameters.

    Args:
        organism_id: NCBI database ID of the record to download.
        rettype: Entrez return type (e.g., "gbwithparts").
        retmode: Entrez return mode (e.g., "text").

    Returns:
        The downloaded record as a string, or None if an error occurs.
    """
    records_length = {}
    try:
        ids = ",".join(str(id) for id in organism_ids)
        handle = Entrez.efetch(db="nucleotide", id=ids, rettype=rettype, retmode=retmode)
        raw_records = handle.read()
        handle.close()
        
        # Parse the raw records using SeqIO
        for record in SeqIO.parse(StringIO(raw_records), "genbank"):
            records_length[record.id] = len(record.seq)
        
        return records_length
    except Exception as e:
        print(f"Error downloading or parsing records: {e}")
        return None

In [6]:
def chunker(seq, size):
    """Yield successive chunks from seq."""
    for i in range(0, len(seq), size):
        yield seq[i:i + size]

## testing

In [45]:
test_id=['NM_130786.3, NM_000014.4']
raw_records = download_entrez_record(test_id)
print(raw_records)
#handle=Entrez.efetch(db='nucleotide', id='NM_130786.3,NM_000014.4', rettype="gbwithparts", retmode="text")

#raw_records=handle.read().strip().split('\n\n')
#handle.close()

    


{'NM_130786.3': 1766, 'NM_000014.4': 4678}


# Processing list

In [46]:
tx_df=pd.read_csv('../data/processed/tx_list.txt', header=None)
display(tx_df.head())

Unnamed: 0,0
0,NM_020963.4
1,NM_148978.2
2,NM_001270891.1
3,NM_025108.2
4,NM_173847.3


In [47]:
tx_list=tx_df.iloc[:, 0].tolist()
print(tx_list)

['NM_020963.4', 'NM_148978.2', 'NM_001270891.1', 'NM_025108.2', 'NM_173847.3', 'NM_020633.3', 'NM_001303516.1', 'NM_178498.3', 'NM_003446.3', 'NM_000171.3', 'NM_001099687.3', 'NM_213613.3', 'NM_001127321.1', 'NM_001671.4', 'NM_016433.3', 'NM_020957.3', 'NM_004957.5', 'NM_005391.4', 'NM_153443.4', 'NM_198565.2', 'NM_183337.2', 'NM_138364.3', 'NM_173854.5', 'NM_003585.4', 'NM_024831.6', 'NM_018423.2', 'NM_139074.3', 'NM_001040032.1', 'NM_001037735.3', 'NM_024578.2', 'NM_001192.2', 'NM_002076.3', 'NM_138638.4', 'NM_138960.3', 'NM_001009184.1', 'NM_016144.2', 'NM_005516.5', 'NM_006198.2', 'NM_018373.2', 'NM_001301782.1', 'NM_207320.2', 'NM_182972.2', 'NM_001190972.1', 'NM_018659.2', 'NM_012435.2', 'NM_024665.4', 'NM_002719.3', 'NM_022006.1', 'NM_001300953.1', 'NM_153813.2', 'NM_015515.4', 'NM_001289119.1', 'NM_015140.3', 'NM_002044.3', 'NM_003719.3', 'NM_001003794.2', 'NM_152520.4', 'NM_024632.5', 'NM_000355.3', 'NM_173625.3', 'NM_004824.3', 'NM_001252402.2', 'NM_001457.3', 'NM_054027.4', 

In [48]:
tx_info = {}
total_tx = len(tx_list)
processed = 0

for i, tx_chunk in enumerate(chunker(tx_list, 1000), start=1):
    records = download_entrez_record(tx_chunk)  # This function returns a dict {id: len(seq)}
    tx_info.update(records)  # Update tx_info with the returned records
    processed += len(tx_chunk)
    remaining = total_tx - processed
    print(f"Processed block {i}, {remaining} items remain")

print(tx_info)

Processed block 1, 18115 items remain
Processed block 2, 17115 items remain
Processed block 3, 16115 items remain
Processed block 4, 15115 items remain
Processed block 5, 14115 items remain
Processed block 6, 13115 items remain
Processed block 7, 12115 items remain
Processed block 8, 11115 items remain
Processed block 9, 10115 items remain
Processed block 10, 9115 items remain
Processed block 11, 8115 items remain
Processed block 12, 7115 items remain
Processed block 13, 6115 items remain
Processed block 14, 5115 items remain
Processed block 15, 4115 items remain
Processed block 16, 3115 items remain
Processed block 17, 2115 items remain
Processed block 18, 1115 items remain
Processed block 19, 115 items remain
Processed block 20, 0 items remain
{'NM_020963.4': 3767, 'NM_148978.2': 6198, 'NM_001270891.1': 796, 'NM_025108.2': 1650, 'NM_173847.3': 819, 'NM_020633.3': 1566, 'NM_001303516.1': 3617, 'NM_178498.3': 6253, 'NM_003446.3': 1695, 'NM_000171.3': 1811, 'NM_001099687.3': 1449, 'NM_2

In [49]:
print(len(tx_info))

19114


In [50]:
#write to disc
with open('../data/processed/tx_info.json', 'w') as json_file:
    json.dump(tx_info, json_file)

In [53]:
with open('../data/processed/tx_info.json', 'r') as file:
    tx_dict = json.load(file)

In [55]:
missing_transcripts = [tx for tx in tx_list if tx not in tx_dict]
print(missing_transcripts)


[nan]


Spot checking a couple to make sure length is correct.

In [56]:
print(tx_dict['NM_153045.3'])

4413


In [57]:
print(tx_dict['NM_173847.3'])

819


Things look pretty good!