In [None]:
## Import libraries
import pandas as pd
import numpy as np
import io
import os
import matplotlib.pyplot as plt
import re

In [None]:
## Set up NCBI Entrez
from Bio import Entrez, SeqIO

# Set email (required for NCBI access)
Entrez.email = "James.Chang@bcm.edu"

In [None]:
## Load files
# Load wastewater file
big_merge = pd.read_csv("/home/azureuser/cloudfiles/code/Users/jc62/projects/llm_combined/data/big_merge_dec_2024.csv", sep="\t")

# Load GenBank files for accession numbers and data
record = list(SeqIO.parse("/home/azureuser/cloudfiles/code/Users/jc62/projects/llm_combined/data/big_merge_entrez_records.gb", "genbank"))

In [None]:
## Function to extract translation and metadata from GenBank records
def pull_translation_and_meta(record, index):
    # Initialize and set apart features and annotations for index number being queried
    extracted_annotations_per_accession = []
    extracted_cds_per_accession = []

    record_features = record[index].features
    record_annotations = record[index].annotations

    # Gene annotation extraction
    record_annotations_accessions = record_annotations.get('accessions', ["Unknown"])
    record_annotations_organism = record_annotations.get('organism', ["Unknown"])
    record_annotations_taxonomy = record_annotations.get('taxonomy', ["Unknown"])
    extracted_annotations = record_annotations_accessions[0], record_annotations_organism , record_annotations_taxonomy
    extracted_annotations_per_accession.append(extracted_annotations)

    # Protein feature extraction
    if len(record_features) > 0:
        for feature in record_features:
            if feature.type == "CDS":
                gene = feature.qualifiers.get("gene", ["Unknown"])[0]
                product = feature.qualifiers.get("product", ["Unknown"])[0]
                protein_id = feature.qualifiers.get("protein_id", ["Unknown"])[0]
                translation = feature.qualifiers.get("translation", ["No translation available"])[0]
                extracted_cds = gene, product, protein_id, translation
                extracted_cds_per_accession.append(extracted_cds)

    # Combine extracted annotations and CDS data into a DataFrame
    extracted_annotations_per_accession_df = pd.DataFrame(extracted_annotations_per_accession, columns=["Gene ID", "Organism", "Taxonomy"])
    extracted_cds_per_accession_df = pd.DataFrame(extracted_cds_per_accession, columns=["Gene", "Product", "Protein ID", "Translation"])
    combined_record = extracted_annotations_per_accession_df.merge(
        extracted_cds_per_accession_df, how="cross"
    )

    return extracted_annotations_per_accession, extracted_cds_per_accession, combined_record

In [None]:
## Extract all translations and metadata for all records
combined_record_sum = []

# Loop through all entries in GenBank record
for index, record_entry in enumerate(record):
    annotation_accession, features_accession, combined_record = pull_translation_and_meta(record, index)
    combined_record_sum.append(combined_record)

# Concat and combine all data into single dataframe
combined_record_df = pd.concat(combined_record_sum, ignore_index=True)

In [None]:
## Example usage
# Pull the first 10 records for demonstration
big_merge_10 = big_merge.loc[0:9]
big_merge_10['key'] = big_merge_10['accession'].str.rsplit(".", 1).str[0] # Note that periods in accession needs to be dropped

In [None]:
# Rename Gene ID as key to match GenBank and wastewater data
combined_record_df.rename(columns={"Gene ID": "key"}, inplace=True)

# Merge both dataframes for final set
big_merge_10_translate = big_merge_10.merge(combined_record_df, on="key", how="left")