<a href="https://colab.research.google.com/github/ebbettin/UCH_SRL/blob/main/Extract_CDS_from_Genomes.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install biopython

Collecting biopython
  Downloading biopython-1.85-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (13 kB)
Downloading biopython-1.85-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/3.3 MB[0m [31m21.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: biopython
Successfully installed biopython-1.85


In [2]:
!pip install biopython

from Bio import SeqIO
from Bio.SeqRecord import SeqRecord
import os
import glob
import io
from datetime import datetime

# === Input files ===
gb_folder = "/content/"
tags_path = "/content/locus_tag.txt"

# === Log setup ===
log_stream = io.StringIO()
def log(msg):
    print(msg)
    log_stream.write(msg + "\n")

# === Validate inputs ===
assert os.path.exists(tags_path), f"❌ Locus tag file not found: {tags_path}"

# === Load target locus_tags ===
with open(tags_path, "r") as f:
    target_tags = {line.strip() for line in f if line.strip()}
log(f"Loaded {len(target_tags)} target locus_tags from {tags_path}")

# === Find all GenBank files ===
gb_files = sorted(glob.glob(os.path.join(gb_folder, "*.gb")))
assert gb_files, f"❌ No .gb files found in {gb_folder}"
log(f"Found {len(gb_files)} GenBank files to process.")

# === Process each GenBank file ===
for gb_path in gb_files:
    base_name = os.path.splitext(os.path.basename(gb_path))[0]
    fasta_path = os.path.join(gb_folder, f"{base_name}_filtered.fasta")
    cds_records = []
    found_tags = set()

    for record in SeqIO.parse(gb_path, "genbank"):
        for feature in record.features:
            if feature.type == "CDS" and "locus_tag" in feature.qualifiers:
                locus_tag = feature.qualifiers["locus_tag"][0]
                if any(tag in locus_tag for tag in target_tags):  # partial match
                    product = feature.qualifiers.get("product", ["unknown_product"])[0]
                    seq = feature.extract(record.seq)
                    cds_records.append(SeqRecord(seq, id=locus_tag, description=product))
                    found_tags.add(locus_tag)

    SeqIO.write(cds_records, fasta_path, "fasta")

    # Report summary for this file
    missing = target_tags - {t for t in target_tags if any(t in f for f in found_tags)}
    log(f"✅ {base_name}: extracted {len(cds_records)} CDSs → {fasta_path}")
    if missing:
        log(f"⚠️  Missing {len(missing)} tags not found in {base_name}: {', '.join(sorted(missing))}")

log("🎉 Done processing all GenBank files.")

# === Save log file ===
log_file = "/content/output_summary.txt"
with open(log_file, "w") as f:
    f.write(log_stream.getvalue())
log(f"📝 Log saved to: {log_file}")


Loaded 9 target locus_tags from /content/locus_tag.txt
Found 6 GenBank files to process.
✅ 18NCF8220317: extracted 9 CDSs → /content/18NCF8220317_filtered.fasta
✅ 24SNM5151115: extracted 8 CDSs → /content/24SNM5151115_filtered.fasta
⚠️  Missing 1 tags not found in 24SNM5151115: _0697
✅ 32LMM2190317 : extracted 9 CDSs → /content/32LMM2190317 _filtered.fasta
✅ 6RUM2090716: extracted 9 CDSs → /content/6RUM2090716_filtered.fasta
✅ Fribourg-Blanc: extracted 12 CDSs → /content/Fribourg-Blanc_filtered.fasta
✅ LMNP-1: extracted 12 CDSs → /content/LMNP-1_filtered.fasta
🎉 Done processing all GenBank files.
📝 Log saved to: /content/output_summary.txt


In [3]:
from google.colab import files
import shutil
import glob

# === Prepare ZIP ===
output_zip = "/content/filtered_fastas.zip"
fasta_files = glob.glob("/content/*_filtered.fasta")
log_file = "/content/output_summary.txt"

if not fasta_files:
    print("⚠️ No *_filtered.fasta files found in /content/")
else:
    # Create a temp folder for zip contents
    temp_folder = "/content/zip_temp"
    os.makedirs(temp_folder, exist_ok=True)

    # Copy FASTA files and log file
    for f in fasta_files:
        shutil.copy(f, temp_folder)
    shutil.copy(log_file, temp_folder)

    # Create the ZIP
    shutil.make_archive(output_zip.replace(".zip", ""), 'zip', temp_folder)
    print(f"✅ Created ZIP file with FASTAs + log: {output_zip}")

    # Download
    files.download(output_zip)


✅ Created ZIP file with FASTAs + log: /content/filtered_fastas.zip


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>