<a href="https://colab.research.google.com/github/eoinleen/Protein-design-random/blob/main/GeneArt_output-analysis_BamHI_v2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [13]:
# ============================================================================
# FASTA PROCESSING AND ANALYSIS SCRIPT FOR GENEART DNA SEQUENCES
# ============================================================================
#
# DESCRIPTION:
#   This script processes DNA sequence files in FASTA format from GeneArt.
#   It recursively finds all .fasta files in a directory structure, compiles
#   them into a single file, analyzes restriction sites and stop codons,
#   and translates the sequences starting at the BamHI site.
#
# FEATURES:
#   - Finds all .fasta files in a directory and subdirectories
#   - Combines all sequences into a single compiled FASTA file
#   - Creates an HTML file with color-coded annotations for:
#     * BamHI sites (GGATCC) - Red background
#     * EcoRI sites (GAATTC) - Green background
#     * HindIII sites (AAGCTT) - Blue background
#     * In-frame stop codons (TAA, TAG, TGA) - Red, underlined text
#     * In frame upstream start codon (ATG)
#   - Translates DNA sequences starting from the BamHI site (including the site)
#   - Outputs translated protein sequences in FASTA format
#
#
# OUTPUT FILES:
#   - compiled_sequences.fasta: All DNA sequences in one file
#   - annotated_sequences.html: Interactive visualizations with highlighted features
#   - translated_sequences.fasta: Protein translations starting at BamHI
#   - procesing log file
#
# USAGE:
#   Run this script in Google Colab with mounted Google Drive access.
#   The script will prompt for the directory containing the FASTA files.
#
# AUTHOR:
#   Created with Claude AI assistant, March 2025
#
# ============================================================================

# First, install required packages
print("Installing required packages...")
try:
    import pip
    pip.main(['install', 'biopython'])
    print("Biopython successfully installed")
except:
    # In case the above method doesn't work in the current Colab environment
    print("Using alternative installation method...")
    !pip install biopython
    print("Installation complete")

# Now import the required packages
import os
import re
import glob
from pathlib import Path
import html
from google.colab import drive

# Import Biopython packages after installation
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord

# Mount Google Drive
print("Mounting Google Drive...")
try:
    drive.mount('/content/drive')
    print("Google Drive mounted at /content/drive")
except:
    print("Google Drive is already mounted or there was an issue mounting it.")

def find_fasta_files(directory):
    """
    Find all FASTA files recursively in the given directory,
    excluding any previously created output directories.
    """
    # Define the output directory name to exclude
    output_dir_name = "GeneArt_analysis_output"

    # Find all FASTA files
    all_files = []
    for root, dirs, files in os.walk(directory):
        # Skip the output directory if found
        if output_dir_name in dirs:
            dirs.remove(output_dir_name)  # Don't traverse into the output directory

        # Add any FASTA files found in the current directory
        for file in files:
            if file.endswith('.fasta'):
                all_files.append(os.path.join(root, file))

    return all_files

def compile_fasta_sequences(input_files, output_file, log_file):
    """Compile all FASTA sequences into a single file with clean headers."""
    all_records = []

    # Create a log file with detailed information
    with open(log_file, "w") as log:
        log.write("# DNA Sequence Processing Log\n")
        log.write("# This file contains detailed information about the sequences processed\n")
        log.write("# The compiled FASTA files have simplified headers for clarity\n\n")

        for file_path in input_files:
            # Get a more friendly sequence ID from the filename
            file_name = os.path.basename(file_path)
            seq_id = os.path.splitext(file_name)[0]

            # Log the detailed information
            log.write(f"Sequence ID: {seq_id}\n")
            log.write(f"Source file: {file_path}\n")

            for i, record in enumerate(SeqIO.parse(file_path, "fasta")):
                # Log original record details
                log.write(f"  Original ID: {record.id}\n")
                log.write(f"  Original description: {record.description}\n")
                log.write(f"  Sequence length: {len(record.seq)} bp\n\n")

                # Create a new record with clean ID
                new_record = SeqRecord(
                    record.seq,
                    id=seq_id,
                    description=""  # Empty description for clean header
                )
                all_records.append(new_record)

    # Write all records to the output file with clean headers
    with open(output_file, "w") as f:
        for record in all_records:
            f.write(f">{record.id}\n")
            # Write sequence with 60 characters per line (standard FASTA format)
            seq_str = str(record.seq)
            for i in range(0, len(seq_str), 60):
                f.write(f"{seq_str[i:i+60]}\n")

    return all_records

def analyze_dna_sequences(fasta_records, output_html):
    """
    Analyze DNA sequences and create an annotated HTML file highlighting:
    - BamHI sites (GGATCC)
    - EcoRI sites (GAATTC)
    - HindIII sites (AAGCTT)
    - Stop codons (TAA, TAG, TGA)
    """
    restriction_sites = {
        "BamHI": "GGATCC",
        "EcoRI": "GAATTC",
        "HindIII": "AAGCTT"
    }

    stop_codons = ["TAA", "TAG", "TGA"]

    with open(output_html, "w") as html_file:
        # Write HTML header
        html_file.write("""<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>DNA Sequence Analysis</title>
    <style>
        body { font-family: monospace; line-height: 1.5; }
        .sequence { margin-bottom: 30px; white-space: pre-wrap; word-wrap: break-word; }
        .BamHI { background-color: #FFCCCC; font-weight: bold; }
        .EcoRI { background-color: #CCFFCC; font-weight: bold; }
        .HindIII { background-color: #CCCCFF; font-weight: bold; }
        .stop { text-decoration: underline; font-weight: bold; color: red; }
        .start { background-color: yellow; color: green; font-weight: bold; }
        h2 { border-bottom: 1px solid #ccc; }
        .legend { margin-bottom: 20px; }
        .legend-item { display: inline-block; margin-right: 15px; }
    </style>
</head>
<body>
    <h1>DNA Sequence Analysis</h1>
    <div class="legend">
        <div class="legend-item"><span class="BamHI">BamHI (GGATCC)</span></div>
        <div class="legend-item"><span class="EcoRI">EcoRI (GAATTC)</span></div>
        <div class="legend-item"><span class="HindIII">HindIII (AAGCTT)</span></div>
        <div class="legend-item"><span class="stop">Stop codons (TAA, TAG, TGA)</span></div>
        <div class="legend-item"><span class="start">Start codon (ATG) in frame with BamHI</span></div>
    </div>
""")

        # Process each sequence
        for record in fasta_records:
            seq_str = str(record.seq).upper()
            html_file.write(f"<h2>{record.id}</h2>\n<div class='sequence'>")

            # Find all sites to annotate
            annotations = []

            # Find restriction sites
            for site_name, site_seq in restriction_sites.items():
                for match in re.finditer(site_seq, seq_str):
                    start, end = match.span()
                    annotations.append((start, end, f"class='{site_name}'"))

            # Find BamHI site for reference
            bamhi_match = re.search("GGATCC", seq_str)
            if bamhi_match:
                bamhi_end = bamhi_match.end()
                bamhi_start = bamhi_match.start()

                # Find in-frame ATG start codons upstream of BamHI
                # Calculate the frame of BamHI site
                frame_offset = bamhi_start % 3

                # Look for ATG codons upstream of BamHI that are in the same frame
                for i in range(bamhi_start - 3, -1, -3):  # Go backwards by codon
                    if i >= 0 and i + 2 < len(seq_str):  # Make sure we have a full codon
                        codon = seq_str[i:i+3]
                        if codon == "ATG" and i % 3 == frame_offset:
                            annotations.append((i, i+3, "class='start'"))

                # Only highlight stop codons in the same reading frame as BamHI
                for i in range(bamhi_end, len(seq_str) - 2, 3):  # Start from BamHI end, increment by 3
                    codon = seq_str[i:i+3]
                    if codon in stop_codons:
                        annotations.append((i, i+3, "class='stop'"))
            else:
                # If no BamHI site, just look for stop codons in all frames
                for i in range(0, len(seq_str) - 2, 3):
                    codon = seq_str[i:i+3]
                    if codon in stop_codons:
                        annotations.append((i, i+3, "class='stop'"))

            # Sort annotations by start position
            annotations.sort(key=lambda x: x[0])

            # Generate HTML with annotations
            last_end = 0
            for start, end, tag in annotations:
                # Add any non-annotated sequence before this annotation
                if start > last_end:
                    html_file.write(html.escape(seq_str[last_end:start]))

                # Add the annotated sequence
                html_file.write(f"<span {tag}>{html.escape(seq_str[start:end])}</span>")
                last_end = end

            # Add any remaining sequence after the last annotation
            if last_end < len(seq_str):
                html_file.write(html.escape(seq_str[last_end:]))

            html_file.write("</div>\n")

        # Write HTML footer
        html_file.write("</body>\n</html>")

def translate_sequences(fasta_records, output_file):
    """
    Translate the DNA sequences starting exactly at the BamHI site (including it).
    Save as FASTA format with clean headers.
    """
    bamhi_site = "GGATCC"
    translated_records = []

    for record in fasta_records:
        seq_str = str(record.seq).upper()
        bamhi_match = re.search(bamhi_site, seq_str)

        if bamhi_match:
            # Start translation from the BamHI site itself (include GGATCC in translation)
            start_pos = bamhi_match.start()

            # Get the coding sequence starting from BamHI site
            coding_seq = seq_str[start_pos:]

            # Translate the sequence
            protein_seq = Seq(coding_seq).translate(to_stop=True)

            # Create a new record for the protein sequence with clean header
            translated_records.append({
                "id": record.id,
                "seq": str(protein_seq)
            })
        else:
            print(f"Warning: No BamHI site found in sequence {record.id}")

    # Write translated sequences with clean headers
    with open(output_file, "w") as f:
        # First write an informational comment (not part of any sequence)
        f.write("# Translated protein sequences starting at BamHI site\n")

        # Write each sequence with clean header
        for record in translated_records:
            f.write(f">{record['id']}\n")
            # Write protein sequence with 60 characters per line
            seq = record['seq']
            for i in range(0, len(seq), 60):
                f.write(f"{seq[i:i+60]}\n")

    return translated_records

# Run the main process
print("\nFASTA Processing Script for Google Colab\n")

# Pre-filled directory path
default_path = "/content/drive/MyDrive/Fasta-files/GeneArt-summary-2025AAH3KF"
directory = input(f"Enter the path to the directory in Google Drive containing FASTA files (press Enter to use default: {default_path}): ") or default_path

# Find all FASTA files
print(f"\nSearching for FASTA files in: {directory}")
fasta_files = find_fasta_files(directory)
print(f"Found {len(fasta_files)} FASTA files:")
for file in fasta_files[:5]:  # Show first 5 files
    print(f"  - {file}")
if len(fasta_files) > 5:
    print(f"  ... and {len(fasta_files) - 5} more")

if not fasta_files:
    print("No FASTA files found. Please check the directory path and try again.")
else:
    # Create output directory within the input directory
    output_dir_name = "GeneArt_analysis_output"
    output_dir = os.path.join(directory, output_dir_name)
    os.makedirs(output_dir, exist_ok=True)
    print(f"\nCreated output directory: {output_dir}")

    # Define output files
    compiled_fasta = os.path.join(output_dir, "compiled_sequences.fasta")
    processing_log = os.path.join(output_dir, "processing_details.log")
    annotated_html = os.path.join(output_dir, "annotated_sequences.html")
    translated_fasta = os.path.join(output_dir, "translated_sequences.fasta")

    # Compile sequences
    print("\nCompiling sequences with clean headers...")
    fasta_records = compile_fasta_sequences(fasta_files, compiled_fasta, processing_log)
    print(f"Compiled {len(fasta_records)} sequences saved to {compiled_fasta}")
    print(f"Detailed processing information saved to {processing_log}")

    # Create annotated HTML
    print("\nCreating annotated HTML file...")
    analyze_dna_sequences(fasta_records, annotated_html)
    print(f"Annotated sequences saved to {annotated_html}")

    # Translate sequences
    translated_fasta = os.path.join(output_dir, "translated_sequences.fasta")
    print("\nTranslating sequences...")
    translate_sequences(fasta_records, translated_fasta)
    print(f"Translated sequences saved to {translated_fasta}")

    print("\nProcessing complete! All files saved to:", output_dir)
    print("You can access these files directly in your Google Drive.")

Installing required packages...


Please see https://github.com/pypa/pip/issues/5599 for advice on fixing the underlying issue.
To avoid this problem you can invoke Python with '-m pip' instead of running pip directly.


Biopython successfully installed
Mounting Google Drive...
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Google Drive mounted at /content/drive

FASTA Processing Script for Google Colab

Enter the path to the directory in Google Drive containing FASTA files (press Enter to use default: /content/drive/MyDrive/Fasta-files/GeneArt-summary-2025AAH3KF): 

Searching for FASTA files in: /content/drive/MyDrive/Fasta-files/GeneArt-summary-2025AAH3KF
Found 21 FASTA files:
  - /content/drive/MyDrive/Fasta-files/GeneArt-summary-2025AAH3KF/110287_mpnn3_model1_607599/110287_mpnn3_model1.fasta
  - /content/drive/MyDrive/Fasta-files/GeneArt-summary-2025AAH3KF/150447_mpnn3_model2_607591/150447_mpnn3_model2.fasta
  - /content/drive/MyDrive/Fasta-files/GeneArt-summary-2025AAH3KF/215645_mpnn10_model2_607588/215645_mpnn10_model2.fasta
  - /content/drive/MyDrive/Fasta-files/GeneArt-summary-2025AAH3KF/21_607613/21.fasta
  - /co

