<a href="https://colab.research.google.com/github/eoinleen/Protein-design-random/blob/main/Fragment-size-calc.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [6]:
###############################################################################
# Restriction Fragment Analysis Script
#
# Description: This script analyzes DNA sequences in FASTA format to find
#              restriction fragments between BamHI (GGATCC) and EcoRI (GAATTC)
#              sites. It calculates fragment sizes and outputs results in both
#              CSV and Excel formats.
#
# Author: Claude
# Date: April 8, 2025
#
# Usage:
#   1. Run the script in Google Colab
#   2. Upload your FASTA file when prompted
#   3. The script will analyze the sequences and provide downloadable results
#
# Input: FASTA file containing DNA sequences
# Output: CSV and Excel files with sequence IDs and fragment sizes
###############################################################################

import re
import pandas as pd
from google.colab import files
import io

# Function to parse FASTA file
def parse_fasta(fasta_content):
    sequences = {}
    current_id = ""
    current_seq = ""

    for line in fasta_content.strip().split('\n'):
        if line.startswith('>'):
            if current_id and current_seq:
                sequences[current_id] = current_seq
            current_id = line[1:].strip()  # Remove the '>' character
            current_seq = ""
        else:
            current_seq += line.strip()

    # Add the last sequence
    if current_id and current_seq:
        sequences[current_id] = current_seq

    return sequences

# Function to find restriction sites and calculate fragment size
def find_fragment_size(sequence, re1="GGATCC", re2="GAATTC"):
    """Find the size of DNA fragment between restriction enzyme sites"""
    re1_pos = sequence.find(re1)
    if re1_pos == -1:
        return None  # BamHI site not found

    # Start searching after BamHI site
    search_start = re1_pos + len(re1)
    re2_pos = sequence[search_start:].find(re2)

    if re2_pos == -1:
        return None  # EcoRI site not found after BamHI

    # Calculate fragment size (adding the start position to get absolute position)
    actual_re2_pos = search_start + re2_pos

    # Fragment size is distance between the end of RE1 and start of RE2
    fragment_size = actual_re2_pos - (re1_pos + len(re1))

    return fragment_size

# Main function
def process_fasta_file():
    print("Please upload your FASTA file:")
    uploaded = files.upload()

    for filename, content in uploaded.items():
        # Decode the content
        fasta_content = content.decode('utf-8')

        # Parse the FASTA file
        sequences = parse_fasta(fasta_content)

        # Analyze each sequence
        results = []
        for seq_id, sequence in sequences.items():
            # Get the first 7 characters of the sequence ID
            id_prefix = seq_id[:7] if len(seq_id) >= 7 else seq_id

            # Calculate fragment size
            fragment_size = find_fragment_size(sequence)

            results.append({
                'Sequence_ID': seq_id,
                'ID_Prefix': id_prefix,
                'Fragment_Size': fragment_size
            })

        # Create DataFrame
        df = pd.DataFrame(results)

        # Generate file names
        base_filename = filename.split('.')[0]
        csv_filename = f"{base_filename}_results.csv"
        excel_filename = f"{base_filename}_results.xlsx"

        # Save to CSV
        df.to_csv(csv_filename, index=False)

        # Save to Excel
        df.to_excel(excel_filename, index=False)

        # Download the files
        print("\nDownloading CSV file...")
        files.download(csv_filename)

        print("Downloading Excel file...")
        files.download(excel_filename)

        # Display the results in Colab
        print(f"\nResults for {filename}:")
        print(df)

        # Summary stats
        print("\nSummary:")
        print(f"Total sequences analyzed: {len(sequences)}")
        sequences_with_sites = sum(1 for r in results if r['Fragment_Size'] is not None)
        print(f"Sequences with both BamHI and EcoRI sites: {sequences_with_sites}")
        if sequences_with_sites > 0:
            avg_size = sum(r['Fragment_Size'] for r in results if r['Fragment_Size'] is not None) / sequences_with_sites
            print(f"Average fragment size: {avg_size:.2f} bp")

# Run the main function
process_fasta_file()

Please upload your FASTA file:


Saving 20250408-check-RE.fasta to 20250408-check-RE.fasta

Downloading CSV file...


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Downloading Excel file...


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>


Results for 20250408-check-RE.fasta:
             Sequence_ID ID_Prefix  Fragment_Size
0    110287_mpnn3_model1   110287_            256
1    150447_mpnn3_model2   150447_            358
2   215645_mpnn10_model2   215645_            370
3                     21        21            133
4    325188_mpnn2_model1   325188_            277
5    164418_mpnn2_model1   164418_            439
6                 296_32    296_32            322
7                1010_24   1010_24            439
8    311379_mpnn1_model1   311379_            436
9    164418_mpnn3_model1   164418_            439
10  825176_mpnn10_model2   825176_            346
11  437645_mpnn19_model2   437645_            304
12  871244_mpnn16_model2   871244_            322
13               4106_21   4106_21            133
14   898214_mpnn4_model1   898214_            286
15   333072_mpnn3_model2   333072_            316
16   993298_mpnn1_model2   993298_            391
17   611457_mpnn5_model2   611457_            322
18   451607_

In [7]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
