<a href="https://colab.research.google.com/github/eoinleen/Protein-design-random/blob/main/20250129-dickin-about-Seq_analysis_RFdiff_v10.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
"""
Script: Sequence Analysis Pipeline with Cluster Images
Author: [Your Name]
Date: [Current Date]

Description:
This script performs a sequence analysis pipeline, which includes the following steps:
1. Extracting designed sequences from an input FASTA file.
2. One-hot encoding the extracted sequences.
3. Computing pairwise Hamming distances between the encoded sequences.
4. Creating and saving a phylogenetic tree visualization based on the distance matrix.
5. Generating a similarity heatmap for the sequences.
6. Exporting individual images for each cluster in the dendrogram.

Dependencies:
- Biopython
- NumPy
- Pandas
- Seaborn
- Matplotlib
- Scikit-learn
- SciPy
- Joblib
- Regular Expressions (re)
"""

# Install required packages
!pip install biopython
!pip install numpy
!pip install pandas
!pip install seaborn
!pip install matplotlib
!pip install scikit-learn
!pip install scipy
!pip install joblib

import os
import numpy as np
import pandas as pd
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.cluster import AgglomerativeClustering
from scipy.spatial.distance import pdist, squareform
from scipy.cluster import hierarchy
from joblib import Parallel, delayed
import re

def extract_designed_sequences(input_file, output_file):
    """Extract designed sequences from RF_diffusion output and save to a new FASTA file."""
    if not os.path.exists(input_file):
        raise FileNotFoundError(f"Input file not found: {input_file}")
    os.makedirs(os.path.dirname(output_file), exist_ok=True)
    extracted_sequences = []
    design_lengths = set()

    for idx, record in enumerate(SeqIO.parse(input_file, "fasta")):
        try:
            match = re.search(r'design:(\d+).*n:(\d+)', record.description)
            if match:
                new_id = f"d{match.group(1)}_n{match.group(2)}"

                if '/' in str(record.seq):
                    designed_seq = str(record.seq).split('/')[1].replace('-', '').strip()
                    design_lengths.add(len(designed_seq))
                    extracted_sequences.append(SeqRecord(Seq(designed_seq), id=new_id, description=""))
                else:
                    print(f"Warning: No '/' found in sequence {record.id}")
            else:
                print(f"Warning: Could not parse sequence {idx}")
        except Exception as e:
            print(f"Error processing sequence {idx}: {e}")

    if not extracted_sequences:
        raise ValueError("No valid sequences found in input file")

    SeqIO.write(extracted_sequences, output_file, "fasta")
    print(f"Extracted {len(extracted_sequences)} sequences, saved to: {output_file}")
    return extracted_sequences

def one_hot_encode(sequences):
    """One-hot encode amino acid sequences."""
    amino_acids = "ACDEFGHIKLMNPQRSTVWY"
    aa_dict = {aa: i for i, aa in enumerate(amino_acids)}
    encoding = np.zeros((len(sequences), len(sequences[0]), len(amino_acids)))

    for i, seq in enumerate(sequences):
        for j, char in enumerate(seq):
            if char in aa_dict:
                encoding[i, j, aa_dict[char]] = 1
    return encoding.reshape(len(sequences), -1)

def compute_distance_matrix(encoded_seqs):
    """Compute pairwise Hamming distances using parallel processing."""
    return squareform(pdist(encoded_seqs, metric="hamming"))

def create_phylogenetic_tree(distance_matrix, sequence_names, output_dir, num_clusters=3):
    """Create and save a phylogenetic tree visualization, and separate cluster images."""
    os.makedirs(output_dir, exist_ok=True)

    if distance_matrix.shape[0] != len(sequence_names):
        raise ValueError(f"Mismatch: distance matrix size {distance_matrix.shape[0]} "
                         f"does not match sequence names count {len(sequence_names)}")

    condensed_dist = squareform(distance_matrix)
    Z = hierarchy.linkage(condensed_dist, method='average')

    # Create and save the main dendrogram
    plt.figure(figsize=(20, 5))
    dendro = hierarchy.dendrogram(Z, labels=sequence_names, leaf_rotation=90, leaf_font_size=10)
    plt.title("Sequence Similarity Tree")
    plt.xlabel("Sequence ID")
    plt.ylabel("Distance")
    plt.tight_layout()
    tree_plot_path = os.path.join(output_dir, 'sequence_tree.png')
    plt.savefig(tree_plot_path, dpi=300)
    plt.close()
    print(f"Tree visualization saved to: {tree_plot_path}")

    # Define clusters at a certain level
    cluster_labels = hierarchy.fcluster(Z, num_clusters, criterion='maxclust')

    # Create individual cluster visualizations
    for cluster_id in range(1, num_clusters + 1):
        # Identify which sequences belong to this cluster
        cluster_sequences = [sequence_names[i] for i in range(len(sequence_names)) if cluster_labels[i] == cluster_id]

        # Plot the dendrogram for this cluster
        plt.figure(figsize=(20, 5))
        hierarchy.dendrogram(Z, labels=sequence_names, leaf_rotation=90, leaf_font_size=10,
                             color_threshold=0, above_threshold_color='black', below_threshold_color='black')

        # Highlight the cluster in a different color
        for i, label in enumerate(sequence_names):
            if label in cluster_sequences:
                plt.gca().get_children()[i].set_color(f"C{cluster_id}")

        plt.title(f"Cluster {cluster_id}")
        plt.xlabel("Sequence ID")
        plt.ylabel("Distance")
        plt.tight_layout()

        cluster_plot_path = os.path.join(output_dir, f'cluster_{cluster_id}.png')
        plt.savefig(cluster_plot_path, dpi=300)
        plt.close()
        print(f"Cluster {cluster_id} visualization saved to: {cluster_plot_path}")

def analyze_sequences(input_file, output_dir):
    """Run complete sequence analysis pipeline."""
    os.makedirs(output_dir, exist_ok=True)
    extracted_file = os.path.join(output_dir, "extracted_sequences.fasta")
    sequences = extract_designed_sequences(input_file, extracted_file)
    sequence_list = [str(seq.seq) for seq in sequences]
    sequence_names = [seq.id for seq in sequences]

    print("Encoding sequences...")
    encoded_seqs = one_hot_encode(sequence_list)
    print("Computing distance matrix...")
    distance_matrix = compute_distance_matrix(encoded_seqs)

    print("Generating similarity heatmap...")
    plt.figure(figsize=(10, 10))
    sns.heatmap(1 - distance_matrix[:50, :50], cmap="viridis", square=True)
    plt.title("Sequence Similarity Matrix")
    plt.tight_layout()
    plt.savefig(os.path.join(output_dir, "similarity_heatmap.png"), dpi=300)
    plt.close()

    print("Creating phylogenetic tree...")
    create_phylogenetic_tree(distance_matrix, sequence_names, output_dir)

    print("Analysis complete. Results saved in", output_dir)
    return sequences, distance_matrix

if __name__ == "__main__":
    input_file = "/content/drive/MyDrive/Fasta-files/3NOB_90-110/3NOB_90-110_design.fasta"
    output_dir = "/content/drive/MyDrive/Fasta-files/3NOB_90-110/analysis_output"
    analyze_sequences(input_file, output_dir)




FileNotFoundError: Input file not found: /content/drive/MyDrive/Fasta-files/3NOB_90-110/3NOB_90-110_design.fasta

In [None]:
from google.colab import drive
drive.mount('/content/drive')