<a href="https://colab.research.google.com/github/eoinleen/Protein-design-random/blob/main/WIP-RFdiff_MSA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
"""
RF_Diffusion Sequence Analysis Pipeline
Made for Colabsheet outputs from RF_Diffusion 1.1.1

Author: Claude AI (Anthropic)
-------------------------------------
Created by: Claude AI (Anthropic) with input from user
Version: 1.0
Date: January 26, 2025

This script analyzes sequences from RF_diffusion output files (.fasta format).
It processes sequences that are in the format:
>design:0 n:0|mpnn:1.589|plddt:0.460|i_ptm:0.173|i_pae:20.678|rmsd:8.341 SEQUENCE1/SEQUENCE2

Features:
- Extracts sequences after the '/' delimiter
- Performs multiple sequence alignment using MUSCLE
- Generates phylogenetic trees using UPGMA method
- Calculates sequence conservation scores
- Creates position-specific scoring matrices (PSSM)
- Produces publication-quality visualizations:
    * Conservation plot (conservation_plot.png)
    * Phylogenetic tree (phylogenetic_tree.png)
    * PSSM heatmap (pssm_heatmap.png)

Usage:
1. Upload this script to Google Colab
2. Modify the fasta_path variable to point to your input file
3. Run the entire script

Requirements:
- Google Colab environment
- Input .fasta file from RF_diffusion
- Access to Google Drive (automatically mounted)

Output:
All files are saved in the same directory as the input file:
- conservation_plot.png: Shows conservation across sequence positions
- phylogenetic_tree.png: Visualizes sequence relationships
- pssm_heatmap.png: Shows amino acid frequencies at each position
- aligned.fasta: Multiple sequence alignment output

Credits:
Primary Developer: Claude AI (Anthropic)
Contributing developer: Dr Eoin Leen, University of Leeds.

License:
This code is provided "as is" for research and educational purposes.
"""


# Required installations
import sys
!{sys.executable} -m pip install bio
!{sys.executable} -m pip install matplotlib
!{sys.executable} -m pip install seaborn
!apt-get install muscle
!apt-get install -y hmmer

# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

from Bio import SeqIO, AlignIO, Phylo
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.Align import MultipleSeqAlignment
from Bio.Phylo.TreeConstruction import DistanceCalculator, DistanceTreeConstructor
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from io import StringIO
import os

class RFDiffusionAnalyzer:
    def __init__(self, fasta_path):
        self.fasta_path = fasta_path
        self.output_dir = os.path.dirname(os.path.abspath(fasta_path))
        self.sequences = []
        self.alignment = None
        self.tree = None
        self.conservation_scores = None
        self.pssm = None

        print(f"Analysis will use file: {self.fasta_path}")
        print(f"Outputs will be saved to: {self.output_dir}")

    def extract_sequences(self):
        """Extract sequences after the '/' from RF_diffusion output"""
        sequences = []
        for record in SeqIO.parse(self.fasta_path, "fasta"):
            split_seq = str(record.seq).split('/')
            if len(split_seq) > 1:
                new_record = SeqRecord(
                    Seq(split_seq[1].strip()),
                    id=f"design_{len(sequences)}",
                    description=record.description
                )
                sequences.append(new_record)
        self.sequences = sequences
        return sequences

    def create_alignment(self):
        """Create MSA using MUSCLE"""
        temp_fasta = "temp_sequences.fasta"
        SeqIO.write(self.sequences, temp_fasta, "fasta")
        !muscle -in {temp_fasta} -out aligned.fasta
        self.alignment = AlignIO.read("aligned.fasta", "fasta")
        return self.alignment

    def calculate_conservation(self):
        """Calculate conservation scores for each position"""
        if self.alignment is None:
            self.create_alignment()

        conservation_scores = []
        for i in range(self.alignment.get_alignment_length()):
            column = self.alignment[:, i]
            unique, counts = np.unique(list(column), return_counts=True)
            conservation = max(counts) / len(self.alignment)
            conservation_scores.append(conservation)

        self.conservation_scores = conservation_scores
        return conservation_scores

    def generate_tree(self):
        """Generate phylogenetic tree"""
        if self.alignment is None:
            self.create_alignment()

        calculator = DistanceCalculator('identity')
        dm = calculator.get_distance(self.alignment)
        constructor = DistanceTreeConstructor(calculator, 'upgma')
        self.tree = constructor.build_tree(self.alignment)
        return self.tree

    def calculate_pssm(self):
        """Calculate position-specific scoring matrix"""
        if self.alignment is None:
            self.create_alignment()

        amino_acids = 'ACDEFGHIKLMNPQRSTVWY'
        length = self.alignment.get_alignment_length()
        pssm = np.zeros((len(amino_acids), length))

        for i in range(length):
            column = self.alignment[:, i]
            for j, aa in enumerate(amino_acids):
                pssm[j, i] = list(column).count(aa) / len(self.alignment)

        self.pssm = pssm
        return pssm

    def plot_conservation(self):
        """Plot conservation scores"""
        if self.conservation_scores is None:
            self.calculate_conservation()

        plt.figure(figsize=(15, 5))
        plt.plot(self.conservation_scores)
        plt.title('Sequence Conservation by Position')
        plt.xlabel('Position')
        plt.ylabel('Conservation Score')
        plt.savefig(os.path.join(self.output_dir, 'conservation_plot.png'))
        plt.show()
        plt.close()

    def plot_tree(self):
        """Plot phylogenetic tree"""
        if self.tree is None:
            self.generate_tree()

        plt.figure(figsize=(20, 20))
        Phylo.draw(self.tree)
        plt.title('Phylogenetic Tree of Designs')
        plt.savefig(os.path.join(self.output_dir, 'phylogenetic_tree.png'))
        plt.show()
        plt.close()

    def plot_pssm_heatmap(self):
        """Plot PSSM heatmap"""
        if self.pssm is None:
            self.calculate_pssm()

        plt.figure(figsize=(20, 10))
        sns.heatmap(self.pssm,
                   yticklabels=list('ACDEFGHIKLMNPQRSTVWY'),
                   cmap='YlOrRd')
        plt.title('Position-Specific Scoring Matrix')
        plt.xlabel('Position')
        plt.ylabel('Amino Acid')
        plt.savefig(os.path.join(self.output_dir, 'pssm_heatmap.png'))
        plt.show()
        plt.close()

    def run_complete_analysis(self):
        """Run all analyses and generate plots"""
        print("Analysis starting...")
        print("Output will be saved to:", self.output_dir)

        print("\n1. Extracting sequences...")
        self.extract_sequences()

        print("2. Creating alignment...")
        self.create_alignment()

        print("3. Calculating conservation...")
        self.calculate_conservation()

        print("4. Generating tree...")
        self.generate_tree()

        print("5. Calculating PSSM...")
        self.calculate_pssm()

        print("\n6. Generating and saving plots...")
        self.plot_conservation()
        self.plot_tree()
        self.plot_pssm_heatmap()

        print("\nAnalysis complete! Files saved in:", self.output_dir)

def run_analysis(fasta_path):
    """Run analysis on RF_diffusion output file"""
    try:
        analyzer = RFDiffusionAnalyzer(fasta_path)
        analyzer.run_complete_analysis()
        return analyzer
    except Exception as e:
        print(f"Error occurred: {str(e)}")
        return None

# Specify the full path to your file
fasta_path = "/content/drive/MyDrive/path/to/your/design.fasta"  # Modify this path

# Run analysis
analyzer = run_analysis(fasta_path)