<a href="https://colab.research.google.com/github/ebbettin/UCH_SRL/blob/main/Proteoform_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## **Proteoform Analysis**

# **Step 1: Install Required Libraries**

This section installs the necessary libraries for the analysis: Biopython, pandas, openpyxl, MAFFT, and xlsxwriter.

In [None]:
!pip install biopython pandas openpyxl

In [None]:
!apt-get install mafft

In [None]:
!pip install xlsxwriter

# **Step 2: Sequence Alignment**

This section will generate alignment files.
Sequences with a stop codon before the last position will be excluded and their headers annotated in a txt file.

Replace input_dir to reflect the folder containing your fasta files.
Replace output_dir and output_txt_path to reflect where you want to save the output files.

In [None]:
import os
from Bio import SeqIO
from Bio.Align.Applications import MafftCommandline

# Replace input_dir and output_dir as required.
input_dir = "/content/seq"
output_dir = "/content/aln"

# Create the output directory if it does not exist
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Define the path for the text file
output_txt_path = "/content/excluded_sequences.txt"

# DNA stop codons
stop_codons_dna = {'TAA', 'TAG', 'TGA'}

# Open the text file for writing
with open(output_txt_path, "w") as output_txt_file:

    # Iterate over all files in the input directory
    for file in os.listdir(input_dir):
        # Check if the file is a fasta file
        if file.endswith(".fasta"):
            input_path = os.path.join(input_dir, file)

            # Read the sequences and filter out those with a stop codon before the last position
            sequences = []
            excluded_sequences = []
            for record in SeqIO.parse(input_path, "fasta"):
                sequence = record.seq

                # Check for a stop codon before the last position
                if any(sequence[i:i + 3] in stop_codons_dna for i in range(0, len(sequence) - 3, 3)):
                    excluded_sequences.append(record.id)
                else:
                    sequences.append(record)

            # Print the names of the excluded sequences to the console
            if excluded_sequences:
                print(f"Excluded sequences with a stop codon before the last position in {file}: {', '.join(excluded_sequences)}")

                # Write the information to the text file
                output_txt_file.write(f"Excluded sequences in {file}: {', '.join(excluded_sequences)}\n")

            # Count the number of sequences in the input file
            num_sequences = len(sequences)

            if num_sequences > 0:
                # Create a temporary file for input to MAFFT
                temp_input_path = "/content/temp.fasta"
                with open(temp_input_path, "w") as temp_input_file:
                    SeqIO.write(sequences, temp_input_file, "fasta")

                # Run MAFFT with --auto and --quiet
                output_filename = f"{file.split('.')[0]}.fasta"
                output_path = os.path.join(output_dir, output_filename)
                mafft_cline = MafftCommandline(input=temp_input_path, auto=True, quiet=True)
                stdout, stderr = mafft_cline()

                # Write the aligned sequences to the output file
                with open(output_path, "w") as output_file:
                    output_file.write(stdout)

                # Print the output path and the number of sequences
                print(f"Alignment of {num_sequences} sequences from {file} was completed. Aligned sequences saved to {output_path}")

# Print the path to the text file
print(f"Excluded sequences information saved to: {output_txt_path}")



# **Step 3: Proteoform Analysis**

This section will generate sheets containing information about the identified proteforms.

Analysis will be made using the raw sequences files and the alignent previously generated.

In [None]:
import os
import pandas as pd
from openpyxl import Workbook, load_workbook
from openpyxl.styles import PatternFill
from Bio import SeqIO
from Bio.Seq import Seq

# Directory path containing the input sequences
sequence_directory = '/content/seq'

# Directory path containing the output alignment files
alignment_directory = '/content/aln'

# Create the "tables" folder to store the Excel files
output_folder = '/content/tables'
os.makedirs(output_folder, exist_ok=True)

# Create an empty list to store the Excel file paths
excel_files = []

# Define a function to split a sequence into codons
def split_into_codons(sequence):
    codons = [sequence[i:i+3] for i in range(0, len(sequence), 3)]
    return codons

# Define a function to get amino acid from codon
def codon_to_aa(codon):
    try:
        translation = Seq(codon).translate()
        return str(translation) if str(translation) != '*' else "*"
    except Exception:
        return "*"

# Define a function to analyze sequences and save results to Excel
def analyze_and_save_to_excel(file_path, sheet_name):

    # Read the sequence file and extract the sequences
    sequences_codons = {}

    for record in SeqIO.parse(file_path, 'fasta'):
        sequence = str(record.seq)
        codons = split_into_codons(sequence)
        sequences_codons[record.id] = codons

    # Create a DataFrame from the sequences and their codons
    df = pd.DataFrame.from_dict(sequences_codons, orient='index')

    # Analyze each sequence individually to stop at its own stop codon
    row_data_dict = {}
    for strain, codons in df.iterrows():
        row_data = []
        for idx, codon in enumerate(codons):
            aa = codon_to_aa(codon)
            row_data.append((idx, f"{codon} ({aa})"))
            if aa == '*':
                break
        row_data_dict[strain] = row_data

    # Identify codon positions that occurred before stop codons
    included_positions = set()
    for row in row_data_dict.values():
        included_positions.update(idx for idx, val in row)

    # Filter columns that are in the included positions and are non-conserved
    filtered_cols = [idx for idx in included_positions if df[idx].nunique() > 1]
    filtered_cols = sorted(filtered_cols)

    # Create an Excel workbook or load it if it already exists
    output_file = os.path.join(output_folder, file_path.split('/')[-1].replace('.fasta', '_analysis.xlsx'))
    if os.path.exists(output_file):
        workbook = load_workbook(output_file)
    else:
        workbook = Workbook()
        default_sheet = workbook.active
        workbook.remove(default_sheet)

    # Create a new sheet or load the existing one
    if sheet_name in workbook.sheetnames:
        sheet = workbook[sheet_name]
    else:
        sheet = workbook.create_sheet(title=sheet_name)

    # Add header row
    header_row = [""] + [col + 1 for col in filtered_cols]
    sheet.append(header_row)

    # Write sequence rows
    for strain, row_data in row_data_dict.items():
        row_dict = dict(row_data)
        row_out = [row_dict.get(col, "") for col in filtered_cols]
        sheet.append([strain] + row_out)

    # Get unique codons from the modified cells
    unique_codons = set()
    for row in sheet.iter_rows(min_row=2, min_col=2):
        for cell in row:
            if isinstance(cell.value, str) and cell.value:
                codon = cell.value.split()[0]
                unique_codons.add(codon)

    # Generate consistent colors for unique codons
    def generate_pastel_color():
        pastel_range = range(150, 256)
        color = 'FF' + ''.join(hex(c)[2:].rjust(2, '0') for c in [
            os.urandom(1)[0] % len(pastel_range) + pastel_range.start for _ in range(3)
        ])
        return color

    color_fills = {}
    for value in unique_codons:
        new_fill = PatternFill(
            start_color=generate_pastel_color(),
            end_color=generate_pastel_color(),
            fill_type='solid'
        )
        color_fills[value] = new_fill

    # Apply colors to cells
    for row_num in range(2, sheet.max_row + 1):
        for col_num in range(2, sheet.max_column + 1):
            cell = sheet.cell(row=row_num, column=col_num)
            value = cell.value
            if isinstance(value, str) and value:
                codon = value.split()[0]
                if codon in color_fills:
                    cell.fill = color_fills[codon]

    # Save the workbook as an Excel file
    workbook.save(output_file)
    excel_files.append(output_file)

# Process files in sequence and alignment directories
for filename in os.listdir(sequence_directory):
    if filename.endswith('.fasta'):
        sequence_file_path = os.path.join(sequence_directory, filename)
        analyze_and_save_to_excel(sequence_file_path, 'Original')

for filename in os.listdir(alignment_directory):
    if filename.endswith('.fasta') or filename.endswith('.aln'):
        alignment_file_path = os.path.join(alignment_directory, filename)
        analyze_and_save_to_excel(alignment_file_path, 'Alignment')

# Output Excel file paths
print("Excel files saved in the 'tables' folder:")
for file in excel_files:
    print(file)

# **Step 4: Proteoform List**

This section will generate sheets containing the list of proteoforms identified.

In [None]:
def group_proteins_by_sequence(input_fasta_file, output_excel_file):
    # Read the fasta file and create a dictionary to group proteins by sequence
    sequence_dict = {}
    for record in SeqIO.parse(input_fasta_file, "fasta"):
        sequence = str(record.seq)
        header = record.id
        if sequence in sequence_dict:
            sequence_dict[sequence].append(header)
        else:
            sequence_dict[sequence] = [header]

    # Create a DataFrame from the dictionary
    df = pd.DataFrame.from_dict(sequence_dict, orient='index').transpose()

    # Create output folder if it does not exist
    output_folder = os.path.dirname(output_excel_file)
    os.makedirs(output_folder, exist_ok=True)

    # Write the DataFrame to an Excel file without transposing
    df.to_excel(output_excel_file, index=False)

# Folder path containing all the input fasta files
input_folder = "/content/seq"

# Output folder path for the Excel files
output_folder = "/content/list"

# Loop through all files in the input folder
for filename in os.listdir(input_folder):
    if filename.endswith(".fasta"):
        input_fasta_file = os.path.join(input_folder, filename)
        output_excel_file = os.path.join(output_folder, filename.replace(".fasta", ".xlsx"))

        # Call the function to group proteins and create the Excel sheet
        group_proteins_by_sequence(input_fasta_file, output_excel_file)

# **Step 5: Export files**

This section will export all the output files as .zip

In [None]:
import shutil
from google.colab import files

shutil.make_archive('/content/aln', 'zip', '/content/aln')
files.download('/content/aln.zip')

shutil.make_archive('/content/tables', 'zip', '/content/tables')
files.download('/content/tables.zip')

shutil.make_archive('/content/list', 'zip', '/content/list')
files.download('/content/list.zip')