<a href="https://colab.research.google.com/github/ebbettin/UCH_SRL/blob/main/Group_by_ortholog.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install biopython
!pip install python-Levenshtein

from pathlib import Path
from Bio import SeqIO
import shutil as sh
from collections import defaultdict
import Levenshtein

Collecting biopython
  Downloading biopython-1.85-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (13 kB)
Downloading biopython-1.85-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/3.3 MB[0m [31m18.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: biopython
Successfully installed biopython-1.85
Collecting python-Levenshtein
  Downloading python_levenshtein-0.27.1-py3-none-any.whl.metadata (3.7 kB)
Collecting Levenshtein==0.27.1 (from python-Levenshtein)
  Downloading levenshtein-0.27.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.6 kB)
Collecting rapidfuzz<4.0.0,>=3.9.0 (from Levenshtein==0.27.1->python-Levenshtein)
  Downloading rapidfuzz-3.14.1-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (12 kB)
Downloading python_levenshtein-0.27.1-py3-none-any.whl (9.4 kB)
Downloading levenshtein-0.27.1-cp312-cp312-manylinu

In [None]:
input_dir = Path("/content/fasta_files")  # Folder with your strain FASTA files
output_dir = Path("/content/genes_output")
output_dir.mkdir(exist_ok=True)

# Parameters for grouping
similarity_threshold = 0.9  # proportion similarity required to group sequences

# Groups will be a list of tuples: (representative_seq, [records])
sequence_groups = []

# Read all sequences and group by closest match
def find_group(seq_str):
    for rep_seq, records in sequence_groups:
        sim_ratio = Levenshtein.ratio(rep_seq, seq_str)
        if sim_ratio >= similarity_threshold:
            return records
    return None

for fasta_path in sorted(input_dir.glob("*.fasta")):
    for record in SeqIO.parse(fasta_path, "fasta"):
        seq_str = str(record.seq).upper()
        group_records = find_group(seq_str)
        if group_records is not None:
            group_records.append(record)
        else:
            sequence_groups.append((seq_str, [record]))

if not sequence_groups:
    raise ValueError("No sequences found in the input directory")

print(f"Found {len(sequence_groups)} ortholog groups based on {similarity_threshold*100:.1f}% similarity threshold.")

# Write one FASTA file per sequence group
for idx, (rep_seq, records) in enumerate(sequence_groups, start=1):
    gene_filename = output_dir / f"ortholog_{idx}.fasta"
    SeqIO.write(records, gene_filename, "fasta")

print(f"Created {len(sequence_groups)} ortholog FASTA files in {output_dir}")


Found 8 ortholog groups based on 90.0% similarity threshold.
Created 8 ortholog FASTA files in /content/genes_output


In [None]:
# Create ZIP archive of the output folder
zip_path = sh.make_archive(str(output_dir), 'zip', output_dir)
print(f"Zipped all output FASTA files to: {zip_path}")

Zipped all output FASTA files to: /content/genes_output.zip
