# Import python packages for pipeline

In [1]:
from Bio import SeqIO
from datetime import date
import subprocess
import platform
import shutil
import time
import os

# Install external softwares

In [2]:
# Try to install the external programs automatically
try:
    if shutil.which("mmseqs") is None:
        subprocess.run("conda install -c conda-forge -c bioconda mmseqs2", shell=True)
        print("mmseqs2 not installed")
    elif shutil.which("mmseqs") is not None:
        print("mmseqs2 already installed: ", shutil.which("mmsa"))
except:
    print("Was not able to install mmseqs2 automatically. Please install manually")
    
try:
    if "macOS" in platform.platform() and shutil.which("mafft") is None:
        subprocess.run("brew install mafft", shell=True)
        print("mafft not installed")
    elif "Linux" in platform.platform() and shutil.which("mafft") is None:
        print("mafft not installed: ", shutil.which("mafft"))
    else:
        print("mafft already installed: ", shutil.which("mafft"))
except:
    print("Was not able to install mafft automatically. Please install manually")

try:
    if "macOS" in platform.platform() and shutil.which("FastTree") is None:
        subprocess.run("brew install fasttree", shell=True)
        print("FastTree not installed")
    elif "Linux" in platform.platform() and shutil.which("FastTree") is None:
        subprocess.run("sudo apt install fasttree", shell=True)
        print("FastTree not installed")
    else:
        print("FastTree already installed: ", shutil.which("FastTree"))
except:
    print("Was not able to install FastTree automatically. Please install manually")

try:
    if "macOS" in platform.platform() and shutil.which("paml") is None:
        subprocess.run("brew install paml", shell=True)
        print("FastTree not installed")
    elif "Linux" in platform.platform() and shutil.which("paml") is None:
        subprocess.run("sudo apt install paml", shell=True)
        print("FastTree not installed")
    else:
        print("FastTree already installed: ", shutil.which("mmsa"))
except:
    print("Was not able to install PAML automatically. Please install manually")

mmseqs2 already installed:  None
mafft already installed:  /opt/homebrew/bin/mafft
FastTree already installed:  /opt/homebrew/bin/FastTree
FastTree not installed


To reinstall 4.9j, run:
  brew reinstall paml


# Setup working directory and assign fasta input file

In [22]:
# path to fasta file
fasta_file = "/Users/dominiquefastus/master_project/NuStru/nustruDB/protFAMS/ddl_fam_secstru_reduced.fasta"

job_id = f"{time.strftime("%Y_%m_%d_%H_%M", time.localtime())}"
# create tree structure for pipeline
cwd = os.getcwd()
if not os.path.exists(f'{cwd}/nustruTREE/{job_id}/'):
    mkdir = os.mkdir(f'{cwd}/nustruTREE/{job_id}/')
    mkdir = os.mkdir(f'{cwd}/nustruTREE/{job_id}/MSA/')
    mkdir = os.mkdir(f'{cwd}/nustruTREE/{job_id}/TREE/')
    
shutil.copy(fasta_file, f'{cwd}/nustruTREE/{job_id}/MSA/')

# global variables
global working_dir 
global working_name
working_dir = f'{cwd}/nustruTREE/{job_id}'
working_name = os.path.basename(fasta_file).split(".")[0]

In [23]:
import os
import shutil
import time

# Path to fasta file
fasta_file = "/Users/dominiquefastus/master_project/NuStru/nustruDB/protFAMS/ddl_fam_secstru_reduced.fasta"

# Generating a unique job ID based on the current time
job_id = time.strftime("%Y_%m_%d_%H_%M", time.localtime())

# Get the current working directory
cwd = os.getcwd()

# Create directories for the new job
job_path = os.path.join(cwd, 'nustruTREE', job_id)
msa_path = os.path.join(job_path, 'MSA')
tree_path = os.path.join(job_path, 'TREE')

try:
    # Ensure all directories exist
    os.makedirs(msa_path, exist_ok=True)
    os.makedirs(tree_path, exist_ok=True)

    # Copy the fasta file to the new directory
    shutil.copy(fasta_file, msa_path)
    print("File copied successfully!")

except FileNotFoundError as fnf_error:
    print(f"Error: {fnf_error}")
    print("Please check the path to your fasta file and ensure it exists.")

except PermissionError as perm_error:
    print(f"Permission Error: {perm_error}")
    print("Please check your permissions for the directories.")

except Exception as e:
    print(f"An unexpected error occurred: {e}")

# Update global variables
global working_dir 
global working_name
working_dir = job_path
working_name = os.path.basename(fasta_file).split(".")[0]


File copied successfully!


# Run the sequence allignment with mmseqs2

In [24]:
subprocess.run(f"mmseqs createdb {working_dir}/MSA/{working_name}.fasta {working_dir}/MSA/{working_name}.fasta.db", shell=True)
subprocess.run(f"mmseqs linclust {working_dir}/MSA/{working_name}.fasta.db {working_dir}/MSA/cluster {working_dir}/MSA/tmp --min-seq-id 0.5 -c 0.7 --cov-mode 1", shell=True)
subprocess.run(f"mmseqs result2msa {working_dir}/MSA/{working_name}.fasta.db {working_dir}/MSA/{working_name}.fasta.db {working_dir}/MSA/cluster {working_dir}/MSA/cluster.fasta --msa-format-mode 2", shell=True)

subprocess.run(f"mmseqs createseqfiledb {working_dir}/MSA/{working_name}.fasta.db {working_dir}/MSA/cluster {working_dir}/MSA/cluster_seq", shell=True)
subprocess.run(f"mmseqs result2flat {working_dir}/MSA/{working_name}.fasta.db {working_dir}/MSA/{working_name}.fasta.db  {working_dir}/MSA/cluster_seq {working_dir}/MSA/cluster_seq.fasta", shell=True)

createdb /Users/dominiquefastus/master_project/NuStru/nustruEVOL/nustruTREE/2024_05_08_20_57/MSA/ddl_fam_secstru_reduced.fasta /Users/dominiquefastus/master_project/NuStru/nustruEVOL/nustruTREE/2024_05_08_20_57/MSA/ddl_fam_secstru_reduced.fasta.db 

MMseqs Version:       	15-6f452
Database type         	0
Shuffle input database	true
Createdb mode         	0
Write lookup file     	1
Offset of numeric ids 	0
Compressed            	0
Verbosity             	3

Converting sequences
[===
Time for merging to ddl_fam_secstru_reduced.fasta.db_h: 0h 0m 0s 8ms
Time for merging to ddl_fam_secstru_reduced.fasta.db: 0h 0m 0s 14ms
Database type: Aminoacid
Time for processing: 0h 0m 0s 100ms
Create directory /Users/dominiquefastus/master_project/NuStru/nustruEVOL/nustruTREE/2024_05_08_20_57/MSA/tmp
linclust /Users/dominiquefastus/master_project/NuStru/nustruEVOL/nustruTREE/2024_05_08_20_57/MSA/ddl_fam_secstru_reduced.fasta.db /Users/dominiquefastus/master_project/NuStru/nustruEVOL/nustruTREE/2024_05_0

CompletedProcess(args='mmseqs result2flat /Users/dominiquefastus/master_project/NuStru/nustruEVOL/nustruTREE/2024_05_08_20_57/MSA/ddl_fam_secstru_reduced.fasta.db /Users/dominiquefastus/master_project/NuStru/nustruEVOL/nustruTREE/2024_05_08_20_57/MSA/ddl_fam_secstru_reduced.fasta.db  /Users/dominiquefastus/master_project/NuStru/nustruEVOL/nustruTREE/2024_05_08_20_57/MSA/cluster_seq /Users/dominiquefastus/master_project/NuStru/nustruEVOL/nustruTREE/2024_05_08_20_57/MSA/cluster_seq.fasta', returncode=0)

In [None]:
subprocess.run(f"mmseqs createsubdb {working_dir}/MSA/cluster {working_dir}/MSA/{working_name}.fasta.db {working_dir}/MSA/cluster_repr", shell=True)
subprocess.run(f"mmseqs convert2fasta {working_dir}/MSA/cluster_repr {working_dir}/MSA/cluster_repr.fasta", shell=True)

In [None]:
subprocess.run(f"mmseqs align {working_dir}/MSA/cluster_repr {working_dir}/MSA/cluster_repr {working_dir}/MSA/cluster_repr {working_dir}/MSA/cluster_repr_aln", shell=True)
subprocess.run(f"mmseqs result2msa {working_dir}/MSA/{working_name}.fasta.db {working_dir}/MSA/{working_name}.fasta.db {working_dir}/MSA/cluster_repr_aln {working_dir}/MSA/cluster_repr.msa.fasta --msa-format-mode 2", shell=True)

# Run the sequence allignment with mafft

In [None]:
subprocess.run(f"mafft --auto {working_dir}/MSA/{working_name}.fasta > {working_dir}/MSA/{working_name}_aligned.fasta", shell=True)
# subprocess.run(f"mafft --auto {working_dir}/MSA/cluster_repr.fasta > {working_dir}/MSA/cluster_repr_alligned.fasta", shell=True)

# Generate the phylogenetic tree with FastTree from MSA

In [None]:
# subprocess.run(f"FastTree {working_dir}/MSA/{working_name}_alligned.fasta > {working_dir}/TREE/cluster_seq.tre", shell=True)
subprocess.run(f"FastTree {working_dir}/MSA/{working_name}_aligned.fasta > {working_dir}/TREE/{working_name}.tree", shell=True)