# Import python packages for pipeline

In [None]:
import subprocess
import platform
import shutil
import uuid

import pandas as pd

from Bio import SeqIO
from ete3 import Tree
from ete3 import TreeStyle, NodeStyle


# Install external softwares

In [None]:
try:
    if shutil.which("mmseqs") is None:
        subprocess.run("conda install -c conda-forge -c bioconda mmseqs2", shell=True)
        print("mmseqs2 not installed")
    elif shutil.which("mmseqs") is not None:
        print("mmseqs2 already installed: ", shutil.which("mmsa"))
except:
    print("Was not able to install mmseqs2 automatically. Please install manually")
    
try:
    if "macOS" in platform.platform() and shutil.which("mafft") is None:
        subprocess.run("brew install mafft", shell=True)
        print("mafft not installed")
    elif "Linux" in platform.platform() and shutil.which("mafft") is None:
        print("mafft not installed: ", shutil.which("mafft"))
    else:
        print("mafft already installed: ", shutil.which("mafft"))
except:
    print("Was not able to install mafft automatically. Please install manually")

try:
    if "macOS" in platform.platform() and shutil.which("FastTree") is None:
        subprocess.run("brew install fasttree", shell=True)
        print("FastTree not installed")
    elif "Linux" in platform.platform() and shutil.which("FastTree") is None:
        subprocess.run("sudo apt install fasttree", shell=True)
        print("FastTree not installed")
    else:
        print("FastTree already installed: ", shutil.which("FastTree"))
except:
    print("Was not able to install FastTree automatically. Please install manually")

try:
    if "macOS" in platform.platform() and shutil.which("paml") is None:
        subprocess.run("brew install paml", shell=True)
        print("FastTree not installed")
    elif "Linux" in platform.platform() and shutil.which("paml") is None:
        subprocess.run("sudo apt install paml", shell=True)
        print("FastTree not installed")
    else:
        print("FastTree already installed: ", shutil.which("mmsa"))
except:
    print("Was not able to install PAML automatically. Please install manually")

# Setup working directory and assign fasta input file

In [None]:
import os
import shutil
import time

# path to fasta file and nustru dataframe
# plese define the fasta file and dataframe from which you want to create the alignment and tree
fasta_file = None
df = pd.read_csv(None)

# for the created files, create a unique job_id
job_id = uuid.uuid4()
# get the current working directory
cwd = os.getcwd()

# create directories for the new job
job_path = os.path.join(cwd, 'nustruTREE', str(job_id))
msa_path = os.path.join(job_path, 'MSA')
tree_path = os.path.join(job_path, 'TREE')

try:
    os.makedirs(msa_path, exist_ok=True)
    os.makedirs(tree_path, exist_ok=True)

    shutil.copy(fasta_file, msa_path)
    print("File copied successfully!")
    
except FileNotFoundError as fnf_error:
    print(f"Error: {fnf_error}")
    print("Please check the path to your fasta file and ensure it exists.")

except PermissionError as perm_error:
    print(f"Permission Error: {perm_error}")
    print("Please check your permissions for the directories.")

except Exception as e:
    print(f"An unexpected error occurred: {e}")

# update global variables
global working_dir 
global working_name

# set the working directory and name
working_dir = job_path
working_name = os.path.basename(fasta_file).split(".")[0]

# Run the sequence allignment with mmseqs2

### this is the attempt to run the msa with mmseqs2 (needs to be updated)

In [None]:
subprocess.run(f"mmseqs createdb {working_dir}/MSA/{working_name}.fasta {working_dir}/MSA/{working_name}.fasta.db", shell=True)
subprocess.run(f"mmseqs linclust {working_dir}/MSA/{working_name}.fasta.db {working_dir}/MSA/cluster {working_dir}/MSA/tmp --min-seq-id 0.5 -c 0.7 --cov-mode 1", shell=True)
subprocess.run(f"mmseqs result2msa {working_dir}/MSA/{working_name}.fasta.db {working_dir}/MSA/{working_name}.fasta.db {working_dir}/MSA/cluster {working_dir}/MSA/cluster.fasta --msa-format-mode 2", shell=True)

subprocess.run(f"mmseqs createseqfiledb {working_dir}/MSA/{working_name}.fasta.db {working_dir}/MSA/cluster {working_dir}/MSA/cluster_seq", shell=True)
subprocess.run(f"mmseqs result2flat {working_dir}/MSA/{working_name}.fasta.db {working_dir}/MSA/{working_name}.fasta.db  {working_dir}/MSA/cluster_seq {working_dir}/MSA/cluster_seq.fasta", shell=True)

In [None]:
subprocess.run(f"mmseqs createdb {working_dir}/MSA/{working_name}.fasta {working_dir}/MSA/inputDB", shell=True)
subprocess.run(f"mmseqs cluster {working_dir}/MSA/inputDB {working_dir}/MSA/clusterDB {working_dir}/MSA/tmp --min-seq-id 0.9", shell=True)
subprocess.run(f"mmseqs createseqfiledb {working_dir}/MSA/inputDB {working_dir}/MSA/clusterDB {working_dir}/MSA/representDB", shell=True)
subprocess.run(f"mmseqs align {working_dir}/MSA/inputDB {working_dir}/MSA/inputDB {working_dir}/MSA/clusterDB {working_dir}/MSA/aligmentDB", shell=True)
subprocess.run(f"mmseqs convertalis {working_dir}/MSA/inputDB {working_dir}/MSA/inputDB {working_dir}/MSA/alignmentDB {working_dir}/MSA/ouput.a3m", shell=True)

In [None]:
subprocess.run(f"mmseqs createsubdb {working_dir}/MSA/cluster {working_dir}/MSA/{working_name}.fasta.db {working_dir}/MSA/cluster_repr", shell=True)
subprocess.run(f"mmseqs convert2fasta {working_dir}/MSA/cluster_repr {working_dir}/MSA/cluster_repr.fasta", shell=True)

In [None]:
subprocess.run(f"mmseqs align {working_dir}/MSA/cluster_seq {working_dir}/MSA/cluster_seq {working_dir}/MSA/cluster_seq {working_dir}/MSA/cluster_seq_aln", shell=True)
subprocess.run(f"mmseqs result2msa {working_dir}/MSA/{working_name}.fasta.db {working_dir}/MSA/{working_name}.fasta.db {working_dir}/MSA/cluster_seq_aln {working_dir}/MSA/cluster_seq_msa.fasta --msa-format-mode 2", shell=True)

# Run the sequence allignment with mafft

In [None]:
# run mafft in the shell with auto mode
subprocess.run(f"mafft --auto {working_dir}/MSA/{working_name}.fasta > {working_dir}/MSA/{working_name}_aligned.fasta", shell=True)

# Generate the phylogenetic tree with FastTree from MSA

In [None]:
# run FastTree in the shell with the aligned fasta file
subprocess.run(f"FastTree {working_dir}/MSA/{working_name}_aligned.fasta > {working_dir}/TREE/{working_name}.tree", shell=True)

# Manual reroot the tree with ete3

### Automatic rerooting of phylogenetic trees with mad

In [None]:
# run automatic rooting with mad
subprocess.run(f"python ~/mad/mad.py {working_dir}/MSA/{working_name}.fasta", shell=True)

### Manual rerooting of phylogenetic trees with identified outgroup

In [None]:
# assign tree file frome before
tree = Tree(f"{working_dir}/TREE/{working_name}.tree")

#### Visualize the tree to identify the root

In [None]:
# customize the tree style
ts = TreeStyle()
ts.show_branch_length = True
ts.show_leaf_name = True

# Render and display the tree
# tree.show(tree_style=ts)  # Use this if you are running locally and want to display the tree interactively
tree.render('%%inline', w=1400, h=1800, tree_style=ts)  # Use this if you want to save the tree to a file

#### Reroot the tree

In [None]:
# find leaf id for organism
outgroup_organism = "Escherichia coli"
try:
    outgroup = df[df["organism"] == outgroup_organism]["primary_id"].values[0]
    print(outgroup)
except:
    print("Organism not found in the dataframe")

In [None]:
# reroot the tree
tree.set_outgroup(tree&outgroup)

# save the rerooted tree
tree.write(outfile="rerooted_tree_file.nwk")