# Import python packages for pipeline

In [7]:
from Bio import SeqIO
import subprocess
import shutil
import os
import sys
from ete3 import Tree, TreeStyle, TextFace, NodeStyle

# Install external softwares

In [None]:
if shutil.which("mmseqs") is None:
    subprocess.run("conda install -c conda-forge -c bioconda mmseqs2", shell=True)
    print("mmseqs2 not installed")
else:
    print("mmseqs2 already installed: ", shutil.which("mmsa"))
    
'''
if shutil.which("kalign") is None:
    subprocess.run("conda install -c conda-forge -c bioconda mmseqs2", shell=True)
    print("mmseqs2 not installed")
else:
    print("mmseqs2 already installed: ", shutil.which("mmsa"))
'''


if shutil.which("FastTree") is None:
    subprocess.run("wget http://www.microbesonline.org/fasttree/FastTree.c", shell=True)
    print("mmseqs2 not installed")
else:
    print("mmseqs2 already installed: ", shutil.which("mmsa"))


# Setup working directory and assign fasta input file

In [19]:
# path to fasta file
fasta_file = "/Users/dominiquefastus/master_project/NuStru/nustruDB/filtered_NEW_ECOLI_FULL_uniprot_02.fasta"

# create tree structure for pipeline
cwd = os.getcwd()
if not os.path.exists(f'{cwd}/nustruTREE'):
    mkdir = os.mkdir(f'{cwd}/nustruTREE')
    mkdir = os.mkdir(f'{cwd}/nustruTREE/MSA')
    mkdir = os.mkdir(f'{cwd}/nustruTREE/TREE')
    
shutil.copy(fasta_file, f'{cwd}/nustruTREE/MSA')

# global variables
global working_dir 
working_dir = f'{cwd}/nustruTREE/'

# Run the sequence allignment with mmseqs2

In [22]:
subprocess.run(f"mmseqs createdb {working_dir}/MSA/filtered_NEW_ECOLI_FULL_uniprot_02.fasta {working_dir}/MSA/filtered_NEW_ECOLI_FULL_uniprot_02.fasta.db", shell=True)
subprocess.run(f"mmseqs linclust {working_dir}/MSA/filtered_NEW_ECOLI_FULL_uniprot_02.fasta.db {working_dir}/MSA/cluster {working_dir}/MSA/tmp --min-seq-id 0.9 -c 0.8 --cov-mode 1", shell=True)
subprocess.run(f"mmseqs result2msa {working_dir}/MSA/filtered_NEW_ECOLI_FULL_uniprot_02.fasta.db {working_dir}/MSA/filtered_NEW_ECOLI_FULL_uniprot_02.fasta.db {working_dir}/MSA/cluster {working_dir}/MSA/cluster.msa", shell=True)

subprocess.run(f"mmseqs createseqfiledb {working_dir}/MSA/cluster.msa {working_dir}/MSA/cluster.msa.db", shell=True)
subprocess.run(f"mmseqs result2flat {working_dir}/MSA/filtered_NEW_ECOLI_FULL_uniprot_02.fasta.db {working_dir}/MSA/filtered_NEW_ECOLI_FULL_uniprot_02.fasta.db {working_dir}/MSA/cluster {working_dir}/MSA/cluster.msa {working_dir}/MSA/cluster.msa.db {working_dir}/MSA/cluster.msa.fasta", shell=True)

/Users/dominiquefastus/master_project/NuStru/nustruEVOL/nustruTREE//MSA/filtered_NEW_ECOLI_FULL_uniprot_02.fasta.db exists and will be overwritten
createdb /Users/dominiquefastus/master_project/NuStru/nustruEVOL/nustruTREE//MSA/filtered_NEW_ECOLI_FULL_uniprot_02.fasta /Users/dominiquefastus/master_project/NuStru/nustruEVOL/nustruTREE//MSA/filtered_NEW_ECOLI_FULL_uniprot_02.fasta.db 

MMseqs Version:       	15-6f452
Database type         	0
Shuffle input database	true
Createdb mode         	0
Write lookup file     	1
Offset of numeric ids 	0
Compressed            	0
Verbosity             	3

Converting sequences
Time for merging to filtered_NEW_ECOLI_FULL_uniprot_02.fasta.db_h: 0h 0m 0s 61ms
Time for merging to filtered_NEW_ECOLI_FULL_uniprot_02.fasta.db: 0h 0m 0s 360ms
Database type: Aminoacid
Time for processing: 0h 0m 1s 195ms
linclust /Users/dominiquefastus/master_project/NuStru/nustruEVOL/nustruTREE//MSA/filtered_NEW_ECOLI_FULL_uniprot_02.fasta.db /Users/dominiquefastus/master_proj

Not enough input paths provided. 3 paths are required.
Too many input paths provided. Only 4 are allowed


CompletedProcess(args='mmseqs result2flat /Users/dominiquefastus/master_project/NuStru/nustruEVOL/nustruTREE//MSA/filtered_NEW_ECOLI_FULL_uniprot_02.fasta.db /Users/dominiquefastus/master_project/NuStru/nustruEVOL/nustruTREE//MSA/filtered_NEW_ECOLI_FULL_uniprot_02.fasta.db /Users/dominiquefastus/master_project/NuStru/nustruEVOL/nustruTREE//MSA/cluster /Users/dominiquefastus/master_project/NuStru/nustruEVOL/nustruTREE//MSA/cluster.msa /Users/dominiquefastus/master_project/NuStru/nustruEVOL/nustruTREE//MSA/cluster.msa.db /Users/dominiquefastus/master_project/NuStru/nustruEVOL/nustruTREE//MSA/cluster.msa.fasta', returncode=1)

In [None]:
subprocess.run(f"mmseqs createsubdb {working_dir}/MSA/cluster.msa.db {working_dir}/MSA/cluster.msa.fasta {working_dir}/MSA/cluster.msa.fasta.db", shell=True)
subprocess.run(f"mmseqs result2fasta {working_dir}/MSA/filtered_NEW_ECOLI_FULL_uniprot_02.fasta.db {working_dir}/MSA/filtered_NEW_ECOLI_FULL_uniprot_02.fasta.db {working_dir}/MSA/cluster {working_dir}/MSA/cluster.msa {working_dir}/MSA/cluster.msa.db {working_dir}/MSA/cluster.msa.fasta", shell=True)

# Generate the phylogenetic tree with FastTree from MSA

In [None]:
subprocess.run(f"FastTree {working_dir}/MSA/cluster.msa.fasta > {working_dir}/TREE/cluster.msa.fasta.tree", shell=True)

# Visualize the phylogenetic tree

In [None]:
tree = Tree(f"{working_dir}/TREE/tree.nwk")

# Define tree style
ts = TreeStyle()
ts.show_leaf_name = True
ts.show_branch_length = True
ts.show_branch_support = True

# Render the tree
tree.show(tree_style=ts)