# Generate the a3m and hmm files for the DBv3
***


## 0. Inspection

In [None]:
from tqdm import tqdm
import os 

path_db = "/home/conchae/databases/depolymerase_building/make_db_try3"
path_fasta = [f"{path_db}/{file}" for file in os.listdir(path_db) if file.count("MSA")<1]

dico_length = {}
for path in tqdm(path_fasta) :
    ipr = path.split("/")[-1].split(".fasta")[0]
    len_sequence = len(open(path).read().split(">")[0:-1])
    dico_length[ipr] = len_sequence

***
## I. Build the MSA (FAMSA)

In [None]:
import pandas as pd
from tqdm import tqdm
import os 
import subprocess
from multiprocessing.pool import ThreadPool
from Bio import SeqIO

path_db = "/home/conchae/databases/depolymerase_building/make_db_try3"
path_fasta = [f"{path_db}/{file}" for file in os.listdir(path_db) if file.count("MSA")<1]

def run_famsa(path_fasta) :
    import subprocess
    ipr = path.split("/")[-1].split(".fasta")[0]
    path_out =  "/".join(path.split("/")[0:-1]) + f"/{ipr}.MSA.fasta"
    align_cmd = f"famsa -gt sl -t 10 {path_fasta} {path_out}"    
    align_subprocess = subprocess.Popen (align_cmd , shell = True, stdout = subprocess.PIPE, stderr = subprocess.STDOUT)
    align_out, align_err = align_subprocess.communicate()
    return align_out, align_err
        
if __name__ == '__main__':
    with ThreadPool(3) as pool:
        pool.map(run_famsa,path_fasta, chunksize = len(path_fasta)/3)
        
# *****************************************************************************************************************************
#!/bin/bash
#BATCH --job-name=FAMSA
#SBATCH --partition=long
#SBATCH --ntasks=1 
#SBATCH --cpus-per-task=60 
#SBATCH --mem=100gb 
#SBATCH --time=01-00:00:00 
#SBATCH --output=FAMSA__%j.log 

source /storage/apps/ANACONDA/anaconda3/etc/profile.d/conda.sh
conda activate bio_phylo

python /home/conchae/databases/depolymerase_building/script_files/try_3/famsa_c50.py

***
## II. Filter the MSA

In [None]:
import pandas as pd
from tqdm import tqdm
import os 
import subprocess
from multiprocessing.pool import ThreadPool
from Bio import SeqIO

path_db = "/home/conchae/databases/depolymerase_building/make_db_try3"
path_fasta = [f"{path_db}/{file}" for file in os.listdir(path_db) if file.count("MSA.fasta")>0]

def filter_MSA(i_file) :
    out_path = "/".join(i_file.split("/")[0:-1])
    file_name = i_file.split("/")[-1].split(".MSA.fasta")[0]
    o_file = f"{out_path}/{file_name}.filtered.MSA.fasta"    
    # ***********************************
    filter_cmd = f"hhfilter -i {i_file} -o {o_file} -id 95"
    filter_subprocess = subprocess.Popen (filter_cmd , shell = True, stdout = subprocess.PIPE, stderr = subprocess.STDOUT)
    filter_out, filter_err = filter_subprocess.communicate()
    print(i_file, "Done", sep="\t")
    
results = list(map(filter_MSA, path_fasta))

***
## III. Reformat the MSA into a3m

In [None]:
import pandas as pd
from tqdm import tqdm
import os 
import subprocess
from multiprocessing.pool import ThreadPool
from Bio import SeqIO

path_db = "/home/conchae/databases/depolymerase_building/make_db_try3"
path_fasta = [f"{path_db}/{file}" for file in os.listdir(path_db) if file.count("filtered.MSA.fasta")>0]


def reformat_MSA(i_file) :
    import subprocess
    out_path = "/".join(i_file.split("/")[0:-1])
    file_name = i_file.split("/")[-1].split(".fasta")[0]
    o_file = f"{out_path}/{file_name}.a3m"    
    # ***********************************
    filter_cmd = f"reformat.pl fas a3m  {i_file} {o_file}"
    filter_subprocess = subprocess.Popen (filter_cmd , shell = True, stdout = subprocess.PIPE, stderr = subprocess.STDOUT)
    filter_out, filter_err = filter_subprocess.communicate()
    print(i_file, "Done", sep="\t")
    
results = list(map(reformat_MSA, path_fasta))

***
### III. Build HMM profile 

In [None]:
import pandas as pd
from tqdm import tqdm
import os 
import subprocess
from multiprocessing.pool import ThreadPool
from Bio import SeqIO

path_db = "/home/conchae/databases/depolymerase_building/make_db_try3"
path_fasta = [f"{path_db}/{file}" for file in os.listdir(path_db) if file.count("filtered.MSA.fasta")>0]

def build_hmm(i_file) :
    import subprocess
    out_path = "/".join(i_file.split("/")[0:-1])
    file_name = i_file.split("/")[-1].split(".a3m")[0]
    o_file = f"{out_path}/{file_name}.hmm"    
    # ***********************************
    build_cmd = f"hmmbuild {o_file} {i_file}"
    build_subprocess = subprocess.Popen (build_cmd , shell = True, stdout = subprocess.PIPE, stderr = subprocess.STDOUT)
    filter_out, filter_err = build_subprocess.communicate()
    print(o_file , filter_out , filter_err)
    
results = list(map(build_hmm, path_fasta))

***
### IV. Make the DB

In [None]:
# ! conda activate HH-suite3
export PATH="/media/concha-eloko/Linux/softwares/hh-suite/lib/ffindex/src:$PATH"
HHLIB="/media/concha-eloko/Linux/conda_envs/HH-suite3"
# modify the script hhsuitedb.py : 
# l 110 :  hhlib_environment = os.environ['HHLIB'] -- > hhlib_environment = "/media/concha-eloko/Linux/conda_envs/HH-suite3"
python3 /media/concha-eloko/Linux/softwares/hh-suite/scripts/hhsuitedb.py \
-o /media/concha-eloko/Linux/depolymerase_project/DBsuite_depolymerase/depolymerase_db.suite \
--ihhm=/media/concha-eloko/Linux/depolymerase_project/clean_files/*.hmm \
--ia3m=/media/concha-eloko/Linux/depolymerase_project/clean_files/*.a3m \
--cpu=2 \
--force