# Generate a MSA for each candidate protein 
***
## A. Run mmseqs on the fasta database vs Uniref90
## B. Generate an MSA for each proteins
## C. Realign the MSAs with clustalo
***

A. Run the mmseqs command
> Install vmtouch in order to use the db-load option : <br>
 conda install -c conda-forge vmtouch

In [None]:
#!/bin/bash
#BATCH --job-name=mmseqs_cmmd
#SBATCH --qos=medium
#SBATCH --ntasks=1 
#SBATCH --cpus-per-task=20
#SBATCH --mem=200gb 
#SBATCH --time=3-00:00:00 
#SBATCH --output=mmseqs__%j.log 

source /storage/apps/ANACONDA/anaconda3/etc/profile.d/conda.sh
conda activate clustering

mmseqs search \
/home/conchae/databases/Millard_jan_2023/5Jan2023_vConTACT2_proteins.faa.mmeqs_input \
/home/conchae/databases/mmseqs_db/uniref90/uniref90_mmseqs \
/home/conchae/ML_depolymerase/get_candidates/millard/millard_out_t4 \
/home/conchae/ML_depolymerase/get_candidates \
--num-iterations 2 \
-s 2 \
--max-seqs 150 \
--db-load-mode 2 \
-a true \
--max-accept 100 \


mmseqs convertalis \
/home/conchae/databases/Millard_jan_2023/5Jan2023_vConTACT2_proteins.faa.mmeqs_input \
/home/conchae/databases/mmseqs_db/uniref90/uniref90_mmseqs \
/home/conchae/ML_depolymerase/get_candidates/millard/millard_out_t4 \
/home/conchae/ML_depolymerase/get_candidates/millard/millard_out_t4.m8 \
--db-load-mode 2 \
--format-output "query,target,qseq,tseq,evalue,bits,qaln,taln"


> B. Generate an MSA for each protein <br> Part I : The sequences

In [None]:
import polars as pl
import os
import pandas as pd
from tqdm import tqdm
from multiprocessing import ThreadPool

path_db = "/home/conchae/ML_depolymerase/get_candidates"

database_MSA = pl.read_csv(f"{path_db}/millard/millard_out_t4.m8",columns = [0,1,2,3] , new_columns = ["query","target","qseq","tseq"] , sep= "\t")

#done_proteins = pl.read_csv(f"{path_db}/millard_MSA_complete.done",columns = [0] , new_columns = ["done_proteins"])

done_proteins = pd.read_csv(f"{path_db}/millard_MSA_complete.done", names = ["done_proteins"])
proteins = database_MSA.get_column("query").unique().to_pandas()
to_do = proteins[~proteins["query"].isin(done_proteins)]
df.loc[~df['Courses'].isin(values)]
def write_df(query, df):
    dico_interest = df.to_dicts()
    with open(f"{path_db}/millard/MSA_candidates/{query}.MSA_complete.fasta" ,"w") as outfile :
        for row in dico_interest:
            outfile.write(f">{row['query']}\n{row['qseq']}\n")
            break
        for row in dico_interest:
            outfile.write(f">{row['target']}\n{row['tseq']}\n")

def get_MSA(query) :
    df_interest = database_MSA.filter(pl.col("query")==query)
    write_df(query, df_interest)
    with open(f"{path_db}/millard_MSA_complete.threading.done","a+") as output :
        output.write(f"{query}\n")


if __name__ == '__main__':
    with ThreadPool(40) as pool:
        pool.map(get_MSA, to_do,chunksize = int(len(to_do)/40))
        
# ****************************************************************
#!/bin/bash
#BATCH --job-name=generate_msa
#SBATCH --qos=short
#SBATCH --ntasks=1 
#SBATCH --cpus-per-task=40 
#SBATCH --mem=200gb 
#SBATCH --time=01-00:00:00 
#SBATCH --output=msa_generate__%j.log 

source /storage/apps/ANACONDA/anaconda3/etc/profile.d/conda.sh
conda activate python_311

python /home/conchae/ML_depolymerase/scripts/generate_msa.threading.py

> Part II : Realign with hhrealign

In [None]:
import os
import pandas as pd
from tqdm import tqdm
from multiprocessing import Pool

path_db = "/home/conchae/ML_depolymerase/get_candidates"
path_candidates = "/home/conchae/ML_depolymerase/get_candidates/millard/MSA_candidates"

paths = [f"{path_candidates}/{file}" for file in os.listdir(path_candidates)]


def realign_MSA(query) :
    # IF length of the first seq is > 200 : 
    query_len = len(open(query).read().split(">")[1].split("\n")[1])
    if query_len > 200 :
        # generate the MSA in a2m format
        path_out = "/home/conchae/ML_depolymerase/get_candidates/millard/scanned_MSA"
        query_name = query.split("/")[-1].split(".MSA")[0]
        file_out = f"{path_out}/{query_name}.MSA.clustalo.fasta"
        command_realign = f"clustalo -i {query} -o {file_out} --outfmt=fa --threads=4"
        hhmer_subprocess = subprocess.Popen (command_realign , shell = True, stdout = subprocess.PIPE, stderr = subprocess.STDOUT)
        hmm_out, hmm_err = hhmer_subprocess.communicate()
        print(hmm_out ,hmm_err, "\n\n")
    with open(f"{path_db}/realigned_millard.txt","a+") as output :
        output.write(f"{query}\n")

size = int(len(paths)/15)
if __name__ == '__main__':
    with Pool(15) as p:
        p.map(realign_MSA, paths, chunksize = size)
        
# ********************************************************
#!/bin/bash
#BATCH --job-name=millard_generate
#SBATCH --qos=long
#SBATCH --ntasks=1 
#SBATCH --cpus-per-task=60 
#SBATCH --mem=200gb 
#SBATCH --time=05-00:00:00 
#SBATCH --output=millard_generate%j.log 

source /storage/apps/ANACONDA/anaconda3/etc/profile.d/conda.sh
conda activate HH-suite3

python /home/conchae/ML_depolymerase/scripts/clustlo_msa.py