# Generate a MSA for each candidate protein 
***
## A. Generate a fasta database with all the candidate proteins
## B. Run mmseqs on the fasta database vs Uniref90
## C. Generate an MSA for each proteins
***

A. Generate the query database 

In [None]:
import os 
from tqdm import tqdm

path_strains = "/home/conchae/prediction_depolymerase_tropism/prophage_prediction/prophage_decipher/phageboost/phageboost_70_20102022"
path_session = "/home/conchae/prediction_depolymerase_tropism/prophage_prediction/depolymerase_decipher/15122022_session"

with open(f"{path_session}/all_prophage_proteins.db.fasta" ,"w") as outfile :
    for strain in tqdm(os.listdir(path_strains)):
        for prophage in os.listdir(f"{path_strains}/{strain}/tmp"):
            for protein in os.listdir(f"{path_strains}/{strain}/tmp/{prophage}"):
                sequence = open(f"{path_strains}/{strain}/tmp/{prophage}/{protein}").read()
                outfile.write(f"{sequence}\n")
                
                
with open(f"{path_session}/all_prophage_proteins.names.db.fasta" ,"w") as outfile :
    for strain in tqdm(os.listdir(path_strains)):
        for prophage in os.listdir(f"{path_strains}/{strain}/tmp"):
            for protein in os.listdir(f"{path_strains}/{strain}/tmp/{prophage}"):
                outfile.write(f"{protein}\n")
            
            

# ********************************************************************************************************************************************************************        
#!/bin/bash
#BATCH --job-name=fasta_file
#SBATCH --partition=short
#SBATCH --ntasks=1 
#SBATCH --cpus-per-task=5
#SBATCH --mem=10gb 
#SBATCH --time=1-00:00:00 
#SBATCH --output=fasta_mmseqs__%j.log 

source /storage/apps/ANACONDA/anaconda3/etc/profile.d/conda.sh
conda activate python_311

python /home/conchae/prediction_depolymerase_tropism/prophage_prediction/depolymerase_decipher/15122022_session/script_files/part_III/generate_query_db.py
python /home/conchae/prediction_depolymerase_tropism/prophage_prediction/depolymerase_decipher/15122022_session/script_files/part_III/generate_protein.names.py




***
B. Run the mmseqs command
> Install vmtouch in order to use the db-load option : <br>
 conda install -c conda-forge vmtouch

In [None]:
#!/bin/bash
#BATCH --job-name=mmseqs_cmmd
#SBATCH --partition=medium
#SBATCH --ntasks=1 
#SBATCH --cpus-per-task=40
#SBATCH --mem=300gb 
#SBATCH --time=5-00:00:00 
#SBATCH --output=mmseqs__%j.log 

source /storage/apps/ANACONDA/anaconda3/etc/profile.d/conda.sh
conda activate clustering

#mmseqs search \
#/home/conchae/prediction_depolymerase_tropism/prophage_prediction/depolymerase_decipher/15122022_session/input_db/all_prophage_proteins.db.mmseqs_input \
#/home/conchae/databases/mmseqs_db/uniref90/uniref90_mmseqs \
#/home/conchae/prediction_depolymerase_tropism/prophage_prediction/depolymerase_decipher/15122022_session/mmseqs_out_t4 \
#/home/conchae/prediction_depolymerase_tropism/prophage_prediction/depolymerase_decipher/15122022_session/script_files/part_III \
#--num-iterations 2 \
#-s 2 \
#--max-seqs 150 \
#--db-load-mode 2 \
#-a true \
#--max-accept 100 \


mmseqs convertalis \
/home/conchae/prediction_depolymerase_tropism/prophage_prediction/depolymerase_decipher/15122022_session/input_db/all_prophage_proteins.db.mmseqs_input \
/home/conchae/databases/mmseqs_db/uniref90/uniref90_mmseqs \
/home/conchae/prediction_depolymerase_tropism/prophage_prediction/depolymerase_decipher/15122022_session/mmseqs_out_t4 \
/home/conchae/prediction_depolymerase_tropism/prophage_prediction/depolymerase_decipher/15122022_session/mmseqs_out_t4.m8 \
--db-load-mode 2 \
--format-output "query,target,qseq,tseq,evalue,bits,qaln,taln"



> C. Generate an MSA for each protein <br> Part I : The sequences

In [None]:
import os 
import pandas as pd 
import subprocess
from multiprocessing import Pool
from tqdm import tqdm

path_db = "/home/conchae/prediction_depolymerase_tropism/prophage_prediction/depolymerase_decipher/15122022_session"
path_strain = "/home/conchae/prediction_depolymerase_tropism/prophage_prediction/prophage_decipher/phageboost/phageboost_70_20102022"
names_db = ["query","target","qseq","tseq","evalue","bits","qaln","taln"]
database_MSA = pd.read_csv(f"{path_db}/mmseqs_out_t4.m8", names = names_db, sep= "\t")

proteins = database_MSA["query"].unique()

def get_MSA(query) :
    path_strain = "/home/conchae/prediction_depolymerase_tropism/prophage_prediction/prophage_decipher/phageboost/phageboost_70_20102022"
    strain, phage = query.split("__")[0] , query.split("__")[1]
    df_interest = database_MSA[database_MSA["query"] == query]
    dico_interest = df_interest.to_dict("records")
    with open(f"{path_strain}/{strain}/mmseqs_out/{phage}/{query}.MSA.fasta" ,"w") as outfile :
        for row in dico_interest :
            outfile.write(f">{row['target']}\n{row['tseq']}\n")
    with open(f"{path_db}/proteins_done","a+") as output :
        output.write(f"{query}\n")


if __name__ == '__main__':
    with Pool(40) as p:
        p.map(get_MSA, proteins)

In [None]:
import polars as pl
import os 
import pandas as pd 
from multiprocessing import Pool
from tqdm import tqdm

path_db = "/home/conchae/prediction_depolymerase_tropism/prophage_prediction/depolymerase_decipher/15122022_session"
path_strain = "/home/conchae/prediction_depolymerase_tropism/prophage_prediction/prophage_decipher/phageboost/phageboost_70_20102022"
names_db = ["query","target","qseq","tseq","evalue","bits","qaln","taln"]

#database_MSA = pl.read_csv(f"{path_db}/mmseqs_out_t4.m8.final",columns = [0,1,3] , new_columns = names_db, sep= "\t")
database_MSA = pl.read_csv(f"{path_db}/mmseqs_out_t4.m8.gamma.final",columns = [0,1,3] , new_columns = ["query","target","tseq"] , sep= "\t")
#database_MSA = database_MSA.select(pl.col("query","target","tseq"))
proteins = database_MSA.select(pl.col("query").unique()).to_list()

def write_df(query, df):
    path_strain = "/home/conchae/prediction_depolymerase_tropism/prophage_prediction/prophage_decipher/phageboost/phageboost_70_20102022"
    strain, phage = query.split("__")[0] , query.split("__")[1]
    dico_interest = df.to_dicts()
    with open(f"{path_strain}/{strain}/mmseqs_out/{phage}/{query}.MSA.fasta" ,"w") as outfile :
        for row in dico_interest:
            outfile.write(f">{row['target']}\n{row['tseq']}\n")
    
def get_MSA(query) :
    df_interest = database_MSA.filter(pl.col("query")==query)
    write_df(query, df_interest)
    with open(f"{path_db}/proteins_done","a+") as output :
        output.write(f"{query}\n")

if __name__ == '__main__':
    with Pool(40) as p:
        p.map(get_MSA, proteins)

In [None]:
for row in interest :
    print(row["foo"])

> Check the advancement

In [None]:
# Check advancement :

import os 
import pandas as pd 

path_db = "/home/conchae/prediction_depolymerase_tropism/prophage_prediction/depolymerase_decipher/15122022_session"

done_protein = pd.read_csv(f"{path_db}/proteins_done", header = None)
df_names = pd.read_csv(f"{path_db}/all_prophage_proteins.names.db.fasta", header = None) 

ratio = (len(done_protein)/ len(df_names))*100

print(ratio)
# after 1:19h, 26%
# blocked at 76.09% ; 76.10 ; 76.11491960840425 ; 76.126858503136 ; 76.14408332072611 ; 76.15483743964478 ; 76.19065412384006 ;76.25208019291796 ; 77.75720115852847; 77.91659907368759

'''(base) [conchae@master 15122022_session]$ tail proteins_done -n 10 
GCF_016544565.1__phage1__61
GCF_003954625.1__phage1__75
GCF_019928215.1__phage4__791
GCF_016651625.1__phage29__154
GCF_900508465.1__phage1__34
GCF_900508885.1__phage16__81
GCF_007955155.1__phage17__4
GCF_009930975.1__phage27__15
GCF_021206795.1__phage7__11
GCF_905127515.1__phage0__7'''





In [None]:
import os 
import pandas as pd 

path_db = "/home/conchae/prediction_depolymerase_tropism/prophage_prediction/depolymerase_decipher/15122022_session"

done_protein = pd.read_csv(f"{path_db}/proteins_done", header = None)
done_protein_f = done_protein.astype(str) + ".fasta"
df_names = pd.read_csv(f"{path_db}/all_prophage_proteins.names.db.fasta", header = None) 
to_do = df_names[~df_names[0].isin(done_protein_f[0])]

to_do = to_do[0].str.split(".fast", n = 1, expand = True)[0]
to_do.to_csv(f"{path_db}/to_do.final.gamma.txt", index = False)


to_do = pd.read_csv(f"{path_db}/to_do.final.txt")
to_do_list = to_do.values

names_db = ["query","target","qseq","tseq","evalue","bits","qaln","taln"]
database_MSA = pd.read_csv(f"{path_db}/mmseqs_out_t4.m8", names = names_db, sep= "\t", nrows=1000)
proteins = database_MSA["query"].unique()

# program_df = proteins[proteins.isin(done_protein_f[0])]

def get_MSA(query) :
    path_strain = "/home/conchae/prediction_depolymerase_tropism/prophage_prediction/prophage_decipher/phageboost/phageboost_70_20102022"
    strain, phage = query.split("__")[0] , query.split("__")[1]
    df_interest = database_MSA[database_MSA["query"] == query]
    dico_interest = df_interest.to_dict("records")
    with open(f"{path_strain}/{strain}/mmseqs_out/{phage}/{query}.MSA.fasta" ,"w") as outfile :
        for row in dico_interest :
            outfile.write(f">{row['target']}\n{row['tseq']}\n")
    with open(f"{path_db}/proteins_done","a+") as output :
        output.write(f"{query}\n")


if __name__ == '__main__':
    with Pool(40) as p:
        p.map(get_MSA, proteins)

In [None]:
import os
import pandas as pd 
import subprocess
from multiprocessing import Pool
from tqdm import tqdm
import time
import dask.dataframe as dd

path_db = "/home/conchae/prediction_depolymerase_tropism/prophage_prediction/depolymerase_decipher/15122022_session"

seconds = time.time()
local_time = time.ctime(time.time())

with open(f"{path_db}/evolution__{seconds}.txt", "w") as outfile :
    outfile.write(f"The command is starting : {local_time}\n")

names_db = ["query","target","qseq","tseq","evalue","bits","qaln","taln"]
database_MSA = dd.read_csv(f"{path_db}/mmseqs_out_t4.m8", names = names_db, sep= "\t")
to_do = pd.read_csv(f"{path_db}/to_do.final.txt", names = ["protein"], header = None, skiprows= 1)


local_time = time.ctime(seconds)
seconds = time.time()

with open(f"{path_db}/evolution__{seconds}.txt", "a+") as outfile :
    outfile.write(f"The command really is starting : {local_time}\n")

def get_MSA(query) :
    path_strain = "/home/conchae/prediction_depolymerase_tropism/prophage_prediction/prophage_decipher/phageboost/phageboost_70_20102022"
    strain, phage = query.split("__")[0] , query.split("__")[1]
    df_interest = database_MSA[database_MSA["query"] == query]
    df_interest = df_interest.compute()
    dico_interest = df_interest.to_dict("records")
    len_dico = len(dico_interest)
    print(dico_interest)
    with open(f"{path_db}/evolution__{seconds}.txt", "a+") as outfile :
        outfile.write(f"{query} seems to work. Here is the length of the dico : {len_dico} \n")
    with open(f"{path_strain}/{strain}/mmseqs_out/{phage}/{query}.MSA.fasta" ,"w") as outfile :
        for row in dico_interest :
            outfile.write(f">{row['target']}\n{row['tseq']}\n")
    with open(f"{path_db}/proteins_done","a+") as output :
        output.write(f"{query}\n")

if __name__ == '__main__':
    with Pool(5) as p:
        print(p.map(get_MSA , to_do["protein"]))
        
# ****************************************************************
#!/bin/bash
#BATCH --job-name=generate_msa
#SBATCH --partition=long
#SBATCH --ntasks=1 
#SBATCH --cpus-per-task=50 
#SBATCH --mem=600gb 
#SBATCH --time=05-00:00:00 
#SBATCH --output=msa_generate__%j.log 

source /storage/apps/ANACONDA/anaconda3/etc/profile.d/conda.sh
conda activate python_311

python /home/conchae/prediction_depolymerase_tropism/prophage_prediction/depolymerase_decipher/15122022_session/script_files/part_III/generate_msa.final.py



In [1]:
import os
import pandas as pd 
import numpy as np
import subprocess
from multiprocessing import Pool
from tqdm import tqdm
import time
#import dask.dataframe as dd

path_db = "/home/conchae/prediction_depolymerase_tropism/prophage_prediction/depolymerase_decipher/15122022_session"

seconds = time.time()
local_time = time.ctime(seconds)

names_db = ["query","target","qseq","tseq","evalue","bits","qaln","taln"]
database_MSA = dd.read_csv(f"{path_db}/mmseqs_out_t4.m8", names = names_db, sep= "\t")
to_do = pd.read_csv(f"{path_db}/to_do.final.txt", names = ["protein"], header = None, skiprows= 1)

with open(f"{path_db}/evolution__command_control.txt", "a+") as outfile :
    for protein in to_do["protein"] :
        outfile.write(protein + "\n")
        df_interest = database_MSA[database_MSA["query"] == protein]
        df_interest = df_interest.compute()
        dico_interest = df_interest.to_dict("records")
        len_dico = len(dico_interest)
        outfile.write(df_interest)

# ****************************************************************
#!/bin/bash
#BATCH --job-name=control_msa
#SBATCH --partition=short
#SBATCH --ntasks=1 
#SBATCH --cpus-per-task=20 
#SBATCH --mem=100gb 
#SBATCH --time=01-00:00:00 
#SBATCH --output=control_msa%j.log 

source /storage/apps/ANACONDA/anaconda3/etc/profile.d/conda.sh
conda activate python_311

python /home/conchae/prediction_depolymerase_tropism/prophage_prediction/depolymerase_decipher/15122022_session/script_files/part_III/control.py


[1, 4, 9]


> Part II : Realign with hhrealign

In [None]:
import os 
import pandas as pd 
import subprocess
from multiprocessing import Pool
from tqdm import tqdm


path_session = "/home/conchae/prediction_depolymerase_tropism/prophage_prediction/depolymerase_decipher/15122022_session"
path_strain = "/home/conchae/prediction_depolymerase_tropism/prophage_prediction/prophage_decipher/phageboost/phageboost_70_20102022"

df_names = pd.read_csv(f"{path_session}/all_prophage_proteins.names.db.fasta", names = ["protein"], header = None) 


def realign_MSA(query) :
    path_strain = "/home/conchae/prediction_depolymerase_tropism/prophage_prediction/prophage_decipher/phageboost/phageboost_70_20102022"
    strain, phage = query.split("__")[0] , query.split("__")[1]
    file_query = f"{path_strain}/{strain}/tmp/{phage}/{query}"
    file_target =  f"{path_strain}/{strain}/mmseqs_out/{phage}/{query.split('.fasta')[0]}.MSA.fasta"
    if os.path.isfile(file_target)== True :
        # generate the multi fasta
        input_file =  f"{path_strain}/{strain}/mmseqs_out/{phage}/{query.split('.fasta')[0]}.multi.complete.fasta"
        make_input = f"cat {file_query} {file_target} > {input_file}"
        make_input_subprocess = subprocess.Popen (make_input , shell = True, stdout = subprocess.PIPE, stderr = subprocess.STDOUT)
        make_i_out, make_i_err = make_input_subprocess.communicate()
        print(make_i_out , input_file)
        # generate the MSA in a2m format
        file_out = f"{path_strain}/{strain}/mmseqs_out/{phage}/{query.split('.fasta')[0]}.MSA.a2m"
        command_realign = f"clustalo -i {input_file} -o {file_out} --outfmt=fa --threads=4"
        hhmer_subprocess = subprocess.Popen (command_realign , shell = True, stdout = subprocess.PIPE, stderr = subprocess.STDOUT)
        hmm_out, hmm_err = hhmer_subprocess.communicate()
        print(hmm_out ,hmm_err, "\n\n")
        with open(f"{path_session}/MSA_done","a+") as output :
            output.write(f"{query}\n")
    else :
         with open(f"{path_session}/MSA_missing","a+") as output :
            output.write(f"{query}\n")
        
if __name__ == '__main__':
    with Pool(20) as p:
        p.map(realign_MSA, df_names["protein"])
        
# ********************************************************
#!/bin/bash
#BATCH --job-name=generate_msa
#SBATCH --partition=long
#SBATCH --ntasks=1 
#SBATCH --cpus-per-task=40 
#SBATCH --mem=600gb 
#SBATCH --time=05-00:00:00 
#SBATCH --output=msa_generate__%j.log 

source /storage/apps/ANACONDA/anaconda3/etc/profile.d/conda.sh
conda activate HH-suite3

python /home/conchae/prediction_depolymerase_tropism/prophage_prediction/depolymerase_decipher/15122022_session/script_files/part_III/realign_MSA.py