In [6]:
import re
import sys
import time
import os
import argparse
import pandas as pd
import subprocess
from subprocess import Popen, PIPE

In [7]:
# 设置工作目录
workdir = '/Users/dongjiacheng/Desktop/code/mtd/tools/'
workdir_db_prot = os.path.join(workdir,'blast','db_prot')
workdir_msa = os.path.join(workdir,'MSA')

/Users/dongjiacheng/Desktop/code/mtd/tools/
/Users/dongjiacheng/Desktop/code/mtd/tools/blast
/Users/dongjiacheng/Desktop/code/mtd/tools/blast/db_prot
/Users/dongjiacheng/Desktop/code/mtd/tools/MSA


In [8]:
def blastp(blast_input_txt, blast_output, db_prot, evalue):
    """
    Description: 根据输入序列信息，对蛋白库进行blastp比对，得到比对结果
    Args:
        blast_input_txt: 输入txt文件
        blast_output: 输出txt文件
        db_prot: 菌种数据库
        evalue: evalue值

    # blasp -query input.txt -out output.txt -db db_prot -outfmt 6 -evalue 1e-6
    """
    cmd =f'blastp -query {blast_input_txt} -out {blast_output} -db {db_prot} -outfmt 6 -evalue {evalue}'
    p = Popen(cmd, shell=True, stdout=PIPE, stderr=PIPE)
    returncode = p.wait() #状态码
    stdout, stderr = p.communicate()
    if stderr:
        return stderr.decode('utf-8')
    return stdout.decode('utf-8')

In [9]:
# 运行blastp
blastp('blast_input.txt', 
       'blast_output.txt', 
       '/Users/dongjiacheng/Desktop/code/mtd/tools/blast/db_prot/Mt-Nc' , 
       1e-6)

''

In [10]:
"""
让用户得到匹配到的蛋白的序列
1. 从记录了所有菌种蛋白id与序列对应关系的表中根据蛋白id，提取对应序列
2. 将输入的序列与提取的序列合并为一个文件，记录为fasta格式
"""

def get_sequences_for_blast(seq_file):
    """获得blast比对到的蛋白序列，并与输入序列合并为一个文件
    Args:
        df_blast: blast比对结果
        df_all_fungi_seq: 记录了所有菌种蛋白id与序列对应关系的表
        seq_file: 输出文件名
    Returns:
        None    
    """
    # 读取blast比对结果
    df_blast_path = os.path.join(workdir_msa, 'blast_output.txt')
    df_blast = pd.read_csv(df_blast_path, sep='\t', header=None)
    df_blast.columns = ['query_id', 'subject_id', 'pct_identity', 'aln_length', 'mismatches',
                    'gap_opens', 'q_start', 'q_end', 's_start', 's_end', 'evalue', 'bit_score']
    
    # 读取记录了所有菌种蛋白id与序列对应关系的表
    df_all_fungi_seq_path = os.path.join(workdir_msa, 'All_Fungi.tsv')
    df_all_fungi_seq = pd.read_csv(df_all_fungi_seq_path, sep='\t')

    # 数据匹配
    merged_df = pd.merge(df_blast, df_all_fungi_seq, left_on='subject_id', right_on='Protein ID', how='inner')
    fasta_sequences = ""

    # 将blast_input.txt中的序列写入到fasta_sequences中
    with open('blast_input.txt', 'r') as file:
        for line in file:
            line = line.strip()
            if line.startswith('>'):
                fasta_sequences += "{}\n".format(line)
            else:
                fasta_sequences += "{}\n".format(line)

    # 将数据写入FASTA格式
    for index, row in merged_df.iterrows():
        fasta_sequences += ">{}\n{}\n".format(row['Protein ID'], row['Sequence'])
        # fasta_sequences += ">{}-{}\n{}\n".format(row['Protein ID'], row['Species'], row['Sequence']) # 添加了菌种名

    # 写入文件
    try:
        with open(seq_file, 'w') as file:
            file.write(fasta_sequences)
    except Exception as e:
        print("Error writing file:", str(e))
        sys.exit(1)


get_sequences_for_blast('1.GetSeq.fasta')

In [9]:
def kalign(input_fasta, output_fasta):
    """使用kalign对序列进行比对
    Args:
        input_fasta: 输入fasta文件
        output_fasta: 输出fasta文件
    Returns:
        None
    # kalign -i input.fasta -o output.fasta
    """
    cmd = "kalign -i {} -o {}".format(input_fasta, output_fasta)
    subprocess.call(cmd, shell=True)
    
# # mafft对齐
# def mafft(extract_output, mafft_output):
#     cmd =  "mafft --auto {} > {}".format(extract_output, mafft_output)
#     os.system(cmd)

In [5]:
# fasttree对蛋白序列
def fasttree(model, aligned_fasta, fasttree_prot_output):  
    """使用fasttree对蛋白序列进行进化树构建
    Args:
        aligned_fasta: 输入fasta文件
        fasttree_prot_output: 输出文件
        model: 模型
    Returns:
        None
    
    # FastTree -out output.nwk input.fasta
    # FastTree -out output.nwk -lg input.fasta
    # FastTree -out output.nwk -wag input.fasta
    """
    # 如果model为空，则使用默认的JTT模型
    if model == "jtt":
        cmd = "FastTree -out {} {}".format(fasttree_prot_output, aligned_fasta)
    else:
        cmd = "FastTree -out {} -{} {}".format(fasttree_prot_output, model, aligned_fasta)
    subprocess.call(cmd, shell=True)

# 调用fasttree_prot
fasttree('./2.AlignmentSeq.fasta', '3.tree.nwk', 'jtt')

/bin/sh: FastTree: command not found
