In [4]:
import re
import pandas as pd

fungi_name = 'Saccharomyces_cerevisiae_S288C'
# 获得核酸序列
genes = []
with open(fungi_name + "/genome/cds_from_genomic.fna") as f:
    gene = {}
    for line in f:
        if line.startswith(">"):
            if gene:
                genes.append(gene)
            # gene = {"locus_tag": "", "ano": "", "sequence": ""}
            gene = {"gene": "", "sequence": ""} 
            # gene["ano"] = line.strip()
            gene["gene"] = re.search(r"locus_tag=(\w+)", line).group(1)
            gene["protein_id"] = re.search(r"protein_id=(\w+\.\d+)", line).group(1)
            gene["ncbi_gene_id"] = re.search(r"GeneID:(\d+)", line).group(1)
        else:
            gene["sequence"] += line.strip()

    # 添加最后一个基因
    if gene:
        genes.append(gene)

# 将列表转换为pandas DataFrame并保存为tsv文件
df = pd.DataFrame(genes, columns=["gene",'protein_id','ncbi_gene_id',"sequence"])
df.to_csv("Nucleic_acid_sequence.csv", sep=",", index=False)

In [3]:
from Bio import SeqIO
import pandas as pd

# 获得氨基酸序列
fasta_file = fungi_name + "/genome/protein.faa"

data = []
for record in SeqIO.parse(fasta_file, "fasta"):
    protein_id = record.id
    # protein_id = protein_id.split(".")[0]

    protein_desc = record.description.split(" ", maxsplit=1)[1].split(" [")[0]
    species = record.description.split("[")[1].split("]")[0].strip()
           
    sequence = str(record.seq)
    data.append((protein_id, protein_desc,species, sequence))

df = pd.DataFrame(data, columns=["Protein ID", "Protein Description", "Species", "Sequence"])

df.to_csv("Amino_acid_sequence.csv", sep=',', index=False)