In [12]:
import pandas as pd
from Bio import SeqIO 
from Bio.SeqRecord import SeqRecord
from Bio.Seq import Seq

# Parse PFAM predicted results and generate fasta

In [7]:
# Assumes two columns: species name and gene name
df = pd.read_csv("PFAM_combined_results.csv", sep=",")

Unnamed: 0.1,Unnamed: 0,# target name,query name
0,Glycine max,Peptidase_S8,Glyma.16G018900.1.p
1,Glycine max,Peptidase_S8,Glyma.16G019000.2.p
2,Glycine max,Peptidase_S8,Glyma.16G201700.1.p
3,Glycine max,Peptidase_S8,Glyma.16G012700.1.p
4,Glycine max,Peptidase_S8,Glyma.16G182500.1.p
...,...,...,...
311,Vigna unguiculata,Peptidase_S8,Vigun02g110700.1.p
312,Vigna unguiculata,Peptidase_S8,Vigun07g225100.1.p
313,Vigna unguiculata,Peptidase_S8,Vigun07g045300.1.p
314,Vigna unguiculata,Peptidase_S8,Vigun07g161600.1.p


In [6]:
# This dictionary will convert the species name to the fasta file name
file_names = {
    "Glycine max": "Gmax_508_Wm82.a4.v1.protein_primaryTranscriptOnly.fa",
    "Arabidopsis thaliana": "Athaliana_167_TAIR10.protein_primaryTranscriptOnly.fa"
}

In [27]:
# An empty dictionary to hold all the SeqRecord values
gene_dict = dict()

for species in set(df["species name"]):
    # For each species get a list of all genes
    gene_list = list(df[df["species name"]==line]["gene name"])
    # Parse the fasta file using the file_names dictionary
    for seq in SeqIO.parse("proteomes/"+file_names[line], "fasta"):
        # If the gene is in the species-specific gene list add SeqRecord to dictionary
        if seq.name in gene_list:
            gene_dict[seq.name] = SeqRecord(seq=seq.seq, id=seq.name, description="")

# Write the gene_dict values to a fasta file
with open("PFAM_combined_results.fasta", "w") as handle:
    SeqIO.write(gene_dict.values(), handle, "fasta") 