## Preface

The purpose of this script is to match EMBL gene IDs with those from RefSeq. We will use the peptide library from EMBL as the query database and the RefSeq as the subject. From there we will map the BLAST hits to Orthogroups. We will blast against the closest related plant (Manihot esculenta) whose genome sequencing is complete and can be identified up to the level of chromosome (not just scaffold like the alternatives Hevea brasiliensis and Ricinus communis) in our dataset.

The working directory is the jobs folder.

In [1]:
import pandas as pd
import Bio.SeqIO as SeqIO
from Bio import SearchIO
import os as os

## FASTA inputs

Here we are loading in the Populus trichocarpa EMBL sequences.

In [2]:
#read in fasta files
embl = SeqIO.to_dict(SeqIO.parse("../raw_data/Populus_trichocarpa.Pop_tri_v3.pep.all.fa", "fasta"))

#extract ids
embl_ids = list(embl.keys())

#extract sequences. doing this iterative search will keep both the ids and sequences in the same order for easier reference
embl_seq = []

for i in embl_ids:
    embl_seq.append(embl[i].seq)

## BLAST alignment

BLAST Populus transcripts against Manihot transcripts. Save this data to disk because it can be computationally intensive.

In [5]:
#construct BLAST database
os.system("makeblastdb -in ../20200324_genome_analyses/raw_data/protein_fasta/AM_refseq_manihot_esculenta.FAA -out ../blast_db/manihot_esculenta -dbtype prot")

0

In [3]:
"""
No need to re-run this because it takes forever and I did it already.
"""
#construct directory to store blast alignments
os.mkdir("../processed_data/Q-populus_S-manihot")

#write out embl sequences to file to make life easier
os.mkdir("../raw_data/populus_trichocarpa_embl_seq")

for record in SeqIO.parse("../raw_data/Populus_trichocarpa.Pop_tri_v3.pep.all.fa", "fasta"):
    SeqIO.write(record, "../raw_data/populus_trichocarpa_embl_seq/" + record.id + ".FAA", "fasta")

seq_files = os.listdir("../raw_data/populus_trichocarpa_embl_seq")

In [4]:
print(len(seq_files))
print(len(embl_ids))
print(len(set(embl_ids)))
print(seq_files[0])

73012
73012
73012
PNT54524.FAA


In [6]:
"""
I already ran this. No need to run it again.
"""
#run blast alignments and save to disk
db = "../blast_db/manihot_esculenta"
for i in seq_files:
    cmd = "blastp -query ../raw_data/populus_trichocarpa_embl_seq/" + i + " -db " + db + " -outfmt 7 -out ../processed_data/Q-populus_S-manihot/" + i + ".txt -num_threads 3"
    os.system(cmd)

## Extracting BLAST data

In [7]:
#iterate through blast files and save most relevant information
popu_gene = []
mani_gene = []
ident_pcts = []
aln_spans = []
evalues = []

file_prefix = "../processed_data/Q-populus_S-manihot/"
files = os.listdir(file_prefix)

for i in files:
    try:
        blast_in = SearchIO.read(file_prefix + i, "blast-tab", comments = True)
        popu_gene.append(blast_in[0][0].query_id)
        mani_gene.append(blast_in[0][0].hit_id)
        ident_pcts.append(blast_in[0][0].ident_pct)
        aln_spans.append(blast_in[0][0].aln_span)
        evalues.append(blast_in[0][0].evalue)
    except:
        pass

In [8]:
#convert blast info lists into python dataframe
blast_df = pd.DataFrame(list(zip(popu_gene, mani_gene, ident_pcts, aln_spans, evalues)),
                       columns = ["popu_gene", "mani_gene", "percent_identity", "alignment_length", "evalue"])
blast_df.to_csv("../processed_data/populus_blast_results.tsv", sep = "\t")

In [9]:
blast_df = pd.read_csv("../processed_data/populus_blast_results.tsv", sep = "\t", index_col = 0)

#I'm choosing a relatively stringent requirement of e-value < 0.0001 as an accepted cut-off
blast_df["evalue_acceptance"] = blast_df.evalue < 0.0001

## Matching orthogroup data

In [10]:
#read in orthogroup data
ortho_meta = pd.read_csv("../20200324_genome_analyses/metadata/Orthogroups.csv", sep = "\t")

In [11]:
#create dataframe matching brachypodium proteins with orthogroup
ortho_filt = ortho_meta[["Unnamed: 0", "AM_refseq_manihot_esculenta"]]
ortho_filt = ortho_filt.dropna()

#match proteins to orthogroups
orthogroup = []
protein = []

for i in range(0,len(ortho_filt.index)):
    holder = ortho_filt.iloc[i,1]
    for j in holder.split(", "):
        orthogroup.append(ortho_filt.iloc[i,0])
        protein.append(j)

brach_ortho = pd.DataFrame(list(zip(orthogroup, protein)),
                          columns = ["orthogroup", "mani_gene"])

In [73]:
blast_df2 = pd.merge(blast_df, brach_ortho, on = "mani_gene")
blast_df2.loc[(blast_df2.evalue_acceptance == False), "orthogroup"] = "not_in_manihot"

#have gene names that are compatible with calabrese et al 2019 names
entrez = pd.read_table("../metadata/Populus_trichocarpa.Pop_tri_v3.51.ena.tsv", sep = "\t")
entrez.gene_stable_id = entrez.gene_stable_id.str.replace("POPTR_", "Potri.")
entrez.gene_stable_id = entrez.gene_stable_id.str.replace("v3", "")
entrez = entrez.iloc[:,[2,3]]
blast_df2 = pd.merge(blast_df2, entrez, left_on = "popu_gene", right_on = "transcript_stable_id")

blast_df2.to_csv("../processed_data/20210718_populus_blast_with_orthogroup.tsv", sep = "\t")

In [74]:
display(entrez)

Unnamed: 0,gene_stable_id,transcript_stable_id
0,ENSRNA049911435,ENSRNA049911435-T1
1,ENSRNA049911436,ENSRNA049911436-T1
2,ENSRNA049911437,ENSRNA049911437-T1
3,ENSRNA049911438,ENSRNA049911438-T1
4,ENSRNA049911439,ENSRNA049911439-T1
...,...,...
74019,Potri.T181700,PNS13733
74020,Potri.T181800,PNS13731
74021,Potri.T181900,PNS13732
74022,Potri.T182000,PNS13730
