In [None]:
import gzip
import os

from collections import OrderedDict
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord

import pandas as pd

# Small Secreted Peptides (SSPs) Finder

The code below is intended to reproduce the analysis conducted in -
Li, Y.L., Dai, X.R., Yue, X., Gao, X.-Q. and Zhang, X.S. (2014) Identification of small secreted peptides (SSPs) in maize and expression analysis of partial SSP genes in reproductive tissues. Planta, 240, 713–728. (https://pubmed.ncbi.nlm.nih.gov/25048445/)

According to the paper, SSPs are defined as:
1. Proteins shorter than 200aa
2. Containt signal peptide (using signalp)
3. Don't containt transmembrane helices (using TMHMM)

In [None]:
# Load the compressed fasta file and parse it using biopython
fasta_file = gzip.open("protein_seqs.fa.gz", mode='rt')

record_iterator = SeqIO.parse(fasta_file, "fasta")
od = OrderedDict()
for record in record_iterator:
    # filter any genes shorter than 200aa
    if len(record.seq) <= 200:
        od[record.id] = record

# Save results to short_seqs.fa file fo
with open("short_seqs.fa", 'w') as handle:
    SeqIO.write(od.values(), handle, 'fasta')

In [None]:
# Use signalp5 (signalp-5.0b.Linux.tar.gz) to predoct signal peptides
os.system("signalp "+" -fasta short_seqs.fa -prefix signalp")

In [None]:
# Read the signalp results
sp = pd.read_csv("signalp_summary.signalp5", sep="\t", skiprows=1, index_col=0)
# Keep the rows with predicted signal peptides (SP(Sec/SPI))
spp = sp[sp["Prediction"]=="SP(Sec/SPI)"]

In [None]:
# Use tmhmm (tmhmm-2.0c.Linux.tar.gz) to predoct transmembrane domains
os.system("tmhmm "+" -short short_seqs.fa > tmhmm_short_seqs.tsv")
# read the tmhmm result table
tm = pd.read_csv("tmhmm_short_seqs.tsv", sep="\t", index_col=0, header=None)
# Keep the rows with 0 predicted helices
tmp = tm[tm[4]=="PredHel=0"]

In [None]:
# https://stackoverflow.com/questions/5094083/find-the-overlap-between-2-python-lists
# Get the intersect between the signalp and tmhmm results
intersection = set(spp.index) & set(tmp.index)
print("The intersection has", len(intersection), "genes. SignalP:", len(spp), "TMHMM:", len(tmp))

In [None]:
fasta_file = gzip.open("Zm-B73-REFERENCE-GRAMENE-4.0_Zm00001d.2.protein.longest.fa.gz", mode='rt')

record_iterator = SeqIO.parse(fasta_file, "fasta")
od = OrderedDict()

# Write both a fasta file and a tsv file that containt the sequences
# I also include a column that has the last 30aa of the protein
outFile = open("predicted_SSPs.tsv" ,"w")
outFile.write("GeneID\tlast30aa\tSequence\n")

for record in record_iterator:
    if record.id in intersection:
        od[record.id] = record
        outFile.write(record.id+'\t'+str(record.seq[-31:-1])+'\t'+str(record.seq)+"\n")

with open("predicted_SSPs.fa", 'w') as handle:
    SeqIO.write(od.values(), handle, 'fasta')