In [1]:
import gzip
import os

from collections import OrderedDict
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord

import pandas as pd

# Small Secreted Peptides (SSPs) Finder (with Phobius)

Check the next section to see the reproduction of the Li et al. 2014 paper about SSPs.

The section below uses Phobius as an alternative method for SSP-discovery.

In this case I am using the output tsv file from InterProScan but Phobius can be run in stand-alone format.

According to the paper, SSPs are defined as:
1. Proteins shorter than 200aa
2. Containt signal peptide (using signalp)
3. Don't containt transmembrane helices (using TMHMM)

In [108]:
df = pd.read_csv("InterProScan_results.tsv", sep="\t", header=None)

# First get all proteins shorter than 200aa
df = df[df[2]<=200]

# Second, remove all non-Phobius annotation rows
print("InterProScan tools:", df[3].unique())
phobius = df[df[3]=="Phobius"]
# We only need column 0 and 4 from now on for gene IDs and Phobius annotations
phobius = phobius[[0,4,5]]

# Third, we want to filter based on "SIGNAL_PEPTIDE" and "TRANSMEMBRANE" annotations
print("Phobius annotations:", phobius[4].unique())
phobius = phobius[phobius[4].isin(["SIGNAL_PEPTIDE", "TRANSMEMBRANE"])]

# Groupby gene ID and Phobius annotation and count each value
phobius = phobius.groupby([0,4]).count().reset_index()

# Find genes with a TM domain and then remove them from ones that have a signal peptide
selected_genes = phobius[((phobius[4]=="TRANSMEMBRANE") & (phobius[5]>=0))]
# selected_genes = phobius[((phobius[4]=="TRANSMEMBRANE") & (phobius[5]==1))] # if 1 TM is OK remove ~ below
phobius = phobius[(phobius[4]=="SIGNAL_PEPTIDE") & (~phobius[0].isin(selected_genes[0]))]

# You can check that there is exactly one annotated signal sequence, not necessary
# phobius = phobius[phobius[5]==1]

phobius[0].to_csv("ref_SSPs.txt", header=False, index=False)

InterProScan tools: ['CDD' 'SUPERFAMILY' 'Pfam' 'ProSiteProfiles' 'PANTHER' 'ProSitePatterns'
 'Gene3D' 'Phobius' 'MobiDBLite' 'PRINTS' 'SMART' 'Coils' 'TMHMM' 'PIRSF'
 'Hamap' 'TIGRFAM' 'SFLD']
Phobius annotations: ['CYTOPLASMIC_DOMAIN' 'NON_CYTOPLASMIC_DOMAIN' 'TRANSMEMBRANE'
 'SIGNAL_PEPTIDE_C_REGION' 'SIGNAL_PEPTIDE_N_REGION' 'SIGNAL_PEPTIDE'
 'SIGNAL_PEPTIDE_H_REGION']


In [109]:
# To save the SSPs as a tsv and fasta file

fasta_file = gzip.open("protein_seqs.fa.gz", mode='rt')

record_iterator = SeqIO.parse(fasta_file, "fasta")
od = OrderedDict()

# Write both a fasta file and a tsv file that containt the sequences
# I also include a column that has the last 30aa of the protein
outFile = open("ref_SSPs.tsv" ,"w")
outFile.write("GeneID\tlast30aa\tSequence\n")

for record in record_iterator:
    if record.id in phobius[0].to_list():
        record.seq = record.seq.replace("*","")
        od[record.id] = record
        outFile.write(record.id+'\t'+str(record.seq[-31:-1])+'\t'+str(record.seq)+"\n")

with open("ref_SSPs.fa", 'w') as handle:
    SeqIO.write(od.values(), handle, 'fasta')

# Small Secreted Peptides (SSPs) Finder (original method)

The code below is intended to reproduce the analysis conducted in -
Li, Y.L., Dai, X.R., Yue, X., Gao, X.-Q. and Zhang, X.S. (2014) Identification of small secreted peptides (SSPs) in maize and expression analysis of partial SSP genes in reproductive tissues. Planta, 240, 713–728. (https://pubmed.ncbi.nlm.nih.gov/25048445/)

According to the paper, SSPs are defined as:
1. Proteins shorter than 200aa
2. Containt signal peptide (using signalp)
3. Don't containt transmembrane helices (using TMHMM)

In [None]:
# Load the compressed fasta file and parse it using biopython
fasta_file = gzip.open("protein_seqs.fa.gz", mode='rt')

record_iterator = SeqIO.parse(fasta_file, "fasta")
od = OrderedDict()
for record in record_iterator:
    # filter any genes shorter than 200aa
    if len(record.seq) <= 200:
        od[record.id] = record

# Save results to short_seqs.fa file fo
with open("short_seqs.fa", 'w') as handle:
    SeqIO.write(od.values(), handle, 'fasta')

In [None]:
# Use signalp5 (signalp-5.0b.Linux.tar.gz) to predoct signal peptides
os.system("signalp "+" -fasta short_seqs.fa -prefix signalp")

In [None]:
# Read the signalp results
sp = pd.read_csv("signalp_summary.signalp5", sep="\t", skiprows=1, index_col=0)
# Keep the rows with predicted signal peptides (SP(Sec/SPI))
spp = sp[sp["Prediction"]=="SP(Sec/SPI)"]

In [None]:
# Use tmhmm (tmhmm-2.0c.Linux.tar.gz) to predoct transmembrane domains
os.system("tmhmm "+" -short short_seqs.fa > tmhmm_short_seqs.tsv")
# read the tmhmm result table
tm = pd.read_csv("tmhmm_short_seqs.tsv", sep="\t", index_col=0, header=None)
# Keep the rows with 0 predicted helices
tmp = tm[tm[4]=="PredHel=0"]

In [None]:
# https://stackoverflow.com/questions/5094083/find-the-overlap-between-2-python-lists
# Get the intersect between the signalp and tmhmm results
intersection = set(spp.index) & set(tmp.index)
print("The intersection has", len(intersection), "genes. SignalP:", len(spp), "TMHMM:", len(tmp))

In [None]:
fasta_file = gzip.open("protein_seqs.fa.gz", mode='rt')

record_iterator = SeqIO.parse(fasta_file, "fasta")
od = OrderedDict()

# Write both a fasta file and a tsv file that containt the sequences
# I also include a column that has the last 30aa of the protein
outFile = open("ref_SSPs.tsv" ,"w")
outFile.write("GeneID\tlast30aa\tSequence\n")

for record in record_iterator:
    if record.id in intersection:
        od[record.id] = record
        outFile.write(record.id+'\t'+str(record.seq[-31:-1])+'\t'+str(record.seq)+"\n")

with open("ref_SSPs.fa", 'w') as handle:
    SeqIO.write(od.values(), handle, 'fasta')