In [1]:
import numpy as np
import pandas as pd
import pickle
from Bio.Seq import Seq

In [2]:
# Read in generated SPs
with open('data/prot_sps_bs5.txt', 'r') as f:
    seqs = f.readlines()

entries = []
entry = []
for i, line in enumerate(seqs):
    line = line.strip()
    if i%7 == 6:
        entries.append(entry)
        entry = []
    else:
        if line:
            entry.append(line)
    
df = pd.DataFrame(entries)
df.columns = ['seqID', 'prot_seq', '75', '90', '95', '99']
df.to_excel("data/sp_prot_translations.xls")

In [3]:
# Get prots
prots = df['prot_seq'].values
prots = [i[1:-1] for i in prots]

# Get SPs
sps = df[['75','90','95','99']].values
sps = [[j[:-6] for j in row] for row in sps]
sps = np.array(sps).flatten()

sp_prots = np.array([[i + j for i in sps] for j in prots]).flatten()
sp_ga_prots = np.array([[i[:] + 'GA' + j for i in sps] for j in prots]).flatten()

In [4]:
len(sps)

164

In [5]:
# Write FASTA file for input to SignalP 5.0
import os
# files = ['outputs/sp_overflow.fasta', 'outputs/sp_prots.fasta', 'outputs/sp_ga_prots.fasta']
# for file in files:
#     os.remove(file)

overflow = open('outputs/sp_overflow_GA_appended.fasta','w')

entries = 0

ga_entries = dict()
with open('outputs/sp_ga_appended_prots.fasta','w') as f:
    for i, prot in enumerate(prots):
        for j, sp in enumerate(sps):
            # write to file
            
            if entries < 5000:
                f.write('>sp_' + str(j//4 + 1) + '_' + str(j%4 + 1) + '_prot_' + str(i + 1) + '_ga\n')
                f.write(sp + prot + '\n')
                entries += 1
            else:
                overflow.write('>sp_' + str(j//4 + 1) + '_' + str(j%4 + 1) + '_prot_' + str(i + 1) + '_ga\n')
                overflow.write(sp + prot + '\n')  
            
            ga_entries.update({'sp_' + str(j//4 + 1) + '_' + str(j%4 + 1) + '_prot_' + str(i + 1) + '_ga':
                             sp+prot})
            
pickle.dump(ga_entries, open('outputs/ga_entries_dict.p','wb'))
overflow.close()

### FASTA files go into signalp, then extracted here

In [6]:
from glob import glob

df_list = []

# Read in glob data, format correctly
for f in glob("data/signalp_stats/*.txt"):
    df = pd.read_table(f).reset_index()
    df = df.rename(columns=df.iloc[0]).drop(df.index[0])
    df_list.append(df)

# Concat to master df
signalp_df = pd.concat(df_list)

# Convert appropriate columns to floats:
float_cols = ['SP(Sec/SPI)', 'TAT(Tat/SPI)', 'LIPO(Sec/SPII)','OTHER',]
float_dic = {i:'float' for i in float_cols}
signalp_df = signalp_df.astype(float_dic)

signalp_df['sp_prob'] = 1 - signalp_df['OTHER']

signalp_df.to_csv('outputs/signalp_stats.df.csv')

In [7]:
signalp_df

Unnamed: 0,# ID,Prediction,SP(Sec/SPI),TAT(Tat/SPI),LIPO(Sec/SPII),OTHER,CS Position,sp_prob
1,sp_33_1_prot_33_ga,OTHER,0.011738,0.001768,0.001774,0.984719,,0.015281
2,sp_24_3_prot_21_ga,OTHER,0.169062,0.002067,0.032688,0.796183,,0.203817
3,sp_24_4_prot_21_ga,OTHER,0.229497,0.002131,0.057309,0.711063,,0.288937
4,sp_12_2_prot_39_ga,OTHER,0.115341,0.006321,0.323075,0.555262,,0.444738
5,sp_12_2_prot_19_ga,OTHER,0.144268,0.002467,0.399141,0.454124,,0.545876
...,...,...,...,...,...,...,...,...
107,sp_33_2_prot_33_ga,LIPO(Sec/SPII),0.442475,0.009517,0.547412,0.000595,CS pos: 18-19. AAG-CG. Pr: 0.5510,0.999405
108,sp_26_2_prot_26_ga,SP(Sec/SPI),0.969920,0.000231,0.029331,0.000518,CS pos: 19-20. AQA-AT. Pr: 0.8226,0.999482
109,sp_13_3_prot_5_ga,SP(Sec/SPI),0.997570,0.001009,0.001121,0.000300,CS pos: 26-27. AHA-ET. Pr: 0.9319,0.999700
110,sp_25_2_prot_25_ga,LIPO(Sec/SPII),0.002732,0.000205,0.996805,0.000258,CS pos: 17-18. VAG-CS. Pr: 0.9979,0.999742
