In [1]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt

from Bio import SeqIO
import os
import re
import time

In [2]:
from deepcoil import DeepCoil
from deepcoil.utils import plot_preds

In [3]:
input_file = "/ebio/abt1_share/prediction_hendecads/data/new_prot_fam_data/final_dataset.fasta"

fasta_sequences = list(SeqIO.parse(open(input_file),'fasta'))
n_seq = len(fasta_sequences)

In [4]:
if os.path.exists('/ebio/abt1_share/prediction_hendecads/data/stretches.csv'):
    df = pd.read_csv('/ebio/abt1_share/prediction_hendecads/data/stretches.csv')

else:
    df = pd.DataFrame(columns=['id', 'seq', 'stretch_ix', 'stretch_seq'])

    pattern = r'\[\[.*?\]\]'  # pattern to extract stretches

    for seq_ix, seq in enumerate(fasta_sequences):
        
        print(f"Processing sequence {seq_ix+1}/{n_seq}", end='\r')

        s = str(seq.seq)
        d = str(seq.description)

        stretches = eval(re.findall(pattern, d.split('|||')[-1])[0])

        for st_ix, st in enumerate(stretches):
            df = df.append({'id': seq.id, 'seq': s, 'stretch_ix': st, 'stretch_seq': s[st[0]:st[1]]}, ignore_index=True)

df.head()

Unnamed: 0,id,seq,stretch_ix,stretch_seq
0,MCD6041253.1,Mrlvyvavaailcsfsttslagaektakragkfvektatragkfve...,"[1, 71]",rlvyvavaailcsfsttslagaektakragkfvektatragkfver...
1,MCD7737945.1,MQGRVFFREAAALILAAALSMAGLPASAAANSGIEAAALRTeeete...,"[41, 238]",eeetepstkeavqetavetdtgekpesgedgqeesaesteeeqeed...
2,MYF28459.1,Merlqtdllkeihalrgemhaefasvrqemhagfasirqemhaeta...,"[1, 73]",erlqtdllkeihalrgemhaefasvrqemhagfasirqemhaetas...
3,WP_168920948.1,MSDVFLTASYADREKVKTLGARWNPAEKRWYVPSGRDLSPFAAWLP...,"[437, 540]",aqslvveikhaasqqlllarhvvparmaevtaegrqalrtakaqsq...
4,WP_026306873.1,MLLRRIARPLLSAAFIAEGIDILQNPGPLADRLSPALDFTRRRSQH...,"[172, 342]",slgwrgrraardakdhaealaataaaiaatarergtnlvdtarert...


In [5]:
out_file = '/ebio/abt1_share/prediction_hendecads/tmp/stretches.fasta'

with open(out_file, 'w') as f:
    for i, row in df.iterrows():

        print(f"Writing sequence {i+1}/{len(df)}", end='\r')

        f.write(f">{row['id']}_{str(row['stretch_ix']).replace(' ', '')}\n{row['stretch_seq']}\n")

Writing sequence 40590/40590

In [24]:
dc = DeepCoil(use_gpu=False)
inp = {str(entry.id): str(entry.seq) for entry in SeqIO.parse(out_file, "fasta")}

for i in range(0, len(df), 1000):
    print(f"Processing sequences {i+1}-{min(i+1000, len(df))}", end='\r')

    tmp_keys = list(inp.keys())[i:i+1000]
    tmp_inp = {k: inp[k] for k in tmp_keys}

    preds = dc.predict(tmp_inp)
    averages = [np.mean(preds[key]['cc']) for key in preds.keys()]

    df.loc[i:i+1000-1, 'avg_cc'] = averages

    df.to_csv('/ebio/abt1_share/prediction_hendecads/data/stretches_w_values.csv', index=False)

Processing sequences 1-1000



Processing sequences 25001-26000

In [23]:
len(df.loc[i:i+1000, 'avg_cc'])

1001

In [None]:
df.avg_cc.hist(bins=25, edgecolor='black', grid=False)

plt.xlabel('Avg. CC Probability')
plt.show()