# Preprocessing netsurfP output to generate input for cnn model

This file contains all the necessary steps for preprocessing of the sequences with secondary structure (ss) predictions from NetSurfP2.0 into our needed format to use as input for GT-CNN fold prediction.
- Read in the <file>.csv file generated by NetSurfP2.0 directly.
- Perform Domain-based filtering of the sequences.
- Add paddings to make all sequences and ss predictions 798 in length.
- Organize into a csv file to be used as input for next step.

Inputs:
- SS prediction file generated by NetSurfP2.0 (<file>.csv)<br>
    Note: If SS prediction is done using other tools, process files to the same format as at the end of this notebook and go to the next step directly.<br>
- Domain annotation file<br>
    Generated by running sequences through Batch-CD-Search and processing the output.<br>
    A tab separated file with 3 columns: SeqID | DomainStart | DomainEnd<br>

### Imports, functions and definitions

In [1]:
# import necessary package 
from __future__ import print_function
import os
import numpy as np
import pandas as pd
import time
import matplotlib.pyplot as plt
import matplotlib.mlab as mlab
import matplotlib.patches as patches
from matplotlib.ticker import MultipleLocator
from scipy.stats import norm
from scipy.stats import genextreme
import seaborn as sns
import random
sns.set(style="ticks", color_codes=True)
sns.set_palette("hls")

In [2]:
#### This function is intended to concatenate seq processed by NetSurfP into a single row, needed when new seqs are provided
# a helper function to concatenate seq into a row

def Transfer_Function(Data, val = False, fold_name=False, family_name=False):
    NameList = Data['id'].unique()
    ReturnData = pd.DataFrame(columns = ["Name", "fold", "family", "q3seq", "rawseq"])
    len_sequences = []
    for _ in range(len(NameList)):
        seq = Data[Data['id'].isin([NameList[_]])]
        q3seq = ''
        rawSeq = ''
        if val == True:
            Fold = fold_name
            Fam = family_name
        else:
            Fold = (NameList[_].split("|")[0]).split("-")[1]
            Fam = (NameList[_].split("|")[0])
        for row in seq.itertuples():
            q3seq += str(getattr(row, 'q3'))
            rawSeq += str(getattr(row, 'seq'))
        Name = NameList[_]
        len_sequences.append(len(q3seq))
        ReturnData = ReturnData.append(pd.DataFrame([[Name, Fold, Fam, q3seq, rawSeq]], columns = ["Name","fold","family", "q3seq", "rawseq"]), ignore_index=True)
    return pd.DataFrame(ReturnData), pd.DataFrame(len_sequences)

#### The following 2 function are intended to cut the sequence based on domain bound.

## Add domain bound information to the seq file by matching the IDs
def Add_Domain_Bound(Data, Domain_bound_file):
    returnDate = pd.DataFrame(columns=Data.columns)
    df_seq = pd.DataFrame(columns=["fold", "family","length"])
    len_sequences = []
    Name_fold = []
    Name_fam = []
    for index, row in Data.iterrows():
        Name = row['Name']
        if not Domain_bound_file.loc[Domain_bound_file['Full_Sequence_ID'] == Name].empty:
            bound =  Domain_bound_file.loc[Domain_bound_file['Full_Sequence_ID'] == Name]
            bound_start = bound['Domain_start']
            bound_end = bound['Domain_end']
            q3seq = row['q3seq'][bound_start.values[0]:bound_end.values[0]]
            rawseq = row['rawseq'][bound_start.values[0]:bound_end.values[0]]
#             print(len(row['q3seq']), Name, bound_start.values[0], bound_end.values[0], len(q3seq))
            returnDate = returnDate.append(pd.DataFrame([[Name, getattr(row, "fold"), getattr(row, "family"), q3seq, rawseq]], columns=Data.columns), ignore_index=True)
            Name_fold.append(getattr(row, "fold"))
            Name_fam.append(getattr(row, "family"))
            len_sequences.append(len(q3seq))
            df_seq = df_seq.append(pd.DataFrame([[getattr(row, "fold"), getattr(row, "family"), len(q3seq)]], columns=["fold", "family","length"]), ignore_index=True)
    return returnDate, df_seq

# Cut sequences based on teh domain bounds
def Domain_bound_cutting(Data, threshold):
    returnDate = pd.DataFrame(columns=Data.columns)
    df_seq = pd.DataFrame(columns=["fold", "family","length"])
    len_sequences = []
    Name_fold = []
    Name_fam = []
    # iterate through tables
    for index, row in Data.iterrows():
        Name = row['Name']
        # get seq length
        Seq_length = len(row['q3seq'])
        bound_start = row['Domain_start']
        bound_end = row['Domain_end']
        bound_length = bound_end - bound_start
        # if domain length less than threshold
        if bound_length <= threshold:
            # if sequence length less than threshold, direct append
            if Seq_length <= threshold:
                returnDate = returnDate.append(pd.DataFrame([[getattr(row, "Name"), getattr(row, "fold"), getattr(row, "family"), getattr(row, "Domain_start"), getattr(row, "Domain_end"), getattr(row, "q3seq"), getattr(row, "rawseq")]], columns=Data.columns), ignore_index=True)
                Name_fold.append(getattr(row, "fold"))
                Name_fam.append(getattr(row, "family"))
                len_sequences.append(Seq_length)
                df_seq = df_seq.append(pd.DataFrame([[getattr(row, "fold"), getattr(row, "family"), Seq_length]], columns=["fold", "family","length"]), ignore_index=True)
            # if sequence length longer than threshold, 
            else:
                # domain end position > threshold
                if bound_end >= threshold:
                    random_value = random.randint(bound_start - (threshold - (bound_end - bound_start)), bound_start)
                    q3seq = row['q3seq'][random_value:random_value+threshold]
                    rawseq = row['rawseq'][random_value:random_value+threshold]
                    returnDate = returnDate.append(pd.DataFrame([[getattr(row, "Name"), getattr(row, "fold"), getattr(row, "family"), getattr(row, "Domain_start"), getattr(row, "Domain_end"), q3seq, rawseq]], columns=Data.columns), ignore_index=True)
                    Name_fold.append(getattr(row, "fold"))
                    Name_fam.append(getattr(row, "family"))
                    len_sequences.append(len(q3seq))
                    df_seq = df_seq.append(pd.DataFrame([[getattr(row, "fold"), getattr(row, "family"), len(q3seq)]], columns=["fold", "family","length"]), ignore_index=True)
                #domain end position < threshold
                else:
                    random_value = random.randint(0,bound_start)
                    q3seq = row['q3seq'][random_value:random_value+threshold]
                    rawseq = row['rawseq'][random_value:random_value+threshold]                        
                    returnDate = returnDate.append(pd.DataFrame([[getattr(row, "Name"), getattr(row, "fold"), getattr(row, "family"), getattr(row, "Domain_start"), getattr(row, "Domain_end"), q3seq, rawseq]], columns=Data.columns), ignore_index=True)
                    Name_fold.append(getattr(row, "fold"))
                    Name_fam.append(getattr(row, "family"))
                    len_sequences.append(len(q3seq))
                    df_seq = df_seq.append(pd.DataFrame([[getattr(row, "fold"), getattr(row, "family"), len(q3seq)]], columns=["fold", "family","length"]), ignore_index=True)
    return  returnDate, df_seq 

#### If no domain bound information, use this to cut the sequences

def Cutting(Data, threshold):
    returnDate = pd.DataFrame(columns=Data.columns)
    df_length = pd.DataFrame(columns=["fold", "family","length"])
    len_sequences = []
    Name_fold = []
    Name_fam = []
    # iterate through tables
    for index, row in Data.iterrows():
        Name = row['Name']
        Seq_length = len(row['q3seq'])
        if Seq_length <= threshold:
            returnDate = returnDate.append(pd.DataFrame([[getattr(row, "Name"), getattr(row, "fold"), getattr(row, "family"), getattr(row, "q3seq"), getattr(row, "rawseq")]], columns=Data.columns), ignore_index=True)
            Name_fold.append(getattr(row, "fold"))
            Name_fam.append(getattr(row, "family"))
            len_sequences.append(Seq_length)
            df_length = df_length.append(pd.DataFrame([[getattr(row, "fold"), getattr(row, "family"), Seq_length]], columns=["fold", "family","length"]), ignore_index=True)

    return  returnDate, df_length 

#### Add zero padding to sequences to get them to length 798
# a helper function for two way padding

def Zero_Padding(data, maxlength):
    ReturnData = pd.DataFrame(columns = ["Name", "fold", "family", "q3seq", "rawseq", "paddings"])
    for index, row in data.iterrows():
        q3seq = ''
        rawseq = ''
        length = len(getattr(row, "q3seq"))
        tmp = '-'*(int((maxlength-length)/2))
        tmpSeq = '-'*(int((maxlength-length)/2))
        num = int((maxlength-len(row.q3seq))/2)
        if(((maxlength-len(getattr(row, "q3seq")))%2==0)):
            q3seq = tmp+getattr(row, "q3seq")+tmp
            rawseq = tmpSeq+getattr(row, "rawseq")+tmpSeq    
        else:
            q3seq = tmp+getattr(row, "q3seq")+tmp+'-'
            rawseq = tmpSeq+getattr(row, "rawseq")+tmpSeq + '-' 
        ReturnData = ReturnData.append(pd.DataFrame([[getattr(row, "Name"), getattr(row, "fold"), getattr(row, "family"), q3seq, rawseq, num]], columns = ["Name", "fold", "family", "q3seq", "rawseq", "paddings"]), ignore_index=True)
    return ReturnData

#### Partition data

def Partition(data, maxwordCount=587):
    ReturnData = pd.DataFrame(columns=['Name', 'fold', 'family', 'q3seq', 'rawseq', 'q3seqTokens', 'rawseqTokens', "paddings"])
    # iterate through the csv
    for index, row in data.iterrows():
        Name = row["Name"]
        #print(name1)
        fold = row.fold
        if len(row.q3seq) <= maxwordCount:
            q3seqTokens = list(row.q3seq)
            rawseqTokens = list(row.rawseq)
        else:
            print("Jump extra-long tokens")
        # append
        ReturnData = ReturnData.append(pd.DataFrame([[Name, fold, row.family, row.q3seq, row.rawseq, q3seqTokens,rawseqTokens, row.paddings]], columns=['Name', 'fold', 'family', 'q3seq', 'rawseq', 'q3seqTokens', 'rawseqTokens', "paddings"]), ignore_index=True)
    return ReturnData

In [3]:
def draw_ss(ss, ypos=6, height=.6, xlim=(0,100)):
    for p, s in enumerate(ss):
#         print(s)
        if s=='E': # blue
            pat=patches.Rectangle(width=1,height=height,fc='#0000ff',lw=0,xy=(p-0.5, ypos))
        if s=='H': # red
            pat=patches.Rectangle(width=1,height=height,fc='#ff0000',lw=0,xy=(p-0.5, ypos))
        if s=='C': # black
            pat=patches.Rectangle(width=1,height=height/3,fc='#000000',lw=0, xy=(p-0.5,ypos+(height*(3/8))))
        if s=='-': # black
            pat=patches.Rectangle(width=1,height=height/4,fc='#D3D3D3',lw=0, xy=(p-0.5,ypos+(height*(3/8))))
        plt.gca().add_patch(pat)
        plt.xlim(xlim)
        plt.ylim(0,10)
        del pat

def plotSsCam(df,ss='SecStr',xlim=(0,800)):
#     num=df.shape[0]
#     print(num)
    plt.figure(figsize=(xlim[1]/50,df.shape[0]))
#     plt.figure(figsize=(20,10))
    increm=0.8
    for index, row in df.iterrows():
        ypos1=index
        draw_ss(df[ss][index], ypos=ypos1, height=increm)
    plt.xlim(xlim)
    plt.ylim(0,df.shape[0])
    ax=plt.gca()
    ax.yaxis.set_major_locator(MultipleLocator(1))
    lab=list(df['Name'])
    lab.insert(0,0)
    lab.insert(0,0)
    ax.set_yticklabels(lab)

In [3]:
pwd

'/Users/rtaujale/Dropbox (Edison_Lab@UGA)/Projects/GT_informatics/GT/GT_strML/Github_Folder/GT-CNN/Codes'

### Read csv file produced by netsurfp; IDs need to be edited with family and fold information

In [None]:
new_GT = pd.read_csv("../Datasets/all_gtu.netsurfp.csv")
domain_file = pd.read_csv("../Datasets/all_gtu.domainAnnotation.csv")

In [8]:
new_GT_seq, _ = Transfer_Function(new_GT, val = True, fold_name="u", family_name='GT-u')

In [6]:
new_GT_seq['family'].value_counts()

GT29-u     627
GT11-u     467
GT106-u    321
GT71-u     306
GT26-u     288
GT53-u     253
GT89-u     232
GT76-u     189
GT48-u     185
GT110-u    172
GT105-u    154
GT101-u    138
GT73-u     118
GT92-u     114
GT44-u     111
GT91-u     110
GT108-u     99
GT42-u      71
GT95-u      60
GT100-u     48
GT109-u     36
GT98-u      35
GT96-u      31
GT102-u     29
GT99-u      26
GT69-u      23
GT103-u     20
GT74-u      15
GT97-u       8
Name: family, dtype: int64

## Read in the transferred file; Start here!!

### Cutting the sequence (domain based or direct)

#### If domain bound file is available

In [23]:
# Merge domain bounds with the sequence and ss info
new_GT_seq2= pd.merge(new_GT_seq, domain_file, on='Name')
new_GT_seq2 = new_GT_seq2[['Name', 'fold', 'family','Domain_start','Domain_end', 'q3seq', 'rawseq']]
new_GT_seq2.shape

(679, 7)

In [24]:
new_GT_seq2

Unnamed: 0,Name,fold,family,Domain_start,Domain_end,q3seq,rawseq
0,GT107-B|AUL15450|B.bronchisepticaA310_,u,GT107-u,1,685,CEEEECHHHHCCCCHHHHHCCCEEEEEECCCCCCCCCCEEEEECCC...,MIELHSSGIARIPYLTELLGAPVTRYRRFWPPGGAAPRAVAGWGAR...
1,GT107-B|ABM49925|B.malleiSAVP1_,u,GT107-u,1,525,CEEECCCCCCCCCCCCCCCCCHHHHHHHHHHHHHHCCCCEEEEEEC...,MLLIDERKYSQGIGAVATRNNRGAFERMIRAARAAHPDAEFWLART...
2,GT107-B|AMD49973|B.holmesiiF627_,u,GT107-u,1,676,CEEEECHHHHCCCHHHHHHCCCEEEECCCCCCCCCCEEEECCCCCC...,MIGVFSKGILRIPYLDVFLGQPVQACNRRTPVTGLSAIVGWGMRPS...
3,GT107-B|AHK74629|C.coliRM5611_,u,GT107-u,1,675,CCEEECCHHHHHCHHHHHCCCHHHHHHCCCCCCEEEEECCCHHHHH...,MKFHTTSKKLIKNVKDFYKIVLYKAYKSIGKEDVFVGWGRKNSGLK...
4,GT107-B|AHI64692|B.thailandensisH0587_,u,GT107-u,1,676,CCCCCCCCCCCCCCCCCCCHHHHHCCCCCCCCCCCHHHHHHHHHHH...,MAAPTAPRFDLRGITRGSRLASWLARRSNCLWLDHWSGRAALRVAR...
...,...,...,...,...,...,...,...
674,GT114-u|AWW47291.1|-,u,GT114-u,1,597,CCCEEEEEEECCCCCCCHHHHHHHHHHHHHCCCCEEEEEECHHHHH...,MNQITLILIYSGSSELPQYLRETFEITSRIAKNSRIVFLANQSNYQ...
675,GT113-u|AWY97577.1|-,u,GT113-u,1,350,CCEEEEEECCCCCCCHHHHHHHHHHHHHHHCCCEEEEECCCCCCCC...,MARFLLHLDIGEDFHAGSKAMKDCETVLTKKNYRLLRIHRCEKAKG...
676,GT111-u|AUJ33132.1|-,u,GT111-u,1,254,CCEEEEEEECCCCCCCCCCCEEEEEEECCCCCCCCCCCCCCCCCHH...,MKAEVYIVSHKLVKLPQDRMYVPIQVGTADENFTGFVRDNTGDNIA...
677,GT112-u|AVM45539.1|-,u,GT112-u,1,429,CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC...,MPAESLSVNHDIPSPITIEQPAAPVSVPQPVPEQPQQKSIYGKVPE...


### Cut to get only the domain regions

In [26]:
new_GT_seq_cut=new_GT_seq2.copy()
new_GT_seq_cut['q3seq']=new_GT_seq_cut.apply(lambda x: x['q3seq'][(x['Domain_start']-1):x['Domain_end']], axis=1)
new_GT_seq_cut['rawseq']=new_GT_seq_cut.apply(lambda x: x['rawseq'][(x['Domain_start']-1):x['Domain_end']], axis=1)
new_GT_seq_cut

Unnamed: 0,Name,fold,family,Domain_start,Domain_end,q3seq,rawseq
0,GT107-B|AUL15450|B.bronchisepticaA310_,u,GT107-u,1,685,CEEEECHHHHCCCCHHHHHCCCEEEEEECCCCCCCCCCEEEEECCC...,MIELHSSGIARIPYLTELLGAPVTRYRRFWPPGGAAPRAVAGWGAR...
1,GT107-B|ABM49925|B.malleiSAVP1_,u,GT107-u,1,525,CEEECCCCCCCCCCCCCCCCCHHHHHHHHHHHHHHCCCCEEEEEEC...,MLLIDERKYSQGIGAVATRNNRGAFERMIRAARAAHPDAEFWLART...
2,GT107-B|AMD49973|B.holmesiiF627_,u,GT107-u,1,676,CEEEECHHHHCCCHHHHHHCCCEEEECCCCCCCCCCEEEECCCCCC...,MIGVFSKGILRIPYLDVFLGQPVQACNRRTPVTGLSAIVGWGMRPS...
3,GT107-B|AHK74629|C.coliRM5611_,u,GT107-u,1,675,CCEEECCHHHHHCHHHHHCCCHHHHHHCCCCCCEEEEECCCHHHHH...,MKFHTTSKKLIKNVKDFYKIVLYKAYKSIGKEDVFVGWGRKNSGLK...
4,GT107-B|AHI64692|B.thailandensisH0587_,u,GT107-u,1,676,CCCCCCCCCCCCCCCCCCCHHHHHCCCCCCCCCCCHHHHHHHHHHH...,MAAPTAPRFDLRGITRGSRLASWLARRSNCLWLDHWSGRAALRVAR...
...,...,...,...,...,...,...,...
674,GT114-u|AWW47291.1|-,u,GT114-u,1,597,CCCEEEEEEECCCCCCCHHHHHHHHHHHHHCCCCEEEEEECHHHHH...,MNQITLILIYSGSSELPQYLRETFEITSRIAKNSRIVFLANQSNYQ...
675,GT113-u|AWY97577.1|-,u,GT113-u,1,350,CCEEEEEECCCCCCCHHHHHHHHHHHHHHHCCCEEEEECCCCCCCC...,MARFLLHLDIGEDFHAGSKAMKDCETVLTKKNYRLLRIHRCEKAKG...
676,GT111-u|AUJ33132.1|-,u,GT111-u,1,254,CCEEEEEEECCCCCCCCCCCEEEEEEECCCCCCCCCCCCCCCCCHH...,MKAEVYIVSHKLVKLPQDRMYVPIQVGTADENFTGFVRDNTGDNIA...
677,GT112-u|AVM45539.1|-,u,GT112-u,1,429,CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC...,MPAESLSVNHDIPSPITIEQPAAPVSVPQPVPEQPQQKSIYGKVPE...


### Padding

In [27]:
new_GT_seq_pad = Zero_Padding(new_GT_seq_cut, 798)

In [28]:
new_GT_seq_pad.head()

Unnamed: 0,Name,fold,family,q3seq,rawseq,paddings
0,GT107-B|AUL15450|B.bronchisepticaA310_,u,GT107-u,----------------------------------------------...,----------------------------------------------...,56
1,GT107-B|ABM49925|B.malleiSAVP1_,u,GT107-u,----------------------------------------------...,----------------------------------------------...,136
2,GT107-B|AMD49973|B.holmesiiF627_,u,GT107-u,----------------------------------------------...,----------------------------------------------...,61
3,GT107-B|AHK74629|C.coliRM5611_,u,GT107-u,----------------------------------------------...,----------------------------------------------...,61
4,GT107-B|AHI64692|B.thailandensisH0587_,u,GT107-u,----------------------------------------------...,----------------------------------------------...,61


### Partition the data

In [29]:
new_GT_seq_final = Partition(new_GT_seq_pad, maxwordCount=798)

### Save processed table to csv

In [None]:
new_GT_seq_final.to_csv("../ExampleOutputs/allgtu_processed.csv")