In [1]:
import pandas as pd

In [2]:
raw_train_datafile = '../data/LatestData/TE_Seq_Truncated_1-27-21.csv'
raw_test_datafile = '../data/LatestData/C10TE_candidates_koehneana_lanceolata.csv'
raw_regression_labelfile = '../data/ThioesteraseProductFractionLabel.xlsx'

In [3]:
'''Files needed:
- 1 csv file with enz_alias,enz_sequence
- 1 csv file with test_enz_alias,test_enz_sequence
- 3 csv files with enz_alias,enz_labels(multiclass,binary,regression)
- 1 csv file with enz_alias,enz_name mapping
- 1 csv file with test_enz_alias,test_enz_name mapping
- 1 fasta file with header as enz_alias and second line as sequence
- 1 fasta file with header as test_enz_alias and second line as test_enz_sequence'''

raw_train_df = pd.read_csv(raw_train_datafile,header=None)
raw_test_df = pd.read_csv(raw_test_datafile,header=None)

In [4]:
#assign proper column names
raw_train_df.columns = ['enzNames','sequence','label']
raw_test_df.columns = ['enzNames','sequence','label']

#make all the sequences upper and get rid of -
def upSeq(seqString):
    return seqString.upper().replace('-','')

raw_train_df['sequence'] = raw_train_df.sequence.apply(upSeq)
raw_test_df['sequence'] = raw_test_df.sequence.apply(upSeq)

#get rid of sequences with values in 'B', 'J', 'O', 'U', 'X' and 'Z'
train_df = raw_train_df.loc[~raw_train_df['sequence'].str.contains('B|J|O|U|X|Z')]
test_df = raw_test_df.loc[~raw_test_df['sequence'].str.contains('B|J|O|U|X|Z')]

# give enz_alias
enz_alias_train = [f'enz_{i}' for i in range(len(train_df['enzNames']))]
train_df = train_df.assign(enzAlias=enz_alias_train)
enz_alias_test = [f'test_enz_{i}' for i in range(len(test_df['enzNames']))]
test_df = test_df.assign(enzAlias=enz_alias_test)

# create dictionary mapping between alias and enzyme names
enz_train_mapdict = dict(zip(train_df['enzAlias'],train_df['enzNames']))
enz_test_mapdict = dict(zip(test_df['enzAlias'],test_df['enzNames']))

# create binary labels
def binaryLabel(val):
    if val==3:
        return 1
    else:
        return 0

train_df['binaryLabel'] = train_df.label.apply(binaryLabel)

In [5]:
train_df.head()

Unnamed: 0,enzNames,sequence,label,enzAlias,binaryLabel
0,A._hypogaea_l._(AhFatA),MAVGFGYPMNRVLSVRAIVSDRDGAVVNRVGAEAGTLADRLRLGSL...,1,enz_0,0
1,Arabidopsis_thaliana,MLPDWSMLLAAITTIFLAAEKQWMMLDWKPRRSDMLVDPFGIGRIV...,1,enz_1,0
2,Auxenochlorella_protothecoides,MVENKRVFLEEHRIRGNEAGPSQHVTIAAVANILQEAAGNHAVAMW...,1,enz_2,0
3,Brassica_juncea_(BjFatB1),MLPDWSMLLAAITTVFLAAEKQWMMLDWKPRRSDVIMYPFGLGRIV...,1,enz_3,0
4,Brassica_juncea_(BjFatB2),MLPDWSMLLAAITTVFLAAEKQWMMLDWKPRRSDMIMDPFGLGRIV...,1,enz_4,0


In [6]:
# create regression labels
raw_df_reg = pd.read_excel(raw_regression_labelfile,header=None)
raw_df_reg.loc[111,0] = 'UcFatB1(R197M,' #useless modification
raw_df_reg.loc[112,0] = 'UcFatB1(197M,' #useless modification
df_reg = raw_df_reg.merge(train_df.loc[:,['enzNames','sequence','enzAlias']],right_on='enzNames',left_on=0)

In [7]:
set(raw_df_reg[0]).difference(set(train_df.enzNames))

{'Cuphea_inflata_(Ci1FatB1)', 'Sorghum_bicolor_3', 'Ulmus_americana'}

In [8]:
set(train_df.enzNames).difference(set(raw_df_reg[0]))

{'Cuphea_viscosissima_(CvFatB1)', 'Cuphea_viscosissima_(CvFatB2)'}

In [9]:
df_reg

Unnamed: 0,0,1,enzNames,sequence,enzAlias
0,Cuphea_palustris_(CpFatB1),0.980000,Cuphea_palustris_(CpFatB1),MLLTAITTVFVAPEKRWTMFDRKSKRPNMLMDSFGLERVVQDGLVF...,enz_23
1,Iris_germanica_1,0.050000,Iris_germanica_1,MLPDWSVLLAAITTIFLAAEKQWTLIDWKRGGPDMLTDAFGLGKII...,enz_81
2,Iris_germanica_2,0.090000,Iris_germanica_2,MLPDWSVLLAAITTIFLAAEKQWTLIDWKRGGPDMLSDAFGLPKII...,enz_82
3,Sorghum_bicolor_1,0.060000,Sorghum_bicolor_1,MLPDWSMLLAAVTTIFLAAEKQWTLLDWKPKKPDMLVDTFGFGRII...,enz_107
4,Sorghum_bicolor_2,0.100000,Sorghum_bicolor_2,MLPDWSMLLAAITTIFLAAEKQWTMLDWKPRRPDMLTDTFGFGRII...,enz_108
...,...,...,...,...,...
108,"UcFatB1(R197M,",0.091167,"UcFatB1(R197M,",MLEWKPKLPQLLDDHFGLHGLVFRRTFAIRSYEVGPDRSTSILAVM...,enz_111
109,"UcFatB1(197M,",0.405450,"UcFatB1(197M,",MLEWKPKLPQLLDDHFGLHGLVFRRTFAIRSYEVGPDRSTSILAVM...,enz_110
110,UcFatB1(T231K),0.827146,UcFatB1(T231K),MLEWKPKLPQLLDDHFGLHGLVFRRTFAIRSYEVGPDRSTSILAVM...,enz_112
111,Prunus_sibirica_L._(Siberian,0.006725,Prunus_sibirica_L._(Siberian,MLPDWSVLLAAITTIFLAAEKQWTMLDWKPKRPDMLIDPFGLGRIV...,enz_87


In [10]:
# save the csv sequence file
train_df.loc[:,['enzAlias','sequence']].to_csv('../data/SeqFile/EnzymeSequence.csv',header=None,index=None)
test_df.loc[:,['enzAlias','sequence']].to_csv('../data/SeqFile/TestEnzymeSequence.csv',header=None,index=None)

#save the csv label files
train_df.loc[:,['enzAlias','label']].to_csv('../data/LabelFiles/EnzymeLabelsMultiClass.csv',header=None,index=None)
train_df.loc[:,['enzAlias','binaryLabel']].to_csv('../data/LabelFiles/EnzymeLabelsBinary.csv',header=None,index=None)
df_reg.loc[:,['enzAlias',1]].to_csv('../data/LabelFiles/EnzymeLabelsRegression.csv',header=None,index=None)

#save the enzyme name mappings
train_df.loc[:,['enzAlias','enzNames']].to_csv('../data/EnzymeNameMap.csv',header=None,index=None)
test_df.loc[:,['enzAlias','enzNames']].to_csv('../data/TestEnzymeNameMap.csv',header=None,index=None)

#createFastafileTrain
with open('../data/SeqFile/EnzymeFasta.fa','w') as f:
    for enzal,enzseq in zip(train_df.enzAlias,train_df.sequence):
        f.write('>'+enzal)
        f.write('\n')
        f.write(enzseq+'\n')
        
#createFastafileTest
with open('../data/SeqFile/TestEnzymeFasta.fa','w') as f:
    for enzal,enzseq in zip(test_df.enzAlias,test_df.sequence):
        f.write('>'+enzal)
        f.write('\n')
        f.write(enzseq+'\n')