In [1]:
import pandas as pd

In [2]:
raw_datafile = 'data/Thioesterase_Master_List_modified.xlsx'
raw_regression_labelfile = 'data/ThioesteraseProductFractionLabel.xlsx'

In [3]:
'''Files needed:
- 1 csv file with enz_alias,enz_sequence
- 3 csv files with enz_alias,enz_labels(multiclass,binary,regression)
- 1 csv file with enz_alias,enz_name mapping
- 1 fasta file with header as enz_alias and second line as sequence.'''

raw_df = pd.read_excel(raw_datafile,sheet_name='characterized_TE',usecols=[1,7,11],nrows=116)

In [4]:
#assign proper column names
raw_df.columns = ['enzNames','sequence','label']

#make all the sequences upper and get rid of -
def upSeq(seqString):
    return seqString.upper().replace('-','')

raw_df['sequence'] = raw_df.sequence.apply(upSeq)

#get rid of sequences with values in 'B', 'J', 'O', 'U', 'X' and 'Z'
df = raw_df.loc[~raw_df['sequence'].str.contains('B|J|O|U|X|Z')]

# give enz_alias
enz_alias = [f'enz_{i}' for i in range(len(df['enzNames']))]
df = df.assign(enzAlias=enz_alias)

# create dictionary mapping between alias and enzyme names
enz_mapdict = dict(zip(df['enzAlias'],df['enzNames']))

# create binary labels
def binaryLabel(val):
    if val==3:
        return 1
    else:
        return 0

df['binaryLabel'] = df.label.apply(binaryLabel)

# create regression labels
raw_df_reg = pd.read_excel(raw_regression_labelfile,header=None)
raw_df_reg.loc[114,0] = 'Prunus_sibirica_L._(Siberian apricot)' #useless modification
raw_df_reg.loc[111,0] = 'UcFatB1(R197M, M199H, T231K)' #useless modification
raw_df_reg.loc[112,0] = 'UcFatB1(197M, M199H)' #useless modification
df_reg = raw_df_reg.merge(df.loc[:,['enzNames','sequence','enzAlias']],right_on='enzNames',left_on=0)

In [5]:
# save the csv sequence file
df.loc[:,['enzAlias','sequence']].to_csv('data/SeqFile/EnzymeSequence.csv',header=None,index=None)

#save the csv label files
df.loc[:,['enzAlias','label']].to_csv('data/LabelFiles/EnzymeLabelsMultiClass.csv',header=None,index=None)
df.loc[:,['enzAlias','binaryLabel']].to_csv('data/LabelFiles/EnzymeLabelsBinary.csv',header=None,index=None)
df_reg.loc[:,['enzAlias',1]].to_csv('data/LabelFiles/EnzymeLabelsRegression.csv',header=None,index=None)

#save the enzyme name mappings
df.loc[:,['enzAlias','enzNames']].to_csv('data/EnzymeNameMap.csv',header=None,index=None)

#createFastafile
with open('data/SeqFile/EnzymeFasta.fa','w') as f:
    for enzal,enzseq in zip(df.enzAlias,df.sequence):
        f.write('>'+enzal)
        f.write('\n')
        f.write(enzseq+'\n')