In [1]:
import pandas as pd

In [2]:
raw_thyme_datafile = '../data/raw/thyme_dataset_truncV2.xlsx'

In [5]:
'''Files needed:
- 1 csv file with thyme_enz_alias,thyme_enz_name mapping
- 1 csv file with thyme_enz_alias,thyme_enz_sequence (from signal peptides)
- 1 csv file with thyme_enz_alias,thyme_enz_sequence (from Mike's truncated list)
- 1 fasta file with header as thyme_enz_alias and second line as thyme_sequence (from signal peptides)
- 1 fasta file with header as thyme_enz_alias and second line as thyme_sequence (from Mike's truncated list)'''

raw_thyme_df1 = pd.read_excel(raw_thyme_datafile, sheet_name='thyme_dataset_signalP5.0', header=None)
raw_thyme_df2 = pd.read_excel(raw_thyme_datafile,sheet_name='thyme_dataset_MAJ_trunc', header=None)

In [6]:
raw_thyme_df = raw_thyme_df1.merge(raw_thyme_df2, left_index=True, right_index=True)

In [7]:
#assign proper column names
raw_thyme_df.columns = ['enzNames1','sequence1','enzNames2','sequence2']

#make all the sequences upper and get rid of -
def upSeq(seqString):
    return seqString.upper().replace('-','')

raw_thyme_df['sequence1'] = raw_thyme_df.sequence1.apply(upSeq)
raw_thyme_df['sequence2'] = raw_thyme_df.sequence2.apply(upSeq)

#get rid of sequences with values in 'B', 'J', 'O', 'U', 'X' and 'Z'
thyme_df_raw = raw_thyme_df.loc[~raw_thyme_df['sequence1'].str.contains('B|J|O|U|X|Z')]
thyme_df = thyme_df_raw.loc[~thyme_df_raw['sequence2'].str.contains('B|J|O|U|X|Z')]

# give enz_alias
enz_alias_thyme = [f'thyme_enz_{i}' for i in range(len(thyme_df['enzNames1']))]
thyme_df = thyme_df.assign(enzAlias=enz_alias_thyme)


# create dictionary mapping between alias and enzyme names
enz_thyme_mapdict = dict(zip(thyme_df['enzAlias'],thyme_df['enzNames1']))

In [8]:
thyme_df.head()

Unnamed: 0,enzNames1,sequence1,enzNames2,sequence2,enzAlias
0,Uncharacterized_protein__ECO_0000313_EMBL_EMT1...,MAGSVASGFFPTPGSSPAASARGSKNMSGELPESLSVRGMVAKPNT...,Uncharacterized protein {ECO:0000313|EMBL:EMT1...,MHIEIYRYPAWGDVVEIETWCQSEGRIGTRRDWILKDIANAEVTGC...,thyme_enz_0
1,Uncharacterized_protein__ECO_0000313_EMBL_EMT1...,MAGSIAAAAFFPGPPAPPPPKSALGERPDSLDVRGMAAKQASSSSA...,Uncharacterized protein {ECO:0000313|EMBL:EMT1...,MHIEIYRYPAWGDVVEIETWCQSEGRIGTRRDWILKDIANAEVTGC...,thyme_enz_1
2,Uncharacterized_protein__ECO_0000313_EMBL_EMT1...,MANTTLSSGTIYHSRVLIRCSSSERGGQQRSNAAVRVNGAAHRATL...,Uncharacterized protein {ECO:0000313|EMBL:EMT1...,MSDVVEIETWCQTEGRIGTRRDWIIKDFASSEVIGRATSKWVMMNQ...,thyme_enz_2
3,Uncharacterized_protein__ECO_0000313_EMBL_EMT3...,MGQPHIISPSSANYKTVDGRQYPRRICQCFIKHRITVPRIDAYWPS...,Uncharacterized protein {ECO:0000313|EMBL:EMT3...,MRKLHLIWVTSRMHIEIYKYPAWTDVVEIETWCQSEGRIGTRRDWI...,thyme_enz_3
5,Uncharacterized_protein__ECO_0000313_EMBL_ERN0...,MAMANRVGLVNLLGLVETENFRERKCNYVLRCEKREVRGVRNVNRV...,Uncharacterized protein {ECO:0000313|EMBL:ERN0...,MDVVEIEHWCQGEGKIGTRRDWILKDLASGEVIGRATSKWVMMNQD...,thyme_enz_4


In [9]:
# save the csv sequence file
thyme_df.loc[:,['enzAlias','sequence1']].to_csv('../data/seq/ThymeEnzymeSequence1.csv',header=None,index=None)
thyme_df.loc[:,['enzAlias','sequence2']].to_csv('../data/seq/ThymeEnzymeSequence2.csv',header=None,index=None)


#save the enzyme name mappings
thyme_df.loc[:,['enzAlias','enzNames1']].to_csv('../data/ThymeEnzymeNameMap.csv',header=None,index=None)

#createFastafileThyme1
with open('../data/seq/ThymeEnzymeFasta1.fa','w') as f:
    for enzal,enzseq in zip(thyme_df.enzAlias,thyme_df.sequence1):
        f.write('>'+enzal)
        f.write('\n')
        f.write(enzseq+'\n')
        
#createFastafileTest
with open('../data/seq/ThymeEnzymeFasta2.fa','w') as f:
    for enzal,enzseq in zip(thyme_df.enzAlias,thyme_df.sequence2):
        f.write('>'+enzal)
        f.write('\n')
        f.write(enzseq+'\n')