In [1]:
import pandas as pd
import numpy as np
import json
import os

In [2]:
def process_enzyme_file(f):
    labels = []
    sequences = []
    for x in f:

        splits = x.split('>')
        sequence = splits[1].strip()
        label = splits[0][1:]

        labels.append(label)
        sequences.append(sequence)
    Enzyme_seq_len = [len(seq) for seq in sequences]
    enzyme_data = pd.DataFrame({'label' : labels, 'sequence': sequences, 'AAlen': Enzyme_seq_len })
    enyzme_class = []
    enzyme_subclass = []


    for label in labels:
       # print(label)
        splits = label.split('.')
        enyzme_class.append(splits[0])
        enzyme_subclass.append(splits[1])
    enzyme_data['enzyme_class'] = enyzme_class
    enzyme_data['enzyme_subclass'] = enzyme_subclass
    enzyme_data['id'] = enzyme_data.index + 1
    enzyme_data['id'] = 'ENZY' + enzyme_data['id'].astype(str)
    return enzyme_data
    
    

In [3]:
def process_nonenzyme_file(f):
    labels = []
    sequences = []
    for x in f:
        splits = x.split('>')
        sequence = splits[1].strip()
        label = splits[0][1:] + 'NENZY'
        labels.append(label)
        sequences.append(sequence)
    NEnzyme_seq_len = [len(seq) for seq in sequences]
    Nenzyme_data = pd.DataFrame({'label' : labels, 'sequence': sequences, 'AAlen': NEnzyme_seq_len })
    Nenzyme_data['id'] = Nenzyme_data.index + 1
    Nenzyme_data['id'] = 'NENZY' + Nenzyme_data['id'].astype(str)
    
    return Nenzyme_data
    
        

In [4]:
def subset_data(df, Num, AAlen):
    df = df.loc[df['AAlen'] < AAlen]
    rows = df.shape[0]
    print("Data has %s enzymes or nonenzymes of length < %s" %(rows, AAlen))
    rand_rows = np.random.randint(rows, size=Num)
    print("From %s rows Choosing random %s rows %s" %(rows,Num,rand_rows))
    df = df.iloc[rand_rows]
    return df
    
    

In [25]:
f = open("data/new_data_label_sequence.txt", "r")
enzyme_data = process_enzyme_file(f)
enzyme_data.head()

Unnamed: 0,label,sequence,AAlen,enzyme_class,enzyme_subclass,id
0,1.1.1.100,ATAQEQPEGEAPDKVESPVVXXXEAXIVLVNYAREADVDAMMKDAV...,201,1,1,ENZY1
1,1.1.1.100,MATTSIAGPRLTSLKTAAGKLGYREICHVRQWAPVHSPMPHFGMLR...,328,1,1,ENZY2
2,1.1.1.100,MHPYFSLAGRIALVTGGSRGIGQMIAQGLLEAGARVFICARDAEAC...,256,1,1,ENZY3
3,1.1.1.100,MHYLPVAIVTGATRGIGKAICQKLFQKGLSCIILGSTKESIERTAI...,278,1,1,ENZY4
4,1.1.1.100,MKTTKKIAVITGANRGLGKGIAEELSNTNNITVIGTSTSQKGCKII...,245,1,1,ENZY5


In [26]:
f = open("data/non_enzyme_new_data_sequence.txt", "r")
Nenzyme_data = process_nonenzyme_file(f)
Nenzyme_data.head()


Unnamed: 0,label,sequence,AAlen,id
0,NENZY,MNQQLSWRTIVGYSLGDVANNFAFAMGALFLLSYYTDVAGVGAAAA...,457,NENZY1
1,NENZY,HGLPAQCPNADGTMVHTCCLHGMPTFKLNFDSHFTIKTVVAQNGTE...,208,NENZY2
2,NENZY,YSPALNKMFCQLAKTCPVYLRISSPPPPGTRVRAMAIYKKSEFMTE...,207,NENZY3
3,NENZY,MTWLLLCLLAQYENGGKVLALSSSAKPYKTSVRFDPKTAHPNLVVS...,115,NENZY4
4,NENZY,MALPLPPASNLHSILLVTKSRSLGPRLVFHYPPLSPSAAALAGAKD...,870,NENZY5


### Subset the data.  Specify the number to sample, and the AA len cut-off

In [27]:
Num = 3500  ## overshoot and get 2500 unique
AAlen = 600
Enzyme_subset = subset_data(enzyme_data, Num, AAlen)

Data has 17289 enzymes or nonenzymes of length < 600
From 17289 rows Choosing random 3500 rows [ 3966  6916  9096 ... 13507 11076 12777]


In [28]:
NEnzyme_subset = subset_data(Nenzyme_data, Num, AAlen)

Data has 18086 enzymes or nonenzymes of length < 600
From 18086 rows Choosing random 3500 rows [ 7824 13560  3809 ... 14439  4923 10638]


In [29]:
Enzyme_subset.shape

(3500, 6)

In [30]:
Enzyme_subset.drop_duplicates(inplace = True)
NEnzyme_subset.drop_duplicates(inplace = True)
Enzyme_subset = Enzyme_subset.head(2500)
NEnzyme_subset = NEnzyme_subset.head(2500)


In [31]:
Enzyme_subset.head()


Unnamed: 0,label,sequence,AAlen,enzyme_class,enzyme_subclass,id
4502,2.1.1.74,MIIAAPPPQPASNQPVHVIGGGLAGSEATWQLARAGVRVVLHEMRP...,482,2,1,ENZY4503
7934,2.7.1.36,MSLPFLTSAPGKVIIFGEHSAVYNKPAVAASVSALRTYLLISESSA...,443,2,7,ENZY7935
11544,2.7.8.7,MIVGVGIDVLEVERVPEKFAERILGESEKRLFLTRKRRREFIAGRF...,169,2,7,ENZY11545
18036,4.1.1.65,MKDKIFIFLQYIIPHSLTSRLVSKLAESKNKHLKNYLINLAIKKFK...,281,4,1,ENZY18037
9810,2.7.2.4,MALIIQKFGGTSVANIERIKKLVPIIKAEITKNNQVIVVVSAMAGV...,448,2,7,ENZY9811


In [32]:
NEnzyme_subset.shape

(2500, 4)

In [33]:
len(Enzyme_subset['id'].unique())

2500

In [34]:
NEnzyme_subset.shape

(2500, 4)

In [35]:
NEnzyme_subset.head()

Unnamed: 0,label,sequence,AAlen,id
9593,NENZY,MASKYPEEGPITEGVEEDFNSHSTSGLDLTSVGKNPEHPRRILLVL...,387,NENZY9594
16638,NENZY,MSAEIEEATNAVNNLSINDSEQQPRAPTHKTVIDPEDTIFIGNVAH...,294,NENZY16639
4724,NENZY,MHLRDSTKDTLASSGNRDDDEIDDGQHAQFLDADTVDSEESIEAEP...,427,NENZY4725
1171,NENZY,MVSSLFINPFSEDAREIVRKYGSLDTIDDTRDELLEIGRRTRGQNL...,434,NENZY1172
21367,NENZY,MMDFLSKTPEPPYYAVIFSSVKSENDTGYGETAERMVSLAADQPGF...,114,NENZY21368


## TAPE files

### Labels

### Enzyme-NonEnzyme Labels

In [49]:
e = dict(zip(Enzyme_subset.id,[1 for i in range(Enzyme_subset.shape[0])]))
ne = dict(zip(NEnzyme_subset.id,[0 for i in range(NEnzyme_subset.shape[0])]))
e.update(ne)

In [50]:
e

{'ENZY4503': 1,
 'ENZY7935': 1,
 'ENZY11545': 1,
 'ENZY18037': 1,
 'ENZY9811': 1,
 'ENZY7689': 1,
 'ENZY2907': 1,
 'ENZY16992': 1,
 'ENZY9919': 1,
 'ENZY16891': 1,
 'ENZY5877': 1,
 'ENZY16807': 1,
 'ENZY5495': 1,
 'ENZY18691': 1,
 'ENZY15765': 1,
 'ENZY12278': 1,
 'ENZY20112': 1,
 'ENZY3730': 1,
 'ENZY15974': 1,
 'ENZY2380': 1,
 'ENZY4439': 1,
 'ENZY9563': 1,
 'ENZY15313': 1,
 'ENZY20164': 1,
 'ENZY19186': 1,
 'ENZY14826': 1,
 'ENZY1766': 1,
 'ENZY2383': 1,
 'ENZY19109': 1,
 'ENZY3517': 1,
 'ENZY6278': 1,
 'ENZY6388': 1,
 'ENZY3908': 1,
 'ENZY5543': 1,
 'ENZY15984': 1,
 'ENZY2497': 1,
 'ENZY6775': 1,
 'ENZY7150': 1,
 'ENZY7968': 1,
 'ENZY18312': 1,
 'ENZY16093': 1,
 'ENZY12003': 1,
 'ENZY12409': 1,
 'ENZY18776': 1,
 'ENZY13711': 1,
 'ENZY12977': 1,
 'ENZY4927': 1,
 'ENZY6514': 1,
 'ENZY981': 1,
 'ENZY20562': 1,
 'ENZY18986': 1,
 'ENZY8912': 1,
 'ENZY17255': 1,
 'ENZY6494': 1,
 'ENZY12594': 1,
 'ENZY4808': 1,
 'ENZY10787': 1,
 'ENZY5550': 1,
 'ENZY3612': 1,
 'ENZY7503': 1,
 'ENZY12270':

In [86]:
with open("data/MVP/NonEnzymes_Enzymes_labels.json", "w") as outfile:  
    json.dump(e, outfile, indent = 4)

### Enzyme Class labels

In [52]:
data = dict(zip(Enzyme_subset.id,Enzyme_subset.enzyme_class))

In [83]:
with open("data/MVP/Enzymes_Class_labels.json", "w") as outfile:  
    json.dump(data, outfile, indent = 4) 

### Enzyme SubClass labels

In [84]:
data = dict(zip(Enzyme_subset.id,Enzyme_subset.enzyme_subclass))

In [85]:
with open("data/MVP/Enzymes_SubClass_labels.json", "w") as outfile:  
    json.dump(data, outfile, indent = 4) 

### Fasta file

In [71]:
Enzyme_subset.shape

(2500, 7)

In [72]:

Enzyme_subset['id>'] = '>' + Enzyme_subset['id'].astype(str)
NEnzyme_subset['id>'] = '>' + NEnzyme_subset['id'].astype(str)


In [73]:
if not os.path.exists('data/MVP/tmp'):
    os.mkdir('data/MVP/tmp') 
i=0
for row in Enzyme_subset[['id>', 'sequence']].values:
    filename = 'data/MVP/tmp/enzyme{}.fasta'.format(i)
    row.tofile(filename, sep="\n", format="%s")
    i+=1
i=0
for row in NEnzyme_subset[['id>', 'sequence']].values:
    filename = 'data/MVP/tmp/Nenzyme{}.fasta'.format(i)
    row.tofile(filename, sep="\n", format="%s")
    i+=1


In [75]:
### combine all the indiv fasta a format for TAPE
#!cat ~/Documents/DSE/Capstone/git/protein_modeling_capstone/data/MVP/tmp/enzyme*fasta | sed 's/>/\n>/g' |  tr 'n' '\n' > ~/Documents/DSE/Capstone/git/protein_modeling_capstone/data/MVP/Enzymes.fasta
#!cat ~/Documents/DSE/Capstone/git/protein_modeling_capstone/data/MVP/tmp/*fasta | sed 's/>/\n>/g' |  tr 'n' '\n' > ~/Documents/DSE/Capstone/git/protein_modeling_capstone/data/MVP/NonEnzymes_Enzymes.fasta
#rm -r ~/Documents/DSE/Capstone/git/protein_modeling_capstone/data/MVP/tmp




/bin/bash: /bin/cat: Argument list too long
/bin/bash: /bin/cat: Argument list too long


In [81]:
f = open('data/MVP/Enzymes.fasta', 'w')
f2 = open('data/MVP/NonEnzymes_Enzymes.fasta', 'w')
for row in Enzyme_subset[['id>', 'sequence']].values:
    #print(row)
    f.write("\n".join(row) + "\n")
    f2.write("\n".join(row) + "\n")
f.close()

for row in NEnzyme_subset[['id>', 'sequence']].values:
    #print(row)
    f2.write("\n".join(row) + "\n")
f2.close()
    
    

## ESM files

In [60]:
Edata = dict(zip(Enzyme_subset.id,Enzyme_subset.sequence))
Ndata = dict(zip(NEnzyme_subset.id,NEnzyme_subset.sequence))
Edata.update(Ndata)

In [61]:
#Edata

In [82]:
with open('data/MVP/ESM_NonEnzymesEnzymes.json', 'w') as json_file:
    json.dump(Edata, json_file, indent = 4)


In [33]:
### Read in the saved dictionary of fasta into a list of tuples
with open('data/MVP/ESM_NonEnzymesEnzymes.json') as json_file: 
    data = json.load(json_file) 

In [34]:
data_list = []

for id in data:
    data_list.append((id, data[id]))

In [35]:
data_list


[('ENZY17952',
  'MSPQTETKASVGFKAGVKEYKLNYYTPDYDTKDTDILAAFRVSPQPGVPPEEAGAAVAAESSTGTWTTVWTDGLTSLDRYKGRCYQIEPVAGEENQFIAYVAYPLDLFEEGSVTNMLTSIVGXVFGFKALCALRLEDLRIPPA'),
 ('ENZY7957',
  'MVRVSVPATSANLGPGFDTLGVALELRNVIEMDETGIDDVVIEVEGAGAGALEDPGRNMVYQAARLVFQRLGYEPNGLLIREKVAIPVARGMGSSAAAIVGGLVAANALVQKRTGGPGLDREELLRMAVAIEGHPDNVTPALLGGFTVSCMDPDRGPLYLCFPPPRGLRAVVVMPEVQIKGRKTEQSRGVLPAQVSLRDAVYNLNRTALLVAAVAQGRTDLLRVAMQDRLHQPYRAALVPGMRSVFEAALSAGALGVALSGAGPSVIALVAESAEPVALAMEAAFQWAGSNARSLTMDLAREGARVLSGPGREQDMLDRPPHWG'),
 ('ENZY19600',
  'MGRSKCFMDISIGGELEGRIVIELYDDVVPKTAENFRLLCTGEKGLGPNTGVPLHYKGNRFHRVIKGFMIQGGDISANDGTGGESIYGLKFDDENFELKHERKGMLSMANSGPNTNGSQFFITTTRTSHLDGKHVVFGRVTKGMGVVRSIEHVSIEEQSCPSQDVVIHDCGEIPEGADDGICDFFKDGDVYPDWPIDLNESPAELSWWMETVDFVKAHGNEHFKKQDYKMALRKYRKALRYLDICWEKEGIDEETSTALRKTKSQIFTNSAACKLKFGDAKGALLDTEFAMRDEDNNVKALFRQGQAYMALNNVDAAAESLEKALQFEPNDAGIKKEYAAVMKKIAFRDNEEKKQYRKMFV'),
 ('ENZY1038',
  'MPDIDTCPICVESPLEDSTTFNNIAWLQCDICNQWFHASCLKIPKIEVNNLHSYHCEGCSKSHGPSIPKRKSKRSKVQIDYVALNDGDVF

In [None]:
### continue on with the ESM ipynb