In [1]:
import pandas as pd
import numpy as np
import json
import os

In [2]:
def process_enzyme_file(f):
    labels = []
    sequences = []
    for x in f:

        splits = x.split('>')
        sequence = splits[1].strip()
        label = splits[0][1:]

        labels.append(label)
        sequences.append(sequence)
    Enzyme_seq_len = [len(seq) for seq in sequences]
    enzyme_data = pd.DataFrame({'label' : labels, 'sequence': sequences, 'AAlen': Enzyme_seq_len })
    enyzme_class = []
    enzyme_subclass = []


    for label in labels:
       # print(label)
        splits = label.split('.')
        enyzme_class.append(splits[0])
        enzyme_subclass.append(splits[1])
    enzyme_data['enzyme_class'] = enyzme_class
    enzyme_data['enzyme_subclass'] = enzyme_subclass
    enzyme_data['id'] = enzyme_data.index + 1
    enzyme_data['id'] = 'ENZY' + enzyme_data['id'].astype(str)
    return enzyme_data
    
    

In [3]:
def process_nonenzyme_file(f):
    labels = []
    sequences = []
    for x in f:
        splits = x.split('>')
        sequence = splits[1].strip()
        label = splits[0][1:] + 'NENZY'
        labels.append(label)
        sequences.append(sequence)
    NEnzyme_seq_len = [len(seq) for seq in sequences]
    Nenzyme_data = pd.DataFrame({'label' : labels, 'sequence': sequences, 'AAlen': NEnzyme_seq_len })
    Nenzyme_data['id'] = Nenzyme_data.index + 1
    Nenzyme_data['id'] = 'NENZY' + Nenzyme_data['id'].astype(str)
    
    return Nenzyme_data
    
        

In [4]:
def subset_data(df, Num, AAlen):
    df = df.loc[df['AAlen'] < AAlen]
    rows = df.shape[0]
    print("Data has %s enzymes or nonenzymes of length < %s" %(rows, AAlen))
    rand_rows = np.random.randint(rows, size=Num)
    print("From %s rows Choosing random %s rows %s" %(rows,Num,rand_rows))
    df = df.iloc[rand_rows]
    return df
    
    

In [5]:
f = open("data/new_data_label_sequence.txt", "r")
enzyme_data = process_enzyme_file(f)
enzyme_data.head()

Unnamed: 0,label,sequence,AAlen,enzyme_class,enzyme_subclass,id
0,1.1.1.100,ATAQEQPEGEAPDKVESPVVXXXEAXIVLVNYAREADVDAMMKDAV...,201,1,1,ENZY1
1,1.1.1.100,MATTSIAGPRLTSLKTAAGKLGYREICHVRQWAPVHSPMPHFGMLR...,328,1,1,ENZY2
2,1.1.1.100,MHPYFSLAGRIALVTGGSRGIGQMIAQGLLEAGARVFICARDAEAC...,256,1,1,ENZY3
3,1.1.1.100,MHYLPVAIVTGATRGIGKAICQKLFQKGLSCIILGSTKESIERTAI...,278,1,1,ENZY4
4,1.1.1.100,MKTTKKIAVITGANRGLGKGIAEELSNTNNITVIGTSTSQKGCKII...,245,1,1,ENZY5


In [6]:
f = open("data/non_enzyme_new_data_sequence.txt", "r")
Nenzyme_data = process_nonenzyme_file(f)
Nenzyme_data.head()


Unnamed: 0,label,sequence,AAlen,id
0,NENZY,MNQQLSWRTIVGYSLGDVANNFAFAMGALFLLSYYTDVAGVGAAAA...,457,NENZY1
1,NENZY,HGLPAQCPNADGTMVHTCCLHGMPTFKLNFDSHFTIKTVVAQNGTE...,208,NENZY2
2,NENZY,YSPALNKMFCQLAKTCPVYLRISSPPPPGTRVRAMAIYKKSEFMTE...,207,NENZY3
3,NENZY,MTWLLLCLLAQYENGGKVLALSSSAKPYKTSVRFDPKTAHPNLVVS...,115,NENZY4
4,NENZY,MALPLPPASNLHSILLVTKSRSLGPRLVFHYPPLSPSAAALAGAKD...,870,NENZY5


### Subset the data.  Specify the number to sample, and the AA len cut-off

In [113]:
enzyme_data.shape

(22168, 6)

In [115]:
Num = 20713  ## overshoot and get 2500 unique
AAlen = 1024
Enzyme_subset = subset_data(enzyme_data, Num, AAlen)

Data has 20713 enzymes or nonenzymes of length < 1024
From 20713 rows Choosing random 20713 rows [12168 15355 18610 ... 14309  7637 10851]


In [117]:
Num = 20864

In [118]:
NEnzyme_subset = subset_data(Nenzyme_data, Num, AAlen)

Data has 20864 enzymes or nonenzymes of length < 1024
From 20864 rows Choosing random 20864 rows [ 5741 19806  7673 ...  4383  2582   398]


In [119]:
Enzyme_subset.shape

(20713, 6)

In [120]:
Enzyme_subset.drop_duplicates(inplace = True)
NEnzyme_subset.drop_duplicates(inplace = True)
# Enzyme_subset = Enzyme_subset.head(2500)
# NEnzyme_subset = NEnzyme_subset.head(2500)


In [54]:
Enzyme_subset.head()


Unnamed: 0,label,sequence,AAlen,enzyme_class,enzyme_subclass,id
22060,6.5.1.1,MPWDVKFSHGLYLPQLGWWLDAHFPQKRSFVSHAHSDHTATHDEIL...,1017,6,5,ENZY22061
4776,2.3.1.101,MADPAATPKSLSIACVSIKDTFAEAFDMKATRLIVTADDRRWCDES...,334,2,3,ENZY4777
16971,3.6.3.31,MNMQEDKSIIEVSHVSKFFGDKTALDDVTLNVKKGEFVTILGPSGC...,463,3,6,ENZY16972
20965,6.1.1.22,MAGLESKVSDMKIKSDVVFYIDEASGNDETGNGSQTSPFKTAIHAL...,568,6,1,ENZY20966
6963,2.5.1.75,MFYEFALIGTTASGKSSFSIELAKEIEAVILSLDSLCLYKNIDIAS...,300,2,5,ENZY6964


In [121]:
NEnzyme_subset.shape, Enzyme_subset.shape

((13262, 4), (13074, 6))

In [122]:
len(Enzyme_subset['id'].unique())

13074

In [123]:
NEnzyme_subset.shape

(13262, 4)

In [124]:
NEnzyme_subset.head()

Unnamed: 0,label,sequence,AAlen,id
6108,NENZY,MEEYTREPCPYRIGDDIGSAFAMGLVGGSIFQAFGGYKNAAKGKKL...,181,NENZY6109
21039,NENZY,MSITKLARSNAFKPIPNFVRSSLRNAGVESSQNTEYPPYKPKKHHI...,611,NENZY21040
8164,NENZY,ILLDEHIRLLGIDPYSIDGYSRECFSHLAVASTVARGGADLAIGSE...,113,NENZY8165
9642,NENZY,MIDSPTLESFLVKSNLTINLSALVDRLIPLIFENQKLYKETGGYRS...,289,NENZY9643
7000,NENZY,MVQRCLVVALLVVVVAAALCSAQLNFTPNWGTGKRDAGDYGDPYSF...,63,NENZY7001


## TAPE files

### Labels

### Enzyme-NonEnzyme Labels

In [125]:
e = dict(zip(Enzyme_subset.id,[1 for i in range(Enzyme_subset.shape[0])]))
ne = dict(zip(NEnzyme_subset.id,[0 for i in range(NEnzyme_subset.shape[0])]))
e.update(ne)

In [126]:
e

{'ENZY13089': 1,
 'ENZY16482': 1,
 'ENZY19988': 1,
 'ENZY2853': 1,
 'ENZY4911': 1,
 'ENZY8621': 1,
 'ENZY15620': 1,
 'ENZY10807': 1,
 'ENZY1668': 1,
 'ENZY8809': 1,
 'ENZY6274': 1,
 'ENZY17798': 1,
 'ENZY5545': 1,
 'ENZY11409': 1,
 'ENZY8458': 1,
 'ENZY21945': 1,
 'ENZY15136': 1,
 'ENZY3767': 1,
 'ENZY3664': 1,
 'ENZY2723': 1,
 'ENZY2754': 1,
 'ENZY5388': 1,
 'ENZY6083': 1,
 'ENZY12954': 1,
 'ENZY13554': 1,
 'ENZY8461': 1,
 'ENZY13600': 1,
 'ENZY448': 1,
 'ENZY8790': 1,
 'ENZY17490': 1,
 'ENZY10184': 1,
 'ENZY12992': 1,
 'ENZY22021': 1,
 'ENZY1376': 1,
 'ENZY7319': 1,
 'ENZY273': 1,
 'ENZY19655': 1,
 'ENZY19010': 1,
 'ENZY2555': 1,
 'ENZY9570': 1,
 'ENZY11971': 1,
 'ENZY5345': 1,
 'ENZY19273': 1,
 'ENZY1475': 1,
 'ENZY20466': 1,
 'ENZY4990': 1,
 'ENZY12965': 1,
 'ENZY16434': 1,
 'ENZY18662': 1,
 'ENZY6545': 1,
 'ENZY9084': 1,
 'ENZY18249': 1,
 'ENZY11383': 1,
 'ENZY4119': 1,
 'ENZY15382': 1,
 'ENZY536': 1,
 'ENZY14501': 1,
 'ENZY2634': 1,
 'ENZY21092': 1,
 'ENZY14035': 1,
 'ENZY382': 1

In [127]:
with open("data/MVP/1024_NonEnzymes_Enzymes_labels.json", "w") as outfile:  
    json.dump(e, outfile, indent = 4)

### Enzyme Class labels

In [128]:
data = dict(zip(Enzyme_subset.id,Enzyme_subset.enzyme_class))

In [129]:
with open("data/MVP/1024_Enzymes_Class_labels.json", "w") as outfile:  
    json.dump(data, outfile, indent = 4) 

### Enzyme SubClass labels

In [130]:
data = dict(zip(Enzyme_subset.id,Enzyme_subset.enzyme_subclass))

In [131]:
with open("data/MVP/1024_Enzymes_SubClass_labels.json", "w") as outfile:  
    json.dump(data, outfile, indent = 4) 

### Fasta file

In [132]:
Enzyme_subset.shape

(13074, 6)

In [133]:

Enzyme_subset['id>'] = '>' + Enzyme_subset['id'].astype(str)
NEnzyme_subset['id>'] = '>' + NEnzyme_subset['id'].astype(str)


In [134]:
# if not os.path.exists('data/MVP/tmp'):
#     os.mkdir('data/MVP/tmp') 
# i=0
# for row in Enzyme_subset[['id>', 'sequence']].values:
#     filename = 'data/MVP/tmp/enzyme{}.fasta'.format(i)
#     row.tofile(filename, sep="\n", format="%s")
#     i+=1
# i=0
# for row in NEnzyme_subset[['id>', 'sequence']].values:
#     filename = 'data/MVP/tmp/Nenzyme{}.fasta'.format(i)
#     row.tofile(filename, sep="\n", format="%s")
#     i+=1


In [135]:
### combine all the indiv fasta a format for TAPE
#!cat ~/Documents/DSE/Capstone/git/protein_modeling_capstone/data/MVP/tmp/enzyme*fasta | sed 's/>/\n>/g' |  tr 'n' '\n' > ~/Documents/DSE/Capstone/git/protein_modeling_capstone/data/MVP/Enzymes.fasta
#!cat ~/Documents/DSE/Capstone/git/protein_modeling_capstone/data/MVP/tmp/*fasta | sed 's/>/\n>/g' |  tr 'n' '\n' > ~/Documents/DSE/Capstone/git/protein_modeling_capstone/data/MVP/NonEnzymes_Enzymes.fasta
#rm -r ~/Documents/DSE/Capstone/git/protein_modeling_capstone/data/MVP/tmp




In [136]:
f = open('data/MVP/1024_Enzymes.fasta', 'w')
f2 = open('data/MVP/1024_NonEnzymes_Enzymes.fasta', 'w')
for row in Enzyme_subset[['id>', 'sequence']].values:
    #print(row)
    f.write("\n".join(row) + "\n")
    f2.write("\n".join(row) + "\n")
f.close()

for row in NEnzyme_subset[['id>', 'sequence']].values:
    #print(row)
    f2.write("\n".join(row) + "\n")
f2.close()
    
    

## ESM files

In [60]:
Edata = dict(zip(Enzyme_subset.id,Enzyme_subset.sequence))
Ndata = dict(zip(NEnzyme_subset.id,NEnzyme_subset.sequence))
Edata.update(Ndata)

In [61]:
#Edata

In [82]:
with open('data/MVP/ESM_NonEnzymesEnzymes.json', 'w') as json_file:
    json.dump(Edata, json_file, indent = 4)


In [33]:
### Read in the saved dictionary of fasta into a list of tuples
with open('data/MVP/ESM_NonEnzymesEnzymes.json') as json_file: 
    data = json.load(json_file) 

In [34]:
data_list = []

for id in data:
    data_list.append((id, data[id]))

In [35]:
data_list


[('ENZY17952',
  'MSPQTETKASVGFKAGVKEYKLNYYTPDYDTKDTDILAAFRVSPQPGVPPEEAGAAVAAESSTGTWTTVWTDGLTSLDRYKGRCYQIEPVAGEENQFIAYVAYPLDLFEEGSVTNMLTSIVGXVFGFKALCALRLEDLRIPPA'),
 ('ENZY7957',
  'MVRVSVPATSANLGPGFDTLGVALELRNVIEMDETGIDDVVIEVEGAGAGALEDPGRNMVYQAARLVFQRLGYEPNGLLIREKVAIPVARGMGSSAAAIVGGLVAANALVQKRTGGPGLDREELLRMAVAIEGHPDNVTPALLGGFTVSCMDPDRGPLYLCFPPPRGLRAVVVMPEVQIKGRKTEQSRGVLPAQVSLRDAVYNLNRTALLVAAVAQGRTDLLRVAMQDRLHQPYRAALVPGMRSVFEAALSAGALGVALSGAGPSVIALVAESAEPVALAMEAAFQWAGSNARSLTMDLAREGARVLSGPGREQDMLDRPPHWG'),
 ('ENZY19600',
  'MGRSKCFMDISIGGELEGRIVIELYDDVVPKTAENFRLLCTGEKGLGPNTGVPLHYKGNRFHRVIKGFMIQGGDISANDGTGGESIYGLKFDDENFELKHERKGMLSMANSGPNTNGSQFFITTTRTSHLDGKHVVFGRVTKGMGVVRSIEHVSIEEQSCPSQDVVIHDCGEIPEGADDGICDFFKDGDVYPDWPIDLNESPAELSWWMETVDFVKAHGNEHFKKQDYKMALRKYRKALRYLDICWEKEGIDEETSTALRKTKSQIFTNSAACKLKFGDAKGALLDTEFAMRDEDNNVKALFRQGQAYMALNNVDAAAESLEKALQFEPNDAGIKKEYAAVMKKIAFRDNEEKKQYRKMFV'),
 ('ENZY1038',
  'MPDIDTCPICVESPLEDSTTFNNIAWLQCDICNQWFHASCLKIPKIEVNNLHSYHCEGCSKSHGPSIPKRKSKRSKVQIDYVALNDGDVF

In [None]:
### continue on with the ESM ipynb