In [1]:
import pandas as pd
import numpy as np
import json
import os

In [2]:
def process_enzyme_file(f):
    labels = []
    sequences = []
    for x in f:

        splits = x.split('>')
        sequence = splits[1].strip()
        label = splits[0][1:]

        labels.append(label)
        sequences.append(sequence)
    Enzyme_seq_len = [len(seq) for seq in sequences]
    enzyme_data = pd.DataFrame({'label' : labels, 'sequence': sequences, 'AAlen': Enzyme_seq_len })
    enyzme_class = []
    enzyme_subclass = []


    for label in labels:
       # print(label)
        splits = label.split('.')
        enyzme_class.append(splits[0])
        enzyme_subclass.append(splits[1])
    enzyme_data['enzyme_class'] = enyzme_class
    enzyme_data['enzyme_subclass'] = enzyme_subclass
    enzyme_data['id'] = enzyme_data.index + 1
    enzyme_data['id'] = 'ENZY' + enzyme_data['id'].astype(str)
    return enzyme_data
    
    

In [3]:
def process_nonenzyme_file(f):
    labels = []
    sequences = []
    for x in f:
        splits = x.split('>')
        sequence = splits[1].strip()
        label = splits[0][1:] + 'NENZY'
        labels.append(label)
        sequences.append(sequence)
    NEnzyme_seq_len = [len(seq) for seq in sequences]
    Nenzyme_data = pd.DataFrame({'label' : labels, 'sequence': sequences, 'AAlen': NEnzyme_seq_len })
    Nenzyme_data['id'] = Nenzyme_data.index + 1
    Nenzyme_data['id'] = 'NENZY' + Nenzyme_data['id'].astype(str)
    
    return Nenzyme_data
    
        

In [4]:
def subset_data(df, Num, AAlen):
    df = df.loc[df['AAlen'] < AAlen]
    rows = df.shape[0]
    print("Data has %s enzymes or nonenzymes of length < %s" %(rows, AAlen))
    rand_rows = np.random.randint(rows, size=Num)
    print("From %s rows Choosing random %s rows %s" %(rows,Num,rand_rows))
    df = df.iloc[rand_rows]
    return df
    
    

In [5]:
f = open("data/new_data_label_sequence.txt", "r")
enzyme_data = process_enzyme_file(f)
enzyme_data.head()

Unnamed: 0,label,sequence,AAlen,enzyme_class,enzyme_subclass,id
0,1.1.1.100,ATAQEQPEGEAPDKVESPVVXXXEAXIVLVNYAREADVDAMMKDAV...,201,1,1,ENZY1
1,1.1.1.100,MATTSIAGPRLTSLKTAAGKLGYREICHVRQWAPVHSPMPHFGMLR...,328,1,1,ENZY2
2,1.1.1.100,MHPYFSLAGRIALVTGGSRGIGQMIAQGLLEAGARVFICARDAEAC...,256,1,1,ENZY3
3,1.1.1.100,MHYLPVAIVTGATRGIGKAICQKLFQKGLSCIILGSTKESIERTAI...,278,1,1,ENZY4
4,1.1.1.100,MKTTKKIAVITGANRGLGKGIAEELSNTNNITVIGTSTSQKGCKII...,245,1,1,ENZY5


In [6]:
f = open("data/non_enzyme_new_data_sequence.txt", "r")
Nenzyme_data = process_nonenzyme_file(f)
Nenzyme_data.head()


Unnamed: 0,label,sequence,AAlen,id
0,NENZY,MNQQLSWRTIVGYSLGDVANNFAFAMGALFLLSYYTDVAGVGAAAA...,457,NENZY1
1,NENZY,HGLPAQCPNADGTMVHTCCLHGMPTFKLNFDSHFTIKTVVAQNGTE...,208,NENZY2
2,NENZY,YSPALNKMFCQLAKTCPVYLRISSPPPPGTRVRAMAIYKKSEFMTE...,207,NENZY3
3,NENZY,MTWLLLCLLAQYENGGKVLALSSSAKPYKTSVRFDPKTAHPNLVVS...,115,NENZY4
4,NENZY,MALPLPPASNLHSILLVTKSRSLGPRLVFHYPPLSPSAAALAGAKD...,870,NENZY5


### Subset the data.  Specify the number to sample, and the AA len cut-off

In [7]:
Num = 2500
AAlen = 600
Enzyme_subset = subset_data(enzyme_data, Num, AAlen)

Data has 17289 enzymes or nonenzymes of length < 600
From 17289 rows Choosing random 2500 rows [13853  6938 15344 ... 12310  3503 16650]


In [8]:
NEnzyme_subset = subset_data(Nenzyme_data, Num, AAlen)

Data has 18086 enzymes or nonenzymes of length < 600
From 18086 rows Choosing random 2500 rows [ 1149 15547  2310 ... 11738 13354  1463]


In [9]:
Enzyme_subset.shape

(2500, 6)

In [10]:
Enzyme_subset.head()


Unnamed: 0,label,sequence,AAlen,enzyme_class,enzyme_subclass,id
17951,4.1.1.39,MSPQTETKASVGFKAGVKEYKLNYYTPDYDTKDTDILAAFRVSPQP...,143,4,1,ENZY17952
7956,2.7.1.39,MVRVSVPATSANLGPGFDTLGVALELRNVIEMDETGIDDVVIEVEG...,324,2,7,ENZY7957
19599,5.2.1.8,MGRSKCFMDISIGGELEGRIVIELYDDVVPKTAENFRLLCTGEKGL...,361,5,2,ENZY19600
1037,1.14.11.27,MPDIDTCPICVESPLEDSTTFNNIAWLQCDICNQWFHASCLKIPKI...,514,1,14,ENZY1038
15290,3.4.21.35,MWFLILFLALFLGGIDAAPPVQSRIIGGFNCEKNSQPWHVAVYRFA...,263,3,4,ENZY15291


In [11]:
NEnzyme_subset.shape

(2500, 4)

In [12]:
NEnzyme_subset.head()

Unnamed: 0,label,sequence,AAlen,id
1417,NENZY,MGVKLEVFRMTIYLTFPVAMFWIANQAEWFEDYVIQRKRELWPPEK...,76,NENZY1418
19054,NENZY,MMNWRALSQTKQDRIWSEVNKIIKWKPGSRCHHIIPPDPYRVFDIS...,162,NENZY19055
2865,NENZY,YCFQKINRPGESDEGCILDGKLYPFGEISRTENCYRCSCSRDAMRC...,90,NENZY2866
4454,NENZY,MKASVIVSVALGASMCLATTLAELPACSQACLQSMLGKAAELGCPP...,264,NENZY4455
13598,NENZY,MAKRLQAELSCPVCLDFFSCSISLSCTHVFCFDCIQRYILENHDFR...,263,NENZY13599


## TAPE files

### Labels

### Enzyme-NonEnzyme Labels

In [16]:
e = dict(zip(Enzyme_subset.id,[1 for i in range(Enzyme_subset.shape[0])]))
ne = dict(zip(NEnzyme_subset.id,[0 for i in range(NEnzyme_subset.shape[0])]))
e.update(ne)

In [17]:
e

{'ENZY17952': 1,
 'ENZY7957': 1,
 'ENZY19600': 1,
 'ENZY1038': 1,
 'ENZY15291': 1,
 'ENZY21379': 1,
 'ENZY4551': 1,
 'ENZY12340': 1,
 'ENZY219': 1,
 'ENZY15960': 1,
 'ENZY2589': 1,
 'ENZY18970': 1,
 'ENZY18707': 1,
 'ENZY6697': 1,
 'ENZY5398': 1,
 'ENZY6589': 1,
 'ENZY7220': 1,
 'ENZY5620': 1,
 'ENZY7279': 1,
 'ENZY10067': 1,
 'ENZY842': 1,
 'ENZY10906': 1,
 'ENZY15990': 1,
 'ENZY2375': 1,
 'ENZY6742': 1,
 'ENZY6856': 1,
 'ENZY19737': 1,
 'ENZY7057': 1,
 'ENZY10962': 1,
 'ENZY3764': 1,
 'ENZY12612': 1,
 'ENZY14481': 1,
 'ENZY490': 1,
 'ENZY6273': 1,
 'ENZY1106': 1,
 'ENZY13848': 1,
 'ENZY2160': 1,
 'ENZY20721': 1,
 'ENZY14032': 1,
 'ENZY21076': 1,
 'ENZY1122': 1,
 'ENZY9946': 1,
 'ENZY4080': 1,
 'ENZY21505': 1,
 'ENZY9226': 1,
 'ENZY3972': 1,
 'ENZY18360': 1,
 'ENZY14497': 1,
 'ENZY7036': 1,
 'ENZY12679': 1,
 'ENZY11605': 1,
 'ENZY3615': 1,
 'ENZY7525': 1,
 'ENZY859': 1,
 'ENZY17471': 1,
 'ENZY12736': 1,
 'ENZY9328': 1,
 'ENZY10051': 1,
 'ENZY3757': 1,
 'ENZY6826': 1,
 'ENZY10287': 1,


In [29]:
with open("data/MVP/NonEnzymes_Enzymes_labels.json", "w") as outfile:  
    json.dump(e, outfile)

### Enzyme Class labels

In [19]:
data = dict(zip(Enzyme_subset.id,Enzyme_subset.enzyme_class))

In [20]:
with open("data/MVP/Enzymes_Class_labels.json", "w") as outfile:  
    json.dump(data, outfile) 

### Enzyme SubClass labels

In [21]:
data = dict(zip(Enzyme_subset.id,Enzyme_subset.enzyme_subclass))

In [22]:
with open("data/MVP/Enzymes_SubClass_labels.json", "w") as outfile:  
    json.dump(data, outfile) 

### Fasta file

In [26]:

Enzyme_subset['id>'] = '>' + Enzyme_subset['id'].astype(str)
NEnzyme_subset['id>'] = '>' + NEnzyme_subset['id'].astype(str)


In [27]:
if not os.path.exists('data/MVP/tmp'):
    os.mkdir('data/MVP/tmp') 
i=0
for row in Enzyme_subset[['id>', 'sequence']].values:
    filename = 'data/MVP/tmp/enzyme{}.fasta'.format(i)
    row.tofile(filename, sep="\n", format="%s")
    i+=1
i=0
for row in NEnzyme_subset[['id>', 'sequence']].values:
    filename = 'data/MVP/tmp/Nenzyme{}.fasta'.format(i)
    row.tofile(filename, sep="\n", format="%s")
    i+=1


In [None]:
pwd

In [28]:
### combine all the indiv fasta a format for TAPE
!cat ~/Documents/DSE/Capstone/git/protein_modeling_capstone/data/MVP/tmp/enzyme*fasta | sed 's/>/\n>/g' |  tr 'n' '\n' > ~/Documents/DSE/Capstone/git/protein_modeling_capstone/data/MVP/Enzymes.fasta
!cat ~/Documents/DSE/Capstone/git/protein_modeling_capstone/data/MVP/tmp/*fasta | sed 's/>/\n>/g' |  tr 'n' '\n' > ~/Documents/DSE/Capstone/git/protein_modeling_capstone/data/MVP/NonEnzymes_Enzymes.fasta
!rm -r ~/Documents/DSE/Capstone/git/protein_modeling_capstone/data/MVP/tmp




/bin/bash: /bin/cat: Argument list too long


## ESM files

In [30]:
Edata = dict(zip(Enzyme_subset.id,Enzyme_subset.sequence))
Ndata = dict(zip(NEnzyme_subset.id,NEnzyme_subset.sequence))
Edata.update(Ndata)

In [31]:
#Edata

In [32]:
with open('data/MVP/ESM_NonEnzymesEnzymes.json', 'w') as json_file:
    json.dump(Edata, json_file)


In [33]:
### Read in the saved dictionary of fasta into a list of tuples
with open('data/MVP/ESM_NonEnzymesEnzymes.json') as json_file: 
    data = json.load(json_file) 

In [34]:
data_list = []

for id in data:
    data_list.append((id, data[id]))

In [35]:
data_list


[('ENZY17952',
  'MSPQTETKASVGFKAGVKEYKLNYYTPDYDTKDTDILAAFRVSPQPGVPPEEAGAAVAAESSTGTWTTVWTDGLTSLDRYKGRCYQIEPVAGEENQFIAYVAYPLDLFEEGSVTNMLTSIVGXVFGFKALCALRLEDLRIPPA'),
 ('ENZY7957',
  'MVRVSVPATSANLGPGFDTLGVALELRNVIEMDETGIDDVVIEVEGAGAGALEDPGRNMVYQAARLVFQRLGYEPNGLLIREKVAIPVARGMGSSAAAIVGGLVAANALVQKRTGGPGLDREELLRMAVAIEGHPDNVTPALLGGFTVSCMDPDRGPLYLCFPPPRGLRAVVVMPEVQIKGRKTEQSRGVLPAQVSLRDAVYNLNRTALLVAAVAQGRTDLLRVAMQDRLHQPYRAALVPGMRSVFEAALSAGALGVALSGAGPSVIALVAESAEPVALAMEAAFQWAGSNARSLTMDLAREGARVLSGPGREQDMLDRPPHWG'),
 ('ENZY19600',
  'MGRSKCFMDISIGGELEGRIVIELYDDVVPKTAENFRLLCTGEKGLGPNTGVPLHYKGNRFHRVIKGFMIQGGDISANDGTGGESIYGLKFDDENFELKHERKGMLSMANSGPNTNGSQFFITTTRTSHLDGKHVVFGRVTKGMGVVRSIEHVSIEEQSCPSQDVVIHDCGEIPEGADDGICDFFKDGDVYPDWPIDLNESPAELSWWMETVDFVKAHGNEHFKKQDYKMALRKYRKALRYLDICWEKEGIDEETSTALRKTKSQIFTNSAACKLKFGDAKGALLDTEFAMRDEDNNVKALFRQGQAYMALNNVDAAAESLEKALQFEPNDAGIKKEYAAVMKKIAFRDNEEKKQYRKMFV'),
 ('ENZY1038',
  'MPDIDTCPICVESPLEDSTTFNNIAWLQCDICNQWFHASCLKIPKIEVNNLHSYHCEGCSKSHGPSIPKRKSKRSKVQIDYVALNDGDVF

In [None]:
### continue on with the ESM ipynb