In [2]:
import pandas as pd
import numpy as np
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split

import json
import pickle

In [3]:
with open("./ECPRED_class_labels.json") as f:
    ecpred_class_labels = json.load(f)

In [4]:
ecpred_class_labels

{'Q65GK1': '2',
 'P16616': '2',
 'Q1LU25': '2',
 'Q7VRM4': '2',
 'Q491Z6': '2',
 'Q7WKM1': '2',
 'Q7W785': '2',
 'A9IKF3': '2',
 'Q2KIN5': '2',
 'B2S842': '2',
 'C0RFD1': '2',
 'B0CID9': '2',
 'B8D8B0': '2',
 'B4E8C4': '2',
 'Q62LC0': '2',
 'A4XK06': '2',
 'A0RNT3': '2',
 'Q8KCJ4': '2',
 'A7FSG8': '2',
 'P87214': '4',
 'Q0VTD7': '1',
 'Q89SC2': '1',
 'Q57BW7': '1',
 'C0REI6': '1',
 'P63851': '1',
 'B1JW43': '1',
 'Q0BD83': '1',
 'A3MI45': '1',
 'A1V2G0': '1',
 'B0T8D1': '1',
 'Q1QTJ9': '1',
 'A7MKX5': '1',
 'A9KC01': '1',
 'A7ZPN6': '1',
 'B7MHT7': '1',
 'B7NPX2': '1',
 'B5YZY1': '1',
 'C4ZX09': '1',
 'Q2A3H9': '1',
 'A0Q6H6': '1',
 'A4IY09': '1',
 'Q42840': '1',
 'B5XVR1': '1',
 'Q5ZW72': '1',
 'Q1H4H0': '1',
 'P36552': '1',
 'Q7N6Z9': '1',
 'B7V0Q9': '1',
 'Q1MDJ5': '1',
 'C3PM25': '1',
 'Q1RGQ3': '1',
 'Q9ZC86': '1',
 'A8GU63': '1',
 'B5F0I0': '1',
 'B5R4G0': '1',
 'Q5PI28': '1',
 'B5BB50': '1',
 'Q8Z4U8': '1',
 'B8E3K7': '1',
 'A5F4C4': '1',
 'C3LPC8': '1',
 'Q8DDD5': '1',
 'Q8PF76

In [5]:
a = np.load("./test_ecpred_from_esm.npz", allow_pickle=True)

In [6]:
a_data = {}

for data in a:
    a_data[data] = a[data]

#### Verify the test data is not part of the train data

In [60]:
c = np.load("./all_ecpred_from_esm.npz", allow_pickle=True)

In [123]:
c['B7M2G1']

numpy.ndarray

In [61]:
c_data = {}

for data in c:
    c_data[data] = c[data]

In [64]:
intersection = set(list(a_data.keys())) & set(list(c_data.keys()))

In [65]:
intersection

set()

In [7]:
test_data = pd.DataFrame(a_data.items(), columns=['identifier', 'embeddings'])

In [9]:
test_data['classes'] = test_data['identifier'].map(ecpred_class_labels)

In [78]:
test_data['enzyme_non_enzyme'] = "1"

In [79]:
test_data

Unnamed: 0,identifier,embeddings,classes,enzyme_non_enzyme
0,Q8HVP5,"[0.033683684, 0.049727384, -0.016321301, 0.039...",1,1
1,Q8HVS5,"[0.033807687, 0.04896432, -0.015200496, 0.0365...",1,1
2,P0C386,"[0.018022297, 0.02248324, -0.06277644, 0.06475...",1,1
3,Q8HVN4,"[0.033129636, 0.049638327, -0.010831154, 0.034...",1,1
4,Q0B4Z6,"[0.11726774, 0.2692161, -0.05544941, 0.0621133...",2,1
...,...,...,...,...
355,Q8Z080,"[0.049787696, 0.09494831, -0.014217152, 0.0826...",2,1
356,Q3V7K1,"[0.04744051, 0.07840378, -0.02636614, 0.056081...",2,1
357,Q8Z4K6,"[0.04725945, 0.07858272, -0.023864051, 0.05558...",2,1
358,Q3BZR9,"[0.0028418622, 0.08328892, 0.015781334, 0.0413...",2,1


In [24]:
b = np.load("./all_deeppre_from_esm.npz", allow_pickle=True)

In [25]:
b_data = {}

for data in b:
    b_data[data] = b[data]

In [126]:
b['ENZY12425']

array([ 0.12746994,  0.27281544,  0.06692323, ...,  0.03192677,
       -0.12083121,  0.10100751], dtype=float32)

In [31]:
with open("./1024_NonEnzymes_Enzymes_labels.json") as f:
    nonenzyme_enzyme_labels = json.load(f)

In [32]:
ne_e_test_data = pd.DataFrame(b_data.items(), columns=['identifier', 'embeddings'])

In [38]:
ne_e_test_data['enzyme_nonenzyme'] = ne_e_test_data['identifier'].map(nonenzyme_enzyme_labels)

In [39]:
ne_e_test_data

Unnamed: 0,identifier,embeddings,enzyme_nonenzyme
0,ENZY12425,"[0.12746994, 0.27281544, 0.06692323, 0.1448938...",1
1,ENZY3790,"[0.12582903, 0.11324516, 0.11739114, 0.1498377...",1
2,ENZY5288,"[0.13101612, 0.30415857, 0.17695493, -0.022740...",1
3,ENZY13326,"[0.026205149, 0.21668276, 0.05241565, 7.565544...",1
4,ENZY2952,"[0.07479797, -0.0067801224, 0.077018805, 0.008...",1
...,...,...,...
26330,ENZY11657,"[-0.02991949, 0.19900143, 0.07266715, 0.152193...",1
26331,ENZY1891,"[0.1251773, 0.10361989, 0.052353505, -0.006737...",1
26332,ENZY21537,"[-0.070309326, 0.012636613, -0.058369204, 0.15...",1
26333,ENZY10759,"[0.12393214, 0.13972586, 0.039379448, 0.215272...",1


In [80]:
x_test = list(test_data['embeddings'])
y_test_class = list(test_data['classes'])
y_test_binary = list(test_data['enzyme_non_enzyme'])

In [75]:
x_test

[array([ 0.03368368,  0.04972738, -0.0163213 , ..., -0.13593875,
        -0.03433039, -0.06656401], dtype=float32),
 array([ 0.03380769,  0.04896432, -0.0152005 , ..., -0.13321853,
        -0.03336693, -0.0679692 ], dtype=float32),
 array([ 0.0180223 ,  0.02248324, -0.06277644, ..., -0.1031374 ,
        -0.01176274, -0.0366175 ], dtype=float32),
 array([ 0.03312964,  0.04963833, -0.01083115, ..., -0.12932996,
        -0.03356467, -0.07291717], dtype=float32),
 array([ 0.11726774,  0.2692161 , -0.05544941, ..., -0.02116028,
        -0.02082298,  0.07637652], dtype=float32),
 array([ 0.0784464 ,  0.28879067, -0.04174797, ..., -0.08063894,
        -0.05886092,  0.06509774], dtype=float32),
 array([ 0.14412081,  0.33220333, -0.1289441 , ..., -0.05485492,
        -0.10075965, -0.00375912], dtype=float32),
 array([ 0.13952522,  0.3299176 , -0.1396554 , ..., -0.05689486,
        -0.10297772,  0.0057671 ], dtype=float32),
 array([ 0.1049448 ,  0.24859513, -0.11302238, ..., -0.06344181,
       

In [141]:
x_binary_test = np.array(list(ne_e_test_data['embeddings']))
y_binary_test = list(ne_e_test_data['enzyme_nonenzyme'])

  x_binary_test = np.array(list(ne_e_test_data['embeddings']))


In [142]:
x_binary_test.shape

(26335,)

In [150]:
x_binary_test = x_binary_test.reshape(1, -1)

In [153]:
x_binary_test.shape

(1, 26335)

In [164]:
x_binary_test = list(ne_e_test_data['embeddings'])

In [165]:
x_binary_test

[array([ 0.12746994,  0.27281544,  0.06692323, ...,  0.03192677,
        -0.12083121,  0.10100751], dtype=float32),
 array([ 0.12582903,  0.11324516,  0.11739114, ..., -0.10598259,
        -0.09380031,  0.09344083], dtype=float32),
 array([ 0.13101612,  0.30415857,  0.17695493, ..., -0.1529716 ,
        -0.10064235,  0.05371748], dtype=float32),
 array([ 0.02620515,  0.21668276,  0.05241565, ..., -0.0518866 ,
        -0.02595983, -0.06790142], dtype=float32),
 array([ 0.07479797, -0.00678012,  0.0770188 , ..., -0.09401439,
        -0.16186218, -0.05884096], dtype=float32),
 array([ 0.16155864,  0.15980445, -0.0059483 , ..., -0.12044955,
        -0.1110381 ,  0.25093448], dtype=float32),
 array([-0.08676058,  0.17472704, -0.14077938, ..., -0.01626236,
        -0.05927157, -0.01815291], dtype=float32),
 array([-0.00168251,  0.22733687, -0.00960301, ..., -0.17274092,
         0.04870052,  0.0334609 ], dtype=float32),
 array([ 0.05386835,  0.3409739 , -0.05055678, ..., -0.09174938,
       

In [169]:
train, test = train_test_split(ne_e_test_data, test_size=0.2)

In [184]:
x_e_train = train['embeddings']
x_e_test = test['embeddings']

In [181]:
y_e_train = train["enzyme_nonenzyme"]
y_e_test = test["enzyme_nonenzyme"]

In [185]:
neigh = KNeighborsClassifier(n_neighbors=5)
neigh.fit(x_e_train, y_e_train)

ValueError: setting an array element with a sequence.

### Naive Bayes

#### Class Classification

In [110]:
infile = open("./gb_enzyme_class_esm.pkl",'rb')

In [111]:
model = pickle.load(infile)



In [112]:
y_pred = model.predict(x_test)

In [113]:
accuracy_score(y_test_class, y_pred)

0.3527777777777778

#### Enzyme Non Enzyme Classification

In [166]:
infile = open("./gb_enzyme_non_enzyme_esm.pkl",'rb')

In [167]:
model = pickle.load(infile)



In [168]:
y_pred = model.predict(x_binary_test)

  return array(a, dtype, copy=False, order=order)


ValueError: Expected 2D array, got 1D array instead:
array=[array([ 0.12746994,  0.27281544,  0.06692323, ...,  0.03192677,
        -0.12083121,  0.10100751], dtype=float32)
 array([ 0.12582903,  0.11324516,  0.11739114, ..., -0.10598259,
        -0.09380031,  0.09344083], dtype=float32)
 array([ 0.13101612,  0.30415857,  0.17695493, ..., -0.1529716 ,
        -0.10064235,  0.05371748], dtype=float32)                ...
 array([-0.07030933,  0.01263661, -0.0583692 , ..., -0.08494956,
         0.01382006,  0.01726289], dtype=float32)
 array([ 0.12393214,  0.13972586,  0.03937945, ...,  0.01177074,
        -0.0671177 , -0.04921646], dtype=float32)
 array([ 0.07451792,  0.0815477 , -0.00480333, ..., -0.03443815,
        -0.07907216,  0.0944355 ], dtype=float32)               ].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.

In [82]:
accuracy_score(y_test_binary, y_pred)

0.25833333333333336

###  Random Forest

#### Class Classification

In [52]:
infile = open("./rf_enzyme_class_esm.pkl",'rb')
model = pickle.load(infile)



In [54]:
y_pred = model.predict(x_test)

In [55]:
accuracy_score(y_test_class, y_pred)

0.4361111111111111

#### Enzyme Non Enzyme Classification

In [83]:
infile = open("./rf_enzyme_non_enzyme_esm.pkl",'rb')
model = pickle.load(infile)



In [84]:
y_pred = model.predict(x_test)

In [85]:
accuracy_score(y_test_binary, y_pred)

  score = y_true == y_pred


0.0

### KNN

#### Class Classification

In [56]:
infile = open("./knn_enzyme_class_esm.pkl",'rb')
model = pickle.load(infile)



In [57]:
y_pred = model.predict(x_test)
accuracy_score(y_test_class, y_pred)

0.5333333333333333

#### Enzyme Non Enzyme Classification

In [116]:
infile = open("./knn_enzyme_non_enzyme_esm.pkl",'rb')
model = pickle.load(infile)



In [120]:
y_pred = model.predict(x_test)

In [121]:
accuracy_score(y_test_binary, y_pred)

  score = y_true == y_pred


0.0

### MLP

#### Class Classification

In [58]:
infile = open("./mlp_enzyme_class_esm.pkl",'rb')
model = pickle.load(infile)



In [59]:
y_pred = model.predict(x_test)
accuracy_score(y_test_class, y_pred)

0.6388888888888888

#### Enzyme Non Enzyme Classification

In [88]:
infile = open("./mlp_enzyme_non_enzyme_esm.pkl",'rb')
model = pickle.load(infile)



In [89]:
y_pred = model.predict(x_test)
accuracy_score(y_test_binary, y_pred)

  score = y_true == y_pred


0.0

### SVC

#### Class Classification

#### Enzyme Non Enzyme Classification

In [90]:
infile = open("./svc_enzyme_non_enzyme_esm.pkl",'rb')
model = pickle.load(infile)



In [None]:
y_pred = model.predict(x_test)
accuracy_score(y_test_binary, y_pred)