In [1]:
import time
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score

# Make Individual Fasta Files Train

In [2]:
!python makeIndFastaFiles.py

# Create the similarity matrix for training dataset

In [3]:
!python makeSeqSimilarityMatrix.py

*******************************************************************************************************************

# Model Simulation

In [1]:
!python similarity-model-check.py

93.27724933624268


# Model Results

In [2]:
!python plot-similarity-model-results.py

# Create similarity matrix for test dataset

In [5]:
!python makeSeqSimilarityMatrix-test.py

****

# Model Run 1 instance

In [16]:
import time
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score


similarity_matrix = '../similarity/similarity_matrix.txt'
label_file = '../data/label/train_enz_label.csv'

test_similarity_matrix = '../similarity/test_similarity_matrix.txt'


def get_matrix():
    mat = np.loadtxt(similarity_matrix, delimiter=',')
    mat[mat==0] = 1e-9
    return 1/mat


def get_test_matrix():
    mat = np.loadtxt(test_similarity_matrix, delimiter=',')
    mat[mat==0] = 1e-9
    return 1/mat


def get_labels():
    y = []
    with open(label_file, 'rt') as f:
        for lines in f:
            vals = lines.strip().split(',')
            y.append(int(vals[1]))
    y = np.array(y)
    return y


def get_train_valid_split(mat, y):
    all_indices = [i for i in range(115)]

    train_idx = np.random.choice(all_indices, size=80, replace=False)
    valid_idx = [i for i in all_indices if i not in train_idx]

    X_train = mat[train_idx, :][:, train_idx]
    X_valid = mat[valid_idx, :][:, train_idx]

    y_train = y[train_idx]
    y_valid = y[valid_idx]
    return X_train, X_valid, y_train, y_valid, train_idx


def get_model_metrics(y_valid, y_hat_valid):
    acc = accuracy_score(y_valid, y_hat_valid)
    rec = recall_score(y_valid, y_hat_valid, labels=[3], average='micro')
    prec = precision_score(y_valid, y_hat_valid, labels=[3], average='micro')
    return prec, rec, acc

In [40]:
# get the matrix
sim_mat = get_matrix()

# get the labels
y = get_labels()

# get train valid data
Xtrain, Xvalid, ytrain, yvalid, train_idx = get_train_valid_split(sim_mat, y)

neigh = KNeighborsClassifier(n_neighbors=5, algorithm='brute', metric='precomputed')
neigh.fit(Xtrain, ytrain)

yhatvalid = neigh.predict(Xvalid)

In [41]:
# get model performance
prec, rec, acc = get_model_metrics(yvalid, yhatvalid)

In [42]:
prec,rec,acc

(0.75, 0.8571428571428571, 0.6285714285714286)

In [43]:
test_sim_mat = get_test_matrix()

In [44]:
Xtest = test_sim_mat[:, train_idx]

In [45]:
ytest_pred = neigh.predict(Xtest)

In [46]:
ytest_pred

array([3, 3, 3, 3])

In [47]:
neigh.kneighbors(Xtest)

(array([[0.01089004, 0.01211138, 0.0121622 , 0.01226858, 0.01229166],
        [0.01016994, 0.01052321, 0.01055977, 0.01118593, 0.01149584],
        [0.01019462, 0.01055398, 0.01058672, 0.01121617, 0.01149584],
        [0.01194543, 0.01206593, 0.01214536, 0.01214919, 0.01217523]]),
 array([[32, 17, 12, 78, 41],
        [13, 12, 41, 73, 50],
        [13, 12, 41, 73, 50],
        [12, 17, 33, 13, 41]]))