In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import f1_score, recall_score, precision_score

types = [('IC50',150), ('Ki',130), ('Kd',70)]

In [None]:
def classify(classifier, train_x, train_y, validate_x, validate_y, test_x, test_y):
    classifier = classifier.fit(train_x, train_y)
    scores = cross_val_score(estimator=classifier, X=validate_x, y=validate_y, cv=10)
    print("Cross Validation Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
    
    y_pred = classifier.predict(test_x)
    y_true = test_y.tolist()
    
    score = 0
    for i in range(0,len(y_pred)):
        if y_pred[i] == y_true[i]:
            score += 1
    score /= len(y_pred)
    
    print("Classifier Score: %0.2f" % score)
    print('Precision Score - micro : %0.2f' % precision_score(y_true, y_pred, average='micro'))
    print('Recall Score - micro : %0.2f' % recall_score(y_true, y_pred, average='micro'))
    print('F1 Score - micro : %0.2f' % f1_score(y_true, y_pred, average='micro'))

In [None]:
type_ = types[0]
x = type_[0]
y = int(type_[1])
print('-----', x, '-----')

In [None]:
df = pd.read_csv('../datasets/protLigBindDB_'+x+'_binned',skiprows=1)
df = df.drop('PDB', 1)
df = df.drop(x, 1)
df = df.drop('log(' + x + ')', 1)
df = df.drop('Log_Binned_Binding_Affinity', 1)

In [None]:
train, rest = train_test_split(df, test_size = 0.4)
validate, test = train_test_split(rest, test_size = 0.5)

In [None]:
train_y = train[['Binned_Binding_Affinity']].as_matrix().flatten()
train_x = train.drop('Binned_Binding_Affinity', 1).as_matrix()
validate_y = validate[['Binned_Binding_Affinity']].as_matrix().flatten()
validate_x = validate.drop('Binned_Binding_Affinity', 1).as_matrix()
test_y = test[['Binned_Binding_Affinity']].as_matrix().flatten()
test_x = test.drop('Binned_Binding_Affinity', 1).as_matrix()

# KNeighborsClassifier
Neighbors-based classification is a type of instance-based learning or non-generalizing learning: it does not attempt to construct a general internal model, but simply stores instances of the training data. Classification is computed from a simple majority vote of the nearest neighbors of each point: a query point is assigned the data class which has the most representatives within the nearest neighbors of the point.

In [None]:
print("Nearest Neighbors")
from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier(n_neighbors=10, weights='uniform', 
                                  algorithm='brute',  metric='minkowski', 
                                  p=2, n_jobs=1)
classify(classifier=classifier, train_x=train_x, train_y=train_y, validate_x=validate_x, validate_y=validate_y, test_x=test_x, test_y=test_y)

# SVC
The implementation is based on libsvm. The fit time complexity is more than quadratic with the number of samples which makes it hard to scale to dataset with more than a couple of 10000 samples.
The multiclass support is handled according to a one-vs-one scheme.

In [None]:
print("Linear SVM")
from sklearn.svm import SVC
classifier = SVC(C=1, kernel='linear', probability=False, 
                 shrinking=True, max_iter=-1)
classify(classifier=classifier, train_x=train_x, train_y=train_y, validate_x=validate_x, validate_y=validate_y, test_x=test_x, test_y=test_y)

In [None]:
print("RBF SVM")
from sklearn.svm import SVC
classifier = SVC(C=1, kernel='linear', probability=False, 
                 shrinking=True, max_iter=-1)
classify(classifier=classifier, train_x=train_x, train_y=train_y, validate_x=validate_x, validate_y=validate_y, test_x=test_x, test_y=test_y)

# Random Forest Classifier
A random forest is a meta estimator that fits a number of decision tree classifiers on various sub-samples of the dataset and use averaging to improve the predictive accuracy and control over-fitting. The sub-sample size is always the same as the original input sample size but the samples are drawn with replacement if bootstrap=True (default).

In [None]:
print("Random Forest")
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators=20, criterion='entropy', 
                                    max_features=None, max_depth=None, 
                                    min_samples_split=2, min_samples_leaf=1, 
                                    max_leaf_nodes=None, bootstrap=True)
classify(classifier=classifier, train_x=train_x, train_y=train_y, validate_x=validate_x, validate_y=validate_y, test_x=test_x, test_y=test_y)

# MLPClassifier
Class MLPClassifier implements a multi-layer perceptron (MLP) algorithm that trains using Backpropagation.
MLP trains on two arrays: array X of size (n_samples, n_features), which holds the training samples represented as floating point feature vectors; and array y of size (n_samples,), which holds the target values (class labels) for the training samples

In [None]:
print("Neural Net")
from sklearn.neural_network import MLPClassifier
classifier = MLPClassifier(hidden_layer_sizes=(100,), activation='logistic', 
                           solver='adam', max_iter=200, shuffle=True)
classify(classifier=classifier, train_x=train_x, train_y=train_y, validate_x=validate_x, validate_y=validate_y, test_x=test_x, test_y=test_y)