In [1]:
import torch
import numpy as np
import math, time

import warnings
warnings.filterwarnings('ignore')

from sklearn import datasets
from torch.autograd import Variable

In [2]:
def shuffle_data(X, y, seed=None):
    """ Random shuffle of the samples in X and y """
    if seed:
        np.random.seed(seed)
    idx = np.arange(X.shape[0])
    np.random.shuffle(idx)
    return X[idx], y[idx]

def train_test_split(X, y, test_size=0.5, shuffle=True, seed=None):
    """ Split the data into train and test sets """
    if shuffle:
        X, y = shuffle_data(X, y, seed)
    # Split the training data from test data in the ratio specified in
    # test_size
    split_i = len(y) - int(len(y) // (1 / test_size))
    X_train, X_test = X[:split_i], X[split_i:]
    y_train, y_test = y[:split_i], y[split_i:]

    return X_train, X_test, y_train, y_test

def euclidean_distance(x1, x2):
    """ Calculates the l2 distance between two vectors """
    distance = 0
    # Squared distance between each coordinate
    for i in range(len(x1)):
        distance += pow((x1[i] - x2[i]), 2)
    return math.sqrt(distance)

def normalize(X, axis=-1, order=2):
    """ Normalize the dataset X """
    l2 = np.atleast_1d(np.linalg.norm(X, order, axis))
    l2[l2 == 0] = 1
    return X / np.expand_dims(l2, axis)

def accuracy_score(y_true, y_pred):
    """ Compare y_true to y_pred and return the accuracy """
    accuracy= np.sum(y_true == y_pred, axis=0) / len(y_true)
    return accuracy

def to_categorical(x, n_col=None):
    """ One-hot encoding of nominal values """
    if not n_col:
        n_col = np.amax(x) + 1
    one_hot = np.zeros((x.shape[0], n_col))
    one_hot[np.arange(x.shape[0]), x] = 1
    return one_hot

In [3]:
# class KNN():
#     """ K Nearest Neighbors classifier.

#     Parameters:
#     -----------
#     k: int
#         The number of closest neighbors that will determine the class of the 
#         sample that we wish to predict.
#     """
#     def __init__(self, k=5):
#         self.k = k

#     def _vote(self, neighbor_labels):
#         """ Return the most common class among the neighbor samples """
#         counts = np.bincount(neighbor_labels.astype('int'))
#         return counts.argmax()

#     def predict(self, X_test, X_train, y_train):
#         y_pred = np.empty(X_test.shape[0])
#         # Determine the class of each sample
#         for i, test_sample in enumerate(X_test):
#             # Sort the training samples by their distance to the test sample and get the K nearest
#             idx = np.argsort([euclidean_distance(test_sample, x) for x in X_train])[:self.k]
#             # Extract the labels of the K nearest neighboring training samples
#             k_nearest_neighbors = np.array([y_train[i] for i in idx])
#             # Label sample as the most common class label
#             count= self._vote(k_nearest_neighbors)
#             y_pred[i] = count

#         return y_pred

# data = datasets.load_iris()
# X = data.data
# y = data.target
# X_train, X_test, y_train, y_test = train_test_split(normalize(X), y, test_size=0.2)

# clf = KNN(k=5)
# y_pred = clf.predict(X_test, X_train, y_train)
# accuracy_his = accuracy_score(y_test, y_pred)

# from supervised_learning.knn import KNN

# device = torch.device("cuda:0" if (torch.cuda.is_available() and 1 > 0) else "cpu")
# X_train, X_test, y_train, y_test= Variable(torch.from_numpy(X_train)), Variable(torch.from_numpy(X_test)), Variable(torch.from_numpy(y_train)), Variable(torch.from_numpy(y_test))

# classifier= KNN(k= 5)#.to(device)
# y_pred_mine = classifier.predict(X_test, X_train, y_train)

# def accuracy_score(y_true, y_pred):
#     """ Compare y_true to y_pred and return the accuracy """
#     accuracy = torch.sum(y_true == y_pred, axis=0).item() / len(y_true)
#     return accuracy

# accuracy = accuracy_score(y_test, y_pred_mine)
# print ("Accuracy his:", accuracy_his)
# print ("Accuracy mine:", accuracy)

In [4]:
# from mlfromscratch.deep_learning.activation_functions import Sigmoid

# class LogisticRegression():
#     """ Logistic Regression classifier.
#     Parameters:
#     -----------
#     learning_rate: float
#         The step length that will be taken when following the negative gradient during
#         training.
#     gradient_descent: boolean
#         True or false depending if gradient descent should be used when training. If
#         false then we use batch optimization by least squares.
#     """
#     def __init__(self, learning_rate=.1, gradient_descent=True):
#         self.param = None
#         self.learning_rate = learning_rate
#         self.gradient_descent = gradient_descent
#         self.sigmoid = Sigmoid()

#     def _initialize_parameters(self, X):
#         n_features = np.shape(X)[1]
#         # Initialize parameters between [-1/sqrt(N), 1/sqrt(N)]
#         limit = 1 / math.sqrt(n_features)
#         self.param = np.random.uniform(-limit, limit, (n_features,))

#     def fit(self, X, y, n_iterations=4000):
#         self._initialize_parameters(X)
#         # Tune parameters for n iterations
#         for i in range(n_iterations):
#             # Make a new prediction
#             y_pred = self.sigmoid(X.dot(self.param))
#             if self.gradient_descent:
#                 # Move against the gradient of the loss function with
#                 # respect to the parameters to minimize the loss
#                 diff= -(y -y_pred)
#                 self.param -= self.learning_rate * diff.dot(X)
#             else:
#                 # Make a diagonal matrix of the sigmoid gradient column vector
#                 diag_gradient = make_diagonal(self.sigmoid.gradient(X.dot(self.param)))
#                 # Batch opt:
#                 self.param = np.linalg.pinv(X.T.dot(diag_gradient).dot(X)).dot(X.T).dot(diag_gradient.dot(X).dot(self.param) + y - y_pred)

#     def predict(self, X):
#         y_pred = np.round(self.sigmoid(X.dot(self.param))).astype(int)
#         return y_pred

# data = datasets.load_iris()
# X = normalize(data.data[data.target != 0])
# y = data.target[data.target != 0]
# y[y == 1] = 0
# y[y == 2] = 1

# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, seed=1)

# clf = LogisticRegression(gradient_descent=True)
# clf.fit(X_train, y_train)
# y_pred = clf.predict(X_test)
# accuracy = accuracy_score(y_test, y_pred)
# print ("Accuracy:", accuracy)

# from supervised_learning.logisticregression import LogisticRegression

# device = torch.device("cuda:0" if (torch.cuda.is_available() and 1 > 0) else "cpu")
# X_train, X_test, y_train, y_test= Variable(torch.from_numpy(X_train)), Variable(torch.from_numpy(X_test)), Variable(torch.from_numpy(y_train)), Variable(torch.from_numpy(y_test))
# clf= LogisticRegression(gradient_descent= True).to(device)
# clf.fit(X_train, y_train)
# y_pred= clf.predict(X_test)

# def accuracy_score(y_true, y_pred):
#     """ Compare y_true to y_pred and return the accuracy """
#     accuracy = torch.sum(y_true == y_pred, axis=0).item() / len(y_true)
#     return accuracy

# accuracy = accuracy_score(y_test, y_pred)
# print ("Accuracy:", accuracy)

In [5]:
from mlfromscratch.deep_learning.activation_functions import Sigmoid, Softmax
from mlfromscratch.deep_learning.loss_functions import CrossEntropy

class MultilayerPerceptron():
    """Multilayer Perceptron classifier. A fully-connected neural network with one hidden layer.
    Unrolled to display the whole forward and backward pass.

    Parameters:
    -----------
    n_hidden: int:
        The number of processing nodes (neurons) in the hidden layer. 
    n_iterations: float
        The number of training iterations the algorithm will tune the weights for.
    learning_rate: float
        The step length that will be used when updating the weights.
    """
    def __init__(self, n_hidden, n_iterations=3000, learning_rate=0.01):
        self.n_hidden = n_hidden
        self.n_iterations = n_iterations
        self.learning_rate = learning_rate
        self.hidden_activation = Sigmoid()
        self.output_activation = Softmax()
        self.loss = CrossEntropy()

    def _initialize_weights(self, X, y):
        n_samples, n_features = X.shape
        _, n_outputs = y.shape
        # Hidden layer
        limit   = 1 / math.sqrt(n_features)
#         print((n_features, self.n_hidden))
        self.W  = np.random.uniform(-limit, limit, (n_features, self.n_hidden))
        self.w0 = np.zeros((1, self.n_hidden))
        # Output layer
        limit   = 1 / math.sqrt(self.n_hidden)
#         print((self.n_hidden, n_outputs))
        self.V  = np.random.uniform(-limit, limit, (self.n_hidden, n_outputs))
        self.v0 = np.zeros((1, n_outputs))

    def fit(self, X, y):

        self._initialize_weights(X, y)

        for i in range(self.n_iterations):

            # ..............
            #  Forward Pass
            # ..............

            # HIDDEN LAYER
            hidden_input = X.dot(self.W) + self.w0
            hidden_output = self.hidden_activation(hidden_input)
            # OUTPUT LAYER
#             print('Output Layer: ', hidden_output.shape, self.V.shape)
            output_layer_input = hidden_output.dot(self.V) + self.v0
            y_pred = self.output_activation(output_layer_input)

            # ...............
            #  Backward Pass
            # ...............

            # OUTPUT LAYER
            # Grad. w.r.t input of output layer
            grad_wrt_out_l_input = self.loss.gradient(y, y_pred) * self.output_activation.gradient(output_layer_input)
            grad_v = hidden_output.T.dot(grad_wrt_out_l_input)
            grad_v0 = np.sum(grad_wrt_out_l_input, axis=0, keepdims=True)
            # HIDDEN LAYER
            # Grad. w.r.t input of hidden layer
            grad_wrt_hidden_l_input = grad_wrt_out_l_input.dot(self.V.T) * self.hidden_activation.gradient(hidden_input)
            grad_w = X.T.dot(grad_wrt_hidden_l_input)
            grad_w0 = np.sum(grad_wrt_hidden_l_input, axis=0, keepdims=True)
            
#             print(grad_v.shape, self.V.shape)
#             print(grad_v0.shape, self.v0.shape)
#             print(grad_w.shape, self.W.shape)
#             print(grad_w0.shape, self.w0.shape)
            
            # Update weights (by gradient descent)
            # Move against the gradient to minimize loss
            self.V  -= self.learning_rate * grad_v
            self.v0 -= self.learning_rate * grad_v0
            self.W  -= self.learning_rate * grad_w
            self.w0 -= self.learning_rate * grad_w0

    # Use the trained model to predict labels of X
    def predict(self, X):
        # Forward pass:
        hidden_input = X.dot(self.W) + self.w0
        hidden_output = self.hidden_activation(hidden_input)
        output_layer_input = hidden_output.dot(self.V) + self.v0
        y_pred = self.output_activation(output_layer_input)
        return y_pred


data = datasets.load_iris()
X = normalize(data.data[data.target != 0])
y = data.target[data.target != 0]
y = to_categorical(y)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, seed=1)
clf= MultilayerPerceptron(n_hidden= 20, n_iterations= 10000, learning_rate= 1e-4)
clf.fit(X_train, y_train)
y_pred = np.argmax(clf.predict(X_test), axis=1)
y_test = np.argmax(y_test, axis=1)
accuracy = accuracy_score(y_test, y_pred)
print ("Accuracy:", accuracy)

from supervised_learning.multilayerperceptron import MultiLayerPerceptron

device = torch.device("cuda:0" if (torch.cuda.is_available() and 1 > 0) else "cpu")
X_train, X_test, y_train, y_test= Variable(torch.from_numpy(X_train)), Variable(torch.from_numpy(X_test)), Variable(torch.from_numpy(y_train)), Variable(torch.from_numpy(y_test))

clf= MultiLayerPerceptron(n_hidden= 20, n_iterations= 10000, learning_rate= 1e-4).to(device)
clf.fit(X_train, y_train)

y_pred = torch.argmax(clf.predict(X_test), dim=1)
def accuracy_score(y_true, y_pred):
    """ Compare y_true to y_pred and return the accuracy """
    accuracy = torch.sum(y_true == y_pred, axis=0).item() / len(y_true)
    return accuracy

accuracy = accuracy_score(y_test, y_pred)
print ("Accuracy:", accuracy)

Accuracy: 0.41379310344827586
Accuracy: 0.4482758620689655
