In [30]:
# python 3.6.3
# import statements
import os
import numpy as np
import math
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.python.framework import ops
from mpl_toolkits.mplot3d import Axes3D
from sklearn.preprocessing import MinMaxScaler
from sklearn import preprocessing
from sklearn.metrics import confusion_matrix
import itertools
import time

In [31]:
# global declarations
np.set_printoptions(precision=2, suppress=True) # remove scientific notation in printing

In [32]:
# return all unique elements and number of occurence from the input array, as a dict
def make_dict_counts(Y, start, end):
    unique, counts = np.unique(Y, return_counts=True)
    dict_counts = dict(zip(unique, counts))
        
    for key in range(start, end):
        if not key in dict_counts:
            dict_counts[key] = 0
            
    return dict_counts

In [33]:
# read file and split data on X and Y
def get_X_Y_counts(filename):
    data = np.loadtxt(open(filename, "rb"), delimiter=";", skiprows=1)
    X, Y = np.split(data, [-1], axis=1)
    Y = Y.astype(int)
    
    dict_counts = make_dict_counts(Y, 0, 10)
            
    return X, Y, dict_counts

In [34]:
# make train and test set by taking equal percentage of examples from each class
def make_train_test(X, Y, dict_counts, test_perc):
    X_train = []
    Y_train = []
    X_test = []
    Y_test = []
    
    dict_counts_temp = dict_counts.copy()
    
    for i in range(0, X.shape[0]):
        if dict_counts_temp.get(Y[i][0], 0) > test_perc * dict_counts.get(Y[i][0], 0):
            X_train.append(X[i])
            Y_train.append(Y[i])
        else:
            X_test.append(X[i])
            Y_test.append(Y[i])
        dict_counts_temp[Y[i][0]] -= 1
    
    X_train = np.array(X_train)
    Y_train = np.array(Y_train)
    X_test = np.array(X_test)
    Y_test = np.array(Y_test)
    
    dict_counts_train = make_dict_counts(Y_train, 1, 10)
    dict_counts_test = make_dict_counts(Y_test, 1, 10)
    
    return X_train, Y_train, X_test, Y_test, dict_counts_train, dict_counts_test

In [35]:
# concatinate data for red and white wine
def concatinate_train_test(X_red_train, Y_red_train, X_red_test, Y_red_test,
                           X_white_train, Y_white_train, X_white_test, Y_white_test):
    X_train = np.append(X_red_train, X_white_train, axis=0)
    Y_train = np.append(Y_red_train, Y_white_train, axis=0)
    X_test = np.append(X_red_test, X_white_test, axis=0)
    Y_test = np.append(Y_red_test, Y_white_test, axis=0)
    
    return X_train, Y_train, X_test, Y_test;

In [36]:
# print info about shapes of red and white wine, and in total
def print_shapes(X, Y, X_train, Y_train, X_test, Y_test, red_white):
    print(str(red_white) + " wine:")
    print("X shape: " + str(X.shape))
    print("Y shape: " + str(Y.shape))
    print("Train set X shape: " + str(X_train.shape))
    print("Train set Y shape: " + str(Y_train.shape))
    print("Test set X shape: " + str(X_test.shape))
    print("Test set Y shape: " + str(Y_test.shape))

In [37]:
# print shapes of training and test sets
def print_result_shapes(X_train, Y_train, X_test, Y_test):
    print("Split data:")
    print("Train set X shape: " + str(X_train.shape))
    print("Train set Y shape: " + str(Y_train.shape))
    print("Test set X shape: " + str(X_test.shape))
    print("Test set Y shape: " + str(Y_test.shape))

In [38]:
# plot the distribution of data over the classes
def plot_counts(dict_counts, title, i):
    #plt.figure(figsize=(8, 40))
    plt.subplot(6, 1, i)
    plt.bar(dict_counts.keys(), dict_counts.values())
    plt.title(title)
    labs = [3, 4, 5, 6, 7, 8, 9]
    plt.xticks(labs, labs)
    plt.xlabel('Class')
    plt.ylabel('Number of examples')
    plt.show()

In [39]:
def plot_count_stats(dict_counts_red, dict_counts_red_train, dict_counts_red_test,
                     dict_counts_white, dict_counts_white_train, dict_counts_white_test):
    #plt.figure(1)
    plot_counts(dict_counts_red, "Original Red Wine Data Distribution", 1)
    #plot_counts(dict_counts_red_train, "Red Wine Train Set Distribution", 2)
    #plot_counts(dict_counts_red_test, "Red Wine Test Set Distribution", 3)
    
    plot_counts(dict_counts_white, "Original White Wine Data Distribution", 4)
    #plot_counts(dict_counts_white_train, "White Wine Train Set Distribution", 5)
    #plot_counts(dict_counts_white_test, "White Wine Test Set Distribution", 6)

In [40]:
# take an array with n classes, return same array with 2 classes (bad, good)
def convert_to_two_classes(Y):
    res = Y
    for i in range(0, Y.shape[0]):
        if Y[i] >= 0 and Y[i] <= 5:
            res[i] = 1
        else:
            res[i] = 0
    return res

In [41]:
# read data from filenames and return train and test sets
def read_data(foldername, filename_red, filename_white):
    test_perc = 0.21
    
    X_red, Y_red, dict_counts_red = get_X_Y_counts(foldername + "/" + filename_red)
    X_white, Y_white, dict_counts_white = get_X_Y_counts(foldername + "/" + filename_white)
    
    X_red_train, Y_red_train, X_red_test, Y_red_test, dict_counts_red_train, dict_counts_red_test = make_train_test(X_red, Y_red, dict_counts_red, test_perc)
    X_white_train, Y_white_train, X_white_test, Y_white_test, dict_counts_white_train, dict_counts_white_test = make_train_test(X_white, Y_white, dict_counts_white, test_perc)    
    
    X_train, Y_train, X_test, Y_test = concatinate_train_test(X_red_train, Y_red_train, X_red_test, Y_red_test, X_white_train, Y_white_train, X_white_test, Y_white_test)
    
    Y_train = convert_to_two_classes(Y_train)
    Y_test = convert_to_two_classes(Y_test)
    
    X_train = preprocessing.scale(X_train)
    X_test = preprocessing.scale(X_test)
    
    return X_train.T, Y_train, X_test.T, Y_test

#X_train, Y_train_orig, X_test, Y_test_orig = read_data("../wine", "winequality-red.csv", "winequality-white.csv")

In [42]:
# create TF placeholders for arrays X and Y
def create_placeholders(n_x, n_y):
    X = tf.placeholder(tf.float32, shape=[n_x, None], name = "X")
    Y = tf.placeholder(tf.float32, shape=[n_y, None], name = "Y")
    return X, Y

In [43]:
# initialize parameters for a deep neural network
def initialize_parameters_deep(layer_dims):
    parameters = {}
    L = len(layer_dims) # number of layers in the network

    for l in range(1, L):
        parameters['W' + str(l)] = tf.get_variable("W" + str(l), [layer_dims[l], layer_dims[l-1]], initializer = tf.contrib.layers.xavier_initializer(seed = 1))
        parameters['b' + str(l)] = tf.get_variable("b" + str(l), [layer_dims[l], 1], initializer = tf.zeros_initializer())
        
    return parameters

In [44]:
# forward propagation of a deep neural network
def forward_propagation_deep(X, parameters):
    A = X
    L = len(parameters) // 2  # number of layers in the neural network
    
    for l in range(1, L):
        Z = tf.add(tf.matmul(parameters['W' + str(l)], A), parameters['b' + str(l)])
        A = tf.nn.relu(Z)
        
    Z = tf.add(tf.matmul(parameters['W' + str(L)], A), parameters['b' + str(L)])
            
    return Z

In [45]:
# compute cost of a NN, use_reg determines whether to use regularization or not
def compute_cost_deep(Z3, Y, parameters, use_reg):
    logits = tf.transpose(Z3)
    labels = tf.transpose(Y)
    L = len(parameters) // 2
    
    cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits = logits, labels = labels))
    #cost = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits = logits, labels = labels))
    #cost = tf.reduce_mean(tf.nn.weighted_cross_entropy_with_logits(logits = logits, targets = labels, pos_weight = 1.72))
    
    if use_reg == True:
        beta = 0.005
        regularizers = tf.nn.l2_loss(parameters['W1'])
        for l in range(2, L + 1):
            regularizers += tf.nn.l2_loss(parameters['W' + str(l)])
        cost = tf.reduce_mean(cost + beta * regularizers)
    
    return cost

In [46]:
# convert a matrix to one hot matrix
def convert_to_one_hot(Y, C):
    Y = np.eye(C)[Y.reshape(-1)].T
    return Y

In [47]:
# creates a list of random minibatches from (X, Y)
def random_mini_batches(X, Y, mini_batch_size = 64, seed = 0):
    m = X.shape[1] # number of training examples
    mini_batches = []
    np.random.seed(seed)
    
    # Step 1: Shuffle (X, Y)
    permutation = list(np.random.permutation(m))
    shuffled_X = X[:, permutation]
    shuffled_Y = Y[:, permutation].reshape((Y.shape[0],m))

    # Step 2: Partition (shuffled_X, shuffled_Y). Minus the end case.
    num_complete_minibatches = math.floor(m/mini_batch_size) # number of mini batches of size mini_batch_size in your partitionning
    for k in range(0, num_complete_minibatches):
        mini_batch_X = shuffled_X[:, k * mini_batch_size : k * mini_batch_size + mini_batch_size]
        mini_batch_Y = shuffled_Y[:, k * mini_batch_size : k * mini_batch_size + mini_batch_size]
        mini_batch = (mini_batch_X, mini_batch_Y)
        mini_batches.append(mini_batch)
    
    # Handling the end case (last mini-batch < mini_batch_size)
    if m % mini_batch_size != 0:
        mini_batch_X = shuffled_X[:, num_complete_minibatches * mini_batch_size : m]
        mini_batch_Y = shuffled_Y[:, num_complete_minibatches * mini_batch_size : m]
        mini_batch = (mini_batch_X, mini_batch_Y)
        mini_batches.append(mini_batch)
    
    return mini_batches

In [48]:
# split data on two classes
def split_two_classes(X, Y):
    X = X.T
    Y = Y.T
    
    X_first = []
    Y_first = []
    X_second = []
    Y_second = []
    
    for i in range(0, Y.shape[0]):
        if Y[i][1] == 1: # because it's one hot!!! with two classes
            X_first.append(X[i])
            Y_first.append(Y[i])
        else:
            X_second.append(X[i])
            Y_second.append(Y[i])
            
    X_first = np.array(X_first)
    Y_first = np.array(Y_first)
    X_second = np.array(X_second)
    Y_second = np.array(Y_second)
    
    return X_first.T, Y_first.T, X_second.T, Y_second.T

In [49]:
# plot confusion matrix of two classes, bad and good wine
def plot_confusion_matrix(cm, labs):
    plt.matshow(cm, cmap=plt.cm.Blues)
    #plt.tight_layout()
    labels = ['Bad', 'Good']
    plt.xticks(labs, labels)
    plt.yticks(labs, labels)
    plt.title('Confusion matrix')
    plt.colorbar()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    fmt = ''
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")
    plt.show()

In [50]:
# plot a cost function
def plot_cost(costs, learning_rate):
    plt.plot(np.squeeze(costs))
    plt.ylabel('cost')
    plt.xlabel('iterations (per tens)')
    plt.title("Learning rate =" + str(learning_rate))
    plt.show()

In [51]:
# training a network, evaluating performance on a test set, returning parameters of a neural network
def train(X_train, Y_train, learning_rate = 0.01, num_epochs = 10000, minibatch_size = 256, print_cost = True):

    ops.reset_default_graph() 
    (n_x, m) = X_train.shape # (n_x: input size, m : number of examples in the train set)
    n_y = Y_train.shape[0] # n_y : output size
    costs = [] 

    X, Y = create_placeholders(n_x, n_y)
    
    # Deep
    layers_dims = (n_x, 20, 20, 20, n_y)
    parameters = initialize_parameters_deep(layers_dims)
    Z3 = forward_propagation_deep(X, parameters)
    cost = compute_cost_deep(Z3, Y, parameters, True)

    # Backpropagation: Define the tensorflow optimizer. Use an AdamOptimizer.
    optimizer = tf.train.AdamOptimizer(learning_rate = learning_rate).minimize(cost)
    
    # Initialize all the variables
    init = tf.global_variables_initializer()
    
    X_first, Y_first, X_second, Y_second = split_two_classes(X_train, Y_train)

    # Start the session to compute the tensorflow graph
    with tf.Session() as sess:
        
        # Run the initialization
        sess.run(init)
        
        # Do the training loop
        for epoch in range(num_epochs):

            epoch_cost = 0 # Defines a cost related to an epoch
            
            num_minibatches = int(X_second.shape[1] / minibatch_size) # number of minibatches of size minibatch_size in the train set
            minibatches_first = random_mini_batches(X_first, Y_first, minibatch_size)
            minibatches_second = random_mini_batches(X_second, Y_second, minibatch_size)

            #for minibatch in minibatches:
            for i in range(0, len(minibatches_first)):
                minibatch_first = minibatches_first[i]
                minibatch_second = minibatches_second[i]
                # Select a minibatch
                (minibatch_first_X, minibatch_first_Y) = minibatch_first
                (minibatch_second_X, minibatch_second_Y) = minibatch_second
                
                minibatch_X = np.append(minibatch_first_X, minibatch_second_X, axis=1)
                minibatch_Y = np.append(minibatch_first_Y, minibatch_second_Y, axis=1)
                
                # Run the session to execute the "optimizer" and the "cost", the feedict should contain a minibatch for (X,Y).
                _ , minibatch_cost = sess.run([optimizer, cost], feed_dict={X: minibatch_X, Y: minibatch_Y})
                
                epoch_cost += minibatch_cost / num_minibatches

            #_, epoch_cost = sess.run([optimizer, cost], feed_dict={X: X_train, Y: Y_train})
            # Print the cost every epoch
            if print_cost == True and epoch % 100 == 0:
                print ("Cost after epoch %i: %f" % (epoch, epoch_cost))
            if epoch % 5 == 0:
                costs.append(epoch_cost)
                
        # plot the cost
        plot_cost(costs, learning_rate)
        
        # Save the parameters in a variable
        parameters = sess.run(parameters)
        #print ("Parameters have been trained and saved!")

        # Calculate the correct predictions
        correct_prediction = tf.equal(tf.argmax(Z3), tf.argmax(Y))

        # Get average from the results
        accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float"))

        print ("Train Accuracy:", str(round(accuracy.eval({X: X_train, Y: Y_train}), 3)))

        test_tensor = tf.argmax(Z3)
        
        # Conf matrix
        labs = [0, 1]
        cm = confusion_matrix(np.argmax(Y_train, 0), test_tensor.eval({X: X_train}), labels=labs)
        plot_confusion_matrix(cm, labs)
        tp, fn, fp, tn = cm.ravel()
        recall = tp / (tp + fn)
        precision = tp / (tp + fp)
        print("Train class 0 recall: " + str(round(recall, 3)))
        print("Train class 1 recall: " + str(round(tn / (tn + fp), 3)))
        #print("Train precision: " + str(precision))
        #print("Train F-measure: " + str(2 * precision * recall / (precision + recall)))
        
        return parameters

In [52]:
# run and evaluate trained neural network on a test set
def sim(X_test, Y_test, parameters):
    (n_x, m) = X_test.shape # (n_x: input size, m : number of examples in the test set)
    n_y = Y_test.shape[0] # n_y : output size
    
    X, Y = create_placeholders(n_x, n_y)
    Y_pred = forward_propagation_deep(X, parameters)
    labs = [0, 1]
    
    with tf.Session() as sess:
        correct_prediction = tf.equal(tf.argmax(Y_pred), tf.argmax(Y))
        accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float"))
        print ("Test Accuracy:", str(round(accuracy.eval({X: X_test, Y: Y_test}), 3)))

        test_tensor = tf.argmax(Y_pred)
        cm = confusion_matrix(np.argmax(Y_test, 0), test_tensor.eval({X: X_test}), labels=labs)
        plot_confusion_matrix(cm, labs)
        tp, fn, fp, tn = cm.ravel()
        recall = tp / (tp + fn)
        precision = tp / (tp + fp)
        print("Test class 0 recall: " + str(round(recall, 3)))
        print("Test class 1 recall: " + str(round(tn / (tn + fp), 3)))
        #print("Test precision: " + str(precision))
        #print("Test F-measure: " + str(2 * precision * recall / (precision + recall)))

In [54]:
# script to run
plt.close("all")
%matplotlib qt
X_train, Y_train_orig, X_test, Y_test_orig = read_data("../wine", "winequality-red.csv", "winequality-white.csv")
n_classes = 2
Y_train = convert_to_one_hot(Y_train_orig.T, n_classes)
Y_test = convert_to_one_hot(Y_test_orig.T, n_classes)

print("Shape of X train: " + str(X_train.shape))
print("Shape of Y train: " + str(X_test.shape))
print("Shape of X test: " + str(Y_train.shape))
print("Shape of Y test: " + str(Y_test.shape))

print("\nTraining started:")
parameters = train(X_train, Y_train, num_epochs = 1000, learning_rate = 0.01, print_cost = False)

print("\nEvaluating results on the test set:")
sim(X_test, Y_test, parameters)

Shape of X train: (11, 5139)
Shape of Y train: (11, 1358)
Shape of X test: (2, 5139)
Shape of Y test: (2, 1358)

Training started:
Train Accuracy: 0.796
Train class 0 recall: 0.781
Train class 1 recall: 0.822

Evaluating results on the test set:
Test Accuracy: 0.771
Test class 0 recall: 0.799
Test class 1 recall: 0.723
