# Section 1: k-Nearest Neighbour

### Initializing random input data

Defining Data (x) and Target (y) variables.

Target is modelled as:
$$ y = \sin (x) + 0.1 x^2 + 0.5 \epsilon $$

In [1]:
import tensorflow as tf
import numpy as np

import matplotlib.pyplot as plt

import plotly.plotly as py
import plotly.graph_objs as go
from plotly import tools

import plotly.offline as pyo
pyo.init_notebook_mode(connected=True)

Generate data:

In [2]:
np.set_printoptions(precision=3)
np.random.seed(521)

# Generating data
Data = np.linspace(1.0, 10.0, num=100) [:, np.newaxis]
Target = np.sin(Data) + 0.1 * np.power(Data, 2) \
     + 0.5 * np.random.randn(100, 1)
    
# Generating a random index
randIdx = np.arange(100)
np.random.shuffle(randIdx)

# Partitioning 100 datapoints into training, validation and 
# test sets consisting of 80, 10 and 10 points respectively.
trainData, trainTarget = Data[randIdx[:80]], Target[randIdx[:80]]
validData, validTarget = Data[randIdx[80:90]], Target[randIdx[80:90]]
testData, testTarget = Data[randIdx[90:]], Target[randIdx[90:]]

Plotting datasets:

In [3]:
def plotPreliminaryData(trainData, trainTarget, validData, validTarget, testData, testTarget):
    trainPlot = plt.plot(trainData, trainTarget, 'bo')
    validPlot = plt.plot(validData, validTarget, 'ro')
    testPlot = plt.plot(testData, testTarget, 'go')

    plt.legend(('train', 'validation', 'test'))
    plt.show()

Defining TensorFlow variables:

In [4]:
# Calculates pairwise squared Euclidean distance between matrices X and Z
# X is the input matrix which houses data points to be calculated, 
# which are calculated against reference datapoints in Z
def euclideanDistance(X, Z):
    if tf.TensorShape.num_elements(X.get_shape()) == 3:
        return tf.reduce_sum(tf.square(X - tf.transpose(Z)), axis=2) # Sums feature deviations for each variable
    else:
        return tf.square(X - tf.transpose(Z), name='Euclidean_Distance_Matrix')

# Produces a BxC responsibility vector
def responsibilityVector(D, k):
    def flatten(tensor):
        return tf.reshape(tensor, [-1])
    
    # Obtains indices of top-K points
    _, idx = tf.nn.top_k(tf.transpose(-D), k)
    
    # Creates a step sequence for M values repeated k times
    M = tf.shape(D)[1]
    step_seq = flatten(tf.transpose(tf.reshape(tf.tile(tf.range(0, M), [k]), [k, M])))
    
    # Form new index key compatible for subsequent sparse_to_dense op
    sparse_idx = tf.stack([step_seq, flatten(idx)], axis=1, name='Sparse_Indices')
    
    # Forms dense tensor
    return tf.sparse_to_dense(tf.cast(sparse_idx, tf.int32), \
                              tf.cast(tf.shape(tf.transpose(D)), tf.int32), \
                              tf.fill([tf.shape(sparse_idx)[0]], tf.divide(1.0, tf.cast(k, tf.float32))), \
                              validate_indices=False, \
                              name='Responsibility_Vector')

# Function that creates a TensorFlow model
def buildGraph(k_):    
    k = tf.constant(k_, name='k') # Hyperparameter
    X = tf.placeholder(tf.float32, shape=[None, None], name='Training_Data')
    Y = tf.placeholder(tf.float32, shape=[None, None], name='Training_Target')
    Z = tf.placeholder(tf.float32, shape=[None, None], name='Input_Data')
    T = tf.placeholder(tf.float32, shape=[None, None], name='Input_Target')
    
    D = euclideanDistance(X, Z)
    R = responsibilityVector(D, k)
    
    Y_hat = tf.matmul(R, Y, name='Y_hat')
    MSE = tf.divide(tf.reduce_sum(tf.square(T - Y_hat)), \
                    tf.scalar_mul(2, tf.cast(tf.shape(Z)[0], tf.float32)), \
                    name='Mean_Squared_Error') # Half of theorectical MSE
    
    return X, Y, Z, T, Y_hat, MSE

# Function used to adapt 'kNN' function based on type of dataset used for calculation
# Mode consists of either ['train', 'validation', 'test', 'full']
def selectDataPartition(mode):
    
    inputData = trainData
    inputTarget = trainTarget
    
    if mode == 'train':
        dataSource = trainData
        targetSource = trainTarget
    elif mode == 'validation':
        dataSource = validData
        targetSource = validTarget
    elif mode == 'test':
        dataSource = testData
        targetSource = testTarget
    elif mode == 'full':
        dataSource = np.linspace(0.0, 11.0, num=1000) [:, np.newaxis]
        targetSource = np.sin(dataSource) + 0.1 * np.power(dataSource, 2) \
                       + 0.5 * np.random.randn(1000, 1)
        
        inputData = trainData
        inputTarget = trainTarget
        
    return inputData, inputTarget, dataSource, targetSource
    
def kNN(k, mode):
    X, Y, Z, T, Y_hat, MSE = buildGraph(k)
    
    with tf.Session(config=tf.ConfigProto(log_device_placement=False, allow_soft_placement=True)) as sess:
        inputData, inputTarget, dataSource, targetSource = selectDataPartition(mode)
            
        error, y_pred = sess.run([MSE, Y_hat], \
                                 feed_dict={X: inputData, Y: inputTarget, \
                                            Z: dataSource, T: targetSource
                                           })
        
    return error, np.transpose(np.append(inputData, inputTarget, axis=1)), np.transpose(np.append(dataSource, y_pred, axis=1))

Main Function:

In [5]:
k_list = [1, 3, 5, 50]
kNN_mode = ['train', 'validation', 'test', 'full']

MSE_list = []
targetSeries = []
predictionSeries = []

# Performs kNN based on various calculation modes
for i in range(4):
    for j in range(4):
        MSE, target, prediction = kNN(k_list[i], kNN_mode[j])
        MSE_list.append(MSE)
        targetSeries.append(target)
        predictionSeries.append(prediction)
MSE_list = np.reshape(MSE_list, (4, 4))

## Generates interactive Plotly graph:

In [6]:
def generateVisualisation(targetSeries, predictionSeries, k_list):
    subplotTitleString = []
    for i in range(len(k_list)):
        subplotTitleString.append('k = %s' % str(k_list[i]))
    
    fig = tools.make_subplots(rows=2, cols=2, subplot_titles=(subplotTitleString))

    for i in range(len(k_list)):
        traceData = go.Scatter(
            x = targetSeries[i * 4][0],
            y = targetSeries[i * 4][1],
            marker = {'color': 'blue',
                      'symbol': 200},
            mode = 'markers',
            name = 'data_k=' + str(k_list[i])
        )
        tracePred = go.Scatter(
            x = predictionSeries[4 * (i + 1) - 1][0], 
            y = predictionSeries[4 * (i + 1) - 1][1],
            marker = {'color': 'green'},
            mode = 'lines',
            name = 'prediction_k=' + str(k_list[i])
        )

        fig.append_trace(traceData, i / 2 + 1, i % 2 + 1)
        fig.append_trace(tracePred, i / 2 + 1, i % 2 + 1)
          
        fig['layout']['xaxis'+str(i+1)].update(title='x')
        fig['layout']['yaxis'+str(i+1)].update(title='y')
        
    fig['layout'].update(height=900, width=950, title='k-NN Regression on data1D', showlegend=False)
    return pyo.iplot(fig, filename='A1Q1_kNN_subplot2x2')

In [7]:
# Output summary table
print 'Mean Squared Error Summary:'
for i in range(5):
    if i == 0:
        print "%3s %10s %10s %6s" % ('k', 'Training', 'Validation', 'Test')
    else:
        print "%3d %10.3f %10.3f %6.3f" % (k_list[i - 1], MSE_list[i - 1][0], MSE_list[i - 1][1], MSE_list[i - 1][2])

print "\n\n\n"
kNN_visuals = generateVisualisation(targetSeries, predictionSeries, k_list)
kNN_visuals

Mean Squared Error Summary:
  k   Training Validation   Test
  1      0.000      0.272  0.311
  3      0.105      0.326  0.145
  5      0.119      0.310  0.178
 50      1.248      1.229  0.707




This is the format of your plot grid:
[ (1,1) x1,y1 ]  [ (1,2) x2,y2 ]
[ (2,1) x3,y3 ]  [ (2,2) x4,y4 ]

