In [1]:
from math import sqrt

from scipy.stats import pearsonr, spearmanr
import pandas as pd

from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split

ModuleNotFoundError: No module named 'scipy'

In [2]:
iris_data = load_iris(return_X_y=True, as_frame=True)
iris = iris_data[0]
iris.columns = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width']
iris['type'] = iris_data[1]
del iris_data

NameError: name 'load_iris' is not defined

### Sample data: Iris dataset

The Iris Dataset we just loaded with the above code consists of 150 records describing the measurements of iris flowers. Then we have an output named "type" which specifies the type of the iris flower (for simplicity, the names of those types were replaced by the numbers 0, 1 and 2). There are 50 records of each type of flower.

Using the similarity functions we saw in the session, we are going to develop our own K-nearest neighbours algorithm to predict the flower types, given just their measurements. For that, we are going to take a random sample of 75 records (test dataset) and try to predict them by looking at their most similar measures from the remaining 75 records (train dataset):

In [209]:
train, test = train_test_split(iris, test_size=0.5)

In [210]:
print('The test data has', len(test), 'records')
print('The train data has', len(train), 'records')

The test data has 75 records
The train data has 75 records


In [211]:
train

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,type
56,6.3,3.3,4.7,1.6,1
37,4.9,3.6,1.4,0.1,0
70,5.9,3.2,4.8,1.8,1
21,5.1,3.7,1.5,0.4,0
33,5.5,4.2,1.4,0.2,0
...,...,...,...,...,...
82,5.8,2.7,3.9,1.2,1
87,6.3,2.3,4.4,1.3,1
12,4.8,3.0,1.4,0.1,0
111,6.4,2.7,5.3,1.9,2


### Defining the similarity functions

As you can see, the 4 input parameters (sepal and petal lengths and widths) are numeric. Therefore, in order to compare our test records and find their most similar records in the train data, we will need a similarity function able to compare those numeric arrays of 4 elements each. In the next section we define some of the similarity functions we saw in our session:

In [212]:
def euclidean_distance(array1, array2):
    sq_sum_i = 0
    for i, j in zip(array1, array2):
        sq_sum_i += (i - j) ** 2
    return sqrt(sq_sum_i)

def manhattan_distance(array1, array2):
    sum_i = 0
    for i, j in zip(array1, array2):
        sum_i += abs(i - j)
    return sum_i

def cosine_similarity(array1, array2):
    mul_i = 0
    array1_sq = 0
    array2_sq = 0
    for i, j in zip(array1, array2):
        mul_i += i*j
        array1_sq += i**2
        array2_sq += j**2
    return mul_i / (sqrt(array1_sq) * sqrt(array2_sq))


# We will try some correlation between the numeric arrays as possible similarities also:
def pearson(array1, array2):
    corr, _ = pearsonr(array1, array2)
    return corr if corr > 0 else 0

def spearman(array1, array2):
    corr, _ = spearmanr(array1, array2)
    return corr if corr > 0 else 0

# For the next similarity functions, we can establish a threshold to assess whether 2 values are considered the
# same. In our case, we will set it by default to 0.2 (if the difference between 2 values is equals or less than
# 0.2, then we consider them the same)

def discrete_similarity(array1, array2, allowed_diff=0.2):
    dist = 0
    for i, j in zip(array1, array2):
        # Discrete comparison, either 1 or 0. If one pais of elements is different, then it's zero, otherwise 1
        if abs(i-j) > allowed_diff:
            return 1  
    return 0

def hamming_distance(array1, array2, allowed_diff=0.2):
    dist = 0
    for i, j in zip(array1, array2):
        if abs(i-j) > allowed_diff:
            dist += 1
    return dist

def jaccard_similarity(array1, array2, allowed_diff=0.2):
    intersection_set_size = 0
    union_set_size = 0
    for i, j in zip(array1, array2):
        if abs(i-j) <= allowed_diff:
            intersection_set_size += 1
            union_set_size += 1
        else:
            union_set_size += 2
    return intersection_set_size / union_set_size

In [213]:
# Now we can use those functions. Using the examples from the lesson:
a = [4, 6, 3]
b = [3, 2, 8]
c = [5, 7, 3]

print('Euclidean (a, b):', euclidean_distance(a, b))
print('Euclidean (a, c):', euclidean_distance(a, c))
print('Euclidean (b, c):', euclidean_distance(b, c))
print('\n')
print('Manhattan (a, b):', manhattan_distance(a, b))
print('Manhattan (a, c):', manhattan_distance(a, c))
print('Manhattan (b, c):', manhattan_distance(b, c))
print('\n')
print('Cosine_similarity (a, b):', cosine_similarity(a, b))
print('Cosine_similarity (a, c):', cosine_similarity(a, c))
print('Cosine_similarity (b, c):', cosine_similarity(b, c))
print('\n')
print('* We can see that correlations might work in some cases, but as with Cosine, careful with the magnitudes!')
print('Pearson_correlation (a, b):', pearson(a, b))
print('Pearson_correlation (a, c):', pearson(a, c))
print('Pearson_correlation (b, c):', pearson(b, c))
print('\n')
print('Spearman_correlation (a, b):', spearman(a, b))
print('Spearman_correlation (a, c):', spearman(a, c))
print('Spearman_correlation (b, c):', spearman(b, c))
print('\n')
print('* And other similarities that also work with strings, and for which we put differences < 0.2 to be equal:')
print('discrete_similarity (a, b):', discrete_similarity(a, b))
print('discrete_similarity (a, c):', discrete_similarity(a, c))
print('discrete_similarity (b, c):', discrete_similarity(b, c))
print('\n')
print('hamming_distance (a, b):', hamming_distance(a, b))
print('hamming_distance (a, c):', hamming_distance(a, c))
print('hamming_distance (b, c):', hamming_distance(b, c))
print('\n')
print('jaccard_similarity (a, b):', jaccard_similarity(a, b))
print('jaccard_similarity (a, c):', jaccard_similarity(a, c))
print('jaccard_similarity (b, c):', jaccard_similarity(b, c))

Euclidean (a, b): 6.48074069840786
Euclidean (a, c): 1.4142135623730951
Euclidean (b, c): 7.3484692283495345


Manhattan (a, b): 10
Manhattan (a, c): 2
Manhattan (b, c): 12


Cosine_similarity (a, b): 0.7003755189718213
Cosine_similarity (a, c): 0.9978250097828444
Cosine_similarity (b, c): 0.6629663121838063


* We can see that correlations might work in some cases, but as with Cosine, careful with the magnitudes!
Pearson_correlation (a, b): 0
Pearson_correlation (a, c): 0.9819805060619655
Pearson_correlation (b, c): 0


Spearman_correlation (a, b): 0
Spearman_correlation (a, c): 1.0
Spearman_correlation (b, c): 0


* And other similarities that also work with strings, and for which we put differences < 0.2 to be equal:
discrete_similarity (a, b): 1
discrete_similarity (a, c): 1
discrete_similarity (b, c): 1


hamming_distance (a, b): 3
hamming_distance (a, c): 2
hamming_distance (b, c): 3


jaccard_similarity (a, b): 0.0
jaccard_similarity (a, c): 0.2
jaccard_similarity (b, c): 0.0


In [214]:
# Now let's apply these similarities to find the K nearest neighbours of an array of input values.
# Then, we will take the type of flower that appears the most in the K nearest neighbours as our prediction

''' 
Our algorithm has 4 inputs:
 * test_array: In our case, the array of 4 values describing an iris flower, plus its real type, so we can compare 
 * train_data: In our case, all our 120 records from which we will find the K most similar
 * comparative_function: the similarity or distance function we will use to know which train records are the nearest
 * similarity: a flag to know if comparative_function is a similarity or distance function. 
      If similarity, the higher the better; if distance, the lower the better
 * k: number of nearest neighbours we are going to use to make our prediction
 
 * RETURNS: Whether our prediction was correct or not.
'''
def predict_using_k_nearest_neighbours(test_array, train_data, comparative_function, similarity=True, k=10, print_results=False):
    # We copy the input train_data, so we modify its copy, not the original
    train_dataset = train_data.copy()
    # The last value in the test array is the real flower type. We will use it to compare with our prediction
    real_flower_type = test_array[-1] 
    # The other 4 values are the measure of the flower that we will use to find its K nearest neighbours
    test_flower_measures = test_array[:4]
    # And these 4 columns are the inputs we need to use to predict the flower type (we remove flower type from there)
    input_columns = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width']
    train_dataset['measure'] = train_dataset.apply(
        lambda row: comparative_function(test_flower_measures, row[input_columns]),
        axis=1
    )
    if similarity: # If similarity, we sort the 120 records by their measure in descending order, and that 1st K
        k_nearest = train_dataset.sort_values('measure', axis=0, ascending=False)[:k]
    else: # Otherwise, it is a distance function, so the less the better, we take the smallest K
        k_nearest = train_dataset.sort_values('measure', axis=0, ascending=True)[:k]
    
    # Now we select the flower type that is repeated the most in those K selected row. That is the mode.
    predicted_flower_type = k_nearest['type'].mode()[0]
    if print_results:
        print('Real:',int(real_flower_type), ', Prediction:', predicted_flower_type)
    
    return predicted_flower_type == int(real_flower_type)

### Now let's use that function to test one of the meaures, the Euclidean distance:

Feel free to change the K_NEAREST_NEIGHBOURS value. 

* What do you get if it is set to 120? Why do you think that is happening?
* Can you find the best performing K_NEAREST_NEIGHBOURS value? What is it?

In [215]:
test_arrays = test.values.tolist()

K_NEAREST_NEIGHBOURS = 5

num_correct_predictions = 0

for test_array in test_arrays:
    correct_prediction = predict_using_k_nearest_neighbours(
        test_array, 
        train, 
        euclidean_distance, 
        similarity=False, 
        k=K_NEAREST_NEIGHBOURS,
        print_results=True
    )
    if correct_prediction:
        num_correct_predictions += 1
        
print('Using Euclidean distance, we correctly predicted', num_correct_predictions,'/',len(test_arrays),
     'test samples (', round(100*num_correct_predictions/len(test_arrays), 2), '%)')

Real: 1 , Prediction: 2
Real: 0 , Prediction: 0
Real: 0 , Prediction: 0
Real: 0 , Prediction: 0
Real: 0 , Prediction: 0
Real: 2 , Prediction: 2
Real: 1 , Prediction: 1
Real: 0 , Prediction: 0
Real: 0 , Prediction: 0
Real: 1 , Prediction: 1
Real: 2 , Prediction: 2
Real: 0 , Prediction: 0
Real: 2 , Prediction: 2
Real: 2 , Prediction: 2
Real: 2 , Prediction: 2
Real: 0 , Prediction: 0
Real: 2 , Prediction: 2
Real: 2 , Prediction: 2
Real: 1 , Prediction: 1
Real: 1 , Prediction: 1
Real: 1 , Prediction: 1
Real: 1 , Prediction: 1
Real: 1 , Prediction: 1
Real: 0 , Prediction: 0
Real: 0 , Prediction: 0
Real: 1 , Prediction: 1
Real: 2 , Prediction: 2
Real: 2 , Prediction: 2
Real: 1 , Prediction: 1
Real: 2 , Prediction: 2
Real: 0 , Prediction: 0
Real: 1 , Prediction: 1
Real: 1 , Prediction: 1
Real: 1 , Prediction: 1
Real: 1 , Prediction: 1
Real: 1 , Prediction: 1
Real: 2 , Prediction: 2
Real: 0 , Prediction: 0
Real: 2 , Prediction: 2
Real: 1 , Prediction: 1
Real: 2 , Prediction: 2
Real: 2 , Predic

### We can make a function from the above piece of code to test all of the functions:

We will need to specify whether each meaure is a distance or a similarity 
(if they were normalised in the [0, 1] interval we won't need this...

In [216]:
def test_dataset_using_knn_with_our_own_measure(test_dataframe, train_dataframe, measure, similarity, k=10):
    test_arrays = test_dataframe.values.tolist()
    num_correct_predictions = 0
    
    if similarity == 'similarity':
        is_similarity = True # it is a similarity, the higher it is, the better
    else:
        is_similarity = False # it is a distance, the lower it is, the better

    for test_array in test_arrays:
        correct_prediction = predict_using_k_nearest_neighbours(
            test_array, 
            train, 
            measure, 
            similarity=is_similarity, 
            k=k
        )
        if correct_prediction:
            num_correct_predictions += 1

    print('Using', measure.__name__, ', we correctly predicted', num_correct_predictions,'/',len(test_arrays),
         'test samples (', round(100*num_correct_predictions/len(test_arrays), 2), '%)')

In [217]:
# MODIFY THIS CODE TO TEST YOUR OWN K_NEAREST_NEIGHBOURS PARAMETERS AND SEE HOW THAT CHANGES PERFORMANCES

K_NEAREST_NEIGHBOURS = 1

measures = (
    (euclidean_distance, 'distance'),
    (manhattan_distance, 'distance'),
    (cosine_similarity, 'similarity'),
    (pearson, 'similarity'),
    (spearman, 'similarity'),
    (discrete_similarity, 'similarity'),
    (hamming_distance, 'distance'),
    (jaccard_similarity, 'similarity'),
)

for measure, similarity in measures:
    test_dataset_using_knn_with_our_own_measure(
        test,
        train,
        measure,
        similarity,
        k=K_NEAREST_NEIGHBOURS
    )

Using euclidean_distance , we correctly predicted 72 / 75 test samples ( 96.0 %)
Using manhattan_distance , we correctly predicted 72 / 75 test samples ( 96.0 %)
Using cosine_similarity , we correctly predicted 72 / 75 test samples ( 96.0 %)
Using pearson , we correctly predicted 70 / 75 test samples ( 93.33 %)
Using spearman , we correctly predicted 52 / 75 test samples ( 69.33 %)
Using discrete_similarity , we correctly predicted 23 / 75 test samples ( 30.67 %)
Using hamming_distance , we correctly predicted 71 / 75 test samples ( 94.67 %)
Using jaccard_similarity , we correctly predicted 71 / 75 test samples ( 94.67 %)


In [218]:
# And why not defining our own similarity function based on other similarity functions? 
# As mentioned in the session, our imagination is the only limit.

# After seeing that pearson and cosine were two of the best performing similarities, we
# can use both of them and average them out... in some data permutations this produces 
# 100% accuracy. 

# MODIFY THIS CODE TO TEST YOUR OWN SIMILARITY FUNCTIONS IN THIS SIMULATION IF YOU WANT

# See here how we transformed the Euclidean distance into Euclidean similarity as we
# pointed out in the session slides.
def combined_similarity(array1, array2):
    total = (1/(1+euclidean_distance(array1, array2))) + cosine_similarity(array1, array2) + pearson(array1, array2)
    return total/3

test_dataset_using_knn_with_our_own_measure(
        test,
        train,
        combined_similarity,
        similarity='similarity',
        k=K_NEAREST_NEIGHBOURS
    )

Using combined_similarity , we correctly predicted 72 / 75 test samples ( 96.0 %)


## Learning Exercises


* If you decrease the size of the train dataset, how does it affect to the overall accuracy? (at the beginning, in the function `train_test_split(iris, test_size=0.5)`, what if test_size is just 0.05, or 0.95?)
* Does using more nearest neighbours yield better results? Can you check by tuning the K_NEAREST_NEIGHBOURS parameter?