# Distance Matrix

In [None]:
# imports
from sklearn import datasets
import numpy as np

In [None]:
# abstract
dataset = datasets.load_iris()

In [None]:
# dictionary
dataset.keys()

In [None]:
dataset["feature_names"]

In [None]:
data = dataset["data"]
# data  # data is a numpy array data structure. Think of it as a matrix of data (or as an excel spreadsheet)

In [None]:
data.shape

In [None]:
# euclidean distance of 2 observations
p1 = data[50]
p2 = data[100]
sum(((p1 - p2)**2))**(1/2)

In [None]:
# initialize distance matrix. What will be its final shape?
dist = []

In [None]:
# Build the distance matrix. Use 2 for loops, the append list method and the euclidean distance formula
for i in range(data.shape[0]):
    dist_row = []
    for j in range(data.shape[0]):
        single_dist = sum((data[i] - data[j]) ** 2) ** 1/2
        dist_row.append(single_dist)
    dist.append(dist_row)        

In [None]:
# dist

In [None]:
# another import (usually all imports are done at the top of the script/ notebook)
import seaborn as sns

In [None]:
sns.heatmap(dist)

# Plotting data: 
### How can we represent an observation in a N-dimensional Space

In [None]:
# another import (usually all imports are done at the top of the script/ notebook)
import matplotlib.pyplot as plt

In [None]:
# 2D scatter plot
plt.scatter(data[:, 0], data[:, 1])
plt.xlabel(dataset["feature_names"][0])
plt.ylabel(dataset["feature_names"][1])
plt.show()

In [None]:
# 1D scatter plot
plt.scatter(data[:, 0], [0 for i in range(data.shape[0])])
plt.xlabel(dataset["feature_names"][0])
plt.show()

In [None]:
# 3D scatter plot
fig = plt.figure(figsize=(14, 7))  # defining a figure so we can add a 3d subplot
ax = fig.add_subplot(111, projection="3d")
ax.scatter(data[:, 0], data[:, 1], data[:, 2])
ax.set_xlabel(dataset["feature_names"][0])
ax.set_ylabel(dataset["feature_names"][1])
ax.set_zlabel(dataset["feature_names"][2])
plt.show()

## Finding nearest neighbors

In [None]:
# get variables to save closest neighbors later
min_args, min_dist = (None, 9e99)
for id_r, row in enumerate(dist):
    row_ = row.copy()[:id_r]
    dist_ = min(row_) if len(row_)>0 else 9e99
    
    if dist_<=min_dist:
        min_dist = dist_
        for id_diag, dist_val in enumerate(row_):
            if dist_val==dist_:
                min_args = (id_diag, id_r)
                break

In [None]:
min_args

In [None]:
print(data[min_args[0]])
print(data[min_args[1]])
print('minimum distance:\t', min_dist)

## Define functions
Why do we want to define functions in this case?

In [None]:
def distance_matrix(data):
    dist = []
    # Build the distance matrix. Use 2 for loops, the append list method and the euclidean distance formula
    for i in range(data.shape[0]):
        dist_row = []
        for j in range(data.shape[0]):
            single_dist = sum((data[i] - data[j]) ** 2) ** 1/2
            dist_row.append(single_dist)
        dist.append(dist_row)    
    return dist    

def closest_points(dist_matrix):
    # get variables to save closest neighbors later
    min_args, min_dist = (None, 9e99)
    for id_r, row in enumerate(dist_matrix):
        row_ = row.copy()[:id_r]
        dist = min(row_) if len(row_)>0 else 9e99
        # check if the row's min distance is the lowest distance found so far
        if dist<=min_dist:
            # save points' ids and their distance
            min_dist = dist 
            for id_diag, dist_val in enumerate(row_):
                if dist_val==dist:
                    min_args = (id_diag, id_r)
                    break
    return min_args, min_dist

## Finding the `n` shortest distances

In [None]:
dist_matrix = distance_matrix(data)
n_distances = 10

distances = []
for _ in range(n_distances):
    c_points = closest_points(dist_matrix)
    dist_matrix[c_points[0][1]][c_points[0][0]] = 9e99  # Increasing shortest distance value to find the next shortest distance
    distances.append(c_points)

distances

## Programming a nearest neighbors algorithm (not covered in class)

In [None]:
# Update the dataset to include the cluster centroid and drop the closest points
# we're going to take this code and define as a function later
data_ = data.copy()
cluster = list(range(data_.shape[0]))

dist_matrix = distance_matrix(data_)
min_args, _ = closest_points(dist_matrix)

# update clusters
nobs_points = []
for i in min_args:
    nobs_points.append(len(str(cluster[i]).split('_')))
    
centroid = (
    data_[min_args[0]]*nobs_points[0] +
    data_[min_args[1]]*nobs_points[1]
) / sum(nobs_points)

# add new cluster label to centroid, append centroid to the dataset and delete joined observations
cluster.append('_'.join([str(cluster[min_args[0]]), str(cluster[min_args[1]])]))
data_ = list(data_)
data_.append(centroid)

del data_[min_args[0]], data_[min_args[1]], cluster[min_args[0]], cluster[min_args[1]]
data_ = np.array(data_)

In [None]:
print("Added cluster label:", cluster[-1], "with the respective values:", data_[-1])

### Define a function

In [None]:
def nearest_neighbors_clustering(data, k=10):
    data_ = data.copy()
    cluster = list(range(data_.shape[0]))
    
    while len(cluster) > k:
        dist_matrix = distance_matrix(data_)
        min_args, _ = closest_points(dist_matrix)

        # update clusters
        nobs_points = []
        for i in min_args:
            nobs_points.append(len(str(cluster[i]).split('_')))
            
        centroid = (
            data_[min_args[0]]*nobs_points[0] +
            data_[min_args[1]]*nobs_points[1]
        ) / sum(nobs_points)

        # add new cluster label do centroid and append data to the dataset
        cluster.append('_'.join([str(cluster[min_args[0]]), str(cluster[min_args[1]])]))
        data_ = list(data_)
        data_.append(centroid)

        del data_[min_args[1]], data_[min_args[0]]
        del cluster[min_args[1]], cluster[min_args[0]]

        data_ = np.array(data_)

    return cluster, data_


In [None]:
nearest_neighbors_clustering(data)