***
# <font color=blue>UNSUPERVISED LEARNING</font>
# <font color=blue>Practice with K-means</font>
# <font color=blue>(student version)</font>
<div style="text-align: right"><font color=magenta>Andrea De Simone</font></div>
***

In [None]:
import numpy as np  
import pandas as pd  
import matplotlib.pyplot as plt  
%matplotlib inline
from sklearn import datasets
from sklearn.cluster import KMeans

***
# 1.  2-D Clustering

## 1.1 Load Dataset 1

In [None]:
# Load data
X = np.loadtxt('dataset1.csv', delimiter=',')
print(X.shape)

In [None]:
fig, ax = plt.subplots()
ax.set_xlabel('$x_1$',size=16)
ax.set_ylabel('$x_2$',size=16)

ax.scatter(X[:,0], X[:,1], alpha=0.5)
plt.show() 

## 1.2 Plot data and initial centroids

In [None]:
initial_centroids = np.array([[3, 3], [6, 2], [8, 5]])

In [None]:
def scatter_plot(data, centroids):
    fig, ax = plt.subplots()
    ax.set_xlabel('$x_1$',size=16)
    ax.set_ylabel('$x_2$',size=16)

    ax.scatter(data[:,0], data[:,1], alpha=0.5)
    ax.scatter(centroids[:,0], centroids[:,1], alpha=1, c='r',marker='x',s=100)
    plt.show() 

In [None]:
scatter_plot(X,initial_centroids)

## 1.3 Assign points to clusters

### <font color='magenta'>>>> Q1: Assign points to clusters. What is the cluster assignment of the 150th point? </font>

In [None]:
def cluster_assignment(X, centroids):  
    """
    Assign points of X to cluster of the closest centroid.
    """
    
    N = X.shape[0]
    tags = np.zeros(N)

    # loop over dataset
    for i in range(N):
        
        dist = np.sum(np.square( X[i,:] - centroids ), axis=1)
        
        # Stard Edit 
            
        # End Edit
        
    return(tags)

In [None]:
tags = cluster_assignment(X, initial_centroids)  
print(tags[104])

## 1.4 Update centroids

In [None]:
def move_centroids(X, tags, k):  
    """
    Update centroid positions.
    Centroids are simple avg of intra-cluster distances
    """
    D = X.shape[1]
    centroids = np.zeros((k, D))

    # loop over centroids
    for i in range(k):
        
        # cluster partitions (list)
        indices = np.where(tags == i)
        # points in cluster
        N_points = len(indices[0])
        
        # update centroid    
        if N_points != 0:
            centroids[i,:] = ( np.sum(X[indices,:], axis=1) / N_points ).ravel()
            
    return(centroids)

In [None]:
centroids = move_centroids(X,tags,3)
centroids

In [None]:
scatter_plot(X, centroids)

## 1.5 K-means algorithm

In [None]:
def run_k_means(X, initial_centroids):  
    """
    Run K-means algorithm. 
    Stop when centroid positions are stable (within tolerance).
    Return cluster assignments and centroid positions.
    """
    N = X.shape[0]
    k = initial_centroids.shape[0]

    tags = np.zeros(N)
    centroids = initial_centroids
    
    tolerance = 1e-5
    iteration = 0

    # Loop until centroid positions are stable
    while True:
        
        iteration += 1        
        centroids_prev = centroids
        
        # Assign points to clusters
        tags = cluster_assignment(X, centroids)
        
        # Update centroid positions
        centroids = move_centroids(X, tags, k)
        
        # check distance between current and previous centroid positions
        dist = np.sum(np.square( centroids_prev - centroids ))        
        if dist < tolerance:            
            print("Converged after {:d} iterations".format(iteration))
            break
            
        # avoid infinite loop
        if iteration >= 10000:
            break

    return(tags, centroids)

In [None]:
tags, centroids = run_k_means(X, initial_centroids)  

In [None]:
scatter_plot(X, centroids)

### <font color='magenta'>>>> Q2: Group coordinates of points in each cluster</font>

In [None]:
# Start Edit
#cluster1 =    # coordinates of points tagged in first cluster 
#cluster2 =    # coordinates of points tagged in second cluster 
#cluster3 =    # coordinates of points tagged in third cluster 
# End Edit

In [None]:
# Plot colored clusters
fig, ax = plt.subplots()  
ax.scatter(cluster1[:,0], cluster1[:,1], s=30, color='r', label='Cluster 1')  
ax.scatter(cluster2[:,0], cluster2[:,1], s=30, color='g', label='Cluster 2')  
ax.scatter(cluster3[:,0], cluster3[:,1], s=30, color='b', label='Cluster 3')  
leg = ax.legend(frameon=True, loc='upper right')  
ax.set_xlabel('$X_1$')  
ax.set_ylabel('$X_2$')  
plt.show()

## 1.6 Random initialization of centroids

### <font color='magenta'>>>> Q3: Complete the function 'initialize_centroids'. Run K-means with random initialization and find centroid coordinates. </font>

In [None]:
def initialize_centroids(X, k): 
    """ 
    Random Initialization of k centroids.
    Choose k random points among the data points.
    """
    
    N, D = X.shape
    centroids = np.zeros((k, D))
    
    # Start Edit    

    # End Edit
    
    return(centroids)

In [None]:
# random 3-D array filled with integers between 0 and 299
np.random.randint(0, 300, 3) 

In [None]:
initialize_centroids(X, 3)  

In [None]:
tags, centroids = run_k_means(X, initialize_centroids(X, 3))
print(centroids)
%time

In [None]:
# Compare with SKlearn KMeans
km = KMeans(n_clusters=3, init='random')
km.fit(X)
print(km.cluster_centers_)
%time

## 1.7  Choose K

In [None]:
# Loss function
def Loss(X, tags, centroids):  
    """
    Sum of intra-cluster squared distances
    """
    k = centroids.shape[0]
    loss = 0
    
    for j in range(k):
        cluster_points = X[tags == j]
        centroid = centroids[j]
        loss += np.sum(np.square( cluster_points - centroid ))
        
    return( loss )

In [None]:
Loss(X, tags, centroids)

### <font color='magenta'>>>> Q4: compute loss function for K = 1 to 10 (values in 'K_list'). Fill list 'Loss_list' with the values of the loss function.</font>

In [None]:
Loss_list = list()
K_list = range(1,11)

for K in K_list:
    
    tags, centroids = run_k_means(X, initialize_centroids(X, K) )  
    
    Loss_list.append(Loss(X,tags,centroids))


In [None]:
def Plot_Loss_K(Loss_list,K_list):
    
    fig, ax = plt.subplots()
    ax.plot(K_list,Loss_list, marker="o")
    ax.tick_params(labelsize=12)
    ax.set_xticks(K_list)
    ax.set_xlabel("K", fontsize=14)
    ax.set_ylabel("Loss",fontsize=14)
    plt.show()

In [None]:
Plot_Loss_K(Loss_list,K_list)

***
# 2. IRIS dataset

## 2.1 Load IRIS Dataset

In [None]:
iris = datasets.load_iris()
X_iris = iris.data
Y_iris = iris.target

In [None]:
# set number of clusters
K=3

# run the algorithm
tags, centroids = run_k_means(X_iris, initialize_centroids(X_iris, K))

In [None]:
cluster1 = X_iris[tags == 0,:]  
cluster2 = X_iris[tags == 1,:]  
cluster3 = X_iris[tags == 2,:]  

true1 = X_iris[Y_iris == 0,:]  
true2 = X_iris[Y_iris == 1,:]  
true3 = X_iris[Y_iris == 2,:]  

print(cluster1.shape)
print(cluster2.shape)
print(cluster3.shape)
print("")
print(true1.shape)
print(true2.shape)
print(true3.shape)

In [None]:
def plot_predicted_vs_true(cluster1,cluster2,cluster3, true1, true2, true3):
    
    # Plot clusters
    fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(nrows=2, ncols=2,figsize=(12,10))  

    ax1.set_title("Predicted")
    ax1.scatter(cluster1[:,0], cluster1[:,1], s=30, color='b',label='cluster 1')  
    ax1.scatter(cluster2[:,0], cluster2[:,1], s=30, color='r', label='cluster 2')  
    ax1.scatter(cluster3[:,0], cluster3[:,1], s=30, color='g', label='cluster 3')  
    ax1.legend(frameon=True, loc='lower right')  
    ax1.set_xlabel('sepal length [cm]')  
    ax1.set_ylabel('sepal width [cm]')  

    ax2.set_title("True")
    ax2.scatter(true1[:,0], true1[:,1], s=30, color='orange', label='setosa')  
    ax2.scatter(true2[:,0], true2[:,1], s=30, color='purple', label='versicolor')  
    ax2.scatter(true3[:,0], true3[:,1], s=30, color='pink', label='virginica')  
    ax2.legend(frameon=True, loc='lower right')  
    ax2.set_xlabel('sepal length [cm]')  
    ax2.set_ylabel('sepal width [cm]')  

    ax3.set_title("Predicted")
    ax3.scatter(cluster1[:,2], cluster1[:,3], s=30, color='b',label='cluster 1')  
    ax3.scatter(cluster2[:,2], cluster2[:,3], s=30, color='r', label='cluster 2')  
    ax3.scatter(cluster3[:,2], cluster3[:,3], s=30, color='g', label='cluster 3')  
    ax3.legend(frameon=True, loc='lower right')  
    ax3.set_xlabel('petal length [cm]')  
    ax3.set_ylabel('petal width [cm]')  

    ax4.set_title("True")
    ax4.scatter(true1[:,2], true1[:,3], s=30, color='orange', label='setosa')  
    ax4.scatter(true2[:,2], true2[:,3], s=30, color='purple', label='versicolor')  
    ax4.scatter(true3[:,2], true3[:,3], s=30, color='pink', label='virginica')  
    ax4.legend(frameon=True, loc='lower right')  
    ax4.set_xlabel('petal length [cm]')  
    ax4.set_ylabel('petal width [cm]') 

    plt.show()

In [None]:
plot_predicted_vs_true(cluster1,cluster2,cluster3,true1,true2,true3)

## 2.2 Assign Clusters to Classes

In [None]:
true_centroids = np.zeros((3, 4))  # 3 clusters, 4 features

for i in range(3):
    true_centroids[i] = np.mean(X_iris[Y_iris==i],axis=0)
    
print(true_centroids)
print("")
print(centroids)

In [None]:
print(np.sum(np.square(centroids[0]-true_centroids), axis=1))
print(np.sum(np.square(centroids[1]-true_centroids), axis=1))
print(np.sum(np.square(centroids[2]-true_centroids), axis=1))

### <font color='magenta'>>>> Q5: Assign clusters to true classes. </font>

In [None]:
tags_new = np.copy(tags)

# assign clusters to classes
for i in range(K):
    
    # Start Edit
    
    # End Edit

In [None]:
cluster1 = X_iris[tags_new == 0,:]  
cluster2 = X_iris[tags_new == 1,:]  
cluster3 = X_iris[tags_new == 2,:]  

true1 = X_iris[Y_iris == 0,:]  
true2 = X_iris[Y_iris == 1,:]  
true3 = X_iris[Y_iris == 2,:]  

plot_predicted_vs_true(cluster1,cluster2,cluster3,true1,true2,true3)

## 2.3 Evaluate Performance

### <font color='magenta'>>>> Q6: find fraction of points correctly assigned to true clusters </font>

In [None]:
correct = 0 
tot = Y_iris.shape[0]

for i in range(3):
    
    # Start Edit
    
    # End Edit
    
print("Correct assignments = {:.1f}%".format(correct/tot * 100))

***
# 3. Image compression (bonus, no questions)

<img src="parrot.jpg", height="150" width="200">

## 3.1 Load Dataset 'Parrot'

In [None]:
A = np.loadtxt('dataset_parrot.csv', delimiter=',').reshape((150,200,3))
print(A[:3])
print(X.shape)
# RGB values

In [None]:
# normalize value ranges
A_norm = A / 255.

# reshape the array
X_p = np.reshape(A_norm, (A_norm.shape[0] * A_norm.shape[1], A_norm.shape[2]))

print(A_norm.shape)
print(X_p.shape)

## 3.2 Reconstruct image from centroids

In [None]:
# Set number of colors
Ncolors = 8

# randomly initialize the centroids
initial_centroids = initialize_centroids(X_p, Ncolors)

# run the algorithm
tags, centroids = run_k_means(X_p, initial_centroids)

# map each pixel to the centroid value
X_recovered = centroids[tags.astype(int),:]

# reshape to the original dimensions
X_recovered = np.reshape(X_recovered, (A_norm.shape[0], A_norm.shape[1], A_norm.shape[2]))

plt.imshow(X_recovered) 
plt.show()