<h1 align="center"> Clustering </h1>

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

%matplotlib inline

import matplotlib as mpl
mpl.rc("savefig", dpi=100) # Adjust for higher-resolution figures

<h2> Lloyd's algorithm from scratch </h2>

<h3> Initializing the <i>k</i> centers </h3>

In [2]:
def init_centers(X, k):
    """
    Randomly samples k observations from X as centers.
    Returns these centers as a (k x d) numpy array.
    
    (X,k) --> centers(numpy.array)
    """
    #
    centers = np.random.choice(len(X), size=k, replace= False)
    return X[centers , :]
    #

In [3]:
#demo run
k=2
X=np.array([ [1,2],[2,3],[1,1],[2,4],[1,2],[1,5],[7,1]])
centers= init_centers(X,k)
centers


array([[2, 4],
       [1, 5]])

In [4]:
k2=3
X2=np.array([[1,1,2],[1,1,9],[2,2,0],[3,2,1],[2,2,1],[2,2,4],[1,1,3]])
centers2=init_centers(X2,k2)
centers2

array([[2, 2, 4],
       [2, 2, 0],
       [1, 1, 2]])

<p><span style="color:purple">define a function "initC" from scratch</span>

In [8]:
#def initC(X,k):
#   uncomment to check
#Centers=initC(X2,k2);Centers

array([[2, 2, 1],
       [3, 2, 1],
       [1, 1, 2]])

<h3>2. Computing the distances</h3>

 the `np.linalg.norm()` function 

In [9]:
v1 = np.array([2,3])
v2 = np.array([1,3])
v3= np.array([-2,-3])
print("norm v1")
print(np.linalg.norm(v1, ord=2, axis=0))

print("norm v1, v3")
np.linalg.norm(v3 , ord=2 )

norm v1
3.605551275463989
norm v1, v3


3.605551275463989

In [46]:
a=16
b=36
np.sqrt(16+36)

7.211102550927978

In [5]:
def compute_d2(X, centers):

    #(X, centers --> S (distance matrix))   

    m = len(X)
    k = len(centers)
    
    S = np.empty((m, k))
    for i in range (m): #for each observation, item
        S[i,:] = np.linalg.norm(X[i, : ] - centers , ord=2, axis=1)**2 
        # 	2-norm (largest sing. value)
    return S

In [9]:
#demo run
print(centers)
S=compute_d2(X, centers)
S

[[7 1]
 [2 3]]


array([[37.,  2.],
       [29.,  0.],
       [36.,  5.],
       [34.,  1.],
       [37.,  2.],
       [52.,  5.],
       [ 0., 29.]])

In [48]:
S2=compute_d2(X2,centers2)
S2

array([[ 0., 49.,  1.],
       [49.,  0., 36.],
       [ 6., 83., 11.],
       [ 6., 69.,  9.],
       [ 3., 66.,  6.],
       [ 6., 27.,  3.],
       [ 1., 36.,  0.]])

In [6]:
a=np.array([[2,3,4]])
b=np.array([[6,6,6]])
aa=np.array([2,3,4])
bb=np.array([6,6,6])


`numpy.linalg.norm(array1 - array2 , ord=2, axis=1) **2` <br>     to revisit  !!! review
https://docs.scipy.org/doc/numpy-1.14.0/reference/generated/numpy.linalg.norm.html

<p><span style="color:purple">compute distance between vectors a and b</span>

In [10]:
normab = np.linalg.norm(a - b, ord=2, axis=1 )**2
print(normab)
normabb = np.linalg.norm(aa - bb, ord=2, axis=0 )**2
normabb

[29.]


28.999999999999996

<h3>3. Assign cluster labels </h3>
<p>(to distance matrix 'S')

 the `np.argmin()` function


In [24]:
vec= np.array([[2,10],[11,2],[65,1]])
print(vec)
print("axis =1, argmin()")
print(np.argmin(vec, axis=1         ))
print("axis =0, argmin()")
np.argmin(vec, axis=0         )

[[ 2 10]
 [11  2]
 [65  1]]
axis =1, argmin()
[0 1 1]
axis =0, argmin()


array([0, 2])

In [32]:
def assign_cluster_labels(S):
    '''
    S --> labels
    '''
    return np.argmin(S, axis=1) ## 1 ; the column index = [1], a column(1) with the  minimum indeces in each row
    #

In [45]:
#demo run
labels = assign_cluster_labels(S)
labels

array([0, 0, 0, 1, 0, 1, 0])

<h3>4. Compute the centers of each cluster </h3>

In [30]:
def update_centers(X, y):
    '''
    (X , labels) --> centers
    '''
    # X[:m, :d] == m points, each of dimension d
    # y[:m] == cluster labels
    m, d = X.shape
    k = max(y) + 1
    assert m == len(y)
    assert (min(y) >= 0)
    
    centers = np.empty((k, d))
    for j in range(k):
        # Compute the new center of cluster j,
        # i.e., centers[j, :d].
        #
        centers[j,:] = np.mean( X[y==j, : ], axis = 0)
        #
    return centers

In [42]:

centers = update_centers(X, labels)
centers

array([[7.        , 1.        ],
       [1.33333333, 2.83333333]])

<h4>HELPERS: return the within-cluster sum of squares, check convergence </h4>

In [34]:
def WCSS(S):
    #
    return np.sum( np.amin(S, axis = 1))
    #
    


In [35]:
def has_converged(old_centers, centers):
    return set([tuple(x) for x in old_centers]) == set([tuple(x) for x in centers])

<h3> The main `kmeans` function </h3>

In [25]:
df = pd.read_csv('data.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,x_1,x_2,label
0,0,-0.234443,-1.07596,1
1,1,0.730359,-0.918093,0
2,2,1.43227,-0.439449,0
3,3,0.026733,1.0503,0
4,4,1.87965,0.207743,0


In [44]:
points = df[['x_1', 'x_2']].values
labels = df['label'].values
n, d = points.shape
k = 2     #2

In [47]:
def kmeans(X, k,
           starting_centers=None,
           max_steps=np.inf):
    # 1. set centers
    if starting_centers is None:
        centers = init_centers(X, k)
    else:
        centers = starting_centers
        
    converged = False # whether the centers have "moved"
    labels = np.zeros(len(X))
    i = 1
    while (not converged) and (i <= max_steps):
        old_centers = centers
        #
        #2 _find square dist points-centers        STEP 1 in Lloyds
        S = compute_d2(X, centers) #gets distance matrix
        
        #3 assign the points to a cluster
        labels = assign_cluster_labels(S)
        
        #4__recalculate the centroids to a center   STEP 2 in Lloyd's
        centers = update_centers(X, labels)
        
        #5__check if centroids have moved
        converged = has_converged(old_centers, centers)
        #
        print ("iteration", i, "WCSS = ", WCSS (S))
        i += 1
    
    return labels

clustering = kmeans(points, k, starting_centers=points[[0, 187], :])
print(clustering)

iteration 1 WCSS =  549.9175535488309
iteration 2 WCSS =  339.80066330255096
iteration 3 WCSS =  300.330112922328
iteration 4 WCSS =  289.80700777322045
iteration 5 WCSS =  286.0745591062787
iteration 6 WCSS =  284.1907705579879
iteration 7 WCSS =  283.22732249939105
iteration 8 WCSS =  282.456491302569
iteration 9 WCSS =  281.84838225337074
iteration 10 WCSS =  281.57242082723724
iteration 11 WCSS =  281.5315627987326
[0 0 1 1 1 1 0 0 0 0 0 1 1 0 1 1 1 1 0 1 1 1 0 1 1 0 1 0 1 0 0 1 1 0 1 1 1
 1 0 0 1 0 0 1 0 0 1 1 1 0 0 1 1 0 1 1 1 0 1 0 0 1 1 1 0 1 0 0 0 0 0 0 0 1
 0 1 0 0 1 0 1 1 1 0 1 0 0 1 1 0 0 1 1 0 1 1 1 0 1 0 1 0 1 0 1 0 0 0 1 0 0
 0 1 0 1 0 0 1 1 1 1 0 0 0 1 0 0 0 0 1 1 1 1 0 0 1 0 1 0 0 0 1 1 0 1 1 0 1
 1 0 0 0 0 1 0 1 0 0 0 0 1 1 0 1 0 0 1 0 0 0 0 0 0 1 1 1 0 0 1 0 0 1 1 0 0
 0 0 1 1 0 0 0 0 0 1 0 0 0 0 0 1 0 1 0 0 0 1 1 1 0 0 1 1 0 1 1 0 0 1 0 0 1
 1 1 1 1 1 0 0 0 0 0 0 1 0 1 1 1 1 1 0 0 1 0 0 1 0 1 1 0 1 0 0 1 1 1 0 0 0
 1 0 0 0 1 1 1 1 0 0 1 0 0 1 1 1 1 1 0 1 0 0 1 1 0 1

In [27]:
clustering = kmeans(points, k)

iteration 1 WCSS =  342.42571680019324
iteration 2 WCSS =  289.9323284147895
iteration 3 WCSS =  286.7538516279051
iteration 4 WCSS =  284.09771994025346
iteration 5 WCSS =  283.36215702221284
iteration 6 WCSS =  282.54826291334257
iteration 7 WCSS =  282.10232370657434
iteration 8 WCSS =  281.57242082723724
iteration 9 WCSS =  281.5315627987326


<h2> a Class of Clustering </h2>

In [13]:

from linear_algebra import squared_distance, vector_mean, distance
import math, random
import matplotlib.image as mpimg
import matplotlib.pyplot as plt


In [None]:


class KMeans:
    """performs k-means clustering"""

    def __init__(self, k):
        self.k = k          # number of clusters
        self.means = None   # means of clusters
        self.assignments=[]
        self.inputs=[]

    def classify(self, input):
        """return the INDEX of the cluster closest to the input"""
        return min(range(self.k),
                   key=lambda i: squared_distance(input, self.means[i]  ))

    def train(self, inputs):

        self.means = random.sample(inputs, self.k)
        self.inputs= inputs
        assignments = None

        while True:
            # Find new assignments
            new_assignments = list(map(self.classify, inputs))

            # If no assignments have changed, we're done.
            if assignments == new_assignments:
                self.assignments= assignments
                return

            # Otherwise keep the new assignments,
            assignments = new_assignments

            for i in range(self.k):
                i_points = [p for p, a in zip(inputs, assignments) if a == i]
                # avoid divide-by-zero if i_points is empty
                if i_points:
                    self.means[i] = vector_mean(i_points)
    def show_assignments(self):
        assignDict = {}
        for input, group in zip(self.inputs, self.assignments):
            assignDict[str(input)]=group
        return (assignDict)


<h4><i>acquaintance with lambda  - key </i> </h4>

In [5]:
x= [4,1,2,3]
#compare the results with a function you specify with key
x = sorted([-4,1,-2,3], key=abs, reverse=True) # is [-4,3,-2,1]


[-4, 3, -2, 1]

In [24]:
d={"C":9.5,"B":7.5,"N":8,"U":8.5,"A":7, "R":17}
Sorted = sorted(d.items(),key=lambda kv: kv[1])
Sorted

[('A', 7), ('B', 7.5), ('N', 8), ('U', 8.5), ('C', 9.5), ('R', 17)]

In [32]:
nums = [-2, 8, -16, 3]
numssq= sorted( nums, key= lambda x:  x**2, reverse=True)
numssq

[-16, 8, 3, -2]

In [66]:
#application 1
random.seed(0)
inputs = [[-14,-5],[13,13],[20,23],[-19,-11],[-9,-16],[21,27],[-49,15],[26,13],[-46,5],[-34,-1],[11,15],[-49,0],[-22,-16],[19,28],[-12,-8],[-13,-19],[-41,8],[-11,-6],[-25,-9],[-18,-3]]
clusterer = KMeans(3)
clusterer.train(inputs)
print (clusterer.means)
print(clusterer.assignments)
print(clusterer.show_assignments())

[[-25.857142857142854, -4.714285714285714], [20.0, 26.0], [16.666666666666664, 13.666666666666666]]
[0, 2, 1, 0, 0, 1, 0, 2, 0, 0, 2, 0, 0, 1, 0, 0, 0, 0, 0, 0]
{'[-14, -5]': 0, '[13, 13]': 2, '[20, 23]': 1, '[-19, -11]': 0, '[-9, -16]': 0, '[21, 27]': 1, '[-49, 15]': 0, '[26, 13]': 2, '[-46, 5]': 0, '[-34, -1]': 0, '[11, 15]': 2, '[-49, 0]': 0, '[-22, -16]': 0, '[19, 28]': 1, '[-12, -8]': 0, '[-13, -19]': 0, '[-41, 8]': 0, '[-11, -6]': 0, '[-25, -9]': 0, '[-18, -3]': 0}


<h4>short squared distance function</h4>

In [39]:

a=[2,3,4]
b=[5,5,6]
def sqdist(v,w):
    
   # assert( len(v) == len(w)), "wrong length"
    sqdist=0
    for v_i,w_i in zip (v,w):
        sqdist += (v_i - w_i)**2       
    return sqdist
sqdist(a,b)

17

<h2> Trying a "from scratch" aproach </h2>

In [115]:
import numpy as np
def init_centers(X, k=2):
    random.seed(20)
    xar= np.array(X)
    centers =  np.random.choice(len(X), size=k, replace= False)
    return list(xar[centers , :])
    #

In [184]:
#k=2
inputs=[[10],[20],[30],[40],[50],[60]]

means= [[10],[50]]

inputs2=[[1,2],[3,4],[10,20],[1,1],[7,18]]
means2= [[1,2],[10,20]]
k2= len(means2)
k= len(means)


def classify (inputs, means):
    clusters=[]
    k=len(means)
    for input in inputs:
        result = min(range(k), key=lambda i: squared_distance(input, means[i]))
        clusters.append(result)
    return clusters

print(classify(inputs,means))
print(classify(inputs2, means2))
clusters= classify (inputs2, means2)
clusters

[0, 0, 0, 1, 1, 1]
[0, 0, 1, 0, 1]


[0, 0, 1, 0, 1]

In [157]:
from numpy import mean


def get_centroids(inputs, clusters):
    cluster_list= list(set(clusters))
    centroids=[]
    for cluster_group in cluster_list:
        current_group=[]
        for input, cluster in zip(inputs, clusters):
            if cluster == cluster_group:
                current_group.append(input)
        group_mean= list(mean(current_group,0))
        centroids.append(group_mean)
    return (centroids)
            
get_centroids(inputs2, clusters)
        
        

[[1.6666666666666667, 2.3333333333333335], [8.5, 19.0]]

In [74]:
np.mean([3,4])
list(np.mean([[3,5],[3,6]],0))

[3.0, 5.5]

In [191]:
#trying a run 
lbl=["shg","shr","nr","lm","rt","al","rn","alc"]
tm=["hr","st","ms"]
inputs=[]
inputs= [[7,10,7],[4,8,8],[10,4,1],[2,6,8],[5,6,2],[8,5,4],[4,3,7],[8,8,8]]

centr= (init_centers(inputs,3))

print(centr)
centroids=centr
turns = 6
for i in range (0,turns):
    
    clusters=classify(inputs, centroids)
    print(lbl)
    print(clusters)
    
    centroids = get_centroids(inputs, clusters)
    print(centroids)


[array([4, 8, 8]), array([5, 6, 2]), array([10,  4,  1])]
['shg', 'shr', 'nr', 'lm', 'rt', 'al', 'rn', 'alc']
[0, 0, 2, 0, 1, 1, 0, 0]
[[5.0, 7.0, 7.6], [6.5, 5.5, 3.0], [10.0, 4.0, 1.0]]
['shg', 'shr', 'nr', 'lm', 'rt', 'al', 'rn', 'alc']
[0, 0, 2, 0, 1, 1, 0, 0]
[[5.0, 7.0, 7.6], [6.5, 5.5, 3.0], [10.0, 4.0, 1.0]]
['shg', 'shr', 'nr', 'lm', 'rt', 'al', 'rn', 'alc']
[0, 0, 2, 0, 1, 1, 0, 0]
[[5.0, 7.0, 7.6], [6.5, 5.5, 3.0], [10.0, 4.0, 1.0]]
['shg', 'shr', 'nr', 'lm', 'rt', 'al', 'rn', 'alc']
[0, 0, 2, 0, 1, 1, 0, 0]
[[5.0, 7.0, 7.6], [6.5, 5.5, 3.0], [10.0, 4.0, 1.0]]
['shg', 'shr', 'nr', 'lm', 'rt', 'al', 'rn', 'alc']
[0, 0, 2, 0, 1, 1, 0, 0]
[[5.0, 7.0, 7.6], [6.5, 5.5, 3.0], [10.0, 4.0, 1.0]]
['shg', 'shr', 'nr', 'lm', 'rt', 'al', 'rn', 'alc']
[0, 0, 2, 0, 1, 1, 0, 0]
[[5.0, 7.0, 7.6], [6.5, 5.5, 3.0], [10.0, 4.0, 1.0]]


In [None]:
01010010

<h3>Trying the img file </h3>


In [None]:
imgfile = "imexample.png"
import matplotlib.image as mpimg
img = mpimg.imread(imgfile)

In [None]:
#pass img to a list of lists
top_row = img[0]
#print(top_row[0])
print(top_row.shape)
top_left_pixel = top_row[0]
red, green, blue, _ = top_left_pixel

In [None]:
#flattened list of all the pixels
pixels = [pixel for row in img for pixel in row]

In [None]:
clusterer= KMeans(5)
clusterer.train(pixels)

In [52]:

def recolor_image(input_file, k=5):

    img = mpimg.imread(path_to_png_file)
    pixels = [pixel for row in img for pixel in row]
    clusterer = KMeans(k)
    clusterer.train(pixels) # this might take a while

    def recolor(pixel):
        cluster = clusterer.classify(pixel) # index of the closest cluster
        return clusterer.means[cluster]     # mean of the closest cluster

    new_img = [[recolor(pixel) for pixel in row]
               for row in img]

    plt.imshow(new_img)
    plt.axis('off')
    plt.show()
