In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Imports

In [None]:
from PIL import Image
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler

# Managing Data

The given image when converted to an array gives a 3d array with four rgba channels, however since the  image is b/w we can convert it to a binary array where 0 will represent [0 0, 0, 255] points thata  is black and 1 gives [255, 255, 255, 255] points that is white points 

In [None]:
def img_to_binary_arr(img_path):
    
    img = Image.open(img_path)
    img_array = np.array(img)
    
    binary_img = np.where(np.all(img_array == [0, 0, 0, 255], axis=-1), 0, 1)
    #black are 0 and white pixels are 1
    
    points = np.argwhere(binary_img == 1)
    x = points[:, 0]
    y = points[:, 1]
    plt.scatter(x, y, color='Blue', marker='o')
    
    img_df = pd.DataFrame(points)
    column_names=['x', 'y']
    img_df.columns = column_names
    
    return points
    

In [None]:
img_path='/kaggle/input/image-new/1 (1).png'
z=img_to_binary_arr(img_path)

# K Means Clustering

Getting the initial centroids by alternatingly appending points from the start and from the end of the img_data array

In [None]:
def initial_centroids(img_data, k):
    centroids=[]
    
    for i in range(k):
        if i%2==0:
            centroids.append(img_data[i])
        else:
            centroids.append(img_data[len(img_data)-i])
    return centroids

In [None]:
def distance(x1, x2):
    x1 = np.array(x1)
    x2 = np.array(x2)
    return np.sqrt(np.sum((x1 - x2)**2))

In [None]:

distance(z[3], initial_centroids(z, 4)[0])

In [None]:
def average(arr):
    return np.mean(arr, axis=0)

Here we'll be starting with the initialised centroids and then we'll update the centroids by taking the the average of the points that are closest to a the centroid

In [None]:
def algorithm(data, k, iters):
    
    centroids = initial_centroids(data, k)
    c_array = []
    
    for iterations in range(iters):
        master_data = []
        c_array = []
        for i in range(len(data)):
            minimum = float('inf')
            c_min = None 
            
            for j in range(k):
                if distance(data[i], centroids[j])<minimum:
                    minimum = distance(data[i], centroids[j])
                    c_min = j
                    
                    
            c_array.append(c_min)
            
        master_data = np.hstack((data, np.reshape(c_array, (-1, 1))))
        
        for r in range(k):
            temp_arr = master_data[master_data[:, -1] == r][:, :-1]  # Extract data for each cluster
            if len(temp_arr) > 0:  # Check if there are points assigned to this centroid
                centroids[r] = average(temp_arr)

    centroids_arr=np.array(centroids)
    
    return master_data, centroids

In [None]:
len(z)
a, b=algorithm(z, 3,100)
b

WCSS is going to be sort of like the cost of the centroid, we are going to take the sum of squares of the distances of the centroids from their respective datapoints 

In [None]:
def wcss(data, centroids, k):
    wcss=0
    unlabelled_datta=[]
    data_without_category=[]
    
    for i in range(k):
        for j in range(len(data[data[:, 2] == i])):
            
            unlabelled_data=data[data[:, 2] == i]
            data_without_category = unlabelled_data[:, :2]
            
            wcss+=distance(centroids[i], data_without_category[j])**2
    return wcss


In [None]:
def elbow_method(data, max_k, max_iters):
    wcss_array=[]
    k_values=np.arange(1, max_k+1)
    
    for i in range(1, max_k+1):
        alpha, beta=algorithm(data, i, max_iters)
        wcss_array.append(wcss(alpha, beta, i))
        
    wcss_array=np.array(wcss_array)
    #plt.plot(k_values, wcss_array, marker='o')
    
    second_derivative = np.diff(np.diff(wcss_array))
    
    k_index = np.argmax(second_derivative) + 2  
    # +2 due to the double difference, 1 in the index and 1 due to difference
    
    optimal_k = k_values[k_index]
    
    return optimal_k

In [None]:
elbow_method(z, 10, 10)

In [None]:
def centroid_distance(data, max_k=10, max_iters=100):
    k = elbow_method(data, max_k, max_iters)
    md, centroids = algorithm(data, k, max_iters)
    centroids=np.array(centroids)
    plt.figure()
    plt.scatter(data[:, 0], data[:, 1], color='yellow', marker='o')
    plt.scatter(centroids[:, 0], centroids[:, 1], color='black', marker='x')
    """plt.plot(centroids[:, 0], centroids[:, 1])
    plt.plot(centroids[0])"""
    distance_arr=[]
    for i in range(1, k):
        distance_arr.append(distance(centroids[i-1], centroids[i]))
    distance_arr.append(distance(centroids[0], centroids[k-1]))
    
    distance_df = pd.DataFrame(distance_arr)
    column_names=['Distance']
    row_names = []
    for j in range(1, k):
        row_names.append("Centroid(" + str(j) + ") to Centroid(" + str(j+1) +")")
    row_names.append("Centroid(" + str(k) + ") to Centroid(" + str(1) +")")
    distance_df.index = row_names
    distance_df.columns = column_names
    return distance_df

In [None]:
centroid_distance(z)