In [1]:
import pandas as pd 
import numpy as np 


In [2]:
pieces = pd.read_csv("geometry.csv",index_col=0)


In [3]:
data = pieces.copy()

In [4]:
# scale data to be between 1 to 10
data = data.apply(lambda x: (x - x.min()) / (x.max() - x.min()) * 9 +1)

In [5]:
# initialize random centroids
def random_centroids(data, k):
    centroids = []
    for i in range(k):
        centroids.append(data.apply(lambda x: float(x.sample())))
    return pd.concat(centroids, axis=1)



In [6]:
centroid_count = 15
max_iterations = 100
centroids = random_centroids(data, centroid_count)

In [7]:
distances = centroids.apply(lambda x: np.sqrt(((data-x)** 2).sum(axis=1)))

In [8]:
def get_labels(data, centroids):
    distances = centroids.apply(lambda x: np.sqrt(((data - x) ** 2).sum(axis=1)))
    return distances.idxmin(axis=1)

In [9]:
labels = get_labels(data, centroids)

In [10]:
labels.value_counts()

8     48
13    18
7     15
11    12
6     11
14     8
2      7
3      7
10     6
4      6
12     1
dtype: int64

In [11]:
data.groupby(labels).apply(lambda x: np.exp(np.log(x).mean())).T

Unnamed: 0,2,3,4,6,7,8,10,11,12,13,14
Col1,4.730996,8.593446,7.405004,8.133538,2.935196,5.071483,4.051574,5.541303,10.0,3.319456,2.72376
Col2,6.712421,4.329264,5.706542,5.64133,3.618437,3.684175,2.564273,3.8802,3.237963,3.006871,6.325296
Col3,5.325795,8.564126,7.45557,8.124323,2.972501,4.953725,3.902311,5.370678,10.0,3.29324,3.251144
Col4,1.452609,9.921367,1.184057,9.875888,1.345974,1.161552,1.197074,1.131506,9.994399,4.195823,1.53139
Col5,6.028559,5.318428,5.825846,4.319587,5.204815,4.593397,5.609232,5.825641,6.359264,4.62705,7.542127
Col6,4.651564,8.0179,7.72387,7.889954,3.008849,4.534552,3.594601,4.832717,10.0,2.990916,2.908794
Col7,6.466976,4.980658,6.240226,5.929356,3.888017,4.007267,2.687513,4.153956,4.681818,2.66874,5.022954
Col8,5.010065,8.011319,5.580267,6.691521,2.82716,4.911601,4.250606,6.329768,9.0,3.333183,3.123417
Col9,4.616081,7.277556,5.601946,5.877447,2.336495,4.583105,3.701424,5.149038,8.56,3.14469,2.755272
Col10,4.899758,5.125039,6.178098,4.136546,3.341344,4.939172,4.161109,2.786327,6.75,4.464453,3.881764


In [12]:
def new_centroids(data, labels, k):
    centroids = data.groupby(labels).apply(lambda x: np.exp(np.log(x).mean())).T
    return centroids

In [13]:
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
from IPython.display import clear_output

In [14]:
def plot_clusters(data, labels, centroids, iteration):
    pca = PCA(n_components=2)
    data_2d = pca.fit_transform(data)
    centroids_2d = pca.transform(centroids.T)
    clear_output(wait=True)
    plt.title(f'Iteration {iteration}')
    plt.scatter(x=data_2d[:,0], y=data_2d[:,1], c=labels)
    plt.scatter(x=centroids_2d[:,0], y=centroids_2d[:,1])
    plt.show()

In [15]:



centroids = random_centroids(data, centroid_count)
old_centroids = pd.DataFrame()
iteration = 1

while iteration < max_iterations and not centroids.equals(old_centroids):
    old_centroids = centroids
    
    labels = get_labels(data, centroids)
    centroids = new_centroids(data, labels, centroid_count)
    # plot_clusters(data, labels, centroids, iteration)
    iteration += 1

In [16]:
centroids

Unnamed: 0,0,2,3,4,5,6,7,8,9,10,11,12,14
Col1,4.465753,8.867932,3.463163,5.157253,2.87746,4.035418,8.051857,4.34339,1.643146,7.869266,5.41205,8.007257,7.185734
Col2,3.139726,3.81466,2.75253,2.833726,4.6486,5.794405,6.595188,7.439512,2.702942,6.39681,3.824006,5.514968,3.786342
Col3,4.262623,8.821925,3.346583,4.90937,3.169172,4.20467,8.075745,5.020502,1.672092,7.896344,5.16982,8.087439,7.095817
Col4,1.135122,9.953282,1.251034,9.949814,1.440076,1.32686,9.841155,1.461388,1.409161,1.136836,9.978382,1.173136,1.102882
Col5,4.674214,5.874549,5.204872,3.921924,7.245586,3.231092,3.886644,6.466244,5.192953,5.13866,3.500595,6.442281,5.604079
Col6,3.596974,8.361145,3.605482,4.197857,3.036858,3.61747,7.838396,4.537723,1.610352,8.113808,4.678832,7.480508,6.770357
Col7,3.192257,4.846517,3.181302,2.748682,3.802042,6.223054,6.398873,6.63664,2.255144,7.278983,4.886364,6.056027,4.590576
Col8,5.042553,8.234655,3.081742,5.716616,2.933983,3.772193,6.457916,4.447222,1.50457,5.562151,5.333333,8.375897,6.910668
Col9,4.602863,7.647292,2.502764,5.096347,2.580028,3.678086,5.567095,4.029677,1.600979,5.869964,4.744,7.394369,6.096302
Col10,4.668128,5.673921,3.232077,4.418145,4.031595,4.996669,3.775907,4.391884,4.70617,7.149381,4.5,3.884565,4.269404


In [17]:
print(labels)

Name
Page_0001_1_1_1_out    14
Page_0001_1_1_2_in      0
Page_0001_1_1_3_out    12
Page_0001_1_1_4_in      5
Page_0001_1_2_1_in      2
                       ..
Page_0001_5_6_4_out     3
Page_0001_5_7_1_out     2
Page_0001_5_7_2_in      3
Page_0001_5_7_3_out    12
Page_0001_5_7_4_in      0
Length: 139, dtype: int64


In [19]:
# print all pices with their cluster label is 1
pieces[labels == 2]

Unnamed: 0_level_0,Col1,Col2,Col3,Col4,Col5,Col6,Col7,Col8,Col9,Col10,...,Col31,Col32,Col33,Col35,Col36,Col37,Col39,Col40,Col41,Col42
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Page_0001_1_2_1_in,149.892694,30.958904,148.896334,179.59938,-4.440883,291,83,145,171,26,...,184,79,79,203,2,2,203,47,62,15
Page_0001_3_3_3_out,160.041575,31.363239,159.04285,179.770604,-2.774132,312,85,159,187,28,...,190,84,84,209,5,5,209,56,69,13
Page_0001_3_5_3_out,157.775229,38.321101,156.917431,177.560575,-3.723571,301,90,165,181,16,...,195,87,87,210,6,6,210,59,72,13
Page_0001_4_1_3_out,153.506944,34.122685,153.049643,175.173533,-2.443264,297,88,150,177,27,...,181,86,86,198,3,3,198,45,66,21
Page_0001_4_3_3_out,144.240601,35.989975,143.464387,176.799349,-7.378316,260,87,161,178,17,...,195,82,82,212,10,10,212,52,65,13
Page_0001_4_7_3_out,151.523595,32.047191,150.526611,179.637325,-1.167857,301,86,142,166,24,...,178,80,80,194,1,1,194,51,64,13
Page_0001_5_4_3_out,147.848341,32.869668,146.863793,179.168867,-9.041456,279,85,146,178,32,...,197,76,76,216,4,4,216,52,57,5
Page_0001_5_5_3_out,140.170426,32.077694,139.225687,178.385633,-6.756318,259,85,148,168,20,...,181,80,80,198,10,10,198,57,64,7
Page_0001_5_7_1_out,149.687927,32.38041,148.883034,177.066414,-6.041919,295,84,136,165,29,...,175,81,81,193,1,1,193,58,66,8


In [20]:
from sklearn.cluster import KMeans

In [21]:
kmeans = KMeans(centroid_count)
kmeans.fit(data)

In [22]:
pd.DataFrame(kmeans.cluster_centers_).T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,2.682592,7.312561,3.674238,7.861978,7.433424,1.979072,5.236903,4.445043,6.571039,4.996796,4.299254,9.08076,3.635602,9.313213,8.788774
1,5.087218,5.450643,3.094793,4.402264,6.763116,2.648893,3.783835,9.336192,6.478364,2.865833,3.070791,10.0,6.392985,6.840988,3.676473
2,2.986228,7.225109,3.591957,7.822664,7.456623,1.954046,5.113683,5.333798,6.877821,4.748154,4.093671,9.432049,3.875164,9.422453,8.740171
3,1.472602,9.900864,1.264108,1.111541,1.180855,1.373496,1.15452,1.576088,1.325158,9.952068,1.143159,9.704652,1.387744,9.836674,9.95513
4,7.490349,4.225949,5.272109,6.16361,4.847295,5.286019,4.329094,6.153874,6.774462,4.023155,5.31743,3.551595,3.377265,4.140223,5.868272
5,2.849523,7.249088,3.890511,7.495438,7.727007,2.087591,4.656934,4.908759,6.555787,4.058394,3.540146,9.277372,3.282847,8.291971,8.341241
6,4.005245,5.678977,3.490642,5.333807,7.504545,2.477273,3.809091,7.647727,6.668831,2.704545,3.102273,10.0,6.599432,6.113636,4.758523
7,2.814103,5.947917,3.333333,7.796875,4.933333,1.694444,5.266667,4.958333,6.583333,5.666667,5.069444,5.833333,3.614583,9.083333,8.114583
8,2.506462,5.158,2.728,6.8365,5.2912,1.696,5.1568,4.384,5.824,5.048,4.408,5.176,3.376,7.744,7.606
9,4.288462,4.09375,3.5,4.375,7.6,4.833333,6.133333,4.375,4.5,4.555556,4.125,4.5,4.90625,3.666667,6.09375
