In [44]:
import numpy as np
import pandas as pd
import random


def get_data(fname):
    with open(fname, 'r') as f:
        data= list(f.readlines())

    for i in range(len(data)):
        data[i]= data[i][:-1]
        data[i]= list(map(float, data[i].split(',')))
    
    features=['Target', 'Alcohol', 'Malic acid', 'Ash', 'Alcalinity of ash' , 'Magnesium', 'Total phenols', 
        'Flavanoids', 'Nonflavanoid phenols', 'Proanthocyanins', 'Color intensity', 'Hue', 
        'OD280/OD315 of diluted wines','Proline']

    dataf= pd.DataFrame(data, columns=features)

    dataf.drop(['Target'], axis = 1, inplace= True)

    return dataf



def normalize(data):
    normalized= data.copy()
    summary= normalized.describe()
    for col in normalized.columns:
        normalized[col]= (normalized[col]-summary[col]['mean'])/summary[col]['std']
    return normalized


def euclidean_distance(x, y):
    '''Euclidean distance between two points'''
    return np.sqrt(((x - y)**2).sum())



def choose_centroids(normalized, k):
    ''' In this way we pick at random k centers from the dataset'''
    lista=[i for i in range(178)]
    ind=[]
    for i in range(k):
        ind.append(random.choice(lista))
    centroids=[]
    for el in range(len(ind)):
        centroids.append(list(normalized.loc[el]))
    return centroids


def difference(list1, list2):
    diff=[]
    for i in range(len(list1)):
        diff.append(list1[i]-list2[i])
    return diff


def square(list):
    return [i ** 2 for i in list]


def division(lista, el):
    for i in range(len(lista)):
        lista[i]= lista[i]/el
    return lista

def division2(lista, lista2):
    # division of lists 
    # using zip() + list comprehension 
    for i in range(len(lista)):
        for j in range(len(lista2)):
          res = [i / j for i, j in zip(lista, lista2)] 
    return res


def move_centers(clus1):
    b=dict()
    for el in clus1.keys():
        summ=[sum(x) for x in zip(*clus1[el])]
        b[el]=division(summ, len(clus1[el]))
    return b

def move_centers2(clus1):
    b=dict()
    for el in clus1.keys():
        summ=[sum(x) for x in zip(*clus1[el])]
        b[el]=division(summ, len(clus1[el]))
    return b

def update_clusters(new_centers, normalized):
    distances=dict()
    for k in range(178):
    # d is the distance of each point from the three clusters (d=[distance_from_cluster1, distance_from_cluster2, distance_from_cluster3])
        d=[]
        for i in range(3):      
            X=list(normalized.loc[k])
            dist_point_center=sum(square(difference(X, new_centers[i])))
        
            d.append(dist_point_center)
        distances[k]=d
    clusters={i:[] for i in range(3)}
    
    for el in distances.keys():
        ii = distances[el].index(min(distances[el]))
        clusters[ii].append(list(normalized.loc[el]))
        
    return clusters

def update_clusters2(new_centers, normalized):
    distances=dict()
    for k in range(178):
    # d is the distance of each point from the three clusters (d=[distance_from_cluster1, distance_from_cluster2, distance_from_cluster3])
        d=[]
        for i in range(3):      
            X=list(normalized.loc[k])
            dist_point_center=sum(min(X, new_centers[i]))//sum(max(X, new_centers[i]))
        
            d.append(dist_point_center)
        distances[k]=d
    clusters={i:[] for i in range(3)}
    
    for el in distances.keys():
        ii = distances[el].index(min(distances[el]))
        clusters[ii].append(list(normalized.loc[el]))
        
    return clusters

def kMeans(k, n_iter, normalized, centroids):
    start = time.time()
    EuclidianDistance=np.array([]).reshape(178,0)

    distances=dict()
    for k in range(178):
        # d is the distance of each point from the three clusters (d=[distance_from_cluster1, distance_from_cluster2, distance_from_cluster3])
        d=[]
        for i in range(3):      
            X=list(normalized.loc[k])
            dist_point_center=sum(square(difference(X, centroids[i])))

            d.append(dist_point_center)
        distances[k]=d
    clusters={i:[] for i in range(3)}

    for el in distances.keys():
        ii=distances[el].index(min(distances[el]))
        clusters[ii].append(list(normalized.loc[el]))
    
    for _ in range(n_iter):
        new_centers= move_centers(clusters)
        clusters= update_clusters(new_centers, normalized)
    end = time.time()
    print('Execution time: ', end-start)
    return (clusters, new_centers)

def kMeans2(k, n_iter, normalized, centroids):
    start = time.time()
    EuclidianDistance=np.array([]).reshape(178,0)
    
    distances=dict()
    for k in range(178):
        # d is the distance of each point from the three clusters (d=[distance_from_cluster1, distance_from_cluster2, distance_from_cluster3])
        d=[]
        for i in range(3):      
            X=list(normalized.loc[k])
            
            dist_point_center=sum(min(X, centroids[i]))//sum(max(X, centroids[i]))
            d.append(dist_point_center)
        distances[k]=d
    clusters={i:[] for i in range(3)}

    for el in distances.keys():
        ii=distances[el].index(min(distances[el]))
        clusters[ii].append(list(normalized.loc[el]))
    
    for _ in range(n_iter):
        new_centers= move_centers(clusters)
        clusters= update_clusters2(new_centers, normalized)
    end = time.time()
    print('Execution time: ', end-start)
    return (clusters, new_centers)

In [46]:
import random
import pandas as pd
import numpy as np
import clustering_lib
import time
#importlib.reload(clustering_lib)

dataf = clustering_lib.get_data('C:/Users/Ela/Desktop/wine.data')
K=3  #K to be decided with the elbow method
normalized= clustering_lib.normalize(dataf)
centroids= clustering_lib.choose_centroids(normalized, K) 
kMeans(3,1000,normalized, centroids)
kMeans2(3,1000,normalized, centroids)

Execution time:  171.1336793899536
Execution time:  163.94756889343262


({0: [[-0.776789065158072,
    -1.2499245332266593,
    -3.6688129526803963,
    -2.6635047091054953,
    -0.8220960326075893,
    -0.503494178267563,
    -1.4609370522683993,
    -0.6577077994784339,
    -2.045742545040052,
    -1.340684477181924,
    0.4049084647154195,
    -1.1150648814174497,
    -0.7205076949920044],
   [-0.8260606744580756,
    -1.1067024422739524,
    -0.3153590057561894,
    -1.0465270508018245,
    0.08810981402110045,
    -0.3916464788342626,
    -0.940342890351313,
    2.154591164660647,
    -2.063214101177521,
    -0.7712982698793953,
    1.2799079392734423,
    -1.3263353351063938,
    -0.2124219457813108],
   [-0.44420570238303897,
    -0.8739665444758039,
    -1.2630742516260733,
    -0.8069748051272064,
    0.01809397966504739,
    -0.43958120716281973,
    -0.619977252248491,
    1.351077174906624,
    -1.6963114222906719,
    0.29845763474959813,
    0.09865864862011123,
    -1.4390129104071636,
    -0.9427952102716828],
   [-0.776789065158072,
    -1

In [48]:
X=list(normalized.loc[2])
print(min(X, centroids[2]))
print(sum(min(X, centroids[2])))
kMeans(3,1,normalized, centroids)
kMeans2(3,1,normalized, centroids)

[0.19632521851702103, 0.02117152397861313, 1.1062138630486382, -0.2679822523593159, 0.08810981402110045, 0.8067217293796732, 1.212113740652024, -0.4970050015276296, 2.1299593718150396, 0.26826291163506993, 0.31740851725961716, 0.7863692017830448, 1.3912237001649408]
7.558892338367835
Execution time:  0.38001465797424316
Execution time:  0.32811713218688965


({0: [[1.5143407672921458,
    -0.560668220516758,
    0.23139978993797622,
    -1.1663031736391336,
    1.90852150727848,
    0.8067217293796732,
    1.0319080692191864,
    -0.6577077994784339,
    1.2214384526666515,
    0.25100878414105393,
    0.36115849098751834,
    1.8427214702277643,
    1.0101593882569206],
   [1.6867913998421606,
    -0.34583508408769786,
    0.4865538945952523,
    -0.8069748051272064,
    0.9282998262937372,
    2.4844372208791836,
    1.462399395419854,
    -0.9791133953800435,
    1.0292513351544927,
    1.1827316688179192,
    -0.4263410361147025,
    1.1807407153357405,
    2.3280068002721572],
   [1.477387060317141,
    -0.5159113170940371,
    0.3043009626971982,
    -1.2860792964764427,
    0.8582839919376841,
    1.5576991398604065,
    1.3622851335127217,
    -0.17559940562602003,
    0.6623486562676434,
    0.7298108220999987,
    0.4049084647154195,
    0.33565890057996484,
    2.2327407222951523],
   [1.7114272044921648,
    -0.4174461295640511