In [1]:
import numpy as np
from os import listdir, getcwd
from os.path import isfile, join
import random
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
def get_and_save_train_data(datapath, output_file):
    '''Gets train instances and save '''
    commands_dirs_by_readers = [join(datapath,f) for f in listdir(datapath) if not isfile(join(datapath, f))]
    instances = np.concatenate([np.concatenate([get_mfc_file(join(join(directory, 'komutlar'), f))]) for directory in commands_dirs_by_readers for f in listdir(join(directory, 'komutlar')) if f.endswith('.mfc')])
    np.save(output_file, instances)
    return instances

In [3]:
def load_datapoints(filepath):
    return np.load(filepath)

In [4]:
def get_mfc_file(path):
    '''Loads mfc file as matrix'''
    return np.loadtxt(path)


In [5]:
def get_and_save_test_data(datapath, output_file):
    '''Gets and saves test instances'''
    test_instances = np.concatenate([np.concatenate([get_mfc_file(join(datapath,f))]) for f in listdir(datapath) if f.endswith('.mfc')])
    np.save(output_file, test_instances)
    return test_instances

In [6]:
def find_cluster(point, clusters, selection='centroid'):
    distances = []   
    if selection == 'centroid':
        distances = [distance(point, centroid) for centroid, points in clusters]
    elif selection == 'point':
        distances = [min(distance(point, cluster_point) for cluster_point in points) for centroid, points in clusters]
    elif selection == 'average':
        for centroid, points in clusters:
            distances_to_cluster = [distance(point, cluster_point) for cluster_point in points]
            distances.append(sum(distances_to_cluster)/ float(len(distances_to_cluster)))
    return [i for i in sorted(enumerate(distances), key=lambda x:x[1])][0]     

In [7]:
def distance(point1, point2):
    global distance_method
    return np.linalg.norm(point1-point2, ord=distance_method)

In [8]:
def cluster_distortion(cluster, centroid):
    return sum([distance(point, centroid) for point in cluster])

In [9]:
def total_distortion(clusters):
    return sum([cluster_distortion(points, centroid) for centroid, points in clusters])

In [10]:
def calc_centroids(points):
    global centroid_method
    if centroid_method == 'mean':
        return np.matrix(points).mean(0)
    elif centroid_method == 'median':
        return np.median(points, axis=0)
    else:
        raise ValueError('{centroid_method} wrong, use "mean" or "median"'.format(centroid_method=repr(centroid_method)))

In [11]:
def kmeans_clustering(points, number_of_clusters):
    print('KMeans')
    centroid_indices = random.sample(range(len(points)), number_of_clusters)
    centroids = [points[index] for index in centroid_indices]
    clusters = [[] for i in range(number_of_clusters)]
    terminate = False
    last_distortion = total_distortion([(calc_centroids(points), points)])       
    while terminate == False:
        clusters = [[] for i in range(number_of_clusters)]
        for point in points:
            cluster_index, dist = find_cluster(point, zip(centroids, clusters)) 
            clusters[cluster_index].append(point)
        centroids = [calc_centroids(points) for points in clusters]          
        distortion = total_distortion(zip(centroids, clusters))
        print('Last distortion: ' + str(last_distortion) + ' New distortion: '  + str(distortion))
        if last_distortion == distortion:
            terminate = True
        else:
            last_distortion = distortion  
    return [(centroid, points) for centroid, points in zip(centroids, clusters)]           


In [12]:
train_data = get_and_save_train_data('../assignment-1/ProjectData/TrainData', 'train-datapoints')
test_data = get_and_save_test_data('../assignment-1/ProjectData/EvalData', 'test-datapoints')
cluster_schedule = [1, 2, 3, 4, 6, 8, 10, 12, 16, 20, 24, 28, 32, 40, 48, 56, 64]
distance_method = 2
centroid_method = 'mean'

In [15]:
clusters = []
split_type = 'binary-recursive'
train_distortions = []
test_distortions = []
for number_of_cluster in cluster_schedule:
    print('Schedule ' + str(number_of_cluster))
    if number_of_cluster == 1:
        clusters = [(calc_centroids(train_data), train_data)]
    else:
        if split_type == 'binary-recursive':
            split_size = 2
            cluster_to_split = 1
            number_of_iteration = number_of_cluster - len(clusters)

        elif split_type == 'binary':
            split_size = 2 
            # choose number_of_cluster - len(clusters) and split each of them into two clusters 
            cluster_to_split = number_of_cluster - len(clusters)
            number_of_iteration = 1
        elif split_type == 'multiple': 
            # choose one cluster and split into number_of_cluster - len(clusters) clusters
            split_size = number_of_cluster - len(clusters)
            cluster_to_split = 1
            number_of_iteration = 1
        else:
            raise ValueError('{split_type} wrong, use "binary", "binary-recursive" or "multiple"'.format(split_type=repr(split_type)))
        for iteration in range(number_of_iteration):
            distortions = []
            for centroid, cluster_points in clusters:
                distortions.append(cluster_distortion(cluster_points, centroid))
            cluster_indices_to_split = [i[0] for i in sorted(enumerate(distortions), key=lambda x:x[1], reverse=True)][:cluster_to_split]    
            for cluster_index in cluster_indices_to_split:
                centroid, datapoints = clusters[cluster_index]
                clusters = clusters + kmeans_clustering(datapoints, split_size)
            for cluster_index in cluster_indices_to_split:
                del clusters[cluster_index]

    train_distortion = total_distortion(clusters)          
    train_distortions.append(train_distortion)
    test_distortion = sum([find_cluster(point, clusters)[1] for point in test_data])
    test_distortions.append(test_distortion)
    print('Train distortion ' + str(train_distortion))
    print('Test distortion ' + str(test_distortion))  

Schedule 1
Train distortion 1955590.9585292945
Test distortion 1328576.7273475064
Schedule 2
KMeans
Last distortion: 1955590.9585292945 New distortion: 1858036.3239538386
Last distortion: 1858036.3239538386 New distortion: 1787602.9145696317
Last distortion: 1787602.9145696317 New distortion: 1748676.863865994
Last distortion: 1748676.863865994 New distortion: 1733653.9536909452
Last distortion: 1733653.9536909452 New distortion: 1730222.2680414882
Last distortion: 1730222.2680414882 New distortion: 1729412.1277123801
Last distortion: 1729412.1277123801 New distortion: 1729202.1661551166
Last distortion: 1729202.1661551166 New distortion: 1729134.2934934166
Last distortion: 1729134.2934934166 New distortion: 1729108.7727920092
Last distortion: 1729108.7727920092 New distortion: 1729099.048215327
Last distortion: 1729099.048215327 New distortion: 1729096.022363575
Last distortion: 1729096.022363575 New distortion: 1729095.1644539344
Last distortion: 1729095.1644539344 New distortion: 17

Last distortion: 414955.7844817778 New distortion: 414939.29606652167
Last distortion: 414939.29606652167 New distortion: 414929.4843800133
Last distortion: 414929.4843800133 New distortion: 414922.7415689245
Last distortion: 414922.7415689245 New distortion: 414919.05530037807
Last distortion: 414919.05530037807 New distortion: 414916.38469621714
Last distortion: 414916.38469621714 New distortion: 414914.4316401541
Last distortion: 414914.4316401541 New distortion: 414913.39827991935
Last distortion: 414913.39827991935 New distortion: 414913.1106485968
Last distortion: 414913.1106485968 New distortion: 414913.2332801891
Last distortion: 414913.2332801891 New distortion: 414913.3543080755
Last distortion: 414913.3543080755 New distortion: 414913.85270391265
Last distortion: 414913.85270391265 New distortion: 414913.6120979901
Last distortion: 414913.6120979901 New distortion: 414913.6171757906
Last distortion: 414913.6171757906 New distortion: 414913.27103957895
Last distortion: 414913

Last distortion: 204573.76045388382 New distortion: 204517.52123678886
Last distortion: 204517.52123678886 New distortion: 204487.14968488392
Last distortion: 204487.14968488392 New distortion: 204468.61953881392
Last distortion: 204468.61953881392 New distortion: 204458.6327344126
Last distortion: 204458.6327344126 New distortion: 204453.61156345857
Last distortion: 204453.61156345857 New distortion: 204449.6594548042
Last distortion: 204449.6594548042 New distortion: 204446.12467769356
Last distortion: 204446.12467769356 New distortion: 204443.55759683147
Last distortion: 204443.55759683147 New distortion: 204442.2593078121
Last distortion: 204442.2593078121 New distortion: 204440.52526921075
Last distortion: 204440.52526921075 New distortion: 204440.334731945
Last distortion: 204440.334731945 New distortion: 204439.69287957327
Last distortion: 204439.69287957327 New distortion: 204438.75767626427
Last distortion: 204438.75767626427 New distortion: 204438.24918489973
Last distortion:

Last distortion: 125194.19491290845 New distortion: 125138.59859226194
Last distortion: 125138.59859226194 New distortion: 125093.37192215023
Last distortion: 125093.37192215023 New distortion: 125052.10222675085
Last distortion: 125052.10222675085 New distortion: 124995.17958682013
Last distortion: 124995.17958682013 New distortion: 124931.36508436891
Last distortion: 124931.36508436891 New distortion: 124846.17188576015
Last distortion: 124846.17188576015 New distortion: 124713.10280491362
Last distortion: 124713.10280491362 New distortion: 124548.02019465787
Last distortion: 124548.02019465787 New distortion: 124377.42611186585
Last distortion: 124377.42611186585 New distortion: 124241.59340454364
Last distortion: 124241.59340454364 New distortion: 124169.36542342257
Last distortion: 124169.36542342257 New distortion: 124140.24218364886
Last distortion: 124140.24218364886 New distortion: 124121.27173099224
Last distortion: 124121.27173099224 New distortion: 124111.26708634477
Last d

Last distortion: 89231.16589055979 New distortion: 89166.88058445166
Last distortion: 89166.88058445166 New distortion: 89116.51091458937
Last distortion: 89116.51091458937 New distortion: 89087.11650741671
Last distortion: 89087.11650741671 New distortion: 89073.43328107057
Last distortion: 89073.43328107057 New distortion: 89062.64286620283
Last distortion: 89062.64286620283 New distortion: 89051.34997247503
Last distortion: 89051.34997247503 New distortion: 89037.67178387412
Last distortion: 89037.67178387412 New distortion: 89028.66107592857
Last distortion: 89028.66107592857 New distortion: 89024.32113096995
Last distortion: 89024.32113096995 New distortion: 89021.81984249382
Last distortion: 89021.81984249382 New distortion: 89019.45986515716
Last distortion: 89019.45986515716 New distortion: 89015.97168907509
Last distortion: 89015.97168907509 New distortion: 89011.59217215993
Last distortion: 89011.59217215993 New distortion: 89004.39634112152
Last distortion: 89004.39634112152

Last distortion: 79071.86636305644 New distortion: 79070.4061688207
Last distortion: 79070.4061688207 New distortion: 79069.8670832175
Last distortion: 79069.8670832175 New distortion: 79068.92635158285
Last distortion: 79068.92635158285 New distortion: 79067.3216232969
Last distortion: 79067.3216232969 New distortion: 79066.08376354244
Last distortion: 79066.08376354244 New distortion: 79065.48685994497
Last distortion: 79065.48685994497 New distortion: 79065.20086300606
Last distortion: 79065.20086300606 New distortion: 79065.15495790658
Last distortion: 79065.15495790658 New distortion: 79064.84792102844
Last distortion: 79064.84792102844 New distortion: 79064.66911901985
Last distortion: 79064.66911901985 New distortion: 79063.97441263801
Last distortion: 79063.97441263801 New distortion: 79063.38065702312
Last distortion: 79063.38065702312 New distortion: 79062.64475321578
Last distortion: 79062.64475321578 New distortion: 79062.57708836885
Last distortion: 79062.57708836885 New d

Last distortion: 62327.16559975649 New distortion: 62278.74484590291
Last distortion: 62278.74484590291 New distortion: 62255.854483628566
Last distortion: 62255.854483628566 New distortion: 62244.26501144801
Last distortion: 62244.26501144801 New distortion: 62236.55973969778
Last distortion: 62236.55973969778 New distortion: 62230.6180050173
Last distortion: 62230.6180050173 New distortion: 62227.7626184093
Last distortion: 62227.7626184093 New distortion: 62227.122585608326
Last distortion: 62227.122585608326 New distortion: 62225.93310213128
Last distortion: 62225.93310213128 New distortion: 62226.2175760471
Last distortion: 62226.2175760471 New distortion: 62225.19134967779
Last distortion: 62225.19134967779 New distortion: 62225.37990479587
Last distortion: 62225.37990479587 New distortion: 62225.53825803389
Last distortion: 62225.53825803389 New distortion: 62225.36526796226
Last distortion: 62225.36526796226 New distortion: 62224.839433690795
Last distortion: 62224.839433690795

Last distortion: 51983.86841029237 New distortion: 51943.60280123788
Last distortion: 51943.60280123788 New distortion: 51913.24881719246
Last distortion: 51913.24881719246 New distortion: 51889.734311398526
Last distortion: 51889.734311398526 New distortion: 51870.41737207396
Last distortion: 51870.41737207396 New distortion: 51853.5848696453
Last distortion: 51853.5848696453 New distortion: 51837.89444955556
Last distortion: 51837.89444955556 New distortion: 51828.85207163707
Last distortion: 51828.85207163707 New distortion: 51824.227414214496
Last distortion: 51824.227414214496 New distortion: 51820.56144069346
Last distortion: 51820.56144069346 New distortion: 51818.81492777349
Last distortion: 51818.81492777349 New distortion: 51816.16031846662
Last distortion: 51816.16031846662 New distortion: 51813.06624311711
Last distortion: 51813.06624311711 New distortion: 51809.88233766134
Last distortion: 51809.88233766134 New distortion: 51806.84874269985
Last distortion: 51806.848742699

Last distortion: 47038.218866308205 New distortion: 47018.171501061384
Last distortion: 47018.171501061384 New distortion: 47008.45142016468
Last distortion: 47008.45142016468 New distortion: 47003.61753424608
Last distortion: 47003.61753424608 New distortion: 47001.8815742867
Last distortion: 47001.8815742867 New distortion: 47000.92201456923
Last distortion: 47000.92201456923 New distortion: 46999.99408902477
Last distortion: 46999.99408902477 New distortion: 46999.054483433836
Last distortion: 46999.054483433836 New distortion: 46998.67793969065
Last distortion: 46998.67793969065 New distortion: 46998.25599587051
Last distortion: 46998.25599587051 New distortion: 46997.97016012327
Last distortion: 46997.97016012327 New distortion: 46997.28206039012
Last distortion: 46997.28206039012 New distortion: 46997.00624223536
Last distortion: 46997.00624223536 New distortion: 46996.17657410314
Last distortion: 46996.17657410314 New distortion: 46995.79245692732
Last distortion: 46995.79245692

Last distortion: 38612.776204081696 New distortion: 38613.2415472374
Last distortion: 38613.2415472374 New distortion: 38613.604179611866
Last distortion: 38613.604179611866 New distortion: 38613.604179611866
KMeans
Last distortion: 40963.88946254916 New distortion: 39965.25564928404
Last distortion: 39965.25564928404 New distortion: 39734.74547960846
Last distortion: 39734.74547960846 New distortion: 39644.509774219725
Last distortion: 39644.509774219725 New distortion: 39583.32319507151
Last distortion: 39583.32319507151 New distortion: 39534.188615059014
Last distortion: 39534.188615059014 New distortion: 39491.08246139073
Last distortion: 39491.08246139073 New distortion: 39457.050125691065
Last distortion: 39457.050125691065 New distortion: 39427.977277907834
Last distortion: 39427.977277907834 New distortion: 39406.812567689754
Last distortion: 39406.812567689754 New distortion: 39389.32840919573
Last distortion: 39389.32840919573 New distortion: 39373.46794898377
Last distortion

Last distortion: 38005.669680876796 New distortion: 38005.585020178645
Last distortion: 38005.585020178645 New distortion: 38005.585020178645
KMeans
Last distortion: 39399.67681302704 New distortion: 38464.66582656854
Last distortion: 38464.66582656854 New distortion: 38170.11922025416
Last distortion: 38170.11922025416 New distortion: 37754.07357069043
Last distortion: 37754.07357069043 New distortion: 37340.491650056065
Last distortion: 37340.491650056065 New distortion: 37067.03323800524
Last distortion: 37067.03323800524 New distortion: 36897.98092382045
Last distortion: 36897.98092382045 New distortion: 36815.30200642627
Last distortion: 36815.30200642627 New distortion: 36795.50343133959
Last distortion: 36795.50343133959 New distortion: 36789.54291471068
Last distortion: 36789.54291471068 New distortion: 36787.13059510246
Last distortion: 36787.13059510246 New distortion: 36786.25716523845
Last distortion: 36786.25716523845 New distortion: 36785.83898869091
Last distortion: 3678

Last distortion: 33015.76321782731 New distortion: 33011.24870350765
Last distortion: 33011.24870350765 New distortion: 33010.45074202399
Last distortion: 33010.45074202399 New distortion: 33009.61215641527
Last distortion: 33009.61215641527 New distortion: 33008.99116439978
Last distortion: 33008.99116439978 New distortion: 33008.642018405364
Last distortion: 33008.642018405364 New distortion: 33008.49912712904
Last distortion: 33008.49912712904 New distortion: 33008.54662566384
Last distortion: 33008.54662566384 New distortion: 33008.597857046225
Last distortion: 33008.597857046225 New distortion: 33008.3858346395
Last distortion: 33008.3858346395 New distortion: 33008.3487099696
Last distortion: 33008.3487099696 New distortion: 33008.28368316947
Last distortion: 33008.28368316947 New distortion: 33007.871865438516
Last distortion: 33007.871865438516 New distortion: 33007.5982088258
Last distortion: 33007.5982088258 New distortion: 33007.31929980621
Last distortion: 33007.31929980621

Last distortion: 30421.270880592674 New distortion: 30418.022854023588
Last distortion: 30418.022854023588 New distortion: 30416.24859763096
Last distortion: 30416.24859763096 New distortion: 30414.924805829083
Last distortion: 30414.924805829083 New distortion: 30414.08980900564
Last distortion: 30414.08980900564 New distortion: 30413.59021746417
Last distortion: 30413.59021746417 New distortion: 30413.26654913664
Last distortion: 30413.26654913664 New distortion: 30413.274245194705
Last distortion: 30413.274245194705 New distortion: 30412.75131435049
Last distortion: 30412.75131435049 New distortion: 30411.820706728096
Last distortion: 30411.820706728096 New distortion: 30411.658882416617
Last distortion: 30411.658882416617 New distortion: 30411.598118858168
Last distortion: 30411.598118858168 New distortion: 30411.624276330498
Last distortion: 30411.624276330498 New distortion: 30411.507203245314
Last distortion: 30411.507203245314 New distortion: 30411.532178045258
Last distortion:

Last distortion: 28377.70083330737 New distortion: 28378.258907118576
Last distortion: 28378.258907118576 New distortion: 28378.818938649878
Last distortion: 28378.818938649878 New distortion: 28379.406812401605
Last distortion: 28379.406812401605 New distortion: 28379.797062019286
Last distortion: 28379.797062019286 New distortion: 28380.820456602538
Last distortion: 28380.820456602538 New distortion: 28382.073307672094
Last distortion: 28382.073307672094 New distortion: 28382.355036063298
Last distortion: 28382.355036063298 New distortion: 28382.447344272074
Last distortion: 28382.447344272074 New distortion: 28382.447344272074
Train distortion 1231630.76748831
Test distortion 881722.8900608235


In [None]:
plt.plot(cluster_schedule, train_distortions, label='Training distortion')
plt.plot(cluster_schedule, test_distortions, label='Test distortion')
plt.xlabel("Schedule")
plt.ylabel("Total Distortion")
plt.legend()
plt.show()

In [None]:
plt.figure(2)
plt.plot(cluster_schedule, [distortion/len(train_data) for distortion in train_distortions], label='Training distortions per point')
plt.plot(cluster_schedule, [distortion/len(test_data) for distortion in test_distortions], label='Test distortions per point')
plt.xlabel("Schedule")
plt.ylabel("Distortions per point")
plt.legend()
plt.show()