In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import clear_output
import time as t
import random as r
from sklearn.impute import SimpleImputer
from tqdm import tqdm

In [None]:
trainCsv = pd.read_csv('Physical_Activity_Monitoring_unlabeled.csv')

In [None]:
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
X = imputer.fit_transform(trainCsv)
X, indexes = X[:, :-1], X[:, -1]



In [None]:
class KMeansClustering:
    def __init__(self, points, centroidAmount):
        self.points = points
        self.centroidAmount = centroidAmount
        self.createCentroids()
        
    def createCentroids(self):
        self.centroids = np.empty((0, self.points.shape[1]))
        for _ in range(self.centroidAmount):
            centroid = []
            for colIdx in range(self.points.shape[1]):
                col = self.points[:, colIdx]
                centroid.append(np.random.randint(int(col.min()), int(col.max())+1))
            self.centroids = np.vstack((self.centroids, tuple(centroid)))


    def computeClosestCentroids(self):
        dists = np.linalg.norm(
            self.points[:, np.newaxis, :] - self.centroids[np.newaxis, :, :],
            axis=2
        )
        return np.argmin(dists, 1)

        
    def fit(self):
        while True:
            closestCentroids = self.computeClosestCentroids()

            oldCentroids = self.centroids.copy()
            for centroid in range(self.centroidAmount):
                assignedPoints = self.points[closestCentroids == centroid]

                if len(assignedPoints) > 0:
                    self.centroids[centroid] = np.mean(assignedPoints, axis=0)
            newCentroids = self.centroids.copy()

            if np.allclose(oldCentroids, newCentroids):
                break

In [None]:
class ElbowMethod:
    def __init__(self, points, kMax):
        self.points = points
        self.kMax = kMax

    def plotGraph(self):
        allKs = list(range(1, self.kMax+1))
        kMSERecords = []
        for k in tqdm(list(range(1, self.kMax+1))):
            cluster = KMeansClustering(points=self.points, centroidAmount=k)
            cluster.fit()
            clusterClosestCentroids = cluster.computeClosestCentroids()
            # for each point, find the mse and sum
            MSE = 0
            for centroid in range(k):
                matchingPoints = self.points[clusterClosestCentroids == centroid]
                if len(matchingPoints) > 0:
                    MSE += np.mean(np.sum((matchingPoints - cluster.centroids[centroid])**2, axis=1))
            kMSERecords.append(MSE)

        plt.plot(allKs, kMSERecords)
        plt.show()


elbow = ElbowMethod(points=X, kMax=10)
elbow.plotGraph()

In [None]:
cluster = KMeansClustering(points=X, centroidAmount=6)
cluster.fit()

import csv

with open('predictions.csv', 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(['Index', 'activityID'])
    for index, pred in zip(list(range(0, X.shape[0])), cluster.computeClosestCentroids()):
        writer.writerow([int(index), pred])
