In [None]:
from pyspark import SparkContext
import numpy as np
import findspark
import random as rd
import time
import matplotlib.pyplot as plt

In [None]:
# Initialize Spark
findspark.init()

In [None]:
'''
Receives a String filename with name of the csv file of the dataset.

Returns: a RDD with the loaded data
'''
def parallelReadFile(filename):
    # Loads the file
    rdd_lines = sc.textFile(filename)

    # Deletes the first row containing labels
    header = rdd_lines.first() #or .take(1)
    #data_lines = rdd_lines.filter(lambda line: line != header)
    data_lines = rdd_lines.flatMap(lambda line: [line] if line != header else [])

    # Divide each line into a list of individual values, then convert to a numpy array
    def parse_line(line):
        # Convert values to float
        return np.array([float(v) for v in line.split(',')[1:]])  # Exclude first column

    # Map each row to a numpy array
    parsed_data = data_lines.map(lambda x: parse_line(x))

    return parsed_data

In [None]:
'''
Receives a list of d-dimensional tuples called centroids, representing the current state
of the centroids, and a d-dimensional tuple x which represents the datum to be assigned to a cluster.

Returns: an integer with the index in centroids of the closest centroid to x
'''
def parallelAssign2cluster(x, centroids):
    
    # Initialisation of the distances from x to the centroids
    distances = [0 for i in range(len(centroids))]
    
    # For each centroid, we compute the euclidian distance to x
    for c in range(len(centroids)):
        for i in range(784):
            distances[c] += (x[i] - centroids[c][i]) ** 2
        #distances[c] **= 0.5
        distances[c] = np.sqrt(distances[c])
    
    # We look for the smallest distance which corresponds to the nearest centroid
    dmin = distances[0]
    for d in distances[1:]:
        if d < dmin:
            dmin = d
            
    return distances.index(dmin)

In [None]:
'''
Performs the serialized K-Means algorithm on the dataset X, grouping the instances into K different
clusters. The number of iterations of the method to be executed is n_iter. The initialization of the centroids will be
random, sampled from a standard normal distribution. It returns a list of length K with the d-dimensional centroids
computed
'''
# Following the implementation described in the slides        
def parallelKMeans(data, K, n_iter):
    # Generate K random tuples with 28000 elements each (len of each x)
    centroids = [tuple(np.random.randn(784)) for i in range(K)]
    
    for it in range(n_iter):
        print("it : ", it)
    
        clustered_data = data.map(lambda x: (parallelAssign2cluster(tuple(x), centroids), np.append(x,1)))

        new_clusters = clustered_data.reduceByKey(lambda x, y: x+y)
        
        def divide(row):
            _, x = row
            x = list(x)
            for k in range(784):
                x[k] = x[k] / x[-1]
            x = tuple(x[:-1])
            return x
                
        centroids = new_clusters.map(lambda row : divide(row)).collect()
        plt.figure(figsize = (20,5))
        i=1
        for c in centroids:
            ax = plt.subplot(1, K, i)
            c = np.asarray(c)
            plt.imshow(c.reshape(28,28),cmap='gray')
            ax.set_axis_off() 
            i+=1
        plt.show()
        
    return centroids # Only centroid for final version

In [None]:
def main(filename="tot_mnist_shuf.csv", K=10):
    
    data = parallelReadFile(filename)
    centroids = parallelKMeans(data, K, 10)

In [None]:
sc = SparkContext("local[*]", "KMeansParallel")

start_time = time.time()
main()
end_time = time.time()

print("Execution time: ", end_time - start_time)
sc.stop()

In [None]:
num_cores = [1, 2, 3, 4, 5, 6, 7, 8]
execution_times = []
K = 10

for cores in num_cores:
    sc = SparkContext(master=f"local[{cores}]", appName="Kmeans PerformanceTest")
    
    start_time = time.time()

    # Running
    main("tot_mnist_shuf.csv", K)
    
    end_time = time.time()
    execution_times.append(end_time - start_time)
    sc.stop()
    
# Compute speedup
base_time = execution_times[0]  # ExecTime with 1 worker
speedups = [base_time / time for time in execution_times]

print(speedups)
print(execution_times)

# Ploting Performance curve
plt.figure(figsize=(10, 5))
plt.subplot(1, 2, 1)
plt.plot(num_cores, execution_times, marker='o')
plt.title("Performance curve for the number of worker")
plt.xlabel("Number of workers")
plt.ylabel("Execution time (seconds)")

# Ploting Speedup curve
plt.subplot(1, 2, 2)
plt.plot(num_cores, speedups, marker='o', color='green')
plt.title("Speedup curve for the number of worker")
plt.xlabel("Number of workers")
plt.ylabel("Time with one worker/time with n workers")

plt.tight_layout()
plt.show()

In [None]:
num_cluster = [1, 3, 5, 7, 8, 9, 10, 11]
execution_times = []
sc = SparkContext(master=f"local[*]", appName="Kmeans PerformanceTest")

for K in num_cluster:
    start_time = time.time()
    # Running
    main("tot_mnist_shuf.csv", K)
    
    end_time = time.time()
    execution_times.append(end_time - start_time)
    
# Compute speedup
base_time = execution_times[0]  # ExecTime with 1 cluster
speedups = [base_time / time for time in execution_times[1:]]

# Ploting Speedup curve
plt.subplot(1, 2, 2)
plt.plot(num_cluster[1:], speedups, marker='o', color='green')
plt.title("Speedup curve for K clusters")
plt.xlabel("Number of cluster")
plt.ylabel("Time with one cluster/time with K cluster")

plt.tight_layout()
plt.show()
sc.stop()