In [139]:
import sys
import matplotlib
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import sklearn.cluster
%matplotlib notebook

In [140]:
# retrieve the dataset
dataset_path = '../datasets/clean_data/clean_data_no_title.csv'
titles_path = '../datasets/clean_data/clean_data_only_title.csv'

all_data = np.genfromtxt(dataset_path, delimiter=',')
all_titles = np.genfromtxt(titles_path, delimiter=',', dtype='str')
print(all_data.shape)
print(all_titles.shape)

(2658, 42)
(42,)


In [141]:
# normalize the data with range from 0 to 1 to prevent bias
# use this formula: (x - min) / (max - min)

max_values = np.amax(all_data, axis=0)
min_values = np.amin(all_data, axis=0)

max_minus_min = max_values - min_values

broadcasted_min_values = np.broadcast_to(min_values, (all_data.shape[0], max_minus_min.shape[0]))

x_minus_min = all_data - broadcasted_min_values

normalized_data = x_minus_min / max_minus_min
np.save('normalized_clean_data.npy', normalized_data)
print(all_data.shape)
print(all_titles.shape)

(2658, 42)
(42,)


In [144]:
all_data = np.load('normalized_clean_data.npy')

# exclude the target value from the dataset and titles
data = all_data[:,:all_data.shape[1]-1]
titles = all_titles[:all_titles.shape[0]-1]

In [145]:
# run kmeans algorithm multiple times to observe the loss values at each k value
max_num_clusters = 20
loss_arr = np.zeros(max_num_clusters)
for k in range(1, max_num_clusters+1):
    kmeans = sklearn.cluster.KMeans(n_clusters=k).fit(data)
    labels = kmeans.labels_
    clusters = kmeans.cluster_centers_
    loss = kmeans.inertia_
    n_iter = kmeans.n_iter_
    loss_arr[k-1] = loss

In [143]:
# plot the loss as a function of the number of cluster centers (k)
x = np.arange(20) + 1
y = loss_arr
fig, ax = plt.subplots()
ax.set_title("KMeans Loss for K Different Clusters")
ax.set_xlabel('Number of clusters (k)')
ax.set_ylabel('Loss (kmeans objective function)')

ax.plot(x,y)
start, end = ax.get_xlim()
ax.xaxis.set_ticks(np.arange(1, 21, 1))
plt.show()

<IPython.core.display.Javascript object>