Skip to content

Commit

Permalink
Added kmeans integration test that compares results to those returned…
Browse files Browse the repository at this point in the history
… by MATLAB.
  • Loading branch information
drusk committed Jan 16, 2013
1 parent a175e3e commit 16b16ec
Show file tree
Hide file tree
Showing 3 changed files with 197 additions and 7 deletions.
11 changes: 7 additions & 4 deletions pml/unsupervised/clustering.py
Original file line number Diff line number Diff line change
Expand Up @@ -167,7 +167,8 @@ def create_random_centroids(dataset, k):
The random value chosen for each feature will always be limited to the
range of values found in the dataset. For example, if a certain feature
has a minimum value of 0 in the dataset, and maximum value of 9, the
has a minimum value of 0 in the dataset, and maximum value of 9, the
value chosen will be between 0 and 9.
Args:
dataset: DataSet
Expand All @@ -189,8 +190,10 @@ def rand_range(range_tuple):
"""
return random.uniform(range_tuple[0], range_tuple[1])

return [pd.Series(map(rand_range, min_maxs), index=dataset.feature_list(),
name = i) for i in range(k)]
return [pd.Series(map(rand_range, min_maxs),
index=dataset.feature_list(),
name=i)
for i in range(k)]

def kmeans(dataset, k=2, create_centroids=create_random_centroids):
"""
Expand Down Expand Up @@ -218,7 +221,7 @@ def kmeans(dataset, k=2, create_centroids=create_random_centroids):

# Initialize k centroids
centroids = create_centroids(dataset, k)

# Iteratively compute best clusters until they stabilize
assignments = None
clusters_changed = True
Expand Down
150 changes: 150 additions & 0 deletions test/datasets/mlab_iris_clusters.data
Original file line number Diff line number Diff line change
@@ -0,0 +1,150 @@
2
0
0
0
2
2
0
2
0
0
2
0
0
0
2
2
2
2
2
2
2
2
0
2
2
0
2
2
2
0
0
2
2
2
0
0
2
0
0
2
2
0
0
2
2
0
2
0
2
0
1
1
1
1
1
1
1
2
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
2
1
1
1
1
2
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
43 changes: 40 additions & 3 deletions test/integration_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
import unittest

import pandas as pd
from hamcrest import assert_that, contains
from hamcrest import assert_that

from pml.api import *

Expand Down Expand Up @@ -113,7 +113,7 @@ def test_pca_ingredients(self):
[0.6785, 0.0200, 0.5440, 0.4933],
[-0.0290, -0.7553, -0.4036, 0.5156],
[-0.7309, 0.1085, 0.4684, 0.4844]],
places=4)
places=4)
)

# TODO: generic sequence almost equals matcher
Expand All @@ -122,7 +122,44 @@ def test_pca_ingredients(self):
for i, expected_eigenvalue in enumerate(expected_eigenvalues):
self.assertAlmostEqual(eigenvalues[i], expected_eigenvalue,
places=4)


def create_iris_preset_centroids(self, dataset, k):
assert k == 3

# index = ["sepal_length", "sepal_width", "petal_length", "petal_width"]
index = dataset.feature_list()
return [pd.Series([4.5, 3, 1.5, 1], index=index),
pd.Series([6.8, 3.5, 5.5, 2], index=index),
pd.Series([5.2, 2.8, 1.5, 0.5], index=index)]

def test_kmeans_vs_matlab(self):
"""
Checks kmeans clusters against the results achieved by MATLAB's
kmeans function.
The only modification made to the MATLAB output is reducing the
cluster numbers by 1 (they started at 1 instead of 0).
Note that the initial centroids have a fixed location to eliminate
the random factor that would make comparison difficult.
"""
data = load(self.relative_to_base("datasets/iris.data"),
has_ids=False)
ml_clusters = pd.read_csv(self.relative_to_base(
"datasets/mlab_iris_clusters.data"),
header=None)
# Cluster assignments were loaded into a DataFrame, slice them out as
# a Series for convenience.
ml_clusters = ml_clusters.ix[:, 0]

clustered = kmeans(data, 3,
create_centroids=self.create_iris_preset_centroids)

pml_clusters = clustered.get_cluster_assignments()
for index in pml_clusters.index:
self.assertEqual(pml_clusters[index], ml_clusters[index],
msg="Cluster discrepancy at index %d" % index)


if __name__ == "__main__":
#import sys;sys.argv = ['', 'Test.testName']
Expand Down

0 comments on commit 16b16ec

Please sign in to comment.