Permalink
Browse files

add predict method for new data & test batch/streaming using method

  • Loading branch information...
JesseKolb committed May 4, 2018
1 parent 504fd64 commit 52f98dac5f85433b9e31eb7058d1819ae3974bed
Showing with 62 additions and 14 deletions.
  1. +39 −12 TICC_solver.py
  2. +23 −2 UnitTest.py
View
@@ -115,16 +115,12 @@ def fit(self, input_file):
print("UPDATED THE OLD COVARIANCE")
# SMOOTHENING
lle_all_points_clusters = self.smoothen_clusters(cluster_mean_info,
computed_covariance,
cluster_mean_stacked_info,
len(clustered_points),
complete_D_train,
time_series_col_size)
# Update cluster points - using NEW smoothening
clustered_points = updateClusters(lle_all_points_clusters, switch_penalty=self.switch_penalty)
self.trained_model = {'cluster_mean_info': cluster_mean_info,
'computed_covariance': computed_covariance,
'cluster_mean_stacked_info': cluster_mean_stacked_info,
'complete_D_train': complete_D_train,
'time_series_col_size': time_series_col_size}
clustered_points = self.predict_clusters()
# recalculate lengths
new_train_clusters = collections.defaultdict(list) # {cluster: [point indices]}
@@ -264,7 +260,8 @@ def write_plot(self, clustered_points, str_NULL, training_indices):
print("Done writing the figure")
def smoothen_clusters(self, cluster_mean_info, computed_covariance,
cluster_mean_stacked_info, clustered_points_len, complete_D_train, n):
cluster_mean_stacked_info, complete_D_train, n):
clustered_points_len = len(complete_D_train)
inv_cov_dict = {} # cluster to inv_cov
log_det_dict = {} # cluster to log_det
for cluster in range(self.number_of_clusters):
@@ -371,4 +368,34 @@ def log_parameters(self):
print("lam_sparse", self.lambda_parameter)
print("switch_penalty", self.switch_penalty)
print("num_cluster", self.number_of_clusters)
print("num stacked", self.window_size)
print("num stacked", self.window_size)
def predict_clusters(self, test_data = None):
'''
Given the current trained model, predict clusters. If the cluster segmentation has not been optimized yet,
than this will be part of the interative process.
Args:
numpy array of data for which to predict clusters. Columns are dimensions of the data, each row is
a different timestamp
Returns:
vector of predicted cluster for the points
'''
if test_data is not None:
if not isinstance(test_data, np.ndarray):
raise TypeError("input must be a numpy array!")
else:
test_data = self.trained_model['complete_D_train']
# SMOOTHENING
lle_all_points_clusters = self.smoothen_clusters(self.trained_model['cluster_mean_info'],
self.trained_model['computed_covariance'],
self.trained_model['cluster_mean_stacked_info'],
test_data,
self.trained_model['time_series_col_size'])
# Update cluster points - using NEW smoothening
clustered_points = updateClusters(lle_all_points_clusters, switch_penalty=self.switch_penalty)
return(clustered_points)
View
@@ -4,8 +4,6 @@
import sys
class TestStringMethods(unittest.TestCase):
def test_example(self):
@@ -17,6 +15,29 @@ def test_example(self):
val = abs(assign - cluster_assignment)
self.assertEqual(sum(val), 0)
# Test prediction works with batch of data outside of `fit` method. Perhaps there is a better way
# to test this in parallel so these are more like unit tests rather than integration tests?
test_batch = ticc.predict_clusters(ticc.trained_model['complete_D_train'][0:1000, ])
batch_val = abs(test_batch - cluster_assignment[0:1000])
self.assertEqual(sum(batch_val), 0)
# Test streaming by passing in 5 row blocks at a time (current timestamp and previous 4)
# I am causing data leakage by training on the whole set and then using the trained model while streaming,
# but this is for testing the code, so it is ok
# TODO: figure out why larger blocks don't improve predictions more. Reference:
# https://github.com/davidhallac/TICC/issues/18#issuecomment-384514116
def test_streaming(block_size):
test_stream = np.zeros(1000)
test_stream[0:block_size] = cluster_assignment[0:block_size]
for i in range(block_size, 1000):
point = ticc.trained_model['complete_D_train'][i - block_size:i, ]
test_stream[i] = ticc.predict_clusters(point)[block_size - 1]
percent_correct_streaming = 100 * sum(cluster_assignment[0:1000] == test_stream) / 1000.0
self.assertGreater(percent_correct_streaming, 0.9)
test_streaming(5)
for i in range(8):
mrf = np.loadtxt("UnitTest_Data/cluster_"+str(i)+".txt",delimiter=',')
try:

0 comments on commit 52f98da

Please sign in to comment.