-
Notifications
You must be signed in to change notification settings - Fork 0
/
CBKM.py
266 lines (223 loc) · 11.1 KB
/
CBKM.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
from BKM import *
class CBKM:
def __init__(self, n_clusters, bin_threshold):
self.data = None
self.k = n_clusters
self.bin_threshold = bin_threshold
self.cluster_centers_ = []
self.labels_ = []
def fit(self, data):
self.data = data
mode = False
final_subclusters = []
for l in range(2, self.k+1):
print('\n(l = ' + str(l) + ')')
# PHASE 1: GLOBAL CLUSTERING
s_cooperative = self.global_clustering(l, final_subclusters, mode)
# PHASE 2: COOPERATIVE CLUSTERING
subclusters = self.cooperative_clustering(s_cooperative, l)
# PHASE 3: MERGING
final_subclusters = self.merging(subclusters, l)
mode = True
self.labels_ = clusters_to_labels(final_subclusters, self.data)
self.cluster_centers_ = recalculate_centroids(final_subclusters)
return self
def predict(self, data_test):
prediction = []
if not any(isinstance(l, np.ndarray) for l in data_test):
data_test = [data_test]
for new_sample in data_test:
sample_distances = []
for c in self.cluster_centers_:
sample_distances.append(euclidean_distances(new_sample.reshape(1, -1), c.reshape(1, -1)))
prediction.append(np.argmin(sample_distances))
return prediction
def get_params(self):
return {'n_clusters': self.k, 'bin_threshold': self.bin_threshold}
def set_params(self, dict_params):
items = dict_params.items()
self.k = items[0][1]
self.bin_threshold = items[1][1]
return self
# Phase 1: Global clustering
def global_clustering(self, l, final_subclusters = None, mode = False):
# K-Means
kmeans = KMeans(n_clusters=l, n_init = 1).fit(self.data)
km_clusters = labels_to_clusters(self.data, kmeans.labels_)
# Bisecting K-Means
bkmeans = BKM(n_clusters=l, mode=mode)
bkmeans = bkmeans.fit(self.data, final_subclusters)
bkm_clusters = labels_to_clusters(self.data, bkmeans.labels_)
return km_clusters, bkm_clusters
# Phase 2: Cooperative Clustering
def cooperative_clustering(self, s_coop, l):
km_clusters, bkm_clusters = s_coop
subclusters = []
# Generating Cooperative Contingency Matrix (CCM)
# Obtaining disjoint sub-clusters from CCM
contingency_matrix = []
for idx, km_c in enumerate(km_clusters):
bkm_list = []
for idy, bkm_c in enumerate(bkm_clusters):
intersection = 0
subclusters_local = []
for km_sample in km_c:
for bkm_sample in bkm_c:
if (km_sample == bkm_sample).all():
intersection += 1
subclusters_local.append(km_sample)
break
bkm_list.append(intersection)
if subclusters_local:
subclusters.append(subclusters_local)
contingency_matrix.append(bkm_list)
# Plot CCM Matrix
ccm_matrix_numpy = np.array(contingency_matrix)
rowcolnames = ['S'+str(i) for i in range(l)]
df = pd.DataFrame(ccm_matrix_numpy, index=rowcolnames, columns=rowcolnames)
print("Cooperative Contingency Matrix (CCM)")
print(df)
return subclusters
# Phasse 3: Merging
def merging(self, subclusters, l):
# STEP 1: For each subcluster, build similarity histogram
histograms_subclusters = []
for id, sc in enumerate(subclusters):
histograms_subclusters.append(self.similarity_histogram(sc, ids=id))
# STEP 2: Using the subclusters histograms, a merging cohesiveness factor of each pair of subclusters
# is computed. This factor represents the coherency (quality) of merging two subclusters into
# a new coherent cluster.
# This factor is calculated by the coherency of merging the corresponding histograms
# The process of merging two subclusters reveals a new histogram; this histogram is constructed by adding the
# corresponding counts of each bin from the two merged histogram and also by adding the additional pair-wise
# similarities that are generated as a result of merging the two subclusters together
merging_matrix = []
for idx, sx in enumerate(subclusters):
cmm_list = []
for idy, sy in enumerate(subclusters):
if idx == idy:
cmm_list.append(-1)
continue
new_hist = self.merge_histograms(histograms_subclusters[idx], histograms_subclusters[idy], sx, sy)
mcf = self.compute_mcf(sx, sy, new_hist)
cmm_list.append(mcf)
merging_matrix.append(cmm_list)
# Plot CMM Matrix
cmm_matrix_numpy = np.array(merging_matrix)
rowcolnames = ['Sb' + str(i) for i in range(len(subclusters))]
df = pd.DataFrame(cmm_matrix_numpy, index=rowcolnames, columns=rowcolnames)
print("\nCooperative Merging Matrix (CMM)")
print(df)
# STEP 3: Merge two sub-clusters with the highest mcf in CMM; reduce the number of subclusters and update CMM
# Until len(subclusters) == l
subclusters_ids = [str(i) for i in range(len(subclusters))]
while len(subclusters) != l:
# Get indices of the pair of subclusters with maximum mcf
row, col = np.unravel_index(np.argmax(cmm_matrix_numpy), cmm_matrix_numpy.shape)
#rows, cols = np.unravel_index(np.argsort(-cmm_matrix_numpy, axis = None)[:2], cmm_matrix_numpy.shape)
# Merge resulting subclusters and updating structures
merged_cluster = subclusters[row] + subclusters[col]
subclusters.pop(row)
histograms_subclusters.pop(row)
cmm_matrix_numpy = np.delete(cmm_matrix_numpy, row, axis=0)
cmm_matrix_numpy = np.delete(cmm_matrix_numpy, row, axis=1)
pop_row = subclusters_ids.pop(row)
if row < col:
col_used = col-1
else:
col_used = col
subclusters.pop(col_used)
histograms_subclusters.pop(col_used)
cmm_matrix_numpy = np.delete(cmm_matrix_numpy, col_used, axis=0)
cmm_matrix_numpy = np.delete(cmm_matrix_numpy, col_used, axis=1)
pop_col = subclusters_ids.pop(col_used)
subclusters.append(merged_cluster)
histograms_subclusters.append(self.similarity_histogram(merged_cluster, ids=str(pop_row)+'+'+str(pop_col),color='purple'))
subclusters_ids.append(str(pop_row)+'+'+str(pop_col))
print("\nSub-clusters merged: Sb" + str(pop_row)+ " and Sb" + str(pop_col))
# Compute mcf of new merged subcluster with the rest of subclusters
cmm_list = []
idx = len(subclusters) - 1
for idy, sy in enumerate(subclusters):
if idx == idy:
cmm_list.append(-1)
continue
new_hist = self.merge_histograms(histograms_subclusters[idx],
histograms_subclusters[idy], subclusters[idx], sy)
mcf = self.compute_mcf(subclusters[idx], sy, new_hist)
cmm_list.append(mcf)
# Updating CMM
cmm_matrix_numpy = np.append(cmm_matrix_numpy, [cmm_list[:-1]], axis=0)
cmm_matrix_numpy = np.insert(cmm_matrix_numpy, len(cmm_list[:-1]), cmm_list, axis=1)
rowcolnames = ['Sb' + str(i) for i in subclusters_ids]
df = pd.DataFrame(cmm_matrix_numpy, index=rowcolnames, columns=rowcolnames)
print("Cooperative Merging Matrix (CMM)")
print(df)
print("\nMerge phase finished with l={} sub-clusters".format(len(subclusters)))
return subclusters
# Compute similarity histogram of a subcluster
def similarity_histogram(self, sc, ids, nbins=21, color='red'):
histogram = np.zeros(nbins, dtype=int)
binsize = 2/nbins
for x in sc:
for y in sc:
if (x == y).all():
continue
# Calculate cosine similarity between samples
sim = cosine_similarity(x.reshape(1, -1), y.reshape(1, -1))[0][0]
# Decide the binid of the histogram in which the similarity falls
if sim == -1:
binid = 0
elif sim == 1:
binid = nbins-1
else:
binid = -1 + int(sim/binsize) + (nbins/2)
# Increment histogram[binid]
histogram[int(binid)] += 1
# Plot histogram
# COMMENT THE FOLLOWING LINES IF YOU DON'T WANT THE HISTOGRAMS TO BE PLOTTED
plt.figure(figsize = (9,5))
plt.bar(np.arange(-1.0, 1.1, 0.1), histogram, width=0.1, align='edge', color=color)
xmin, xmax = plt.xlim()
plt.xlim(xmin * 0.8, xmax * 0.8)
plt.xticks(np.arange(-1.0, 1.1, 0.1))
plt.ylabel('Number of pair-wise similarities')
plt.title("Similarity Histogram S"+str(ids))
plt.show()
return histogram
# Create new merging histogram
def merge_histograms(self, histx, histy, sx, sy):
additional_similarities_histogram = self.get_additional_similarities_histogram(sx, sy)
new_hist = [(histx[i] + histy[i] + additional_similarities_histogram[i]) for i in range(len(histx))]
return new_hist
# Computing merging cohesiveness factor
def compute_mcf(self, sx, sy, new_hist):
binsize = 2 / len(new_hist)
bin_threshold = self.bin_threshold
nsim = (len(sx) + len(sy)) * (len(sx) + len(sy) - 1) / 2
num_mcf = sum([(((binid * binsize) - 1 + (binsize / 2)) * new_hist[binid])
for binid in range(bin_threshold, len(new_hist))])
# An mcf between two sub-clusters is computed by calculating the ration of the count of similarities weighted
# by the bin similarity above aa certain similarity threshold to the total count of similarities in the new
# merged histogram. The higher this ration, the more cohesive the new generated cluster
mcf = num_mcf / nsim
return mcf
# Compute histogram of the additional similarities resulting from the merging of two subclusters
def get_additional_similarities_histogram(self, sx, sy, nbins=21):
histogram = np.zeros(nbins, dtype=int)
binsize = 2/nbins
# Compute new similarities and its histogram
for x in sx:
for y in sy:
# Calculate cosine similarity between samples
sim = cosine_similarity(x.reshape(1, -1), y.reshape(1, -1))[0][0]
# Decide the binid of the histogram in which the similarity falls
if sim == -1:
binid = 0
elif sim == 1:
binid = nbins-1
else:
binid = -1 + int(sim/binsize) + (nbins/2)
# Increment histogram[binid]
histogram[int(binid)] += 1
return histogram