## This demonstrates scikit-learn clustering for comparison with Tribuo clustering

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering
from sklearn.cluster import MeanShift
from sklearn.cluster import DBSCAN
from sklearn.metrics import adjusted_mutual_info_score
from sklearn.metrics import normalized_mutual_info_score

In [2]:
# This dataset is generated in the notebook: scikit-learn Clustering - Data Setup
df = pd.read_csv('../../data/gaussianBlobs.csv')
# print(df)

In [3]:
df_X = df.drop(['Cluster'], axis=1)
df_y = pd.DataFrame(df[['Cluster']])

X_train, X_test, y_train, y_test = train_test_split(df_X.values, df_y.values, test_size=0.2, random_state=1)
print('Training data size = %d, number of features = %d' % (len(X_train), len(df_X.columns)))
print('Testing data size = %d, number of features = %d' % (len(X_test), len(df_X.columns)))

Training data size = 4800000, number of features = 5
Testing data size = 1200000, number of features = 5


In [4]:
def evaluate(actual, predicted):
    print('Clustering Evaluation')
    print('Normalized MI = %.2f' % normalized_mutual_info_score(actual, predicted))
    print('Adjusted MI = %.2f' % adjusted_mutual_info_score(actual, predicted))

In [5]:
km = KMeans(n_clusters=6, max_iter=100, n_jobs=4, random_state=1, init='random')
km_plus_plus = KMeans(n_clusters=6, max_iter=100, n_jobs=4, random_state=1, init='k-means++')
# This crashes the kernel everytime
# ag = AgglomerativeClustering(n_clusters=6)
# This doesn't finish after a reasonable amount of time
# ms = MeanShift(n_jobs=4)
# This also crashes the kernel everytime
# dbscan = DBSCAN(eps=3, min_samples=50, n_jobs=4)

In [6]:
print(km)
print(km_plus_plus)
# print(ag)
# print(ms)
# print(dbscan)

KMeans(init='random', max_iter=100, n_clusters=6, n_jobs=4, random_state=1)
KMeans(max_iter=100, n_clusters=6, n_jobs=4, random_state=1)


In [7]:
%time km.fit(X_train)
# run 1
# time:  17.4 s

# run 2
# time:  17.7 s

# run 3
# time:  16.0 s



CPU times: user 41.5 s, sys: 1.52 s, total: 43.1 s
Wall time: 16 s


KMeans(init='random', max_iter=100, n_clusters=6, n_jobs=4, random_state=1)

In [8]:
predicted = km.predict(X_test)
evaluate(y_test.ravel(), predicted)
# run 1
# Normalized MI = 1.00
# Adjusted MI = 1.00

# run 2
# Normalized MI = 1.00
# Adjusted MI = 1.00

# run 3
# Normalized MI = 1.00
# Adjusted MI = 1.00

Clustering Evaluation
Normalized MI = 1.00
Adjusted MI = 1.00


In [9]:
%time km_plus_plus.fit(X_train)
# run 1
# time:  19.2 s

# run 2
# time:  18.6 s

# run 3
# time:  17.9 s



CPU times: user 33.3 s, sys: 3.03 s, total: 36.3 s
Wall time: 17.9 s


KMeans(max_iter=100, n_clusters=6, n_jobs=4, random_state=1)

In [10]:
predicted = km_plus_plus.predict(X_test)
evaluate(y_test.ravel(), predicted)

# run 1
# Normalized MI = 1.00
# Adjusted MI = 1.00

# run 2
# Normalized MI = 1.00
# Adjusted MI = 1.00

# run 3
# Normalized MI = 1.00
# Adjusted MI = 1.00

Clustering Evaluation
Normalized MI = 1.00
Adjusted MI = 1.00


In [11]:
# %time ag.fit(X_train)

In [12]:
# predicted = ag.predict(X_test)
# evaluate(y_test.ravel(), predicted)

In [13]:
# %time ms.fit(X_train)

In [14]:
# predicted = ms.predict(X_test)
# evaluate(y_test.ravel(), predicted)

In [15]:
# %time dbscan.fit(X_train)

In [16]:
# predicted = dbscan.predict(X_test)
# evaluate(y_test.ravel(), predicted)