In [110]:
import pandas as pd
import numpy as np
import random
from helpers import cluster_acc, get_abspath
from sklearn.metrics import silhouette_score
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture as GMM
from collections import defaultdict
from sklearn.metrics import adjusted_mutual_info_score as ami
import sys

winepath = get_abspath('winequality.csv', 'data/experiments')
seismicpath = get_abspath('seismic_bumps.csv', 'data/experiments')
wine = np.loadtxt(winepath, delimiter=',')
seismic = np.loadtxt(seismicpath, delimiter=',')

# split data into X and y
X = wine[:, :-1]
y = wine[:, -1]
name = 'wine'

# run clustering experiments
clusters = [2, 3, 4, 5, 6, 7, 8, 10, 12, 15, 20, 25, 30, 40, 50, 80, 125]

sse = defaultdict(dict)  # sum of squared errors
logl = defaultdict(dict)  # log-likelihood
bic = defaultdict(dict)  # BIC for EM
acc = defaultdict(lambda: defaultdict(dict))  # accuracy scores
adjmi = defaultdict(lambda: defaultdict(dict))  # adjusted mutual info
silhouette = defaultdict(lambda: defaultdict(dict))  # K-Means silhouette score
km = KMeans(random_state=0)  # K-Means
gmm = GMM(random_state=0)  # Gaussian Mixture Model (EM)

# start loop for given values of k
for k in clusters:
    km.set_params(n_clusters=k)
    gmm.set_params(n_components=k)
    km.fit(X)
    gmm.fit(X)

    # calculate SSE, log-likelihood, accuracy, and adjusted mutual info
    sse[k][name] = km.score(X)
    logl[k][name] = gmm.score(X)
    acc[k][name]['K-Means'] = cluster_acc(y, km.predict(X))
    acc[k][name]['GMM'] = cluster_acc(y, gmm.predict(X))
    adjmi[k][name]['K-Means'] = ami(y, km.predict(X))
    adjmi[k][name]['GMM'] = ami(y, gmm.predict(X))
    
    # calculate silhouette score for K-Means
    km_silhouette = silhouette_score(X, km.predict(X))
    silhouette[k][name]['Kmeans'] = km_silhouette
    
    # calculate BIC for EM
    
# generate output dataframes
sse = (-pd.DataFrame(sse)).T
sse.rename(columns={name: 'SSE'}, inplace=True)
logl = pd.DataFrame(logl).T
logl.rename(columns={name: 'Log-likelihood'}, inplace=True)

# save scoring metrics
outdir = 'results/clustering'
ssefile = get_abspath('{}_sse.csv'.format(name), outdir)
sse.to_csv(ssefile, index_label='k')
loglfile = get_abspath('{}_logl.csv'.format(name), outdir)
logl.to_csv(loglfile, index_label='k')

# save accuracy results
acc = pd.Panel(acc)
accfile = get_abspath('{}_acc.csv'.format(name), outdir)
acc.loc[:, :, name].T.to_csv(accfile, index_label='k')

# save adjusted mutual info results
adjmi = pd.Panel(adjmi)
adjmifile = get_abspath('{}_adjmi.csv'.format(name), outdir)
adjmi.loc[:, :, name].T.to_csv(adjmifile, index_label='k')