<hr/>

# Dependencies

<hr/>

In [1]:
# Calculation Dependencies

import numpy as np
from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture
from sklearn.cluster import SpectralClustering

# Plotting Dependencies
from sklearn import datasets
import matplotlib.pyplot as plt

# Preprocessing dependencies
import csv
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold

# # maybe useful in future
# from sklearn.metrics import silhouette_samples, silhouette_score
# from mpl_toolkits.mplot3d import Axes3D

<hr/>

#  Helper Functions



1.   plot_features - 2d plotting any data array with shape (num_examples, num_features)
2.   PCA_iterative - PCA dimensionality reduction based on desired threshold for the explanation of variance
3.   kmeans_clustering - performs kmeans clustering based on desired number of clusters 



<hr/>

In [2]:
# function that plots every feature against each other
def plot_features(data, labels):
  """
  input args:
    data = numpy array where rows are number of examples and columns are number of features

  
  rows are in order by features 1, 2, ...num_features-1 
  plotted against every other features that hasn't already been plotted
  e.g. row 1 is feature 1 versus 2, 1 versus 3, ...,1 versus num_features
  e.g. row 5 is features 5 versus 6, feature 6 versus 7, ..., 5 versus num_features
  """
  num_feature = data.shape[1]
  plt.figure(figsize=(num_feature*4,num_feature*4));
  for i in range(num_feature):
    for j in range(i+1,num_feature):
      plt.subplot(num_feature,num_feature,i*num_feature+j-i+1)
      plt.scatter(data[:,i],data[:,j],c=labels, label = ('feature ', (i+1), 'versus ', (j+1)))
      plt.legend()
  plt.show()


In [None]:
# Preprocessing, note whitening is set to be true
# PCA and whitening
def PCA_iterative(data, threshold, mode = 'threshold', whiten_bool = 'False', n = 0):
  """
  mode = 'components' does PCA reduction to n components
  mode = 'threshold' performs PCA reduction to the fewest components that satisfy a threshold of explained variance
  prints out how many components the data was reduced to
  prints out 
  """
  if mode == 'threshold':
    for i in range(2, data.shape[1]+1):
      pca = PCA(n_components = i, whiten=whiten_bool)
      B = pca.fit_transform(data)
      L = pca.explained_variance_
      cl=np.cumsum(L); 
      if (cl[i-2]/cl[-1]) > threshold:
        print("PCA reduction to ", i, " components with ", cl[i-2]/cl[-1], " explained variance")
        break
  else:
    pca = PCA(n_components = n, whiten=whiten_bool)
    B = pca.fit_transform(data) 
    L = pca.explained_variance_
    cl=np.cumsum(L); 
    print('PCA reduction to ', n, ' components with', cl[i-2]/cl[-1], " explained variance")

  # PCA scree plot
  plt.subplot(111); 
  plt.ylabel('Total Variance');
  plt.xlabel('Principal component')
  plt.plot(np.arange(1,L.shape[0]+1),cl/cl[-1],'o-r'); 
  plt.ylim(0,None);
  return B, pca


In [4]:
def pseudolabel_unscrambler(label, cluster_label):
  """
  pseudolabels of classes get randomly assigned. This function best matches
  the pseudolabels to the original ground truth labels.
  """
  num_class = np.max(label) + 1
  true_class = []
  for i in range(num_class):
    best_accuracy = 0
    for j in range(num_class):
      current_accuracy = ((cluster_label == i) == (label == j)).mean()
      if current_accuracy > best_accuracy:
        if best_accuracy == 0:
          best_accuracy = current_accuracy
          true_class.append(j)
        else:
          best_accuracy = current_accuracy
          true_class[-1] = j

  temp_cluster_label = np.copy(cluster_label)
  for i in range(len(true_class)):
    cluster_label[(temp_cluster_label == i)] = true_class[i]
  return cluster_label


In [5]:
def kmeans_clustering(num_cluster, data, label, label_bool = False):
  print('\n\ninitializing kmeans clustering')
  kmeans = KMeans(init='random', n_clusters=num_cluster, n_init=10) 
  # list of scores from kmeans method (negative sum of distance squared from cluster center)
  cluster_label = kmeans.fit_predict(data)

  if label_bool == True:
    cluster_label = pseudolabel_unscrambler(label, cluster_label)
    print('plots of clusters with true labels')
    plot_features(data, label)
    accuracy = (cluster_label == label).mean()
    print('kmeans clustering accuracy is ', accuracy)
  print('plots of clusters with pseudolabels')
  plot_features(data, cluster_label)


In [6]:
def gmm_clustering(num_cluster, data, label, label_bool = False):
  print('\n\ninitializing gmm clustering')
  gmm = GaussianMixture(n_components=num_cluster, n_init=10)
  cluster_label = gmm.fit_predict(data)

  # plots 2D slices of the features
  if label_bool == True:
    cluster_label = pseudolabel_unscrambler(label, cluster_label)
    print('plots of clusters with true labels')
    plot_features(data, label)
    accuracy = (cluster_label == label).mean()
    print('gmm clustering accuracy is ', accuracy)
  print('plots of clusters with pseudolabels')
  plot_features(data, cluster_label)

In [7]:
def spec_clustering(num_cluster, data, label, mode, nn = 11, label_bool = False):
  print('\n\ninitializing spectral clustering with ',mode)
  if mode == 'gaussian':
    spectral = SpectralClustering(random_state = 3, n_clusters=num_cluster, affinity = 'rbf') 
    # list of scores from kmeans method (negative sum of distance squared from cluster center)
    cluster_label = spectral.fit_predict(data)
  elif mode == 'knn':
    spectral = SpectralClustering(random_state = 3, n_clusters=num_cluster, affinity = 'nearest_neighbors', n_neighbors = nn) 
    # list of scores from kmeans method (negative sum of distance squared from cluster center)
    cluster_label = spectral.fit_predict(data)

  # plots 2D slices of the features
  if label_bool == True:
    cluster_label = pseudolabel_unscrambler(label, cluster_label)
    print('plots of clusters with true labels')
    plot_features(data, label)
    accuracy = (cluster_label == label).mean()
    print(mode, 'spectral clustering accuracy is ', accuracy)
  print('plots of clusters with pseudolabels')
  plot_features(data, cluster_label)
  return None
  

In [8]:
# main function that performs preprocessing, kmeans clustering, gaussian mixture clustering, and spectral clustering
def experiment_main(num_cluster, data, label, threshold, preprocess=False, label_bool = False):
  # PCA preprocessing
  if preprocess == True:
    data = PCA_iterative(data, threshold, mode = 'threshold')

  # clustering functions
  kmeans_clustering(num_cluster, data, label, label_bool = label_bool)
  gmm_clustering(num_cluster, data, label, label_bool = label_bool)
  spec_clustering(num_cluster, data, label, mode = 'gaussian', label_bool = label_bool)
  spec_clustering(num_cluster, data, label, mode = 'knn', nn = 50, label_bool = label_bool)

In [9]:
### load data

import csv
import numpy as np
x = []
with open('embedded_data_training.csv') as csvfile:
  reader = csv.reader(csvfile, delimiter = ' ')
  for row in reader:
    x.append([float(n) for n in row])

training = np.asarray(x)

x = []
with open('embedded_data.csv') as csvfile:
  reader = csv.reader(csvfile, delimiter = ' ')
  for row in reader:
    x.append([float(n) for n in row])

test = np.asarray(x)

In [None]:
label = 0
  
### hyperparameters
num_cluster = 3
threshold = .99 # variance explained threshold for PCA preprocessing value 0 to 1
# distance_threshold = 
# nearest_neighbors =

experiment_main(num_cluster, training, label, threshold, preprocess=True, label_bool=False)

In [None]:
# data = PCA_iterative(training, 0.99, mode = 'threshold')
gmm = GaussianMixture(n_components=3, n_init=10).fit(data)




In [None]:
#   unfinished code for finding optimal clusters 
#   def kmeans_clustering(data):
#   # the numbers of clusters to check
#   cluster_array = np.arange(1,20)
#   # list of kmeans models for each # of cluster in cluster_array
#   score = []
#   for i in cluster_array:
#     kmeans = KMeans(init='random', n_clusters=i, n_init=100) 
#     # list of scores from kmeans method (negative sum of distance squared from cluster center)
#     cluster_labels = kmeans.fit_predict(X)
#     score.append(kmeans.fit(data).score(data))

#   plt.plot(cluster_array, score)

#   # figure(figsize=(6,6)); ax=subplot(aspect='equal')
#   # scatter(X[:,0],X[:,1],c=kmeans.labels_,cmap=cm.rainbow);

#   # C = kmeans.cluster_centers_
#   # scatter(C[:,0],C[:,1],c='k',marker='o',s=300,alpha=0.5,edgecolor='none')







# # uses KFold to get a 80/20 train/test split
# k_fold = KFold(n_splits=5, shuffle=True) 
# for k, (train, test) in enumerate(k_fold.split(X)):
#   train_data = data[train]
#   test_data = data[test]
#   train_label = label[train]
#   test_label = label[test]
#   break














# # standardization with 0 mean and 1 stdev
# scaler = StandardScaler()
# scaler.fit(data)
# data = scaler.transform(data)