# Import and load data

In [1]:
import time
import numpy as np
import pandas as pd
from scipy.io import mmread
from sklearn.metrics import silhouette_score
import matplotlib
import csv
import matplotlib.pyplot as plt
from sklearn.cluster import SpectralClustering
%matplotlib inline

In [2]:
data_DF = pd.read_csv('diffusion_map_20_eigen.tsv', sep='\t',header=None)
data_DF = np.asarray(data_DF)

data_TSNE = pd.read_csv('tsne_3.tsv', sep='\t',header=None)
data_TSNE = np.asarray(data_TSNE)

data_SVD = pd.read_csv('truncatedsvd_20.tsv', sep='\t',header=None)
data_SVD = np.asarray(data_SVD)

data_PCA = pd.read_csv('pca_20.csv',header=None)
data_PCA = np.asarray(data_PCA)

# Spectural clustering for diffusion map

## Run K=[5,15,25] each for 3 trials

In [None]:
runtime_all = []
score_all = []
for K in [5,15,25]:
    time_trial = []
    score_trial = []
    fig, axes = plt.subplots(nrows=1, ncols=3)
    fig.set_size_inches(18.5, 5.5)
    plt.close()
    fig.suptitle('Spectural clustering for K =' + str(K))
    count = 0
    while count<3:
        start = time.time()
        y_pred = SpectralClustering(n_clusters=K, assign_labels='kmeans').fit_predict(data_DF)
        end = time.time()
        runtime = end-start
        
        score = silhouette_score(data_DF, y_pred)
        axes[count].scatter(data_DF[:, 0], data_DF[:, 1], c=y_pred)
        axes[count].set_xlabel('Diffussion Map 1')
        axes[count].set_ylabel('Diffusion Map 2')
        time_trial.append(runtime)
        score_trial.append(score)
        count += 1
    name = 'DF_spectural'+str(K)+'.jpg'
    fig.savefig(name)
    runtime_K = np.mean(time_trial)
    runtime_all.append(runtime_K)
    score_K = np.mean(score_trial)
    score_all.append(score_K)

In [None]:
np.savetxt("DF_spectural_silhouette.csv", score_all, delimiter=",") 
np.savetxt("DF_spectural_runtime.csv", runtime_all, delimiter=",") 

## Run K = 25 for 15 trials and gather the predicted label

In [None]:
runtime_all = []
score_all = []
y_pred_all = np.zeros(data_DF.shape[0]).reshape(data_DF.shape[0],1)
for K in [25]:
    time_trial = []
    score_trial = []

    count = 0
    while count<15:
        start = time.time()
        y_pred = SpectralClustering(n_clusters=K, assign_labels='kmeans').fit_predict(data_DF)
        end = time.time()
        runtime = end-start
        
        score = silhouette_score(data_DF, y_pred)
        time_trial.append(runtime)
        score_trial.append(score)
        count += 1
        y_pred_all = np.concatenate((y_pred_all, y_pred.reshape(data_DF.shape[0],1)),axis=1)
    runtime_K = np.mean(time_trial)
    runtime_all.append(runtime_K)
    score_K = np.mean(score_trial)
    score_all.append(score_K)
    y_pred_all = y_pred_all[:,1:16].astype(int)

In [None]:
np.savetxt("DF_spectural_ypred25.csv", y_pred_all, delimiter=",") 
np.savetxt("DF_spectural_silhouette25.csv", score_all, delimiter=",") 
np.savetxt("DF_spectural_runtime25.csv", runtime_all, delimiter=",") 

# Spectural clustering for PCA

## Run K=[5,15,25] each for 3 trials

In [None]:
runtime_all = []
score_all = []
for K in [5,15,25]:
    time_trial = []
    score_trial = []
    fig, axes = plt.subplots(nrows=1, ncols=3)
    fig.set_size_inches(18.5, 5.5)
    plt.close()
    
    fig.suptitle('Spectural clustering for K =' + str(K))
    count = 0
    while count<3:
        start = time.time()
        y_pred = SpectralClustering(n_clusters=K, assign_labels='kmeans').fit_predict(data_PCA)
        end = time.time()
        runtime = end-start
        
        score = silhouette_score(data_PCA, y_pred)
        axes[count].scatter(data_PCA[:, 0], data_PCA[:, 1], c=y_pred)
        axes[count].set_xlabel('PCA 1')
        axes[count].set_ylabel('PCA 2')
        time_trial.append(runtime)
        score_trial.append(score)
        count += 1
    name = 'PCA_spectural'+str(K)+'.jpg'
    fig.savefig(name)
    runtime_K = np.mean(time_trial)
    runtime_all.append(runtime_K)
    score_K = np.mean(score_trial)
    score_all.append(score_K)

In [None]:
np.savetxt("PCA_spectural_silhouette.csv", score_all, delimiter=",") 
np.savetxt("PCA_spectural_runtime.csv", runtime_all, delimiter=",") 

## Run K = 25 for 15 trials and gather the predicted label

In [None]:
runtime_all = []
score_all = []
y_pred_all = np.zeros(data_DF.shape[0]).reshape(data_PCA.shape[0],1)
for K in [25]:
    time_trial = []
    score_trial = []

    count = 0
    while count<15:
        start = time.time()
        y_pred = SpectralClustering(n_clusters=K, assign_labels='kmeans').fit_predict(data_PCA)
        end = time.time()
        runtime = end-start
        
        score = silhouette_score(data_PCA, y_pred)
        time_trial.append(runtime)
        score_trial.append(score)
        count += 1
        y_pred_all = np.concatenate((y_pred_all, y_pred.reshape(data_DF.shape[0],1)),axis=1)
    runtime_K = np.mean(time_trial)
    runtime_all.append(runtime_K)
    score_K = np.mean(score_trial)
    score_all.append(score_K)
    y_pred_all = y_pred_all[:,1:16].astype(int)

In [None]:
np.savetxt("PCA_spectural_ypred25.csv", y_pred_all, delimiter=",") 
np.savetxt("PCA_spectural_silhouette25.csv", score_all, delimiter=",") 
np.savetxt("PCA_spectural_runtime25.csv", runtime_all, delimiter=",") 

# Spectural clustering for TSNE

In [None]:
runtime_all = []
score_all = []
for K in [5,15,25]:
    time_trial = []
    score_trial = []
    fig, axes = plt.subplots(nrows=1, ncols=3)
    fig.set_size_inches(18.5, 5.5)
    plt.close()
    
    fig.suptitle('Spectural clustering for K =' + str(K))
    count = 0
    while count<3:
        start = time.time()
        y_pred = SpectralClustering(n_clusters=K, assign_labels='kmeans').fit_predict(data_TSNE)
        end = time.time()
        runtime = end-start
        
        score = silhouette_score(data_TSNE, y_pred)
        axes[count].scatter(data_TSNE[:, 0], data_TSNE[:, 1], c=y_pred)
        axes[count].set_xlabel('TSNE 1')
        axes[count].set_ylabel('TSNE 2')
        time_trial.append(runtime)
        score_trial.append(score)
        count += 1
    name = 'TSNE_spectural'+str(K)+'.jpg'
    fig.savefig(name)
    runtime_K = np.mean(time_trial)
    runtime_all.append(runtime_K)
    score_K = np.mean(score_trial)
    score_all.append(score_K)
    
np.savetxt("TSNE_spectural_silhouette.csv", score_all, delimiter=",") 
np.savetxt("TSNE_spectural_runtime.csv", runtime_all, delimiter=",") 

In [None]:
runtime_all = []
score_all = []
y_pred_all = np.zeros(data_TSNE.shape[0]).reshape(data_TSNE.shape[0],1)
for K in [25]:
    time_trial = []
    score_trial = []

    count = 0
    while count<15:
        start = time.time()
        y_pred = SpectralClustering(n_clusters=K, assign_labels='kmeans').fit_predict(data_TSNE)
        end = time.time()
        runtime = end-start
        
        score = silhouette_score(data_TSNE, y_pred)
        time_trial.append(runtime)
        score_trial.append(score)
        count += 1
        y_pred_all = np.concatenate((y_pred_all, y_pred.reshape(data_TSNE.shape[0],1)),axis=1)
    runtime_K = np.mean(time_trial)
    runtime_all.append(runtime_K)
    score_K = np.mean(score_trial)
    score_all.append(score_K)
    y_pred_all = y_pred_all[:,1:16].astype(int)
    
np.savetxt("TSNE_spectural_ypred25.csv", y_pred_all, delimiter=",") 
np.savetxt("TSNE_spectural_silhouette25.csv", score_all, delimiter=",") 
np.savetxt("TSNE_spectural_runtime25.csv", runtime_all, delimiter=",") 

# Spectural clustering for SVD

In [None]:
runtime_all = []
score_all = []
for K in [5,15,25]:
    time_trial = []
    score_trial = []
    fig, axes = plt.subplots(nrows=1, ncols=3)
    fig.set_size_inches(18.5, 5.5)
    plt.close()
    
    fig.suptitle('Spectural clustering for K =' + str(K))
    count = 0
    while count<3:
        start = time.time()
        y_pred = SpectralClustering(n_clusters=K, assign_labels='kmeans').fit_predict(data_SVD)
        end = time.time()
        runtime = end-start
        
        score = silhouette_score(data_SVD, y_pred)
        axes[count].scatter(data_SVD[:, 0], data_SVD[:, 1], c=y_pred)
        axes[count].set_xlabel('SVD 1')
        axes[count].set_ylabel('SVD 2')
        time_trial.append(runtime)
        score_trial.append(score)
        count += 1
    name = 'SVD_spectural'+str(K)+'.jpg'
    fig.savefig(name)
    runtime_K = np.mean(time_trial)
    runtime_all.append(runtime_K)
    score_K = np.mean(score_trial)
    score_all.append(score_K)
    
np.savetxt("SVD_spectural_silhouette.csv", score_all, delimiter=",") 
np.savetxt("SVD_spectural_runtime.csv", runtime_all, delimiter=",") 

In [None]:
runtime_all = []
score_all = []
y_pred_all = np.zeros(data_SVD.shape[0]).reshape(data_SVD.shape[0],1)
for K in [25]:
    time_trial = []
    score_trial = []

    count = 0
    while count<15:
        start = time.time()
        y_pred = SpectralClustering(n_clusters=K, assign_labels='kmeans').fit_predict(data_SVD)
        end = time.time()
        runtime = end-start
        
        score = silhouette_score(data_SVD, y_pred)
        time_trial.append(runtime)
        score_trial.append(score)
        count += 1
        y_pred_all = np.concatenate((y_pred_all, y_pred.reshape(data_SVD.shape[0],1)),axis=1)
    runtime_K = np.mean(time_trial)
    runtime_all.append(runtime_K)
    score_K = np.mean(score_trial)
    score_all.append(score_K)
    y_pred_all = y_pred_all[:,1:16].astype(int)
    
np.savetxt("SVD_spectural_ypred25.csv", y_pred_all, delimiter=",") 
np.savetxt("SVD_spectural_silhouette25.csv", score_all, delimiter=",") 
np.savetxt("SVD_spectural_runtime25.csv", runtime_all, delimiter=",") 