In [34]:
import numpy
import pandas
import sklearn
import matplotlib.pyplot as plt
import os
import seaborn
from scipy.cluster.hierarchy import dendrogram, linkage
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE, MDS
from sklearn.cluster import DBSCAN, Birch, KMeans
import math
from scipy.linalg import svdvals
from umap import UMAP
import plotly.express as px
seaborn.set(style="ticks", context="talk")

# Data setup

In [35]:
#expname = experiment name
#n_clusters = 12
#df = None #Pandas dataframe
#labels = None #Single column Pandas dataframe
#moreLabels = None #Pandas dataframe
#labelnames = None #Namedict that corresponds labels to string

In [36]:
expname = 'Bank_Classification'
n_clusters = 2
df = numpy.load('embeddings.npz')['embeddings']
source = pandas.read_csv('bank-additional-full.csv', sep=';')
moreLabels = source.drop('y', axis=1)
labels = source['y'].replace(to_replace=['no', 'yes'], value=[0, 1])
labelnames = {0:'No', 1:'Yes'}

In [37]:
labels

0        0
1        0
2        0
3        0
4        0
        ..
41183    1
41184    0
41185    0
41186    1
41187    0
Name: y, Length: 41188, dtype: int64

# Visualisation

In [None]:
# Pca section
pca = PCA(n_components=2)
pcaed_df = pca.fit_transform(df)

# MDS manifold
mds = MDS(n_components=2)
mdsed_df = mds.fit_transform(df)

# T-SNE section
tsne = TSNE(n_components=2)
tsned_df = tsne.fit_transform(df)

# UMAP section
umap = UMAP(n_components=2)
umaped_df = umap.fit_transform(df)

In [None]:
for dataf, methodName in zip([pcaed_df, mdsed_df, tsned_df, umaped_df], ['PCA', 'MDS', 'TSNE', 'UMAP']) :
    fig = px.scatter(pandas.concat([pandas.DataFrame(dataf, columns=['Dim 1', 'Dim 2']), moreLabels], axis=1), x='Dim 1', y='Dim 2', width=1000, height=1000,
                             hover_data=moreLabels.columns,
                             color=[labelnames[i] for i in labels],
                             title=f"2D Data Projection using {methodName}")

    traces = dict(size=20, opacity=0.5, line=dict(color='White', width=2))
    fig.update_traces(marker=traces)
    fig.write_image(f'AnalysisOutput/{expname}_Visualisation_{methodName}.png')
    fig.show()

# Clustering

In [None]:
# Kmeans section
kmeans = KMeans(n_clusters=n_clusters)
kmeans.fit(df)

# DBScan manifold
dbscan = DBSCAN()
dbscan.fit(df)

# T-SNE section
birch = Birch(n_clusters=n_clusters)
birch.fit(df)

In [None]:
visual_models = [pcaed_df, mdsed_df, tsned_df, umaped_df]
model_names = ['PCA', 'MDS', 'TSNE', 'UMAP']
chosen_visual_model = 2

for cluster_model, cluster_model_name in zip([kmeans, dbscan, birch], ['KMeans', 'DBScan', 'Birch']):
    fig = px.scatter(pandas.concat([pandas.DataFrame(visual_models[chosen_visual_model], columns=['Dim 1', 'Dim 2']),
                                    pandas.DataFrame(labels, columns=['moa']),
                                    moreLabels], axis=1),
        x='Dim 1', y='Dim 2', width=1000, height=1000, hover_data=moreLabels.columns,
        color=cluster_model.labels_.astype(str),
        title=f"Clustering using {cluster_model_name}",
        color_discrete_sequence=px.colors.qualitative.Dark24,
        template='plotly_dark')

    traces = dict(size=20, opacity=0.5, line=dict(color='White', width=2))
    fig.update_traces(marker=traces)
    fig.write_image(f'AnalysisOutput/{expname}_Clustering_{cluster_model_name}.png')
    fig.show()

# Collapse

In [None]:
covmat = numpy.cov(df, rowvar=False)
decomp = svdvals(covmat)
plt.plot(range(len(df.columns)), [math.log(i) for i in decomp])
plt.ylim((-10,0))
plt.xlabel('Dimension')
plt.ylabel('Log of Singular Values')
plt.title('Dimensional Collapse')
plt.savefig(f'AnalysisOutput/{expname}_Collapse.png')