In [None]:
import pandas as pd
import pickle
import numpy as np
from scipy.cluster.hierarchy import fcluster, linkage, dendrogram, cophenet
from scipy.spatial.distance import pdist
from matplotlib import pyplot as plt

In [None]:
file = 'interpolated_no_na_no_noise.csv'
df = pd.read_csv(file, index_col=0, nrows=30, sep=";")

In [None]:
with open('euclidean_similarity_matrix.pkl', 'rb') as f:
    sim_matrix = pickle.load(f)

In [None]:
sim_vec = pdist(sim_matrix)
sim_vec.T.shape

In [None]:
Z = linkage(sim_vec, 'centroid', 'precomputed')

In [None]:
c, coph_dists = cophenet(Z, sim_vec) # Calculates cophenetic distance, which determines how well the clustering works
c                                    # used for calculating clustering methods (such as 'centroid')

In [None]:
# calculate full dendrogram
figure =plt.figure(figsize=(25, 10))
plt.title('Hierarchical Clustering Dendrogram')
plt.xlabel('sample index')
plt.ylabel('distance')
dendrogram(
    Z,
    leaf_rotation=90.,  # rotates the x axis labels
    leaf_font_size=8.,  # font size for the x axis labels
)
plt.show()
figure.savefig('dendrogram.png')

In [None]:
max_d = 5*10**5 # Sets max distance for clustering
''' 
fancy_dendrogram(
    Z,
    truncate_mode='lastp',
    p=30,
    leaf_rotation=90.,
    leaf_font_size=12.,
    show_contracted=True,
    annotate_above=10,  # useful in small plots so annotations don't overlap
    max_d=max_d
)
plt.show()
'''

In [None]:
T = fcluster(Z, max_d, criterion='distance')

In [None]:
# Generates a list of lists for each cluster. The nested list contains column names that are in the given cluster
def generate_cluster_list(multi_item_clusters, all_clusters, df, write_to_file=False):
    clusters_list = []
    for cluster in multi_item_clusters:
        indices = np.where(all_clusters == cluster)
        cols = df.columns.values[indices]
        clusters_list.append(cols.tolist())

    not_clusters = list(set(x) - set(y))

    for not_cluster in not_clusters:
        index = np.where(T == not_cluster)
        col = df.columns.values[index]
        clusters_list.append(col.tolist())
    if write_to_file:
        with open ('clusters_list.pkl', 'wb') as f:
            pickle.dump(clusters_list, f, pickle.HIGHEST_PROTOCOL)
    return clusters_list

In [None]:
# Finds the multi items clusters

x = []
y = []
for i in range(len(T)):
    if T[i] not in x:
        x.append(T[i])
    elif T[i] not in y:
        y.append(T[i])

In [None]:
cluster_list = generate_cluster_list(y, T, df, True)