In [None]:
%matplotlib inline

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer
from sklearn import metrics
from sklearn.metrics import pairwise_distances_argmin_min

from sklearn.decomposition import PCA

from sklearn.cluster import KMeans

from scipy.spatial.distance import cdist, euclidean

import sys
from time import time

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
df = pd.read_csv('./health-dataset/word2vec.csv', header=None)
dataset = np.array(df.values)

# PCA

In [None]:
for i in range(128, 0, -1):
    pca = PCA(n_components=i)
    dataset_PCA = pca.fit_transform(dataset)
    total_variance = np.sum(pca.explained_variance_ratio_)
    if i % 50 == 0:
        print('{} components. Total variance: {}'.format(i, total_variance))
    if total_variance <= 0.91:
        print(i)
        print(total_variance)
        break

In [None]:
for i in range(105, 0, -10):
    pca = PCA(n_components=i)
    dataset_PCA = pca.fit_transform(dataset)
    total_variance = np.sum(pca.explained_variance_ratio_)
    if i % 20 == 0:
        print('{} components. Total variance: {}'.format(i, total_variance))
    if total_variance <= 0.96:
        print(i)
        print(total_variance)
        break

In [None]:
pca = PCA(n_components=105)
dataset_PCA = pca.fit_transform(dataset)

In [None]:
def kmeans_func():
    N = 5
    verbose = False

    km_PCA = KMeans(n_clusters=N, init='k-means++', max_iter=1200, n_init=3, n_jobs=6,
                    verbose=verbose)

    print("Clustering sparse data with {}".format(km_PCA))
    t0 = time()

    km_PCA.fit(dataset_PCA)
    print("done in {}s".format(time() - t0))
    return km_PCA

In [None]:
km_PCA = kmeans_func()
closest_PCA, distances_PCA = pairwise_distances_argmin_min(km_PCA.cluster_centers_, dataset_PCA)
print(closest_PCA, distances_PCA)

while not (np.unique(closest_PCA).shape == closest_PCA.shape):
    km_PCA = kmeans_func()
    closest_PCA, distances_PCA = pairwise_distances_argmin_min(km_PCA.cluster_centers_, dataset_PCA)

    print(closest_PCA, distances_PCA)

In [None]:
silhouette_PCA = metrics.silhouette_score(dataset_PCA, km_PCA.labels_)

silhouette_PCA

In [None]:
health_lines = []
with open('health-dataset/health.txt', 'r') as health_txt:
    health_aux = health_txt.readlines()
    health_aux.pop(0)
    for i in health_aux:
        health_lines.append(i.split('|')[-1])
len(health_lines)

In [None]:
for i in enumerate(closest_PCA):
#     print(i)
    print('Cluster {}: {}'.format(i[0], health_lines[i[1]]))

In [None]:
labels = km_PCA.labels_
count = 0
for i in labels:
    if i == 3:
        print("Cluster {}: {}".format(i, health_lines[count]))
    count += 1

In [None]:
run = True
if run:
    Nc = range(2, 20)

    kmeans = []
    score = []
    silhouettes = []

    for i in Nc:
        kmeans.append(KMeans(n_clusters=i, n_init=3, n_jobs=i, max_iter=1200))

    for i in range(len(kmeans)):
        print('Fitting kmeans with {} clusters'.format(kmeans[i].n_clusters))
        score.append(kmeans[i].fit(dataset_PCA).inertia_)
        silhouettes.append(metrics.silhouette_score(dataset_PCA, kmeans[i].labels_))

In [None]:
if run:
    plt.plot(Nc,score)

    plt.xlabel('Number of Clusters')

    plt.ylabel('Sum of squared distances')

    plt.title('Elbow Curve')

    plt.show()
    
    
    plt.plot(Nc,silhouettes)

    plt.xlabel('Number of Clusters')

    plt.ylabel('Silhouettes')

    plt.title('Silhouettes x Number of Clusters')

    plt.show()