In [67]:
%matplotlib inline

from sklearn.feature_extraction.text import CountVectorizer
from sklearn import metrics

from sklearn.decomposition import PCA

from sklearn.cluster import KMeans

from scipy.spatial.distance import cdist, euclidean

import sys
from time import time

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [68]:
df = pd.read_csv('./health-dataset/word2vec.csv', header=None)
dataset = np.array(df.values)

# PCA

In [69]:
pca = PCA(n_components=0.90)
dataset_PCA = pca.fit_transform(dataset)
pca.n_components_

104

In [70]:
def kmeans_func():
    N = 5
    verbose = False

    km_PCA = KMeans(n_clusters=N, init='k-means++', max_iter=1200, n_init=3, n_jobs=6,
                    verbose=verbose)

    print("Clustering sparse data with {}".format(km_PCA))
    t0 = time()

    km_PCA.fit(dataset_PCA)
    print("done in {}s".format(time() - t0))
    return km_PCA

In [None]:
km_PCA = kmeans_func()
closest_PCA, distances_PCA = metrics.pairwise_distances_argmin_min(km_PCA.cluster_centers_, dataset_PCA)
print(closest_PCA, distances_PCA)

while not (np.unique(closest_PCA).shape == closest_PCA.shape):
    km_PCA = kmeans_func()
    closest_PCA, distances_PCA = metrics.pairwise_distances_argmin_min(km_PCA.cluster_centers_, dataset_PCA)

    print(closest_PCA, distances_PCA)

Clustering sparse data with KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=1200,
    n_clusters=5, n_init=3, n_jobs=6, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=False)
done in 0.7212989330291748s
[7987  455 1386 7194  554] [0.43774125 0.44351807 0.39665816 0.42949568 0.40590746]


In [None]:
silhouette_PCA = metrics.silhouette_score(dataset_PCA, km_PCA.labels_)

silhouette_PCA

0.004978121840402525

In [None]:
health_lines = []
with open('health-dataset/health.txt', 'r') as health_txt:
    health_aux = health_txt.readlines()
    health_aux.pop(0)
    for i in health_aux:
        health_lines.append(i.split('|')[-1])
len(health_lines)

13229

In [None]:
for i in enumerate(closest_PCA):
#     print(i)
    print('Cluster {}: {}'.format(i[0], health_lines[i[1]]))

Cluster 0: Blood sugar is NOT what matters most for preventing heart disease in people with #diabetes, new study finds

Cluster 1: RT @CDCgov: CDC #Ebola experts have advised @EmoryUniversity Hospital there is no public health concern w/ release of these patients.

Cluster 2: RT @kellywallacetv: What's the advice when your child wants to stop a physical activity &amp; you know it's good for them to keep doing it? #fi…

Cluster 3: RT @RachelBegunRD: A1 Fact: Heart disease is THE leading cause of death for both men AND women. It is responsible for 1 in 4 deaths in America #healthtalk

Cluster 4: RT @drsanjaygupta: meet the doctor and get a first look at the room where ebola patients will be treated in the US.



In [None]:
labels = km_PCA.labels_
count = np.zeros(2).astype('int')

for i in np.unique(labels):
    count[0] = 0
    for j in labels:
        if j == i:
            print("Cluster {}: {}".format(j, health_lines[count[1]]))
            count[0] += 1
        count[1] += 1
        if count[0] == 5:
            break

Cluster 0: A plant-based diet that incorporates fish may be the key to preventing colorectal cancers:

Cluster 0: How women can wipe out Alzheimer's, from @mariashriver. 

Cluster 0: RT @CNNOpinion: Women can defeat #Alzheimers, says @mariashriver. #WipeOutAlz challenge will make it happen.

Cluster 0: CDC: Misuse of garments may have led to release of bioterror bacteria at Tulane monkey lab.

Cluster 0: Losing a brain tumor, gaining perspective: CNN's Jessica Moskowitz's #FirstPerson experience.

Cluster 1: Ear trouble? Save yourself a trip to the ER. One startup has created an iPhone attachment to diagnose ear infections.

Cluster 1: "Vaccines prevent 6 million deaths every year and fundamentally changed modern medicine" Opinion @drsanjaygupta

Cluster 1: That cell phone is suffocating you, @drsanjaygupta shows you how

Cluster 1: Do any of your resolutions involve eating better?  Here's some easy ways to do just that

Cluster 1: Those NewYear resolutions starting to slide? Try these

In [None]:
run = True
if run:
    Nc = range(2, 20)

    kmeans = []
    score = []
    silhouettes = []

    for i in Nc:
        kmeans.append(KMeans(n_clusters=i, n_init=3, n_jobs=i, max_iter=1200))

    for i in range(len(kmeans)):
        print('Fitting kmeans with {} clusters'.format(kmeans[i].n_clusters))
        score.append(kmeans[i].fit(dataset_PCA).inertia_)
        silhouettes.append(metrics.silhouette_score(dataset_PCA, kmeans[i].labels_))

Fitting kmeans with 2 clusters
Fitting kmeans with 3 clusters
Fitting kmeans with 4 clusters
Fitting kmeans with 5 clusters


In [None]:
if run:
    plt.plot(Nc,score)

    plt.xlabel('Number of Clusters')

    plt.ylabel('Sum of squared distances')

    plt.title('Elbow Curve')

    plt.show()
    
    
    plt.plot(Nc,silhouettes)

    plt.xlabel('Number of Clusters')

    plt.ylabel('Silhouettes')

    plt.title('Silhouettes x Number of Clusters')

    plt.show()