In [1]:
%matplotlib inline

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer
from sklearn import metrics
from sklearn.metrics import pairwise_distances_argmin_min

from sklearn.decomposition import PCA

from sklearn.cluster import KMeans

from scipy.spatial.distance import cdist, euclidean

import sys
from time import time

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('./health-dataset/word2vec.csv', header=None)
dataset = np.array(df.values)

# PCA

In [20]:
pca = PCA(n_components=0.90)
dataset_PCA = pca.fit_transform(dataset)
pca.n_components_

104

In [5]:
pca = PCA(n_components=105)
dataset_PCA = pca.fit_transform(dataset)

In [21]:
def kmeans_func():
    N = 5
    verbose = False

    km_PCA = KMeans(n_clusters=N, init='k-means++', max_iter=1200, n_init=3, n_jobs=6,
                    verbose=verbose)

    print("Clustering sparse data with {}".format(km_PCA))
    t0 = time()

    km_PCA.fit(dataset_PCA)
    print("done in {}s".format(time() - t0))
    return km_PCA

In [22]:
km_PCA = kmeans_func()
closest_PCA, distances_PCA = pairwise_distances_argmin_min(km_PCA.cluster_centers_, dataset_PCA)
print(closest_PCA, distances_PCA)

while not (np.unique(closest_PCA).shape == closest_PCA.shape):
    km_PCA = kmeans_func()
    closest_PCA, distances_PCA = pairwise_distances_argmin_min(km_PCA.cluster_centers_, dataset_PCA)

    print(closest_PCA, distances_PCA)

Clustering sparse data with KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=1200,
    n_clusters=5, n_init=3, n_jobs=6, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=False)
done in 0.5981457233428955s
[7341  455  809 1386  554] [0.42467707 0.44370444 0.43682639 0.40018636 0.40241977]


In [23]:
silhouette_PCA = metrics.silhouette_score(dataset_PCA, km_PCA.labels_)

silhouette_PCA

0.004249867882716707

In [24]:
health_lines = []
with open('health-dataset/health.txt', 'r') as health_txt:
    health_aux = health_txt.readlines()
    health_aux.pop(0)
    for i in health_aux:
        health_lines.append(i.split('|')[-1])
len(health_lines)

13229

In [25]:
for i in enumerate(closest_PCA):
#     print(i)
    print('Cluster {}: {}'.format(i[0], health_lines[i[1]]))

Cluster 0: RT @EinsteinMed: A7: It has been used for over 40 years and is safe for most people with diabetes.-DrC #healthtalk

Cluster 1: RT @CDCgov: CDC #Ebola experts have advised @EmoryUniversity Hospital there is no public health concern w/ release of these patients.

Cluster 2: 1 in 8 U.S. babies is born pre-term. @LIFE takes a look at how hospitals saved these tiny humans 75 yrs ago

Cluster 3: RT @kellywallacetv: What's the advice when your child wants to stop a physical activity &amp; you know it's good for them to keep doing it? #fi…

Cluster 4: RT @drsanjaygupta: meet the doctor and get a first look at the room where ebola patients will be treated in the US.



In [26]:
labels = km_PCA.labels_
count = 0
for i in labels:
    if i == 3:
        print("Cluster {}: {}".format(i, health_lines[count]))
    count += 1

Cluster 3: It doesn't take much to damage your hearing at a sports bar or nightclub. That's why a billion people are at risk.

Cluster 3: RT @CNN: Forever young? Discover this island’s secrets to longevity on #TheWonderList w/ @BillWeirCNN 

Cluster 3: RT @CNN: Is post-traumatic stress disorder in your genes? A simple blood test may one day help tell you

Cluster 3: How women can wipe out Alzheimer's, from @mariashriver. 

Cluster 3: RT @CNNOpinion: Women can defeat #Alzheimers, says @mariashriver. #WipeOutAlz challenge will make it happen.

Cluster 3: You may be your germs: Microbe genes slipped into human DNA, study says.

Cluster 3: #FitNation: Finding the right life balance between family, work and getting fit.

Cluster 3: RT @drsanjaygupta: what are you having for dinner? a lot more #sugar thank you think..

Cluster 3: RT @cnni: Eat yourself healthy with these amazing superfoods:

Cluster 3: RT @CNN: .@RobertDowneyJr presented a child with his own 'Iron Man' robotic arm:

Cluster 


Cluster 3: To help prevent cancer, be sure to mind your ABCs. Here are the nutrients that are vital.

Cluster 3: ATTN Parents: How much do you know about #autism? Join us for a #HealthTalk tomorrow @ 1 pm ET to learn the facts.

Cluster 3: RT @foundersheart: @EverydayHealth @RMichlerMD @MontefioreNYC Thanks to everyone who joined us and for having us! #HeartHealth #HealthTalk

Cluster 3: RT @RMichlerMD: A10: Better and less invasive treatments for those who have #HeartDisease, incld #StemCells. #1 focus needs to be on prevention! #healthtalk

Cluster 3: Looks like we're out of time! Thank you @RMichlerMD @montefioreNYC @foundersheart &amp; everyone who participated! #healthtalk

Cluster 3: RT @chi_stylish: I want to know if taking red wine when on anti hypertensive medications is dangerous #HealthTalk

Cluster 3: RT @Ashley_LizWelch: Check out these 10 fast (and fun) #Heart facts from @everydayhealth

Cluster 3: RT @ChristysChomp: @EverydayHealth A8: lifestyle mod like exercise, plant

Cluster 3: Fitness 'rubs off on your partner'

Cluster 3: 'Designer baby debate should start'

Cluster 3: Unusual activities to get you moving

Cluster 3: Exercise helps with ME, study says

Cluster 3: Australians to pay more to see GPs

Cluster 3: Computers 'good judge of character'

Cluster 3: MP: Pregnant women 'should not drink'

Cluster 3: 'Unique' life of having two penises

Cluster 3: Why are so many drinks flavoured with honey?

Cluster 3: Ebola: How does it compare?

Cluster 3: Christmas closures - plan ahead

Cluster 3: E-readers 'damage sleep and health'

Cluster 3: VIDEO: Fixing bones with a household drill

Cluster 3: Car smoke ban 'to start in October'

Cluster 3: NHS winter: Your questions answered

Cluster 3: Fat 'breathed out' of body via lungs

Cluster 3: Me, my friend Pru, and our memories

Cluster 3: VIDEO: 'Why I'm having my breasts removed'

Cluster 3: Shift workers 'sicker and fatter'

Cluster 3: 'Sugar worse than salt' row erupts

Cluster 3: Scientists make 'fee

In [None]:
run = True
if run:
    Nc = range(2, 20)

    kmeans = []
    score = []
    silhouettes = []

    for i in Nc:
        kmeans.append(KMeans(n_clusters=i, n_init=3, n_jobs=i, max_iter=1200))

    for i in range(len(kmeans)):
        print('Fitting kmeans with {} clusters'.format(kmeans[i].n_clusters))
        score.append(kmeans[i].fit(dataset_PCA).inertia_)
        silhouettes.append(metrics.silhouette_score(dataset_PCA, kmeans[i].labels_))

Fitting kmeans with 2 clusters


In [None]:
if run:
    plt.plot(Nc,score)

    plt.xlabel('Number of Clusters')

    plt.ylabel('Sum of squared distances')

    plt.title('Elbow Curve')

    plt.show()
    
    
    plt.plot(Nc,silhouettes)

    plt.xlabel('Number of Clusters')

    plt.ylabel('Silhouettes')

    plt.title('Silhouettes x Number of Clusters')

    plt.show()