In [1]:
%matplotlib inline

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer
from sklearn import metrics
from sklearn.metrics import pairwise_distances_argmin_min

from sklearn.decomposition import PCA

from sklearn.cluster import KMeans

from scipy.spatial.distance import cdist, euclidean

import sys
from time import time

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('./health-dataset/word2vec.csv', header=None)
dataset = np.array(df.values)

# PCA

In [3]:
for i in range(128, 0, -1):
    pca = PCA(n_components=i)
    dataset_PCA = pca.fit_transform(dataset)
    total_variance = np.sum(pca.explained_variance_ratio_)
    if i % 50 == 0:
        print('{} components. Total variance: {}'.format(i, total_variance))
    if total_variance <= 0.91:
        print(i)
        print(total_variance)
        break

105
0.9085951153438397


In [4]:
for i in range(105, 0, -10):
    pca = PCA(n_components=i)
    dataset_PCA = pca.fit_transform(dataset)
    total_variance = np.sum(pca.explained_variance_ratio_)
    if i % 20 == 0:
        print('{} components. Total variance: {}'.format(i, total_variance))
    if total_variance <= 0.96:
        print(i)
        print(total_variance)
        break

105
0.9085951153438397


In [5]:
pca = PCA(n_components=105)
dataset_PCA = pca.fit_transform(dataset)

In [6]:
def kmeans_func():
    N = 5
    verbose = False

    km_PCA = KMeans(n_clusters=N, init='k-means++', max_iter=1200, n_init=3, n_jobs=6,
                    verbose=verbose)

    print("Clustering sparse data with {}".format(km_PCA))
    t0 = time()

    km_PCA.fit(dataset_PCA)
    print("done in {}s".format(time() - t0))
    return km_PCA

In [7]:
km_PCA = kmeans_func()
closest_PCA, distances_PCA = pairwise_distances_argmin_min(km_PCA.cluster_centers_, dataset_PCA)
print(closest_PCA, distances_PCA)

while not (np.unique(closest_PCA).shape == closest_PCA.shape):
    km_PCA = kmeans_func()
    closest_PCA, distances_PCA = pairwise_distances_argmin_min(km_PCA.cluster_centers_, dataset_PCA)

    print(closest_PCA, distances_PCA)

Clustering sparse data with KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=1200,
    n_clusters=5, n_init=3, n_jobs=6, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=False)
done in 1.022132396697998s
[7194  455  554 1386 7987] [0.43075846 0.44713053 0.40509507 0.39655488 0.43721841]


In [8]:
silhouette_PCA = metrics.silhouette_score(dataset_PCA, km_PCA.labels_)

silhouette_PCA

0.00576790836998088

In [10]:
health_lines = []
with open('health-dataset/health.txt', 'r') as health_txt:
    health_aux = health_txt.readlines()
    health_aux.pop(0)
    for i in health_aux:
        health_lines.append(i.split('|')[-1])
len(health_lines)

13229

In [11]:
for i in enumerate(closest_PCA):
#     print(i)
    print('Cluster {}: {}'.format(i[0], health_lines[i[1]]))

Cluster 0: RT @RachelBegunRD: A1 Fact: Heart disease is THE leading cause of death for both men AND women. It is responsible for 1 in 4 deaths in America #healthtalk

Cluster 1: RT @CDCgov: CDC #Ebola experts have advised @EmoryUniversity Hospital there is no public health concern w/ release of these patients.

Cluster 2: RT @drsanjaygupta: meet the doctor and get a first look at the room where ebola patients will be treated in the US.

Cluster 3: RT @kellywallacetv: What's the advice when your child wants to stop a physical activity &amp; you know it's good for them to keep doing it? #fi…

Cluster 4: Blood sugar is NOT what matters most for preventing heart disease in people with #diabetes, new study finds



In [12]:
labels = km_PCA.labels_
count = 0
for i in labels:
    if i == 3:
        print("Cluster {}: {}".format(i, health_lines[count]))
    count += 1

Cluster 3: It doesn't take much to damage your hearing at a sports bar or nightclub. That's why a billion people are at risk.

Cluster 3: RT @CNN: Forever young? Discover this island’s secrets to longevity on #TheWonderList w/ @BillWeirCNN 

Cluster 3: RT @CNN: Is post-traumatic stress disorder in your genes? A simple blood test may one day help tell you

Cluster 3: #FitNation: Finding the right life balance between family, work and getting fit.

Cluster 3: RT @drsanjaygupta: what are you having for dinner? a lot more #sugar thank you think..

Cluster 3: RT @cnni: Eat yourself healthy with these amazing superfoods:

Cluster 3: RT @CNN: .@RobertDowneyJr presented a child with his own 'Iron Man' robotic arm:

Cluster 3: Are you an e-hypochondriac? Searching symptoms online can be dangerous to your (mental) health

Cluster 3: Training for a triathlon? These yoga poses can help, via @MobilityMaker.

Cluster 3: Preparing for a triathlon? You might want to try this.

Cluster 3: Be careful wh

Cluster 3: 'Abuse-deterrent' OxyContin? Addicts find ways around it  

Cluster 3: Daily weigh-ins may encourage weight loss 

Cluster 3: On a night out, more friends means more drinks 

Cluster 3: 6 worthless foods to cut from your diet 

Cluster 3: How much coffee should you really be drinking? 

Cluster 3: 8 healthy foods that are also inexpensive  

Cluster 3: Chemo doesn't have to mean going bald anymore  

Cluster 3: NJ mom freezes eggs for her daughter

Cluster 3: Modified Mediterranean diets also good for weight loss 

Cluster 3: How do people spread stress to each other?  

Cluster 3: How to add an inch to your arms without ever picking up a weight   

Cluster 3: Why head transplants won't happen anytime soon 

Cluster 3: Online yoga apps, classes take off  

Cluster 3: For many marathon runners pacing may be a matter of gender 

Cluster 3: Why drinking in a large group could get you drunker 

Cluster 3: 3 steps to making stress work for you  

Cluster 3: 14 surprising foods fo


Cluster 3: RT @anniehauser: Take charge of your health this year! Your first step: Joining me + @SELFmagazine on Google+ today @ 4! #13for13

Cluster 3: How much water you should be drinking every day:

Cluster 3: Quiz: How well do you know your favorite fruits and veggies?

Cluster 3: More and more U.S. children are being diagnosed with attention-deficit/hyperactivity disorder #ADHD:

Cluster 3: Does drinking water before your meal help you eat less? Find out!

Cluster 3: Did you know flossing can help prevent heart disease? 7 health problems that start in your mouth:

Cluster 3: 10 foods w/ trans fat to cut out of your diet starting now:

Cluster 3: 15 ways to burn 150 calories:

Cluster 3: Having a no good terrible bad day? Eat this:

Cluster 3: 7 ways to have a little fun (and make a little money!) while you lose weight:

Cluster 3: What's really to blame for your weight gain:

Cluster 3: Did you hit snooze this morning? And then again 10 minutes later? What that says about your p

In [None]:
run = True
if run:
    Nc = range(2, 20)

    kmeans = []
    score = []
    silhouettes = []

    for i in Nc:
        kmeans.append(KMeans(n_clusters=i, n_init=3, n_jobs=i, max_iter=1200))

    for i in range(len(kmeans)):
        print('Fitting kmeans with {} clusters'.format(kmeans[i].n_clusters))
        score.append(kmeans[i].fit(dataset_PCA).inertia_)
        silhouettes.append(metrics.silhouette_score(dataset_PCA, kmeans[i].labels_))

Fitting kmeans with 2 clusters
Fitting kmeans with 3 clusters
Fitting kmeans with 4 clusters
Fitting kmeans with 5 clusters
Fitting kmeans with 6 clusters
Fitting kmeans with 7 clusters
Fitting kmeans with 8 clusters
Fitting kmeans with 9 clusters


In [None]:
if run:
    plt.plot(Nc,score)

    plt.xlabel('Number of Clusters')

    plt.ylabel('Sum of squared distances')

    plt.title('Elbow Curve')

    plt.show()
    
    
    plt.plot(Nc,silhouettes)

    plt.xlabel('Number of Clusters')

    plt.ylabel('Silhouettes')

    plt.title('Silhouettes x Number of Clusters')

    plt.show()