In [1]:
#########################################################################################################
#########################################################################################################
# import the packages that I need

import pandas as pd
import numpy as np
import seaborn as sns
import datetime as dt
import matplotlib.pyplot as plt
import brewer2mpl
import os
import sys
import unicodecsv as csv
import matplotlib.dates as md
from pandas.tseries.offsets import *
from matplotlib.backends.backend_pdf import PdfPages
import codecs
from sklearn.cluster import KMeans
from sklearn.preprocessing import MinMaxScaler
from mpl_toolkits.mplot3d import Axes3D
from sklearn import metrics
from sklearn.metrics import pairwise_distances
from sklearn.decomposition import PCA
from sklearn.preprocessing import scale
from time import time
from scipy import stats

#########################################################################################################
## IMPORT MY DATA

# DAD sleep data: https://console.treasuredata.com/jobs/54403720
# current sleep data: https://console.treasuredata.com/jobs/56191379
data = pd.read_csv('/Users/heather/DataDump/sleep_clustering.csv')

# DAD step data: https://console.treasuredata.com/jobs/54403748
# current step data: https://console.treasuredata.com/jobs/56191342
step_data = pd.read_csv('/Users/heather/DataDump/step_data.csv')

# DAD watch data: https://console.treasuredata.com/jobs/54403786
# current watch data: https://console.treasuredata.com/jobs/56191279
watch_data = pd.read_csv('/Users/heather/DataDump/clustering_watch_data.csv')

# DAD demo data: https://console.treasuredata.com/jobs/54403810
# current demo data: https://console.treasuredata.com/jobs/56191243
demo_data = pd.read_csv('/Users/heather/DataDump/clustering_demo_data.csv')

# hours on wrist data: https://console.treasuredata.com/jobs/56191063
tow_data = pd.read_csv('/Users/heather/DataDump/clustering_tow_data.csv')

# non-SMS notification sources: https://console.treasuredata.com/jobs/56217380
# (most common notifications right now) -- https://console.treasuredata.com/jobs/56214142
non_sms_notifs = pd.read_csv('/Users/heather/DataDump/non_sms_notifs.csv')

msg = non_sms_notifs[non_sms_notifs.data_0_app_name == 'Messenger']
fb_msg = msg[msg.data_0_package_name == 'com.facebook.orca']
fb = non_sms_notifs[non_sms_notifs.data_0_app_name == 'Facebook']
wa = non_sms_notifs[non_sms_notifs.data_0_app_name == 'WhatsApp']
hang = non_sms_notifs[non_sms_notifs.data_0_app_name == 'Hangouts']
sc = non_sms_notifs[non_sms_notifs.data_0_app_name == 'Snapchat']
insta = non_sms_notifs[non_sms_notifs.data_0_app_name == 'Instagram']
maps = non_sms_notifs[non_sms_notifs.data_0_app_name == 'Maps']
maps = non_sms_notifs[non_sms_notifs.data_0_package_name == 'com.google.android.apps.maps']
gmail = non_sms_notifs[non_sms_notifs.data_0_app_name == 'Gmail']
gmail = non_sms_notifs[non_sms_notifs.data_0_package_name == 'com.google.android.gm']
inbox = non_sms_notifs[non_sms_notifs.data_0_app_name == 'Inbox']
inbox = non_sms_notifs[non_sms_notifs.data_0_package_name == 'com.google.android.apps.inbox']
gplay = non_sms_notifs[non_sms_notifs.data_0_app_name == 'Google Play Store']
gplay = non_sms_notifs[non_sms_notifs.data_0_package_name == 'com.android.vending']


# SMS notifications received by android users: https://console.treasuredata.com/jobs/56219442
# updated query: https://console.treasuredata.com/jobs/56223406
sms_notifs = pd.read_csv('/Users/heather/DataDump/sms_notifs.csv')


#########################################################################################################
# Set up some better defaults for matplotlib
from matplotlib import rcParams

#colorbrewer2 Dark2 qualitative color table
dark2_colors = brewer2mpl.get_map('Paired', 'Qualitative', 11).mpl_colors

rcParams['figure.figsize'] = (10, 6)
rcParams['figure.dpi'] = 150
rcParams['axes.color_cycle'] = dark2_colors
rcParams['lines.linewidth'] = 2
rcParams['axes.facecolor'] = 'white'
rcParams['font.size'] = 18
rcParams['patch.edgecolor'] = 'white'
rcParams['patch.facecolor'] = dark2_colors[0]
rcParams['font.family'] = 'StixGeneral'


def remove_border(axes=None, top=False, right=False, left=True, bottom=True):
    """
    Minimize chartjunk by stripping out unnecesasry plot borders and axis ticks
    
    The top/right/left/bottom keywords toggle whether the corresponding plot border is drawn
    """
    ax = axes or plt.gca()
    ax.spines['top'].set_visible(top)
    ax.spines['right'].set_visible(right)
    ax.spines['left'].set_visible(left)
    ax.spines['bottom'].set_visible(bottom)
    
    #turn off all ticks
    ax.yaxis.set_ticks_position('none')
    ax.xaxis.set_ticks_position('none')
    
    #now re-enable visibles
    if top:
        ax.xaxis.tick_top()
    if bottom:
        ax.xaxis.tick_bottom()
    if left:
        ax.yaxis.tick_left()
    if right:
        ax.yaxis.tick_right()
        
        




In [2]:
df2 = data[data.peb_start_sleep < 660]
df2['peb_start_sleep'] = df2.peb_start_sleep + 1440

df3 = data[data.peb_start_sleep >= 660]

data = df2.append(df3)

avg_matrix = data.groupby('user_id').mean()
avg_matrix = avg_matrix.reset_index()
avg_matrix = avg_matrix.rename(columns={'peb_start_sleep':'avg_bedtime','peb_stop_sleep':'avg_waketime','peb_total_sleep':'avg_sleep','peb_deep_sleep':'avg_deep_sleep','num_sleep_sessions':'avg_sleep_sessions'})

std_matrix = data.groupby('user_id').std()
std_matrix = std_matrix.reset_index()
std_matrix = std_matrix.rename(columns={'peb_start_sleep':'std_bedtime','peb_stop_sleep':'std_waketime','peb_total_sleep':'std_sleep','peb_deep_sleep':'std_deep_sleep','num_sleep_sessions':'std_sleep_sessions'})

matrix = avg_matrix.merge(std_matrix,on='user_id')

matrix['avg_bedtime'] = matrix.avg_bedtime/60
matrix['avg_waketime'] = matrix.avg_waketime/60
matrix['avg_sleep'] = matrix.avg_sleep/60
matrix['avg_deep_sleep'] = matrix.avg_deep_sleep/60

# feature rescaling on the data prior to k-means clustering!
# here I will use the MinMaxScaler from scikit-learn to transform the features to values between 0 and 1

scaler = MinMaxScaler()

matrix['rescaled_bedtime'] = pd.DataFrame(scaler.fit_transform(np.array(matrix.avg_bedtime)))
matrix['rescaled_waketime'] = pd.DataFrame(scaler.fit_transform(np.array(matrix.avg_waketime)))
matrix['rescaled_sleep'] = pd.DataFrame(scaler.fit_transform(np.array(matrix.avg_sleep)))
matrix['rescaled_deep_sleep'] = pd.DataFrame(scaler.fit_transform(np.array(matrix.avg_deep_sleep)))
matrix['rescaled_sleep_sessions'] = pd.DataFrame(scaler.fit_transform(np.array(matrix.avg_sleep_sessions)))
matrix['rescaled_std_bedtime'] = pd.DataFrame(scaler.fit_transform(np.array(matrix.std_bedtime)))
matrix['rescaled_std_waketime'] = pd.DataFrame(scaler.fit_transform(np.array(matrix.std_waketime)))
matrix['rescaled_std_sleep'] = pd.DataFrame(scaler.fit_transform(np.array(matrix.std_sleep)))
matrix['rescaled_std_deep_sleep'] = pd.DataFrame(scaler.fit_transform(np.array(matrix.std_deep_sleep)))
matrix['rescaled_std_sleep_sessions'] = pd.DataFrame(scaler.fit_transform(np.array(matrix.std_sleep_sessions)))



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


In [3]:
# let's try some k-means clustering!

#% pylab inline

features = ['rescaled_bedtime','rescaled_waketime','rescaled_sleep','rescaled_deep_sleep']
num_clusters = 4

numfeat = len(features)

training = matrix[features]

# n_init = # of times the algorithm is initialized (i.e. how many times does it come up with clusters?)
# typically, for less than 10 clusters, multiple rounds of optimizations are optimal (typically 50-1000 are used, according to Andrew Ng)
# for our purposes, since we have less than 10 clusters, we will go with n_clusters=500

# init = method for initialization, or choosing the initial centroids
# common methods include random initialization or k-means++
# scikit-learn's default is k-means++, and it has been suggested that this method yields considerable improvement in the final error of k-means
# therefore, we will go with k-means++

km = KMeans(n_clusters=num_clusters,init='k-means++',n_init=500)

km.fit(training)

centroids = km.cluster_centers_
km_labels = km.labels_
inertia = km.inertia_


for i in range(num_clusters):
    ds = training[km_labels==i]
    plt.plot(ds.iloc[:,0],ds.iloc[:,1],'o', c = dark2_colors[i], alpha = 0.7)
    lines = plt.plot(centroids[i,0],'kx')
    
    
plt.ylabel('avg. wake time')
plt.xlabel('avg. bedtime')
#plt.axis([19.5, 30, 0, 11])

inertia
#plt.show()

#s_coeff = metrics.silhouette_score(training,km_labels,metric='euclidean')

446.6611521996814

In [9]:
matrix['cluster_label'] = pd.DataFrame(km_labels)

cluster_ids = matrix[['user_id','cluster_label']]


wa = non_sms_notifs[non_sms_notifs.data_0_app_name == 'WhatsApp']

var = wa

def notif_source(var):
    var['day_of_week'] = var['day_of_week'].astype(int)
    cluster_info = var.merge(cluster_ids,on='user_id')
    cluster_info = cluster_info[['user_id','day_of_week','cluster_label','notif_sent_to_watch']]
    clusters = cluster_info.groupby(['user_id','cluster_label']).mean().reset_index()
    cluster_activity = clusters.groupby(['cluster_label']).mean().reset_index()
    clusters_daily = cluster_info.groupby(['user_id','day_of_week','cluster_label']).mean().reset_index()
    clusters_daily = clusters_daily.groupby(['day_of_week','cluster_label']).mean().reset_index()
    


In [13]:
notif_source(var)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [14]:
cluster_info

NameError: name 'cluster_info' is not defined

In [23]:
var = wa

var['day_of_week'] = var['day_of_week'].astype(int)

cluster_info = var.merge(cluster_ids,on='user_id')

cluster_info = cluster_info[['user_id','day_of_week','cluster_label','notif_sent_to_watch']]

clusters = cluster_info.groupby(['user_id','cluster_label']).mean().reset_index()
cluster_activity = clusters.groupby(['cluster_label']).mean().reset_index()
clusters_daily = cluster_info.groupby(['user_id','day_of_week','cluster_label']).mean().reset_index()
clusters_daily = clusters_daily.groupby(['day_of_week','cluster_label']).mean().reset_index()
clusters_daily

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()


Unnamed: 0,day_of_week,cluster_label,notif_sent_to_watch
0,1,0,35.35712
1,1,1,19.513228
2,1,2,47.046402
3,1,3,26.606178
4,2,0,37.333054
5,2,1,19.896823
6,2,2,45.959648
7,2,3,29.698125
8,3,0,38.993207
9,3,1,20.648094


In [None]:
wa = non_sms_notifs[non_sms_notifs.data_0_app_name == 'WhatsApp']
wa['day_of_week'] = wa['day_of_week'].astype(int)
wa_cluster_info = wa.merge(cluster_ids,on='user_id')
wa_cluster_info = wa_cluster_info[['user_id','day_of_week','cluster_label','notif_sent_to_watch']]
wa_clusters = wa_cluster_info.groupby(['user_id','cluster_label']).mean().reset_index()
wa_cluster_activity = wa_clusters.groupby(['cluster_label']).mean().reset_index()
wa_clusters_daily = wa_cluster_info.groupby(['user_id','day_of_week','cluster_label']).mean().reset_index()
wa_clusters_daily = wa_clusters_daily.groupby(['day_of_week','cluster_label']).mean().reset_index()


In [4]:
# let's do some correlations of cluster group with other variables!!

matrix['cluster_label'] = pd.DataFrame(km_labels)

cluster_ids = matrix[['user_id','cluster_label']]

# activity (step) data

data_with_cluster_info = step_data.merge(cluster_ids,on='user_id')
step_group_analysis = data_with_cluster_info.groupby(['cluster_label']).mean().reset_index()
daily_cluster_activity = data_with_cluster_info.groupby(['user_id','day_of_week','cluster_label']).mean().reset_index()
daily_cluster_activity = daily_cluster_activity.groupby(['day_of_week','cluster_label']).mean().reset_index()

# sleep data

sleep_data_with_cluster_info = data.merge(cluster_ids,on='user_id')
sleep_cluster_activity = sleep_data_with_cluster_info.groupby(['user_id','day_of_week','cluster_label']).mean()
sleep_cluster_activity = sleep_cluster_activity/60
sleep_cluster_activity = sleep_cluster_activity.reset_index()
sleep_cluster_activity = sleep_cluster_activity.groupby(['day_of_week','cluster_label']).mean().reset_index()

sleep_session_clusters = sleep_data_with_cluster_info.groupby(['user_id','cluster_label']).mean().reset_index()
sleep_session_cluster_analysis = sleep_session_clusters.groupby(['cluster_label']).mean()
sleep_session_cluster_daily = sleep_data_with_cluster_info.groupby(['user_id','day_of_week','cluster_label']).mean().reset_index()
sleep_session_cluster_daily = sleep_session_cluster_daily.groupby(['day_of_week','cluster_label']).mean().reset_index()

# watch data

watch_cluster_info = watch_data.merge(cluster_ids,on='user_id')
watches = watch_cluster_info.groupby(['user_id','cluster_label']).mean().reset_index()
watches['rescaled_notif_count'] = pd.DataFrame(scaler.fit_transform(np.array(watches.notif_count)))
watches['rescaled_incoming_call_count'] = pd.DataFrame(scaler.fit_transform(np.array(watches.incoming_call_count)))
watches['rescaled_outgoing_call_count'] = pd.DataFrame(scaler.fit_transform(np.array(watches.outgoing_call_count)))
watches['rescaled_phone_answer_count'] = pd.DataFrame(scaler.fit_transform(np.array(watches.phone_answer_count)))
watches['rescaled_phone_call_elapsed_time'] = pd.DataFrame(scaler.fit_transform(np.array(watches.phone_call_elapsed_time)))
watches['sociability_avg']= pd.DataFrame((watches.rescaled_notif_count+watches.rescaled_incoming_call_count+watches.rescaled_outgoing_call_count)/3)
watch_cluster_activity = watches.groupby(['cluster_label']).mean().reset_index()
watch_cluster_activity_daily = watch_cluster_info.groupby(['user_id','day_of_week','cluster_label']).mean().reset_index()
watch_cluster_activity_daily = watch_cluster_activity_daily.groupby(['day_of_week','cluster_label']).mean().reset_index()

# hours on wrist data

tow_data_with_cluster_info = tow_data.merge(cluster_ids,on='user_id')
tow_clusters = tow_data_with_cluster_info.groupby(['user_id','cluster_label']).mean().reset_index()
tow_cluster_activity = tow_clusters.groupby(['cluster_label']).mean().reset_index()
tow_clusters_daily = tow_data_with_cluster_info.groupby(['user_id','day_of_week','cluster_label']).mean().reset_index()
tow_clusters_daily = tow_clusters_daily.groupby(['day_of_week','cluster_label']).mean().reset_index()

# notification source data ######################################################

# SMS

sms_cluster_info = sms_notifs.merge(cluster_ids,on='user_id')
#non_sms_cluster_info = non_sms_notifs.merge(cluster_ids,on='user_id')
sms_clusters = sms_cluster_info.groupby(['user_id','cluster_label']).median().reset_index()
sms_cluster_activity = sms_clusters.groupby(['cluster_label']).median().reset_index()
sms_clusters_daily = sms_cluster_info.groupby(['user_id','day_of_week','cluster_label']).median().reset_index()
sms_clusters_daily = sms_clusters_daily.groupby(['day_of_week','cluster_label']).median().reset_index()

# FB Messenger

msg = non_sms_notifs[non_sms_notifs.data_0_app_name == 'Messenger']
fb_msg = msg[msg.data_0_package_name == 'com.facebook.orca']
fb_msg['day_of_week'] = fb_msg['day_of_week'].astype(int)
fb_msg_cluster_info = fb_msg.merge(cluster_ids,on='user_id')
fb_msg_cluster_info = fb_msg_cluster_info[['user_id','day_of_week','cluster_label','notif_sent_to_watch']]
fb_msg_clusters = fb_msg_cluster_info.groupby(['user_id','cluster_label']).mean().reset_index()
fb_msg_cluster_activity = fb_msg_clusters.groupby(['cluster_label']).mean().reset_index()
fb_msg_clusters_daily = fb_msg_cluster_info.groupby(['user_id','day_of_week','cluster_label']).mean().reset_index()
fb_msg_clusters_daily = fb_msg_clusters_daily.groupby(['day_of_week','cluster_label']).mean().reset_index()

# WhatsApp

wa = non_sms_notifs[non_sms_notifs.data_0_app_name == 'WhatsApp']
wa['day_of_week'] = wa['day_of_week'].astype(int)
wa_cluster_info = wa.merge(cluster_ids,on='user_id')
wa_cluster_info = wa_cluster_info[['user_id','day_of_week','cluster_label','notif_sent_to_watch']]
wa_clusters = wa_cluster_info.groupby(['user_id','cluster_label']).mean().reset_index()
wa_cluster_activity = wa_clusters.groupby(['cluster_label']).mean().reset_index()
wa_clusters_daily = wa_cluster_info.groupby(['user_id','day_of_week','cluster_label']).mean().reset_index()
wa_clusters_daily = wa_clusters_daily.groupby(['day_of_week','cluster_label']).mean().reset_index()

# google maps

maps['day_of_week'] = maps['day_of_week'].astype(int)
maps_cluster_info = maps.merge(cluster_ids,on='user_id')
maps_cluster_info = maps_cluster_info[['user_id','day_of_week','cluster_label','notif_sent_to_watch']]
maps_clusters = maps_cluster_info.groupby(['user_id','cluster_label']).mean().reset_index()
maps_cluster_activity = maps_clusters.groupby(['cluster_label']).mean().reset_index()
maps_clusters_daily = maps_cluster_info.groupby(['user_id','day_of_week','cluster_label']).mean().reset_index()
maps_clusters_daily = maps_clusters_daily.groupby(['day_of_week','cluster_label']).mean().reset_index()

# inbox

inbox['day_of_week'] = inbox['day_of_week'].astype(int)
inbox_cluster_info = inbox.merge(cluster_ids,on='user_id')
inbox_cluster_info = inbox_cluster_info[['user_id','day_of_week','cluster_label','notif_sent_to_watch']]
inbox_clusters = inbox_cluster_info.groupby(['user_id','cluster_label']).mean().reset_index()
inbox_cluster_activity = inbox_clusters.groupby(['cluster_label']).mean().reset_index()
inbox_clusters_daily = inbox_cluster_info.groupby(['user_id','day_of_week','cluster_label']).mean().reset_index()
inbox_clusters_daily = inbox_clusters_daily.groupby(['day_of_week','cluster_label']).mean().reset_index()

# gmail

gmail['day_of_week'] = gmail['day_of_week'].astype(int)
gmail_cluster_info = gmail.merge(cluster_ids,on='user_id')
gmail_cluster_info = gmail_cluster_info[['user_id','day_of_week','cluster_label','notif_sent_to_watch']]
gmail_clusters = gmail_cluster_info.groupby(['user_id','cluster_label']).mean().reset_index()
gmail_cluster_activity = gmail_clusters.groupby(['cluster_label']).mean().reset_index()
gmail_clusters_daily = gmail_cluster_info.groupby(['user_id','day_of_week','cluster_label']).mean().reset_index()
gmail_clusters_daily = gmail_clusters_daily.groupby(['day_of_week','cluster_label']).mean().reset_index()

# Google Play

gplay['day_of_week'] = gplay['day_of_week'].astype(int)
gplay_cluster_info = gplay.merge(cluster_ids,on='user_id')
gplay_cluster_info = gplay_cluster_info[['user_id','day_of_week','cluster_label','notif_sent_to_watch']]
gplay_clusters = gplay_cluster_info.groupby(['user_id','cluster_label']).mean().reset_index()
gplay_cluster_activity = gplay_clusters.groupby(['cluster_label']).mean().reset_index()
gplay_clusters_daily = gplay_cluster_info.groupby(['user_id','day_of_week','cluster_label']).mean().reset_index()
gplay_clusters_daily = gplay_clusters_daily.groupby(['day_of_week','cluster_label']).mean().reset_index()


# Hangouts

hang = non_sms_notifs[non_sms_notifs.data_0_app_name == 'Hangouts']
hang['day_of_week'] = hang['day_of_week'].astype(int)
hang_cluster_info = hang.merge(cluster_ids,on='user_id')
hang_cluster_info = hang_cluster_info[['user_id','day_of_week','cluster_label','notif_sent_to_watch']]
hang_clusters = hang_cluster_info.groupby(['user_id','cluster_label']).mean().reset_index()
hang_cluster_activity = hang_clusters.groupby(['cluster_label']).mean().reset_index()
hang_clusters_daily = hang_cluster_info.groupby(['user_id','day_of_week','cluster_label']).mean().reset_index()
hang_clusters_daily = hang_clusters_daily.groupby(['day_of_week','cluster_label']).mean().reset_index()


# instagram

insta = non_sms_notifs[non_sms_notifs.data_0_app_name == 'Instagram']
insta['day_of_week'] = insta['day_of_week'].astype(int)
insta_cluster_info = insta.merge(cluster_ids,on='user_id')
insta_cluster_info = insta_cluster_info[['user_id','day_of_week','cluster_label','notif_sent_to_watch']]
insta_clusters = insta_cluster_info.groupby(['user_id','cluster_label']).mean().reset_index()
insta_cluster_activity = insta_clusters.groupby(['cluster_label']).mean().reset_index()
insta_clusters_daily = insta_cluster_info.groupby(['user_id','day_of_week','cluster_label']).mean().reset_index()
insta_clusters_daily = insta_clusters_daily.groupby(['day_of_week','cluster_label']).mean().reset_index()

# snapchat

sc = non_sms_notifs[non_sms_notifs.data_0_app_name == 'Snapchat']
sc['day_of_week'] = sc['day_of_week'].astype(int)
sc_cluster_info = sc.merge(cluster_ids,on='user_id')
sc_cluster_info = sc_cluster_info[['user_id','day_of_week','cluster_label','notif_sent_to_watch']]
sc_clusters = sc_cluster_info.groupby(['user_id','cluster_label']).mean().reset_index()
sc_cluster_activity = sc_clusters.groupby(['cluster_label']).mean().reset_index()
sc_clusters_daily = sc_cluster_info.groupby(['user_id','day_of_week','cluster_label']).mean().reset_index()
sc_clusters_daily = sc_clusters_daily.groupby(['day_of_week','cluster_label']).mean().reset_index()

# facebook (NOT FB messenger, simply fB notifications of any kind)

fb = non_sms_notifs[non_sms_notifs.data_0_app_name == 'Facebook']
fb['day_of_week'] = fb['day_of_week'].astype(int)
fb_cluster_info = fb.merge(cluster_ids,on='user_id')
fb_cluster_info = fb_cluster_info[['user_id','day_of_week','cluster_label','notif_sent_to_watch']]
fb_clusters = fb_cluster_info.groupby(['user_id','cluster_label']).mean().reset_index()
fb_cluster_activity = fb_clusters.groupby(['cluster_label']).mean().reset_index()
fb_clusters_daily = fb_cluster_info.groupby(['user_id','day_of_week','cluster_label']).mean().reset_index()
fb_clusters_daily = fb_clusters_daily.groupby(['day_of_week','cluster_label']).mean().reset_index()



# gender data

demo_data_with_cluster_info = demo_data.merge(cluster_ids,on='user_id')
gender_info = demo_data_with_cluster_info.groupby(['gender','cluster_label']).count().reset_index()

# Let's look at the group averages of the groups separated via clustering:

group_summary = matrix.groupby(['cluster_label']).mean().reset_index()
group_summary = matrix[['avg_bedtime','avg_sleep','avg_waketime','avg_deep_sleep','cluster_label']]
grp_summary = group_summary.groupby(['cluster_label']).mean().reset_index()

a = grp_summary[grp_summary.avg_bedtime == min(grp_summary.avg_bedtime)].cluster_label
b = grp_summary[grp_summary.avg_bedtime == max(grp_summary.avg_bedtime)].cluster_label
c = grp_summary[grp_summary.avg_sleep == min(grp_summary.avg_sleep)].cluster_label
d = grp_summary[grp_summary.avg_sleep == max(grp_summary.avg_sleep)].cluster_label

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is tryin

In [60]:
# graphing the platform breakdown of this group

platform_demo = step_data.groupby(['user_id','platform']).count().reset_index().groupby('platform').count().reset_index()


android = float(platform_demo[platform_demo.platform=='android'].user_id)
ios = float(platform_demo[platform_demo.platform=='ios'].user_id)

platforms = [android,ios]


labels = 'android','ios'

plt.figure()
plt.subplot(aspect=True)
plt.pie(platforms, labels=labels, colors = dark2_colors[4:8:2], autopct='%i%%')
plt.title("platform breakdown")
plt.show()

In [11]:
# graphing the demographic makeup of this group

gender_df = demo_data.groupby('gender').count().reset_index()

fem = float(gender_df[gender_df.gender=='female'].user_id)
male = float(gender_df[gender_df.gender=='male'].user_id)
unknown = float(gender_df[gender_df.gender=='unknown-andy'].user_id) + float(gender_df[gender_df.gender=='unknown-mostly_female'].user_id)+float(gender_df[gender_df.gender=='unknown-mostly_male'].user_id)

genders = [male,fem,unknown]

# below is a pie chart of the percentage of users who wore their pebble for 1, 2, 3, 4 ... 8 consecutive days

labels = 'male','female','unknown'

plt.figure()
plt.subplot(aspect=True)
plt.pie(genders, labels=labels, colors = dark2_colors[6:9], autopct='%i%%')
plt.title("gender breakdown")
plt.show()

In [12]:
# graphing k-means results with sleep vs. deep sleep

for i in range(num_clusters):
    ds = training[km_labels==i]
    plt.plot(ds.iloc[:,2],ds.iloc[:,3],'o', c = dark2_colors[i], alpha = 0.7)
    lines = plt.plot(centroids[i,0],'kx')
    
    
plt.ylabel('avg. deep sleep')
plt.xlabel('avg. sleep')
#plt.axis([19.5, 30, 0, 11])

plt.show()


In [13]:
# graphing k-means results with sleep vs. bedtime

for i in range(num_clusters):
    ds = training[km_labels==i]
    plt.plot(ds.iloc[:,0],ds.iloc[:,2],'o', c = dark2_colors[i], alpha = 0.7)
    lines = plt.plot(centroids[i,0],'kx')
    
    
plt.ylabel('avg. sleep')
plt.xlabel('avg. bedtime')
#plt.axis([19.5, 30, 0, 11])

plt.show()


In [14]:
# 3D graphing of k-means results

fig = plt.figure()
ax = fig.gca(projection='3d')

ax.set_title('Sleep Phenotypes')
ax.set_xlabel('Bedtime')
ax.set_ylabel('Wake time')
ax.set_zlabel('Total sleep')

#ax.set_xlim(19.5,30)
#ax.set_ylim(2,11)
#ax.set_zlim(4,12)

ax.view_init(elev=12,azim=40) # elevation and angle
ax.dist=12 # distance

#colors = ['r','c','m','b']



for i in range(num_clusters):
    ds = training[km_labels==i]
    ax.scatter(ds.iloc[:,0],ds.iloc[:,1],ds.iloc[:,2],'o',c=dark2_colors[i])
    
    
plt.show()

In [15]:
# Gender cluster #1

gender_grp1 = gender_info[gender_info.cluster_label==int(a)]

fem = float(gender_grp1[gender_grp1.gender=='female'].user_id)
male = float(gender_grp1[gender_grp1.gender=='male'].user_id)
unknown = float(gender_grp1[gender_grp1.gender=='unknown-andy'].user_id) + float(gender_grp1[gender_grp1.gender=='unknown-mostly_female'].user_id)+float(gender_grp1[gender_grp1.gender=='unknown-mostly_male'].user_id)

genders = [male,fem,unknown]

# below is a pie chart of the percentage of users who wore their pebble for 1, 2, 3, 4 ... 8 consecutive days

labels = 'male','female','unknown'

plt.figure()
plt.subplot(aspect=True)
plt.pie(genders, labels=labels, colors = dark2_colors[6:9], autopct='%i%%')
plt.title("gender breakdown cluster 1")
plt.show()

In [None]:
# Gender cluster #2

gender_grp2 = gender_info[gender_info.cluster_label==int(b)]

fem = float(gender_grp2[gender_grp2.gender=='female'].user_id)
male = float(gender_grp2[gender_grp2.gender=='male'].user_id)
unknown = float(gender_grp2[gender_grp2.gender=='unknown-andy'].user_id) + float(gender_grp2[gender_grp2.gender=='unknown-mostly_female'].user_id)+float(gender_grp2[gender_grp2.gender=='unknown-mostly_male'].user_id)

genders = [male,fem,unknown]

# below is a pie chart of the percentage of users who wore their pebble for 1, 2, 3, 4 ... 8 consecutive days

labels = 'male','female','unknown'

plt.figure()
plt.subplot(aspect=True)
plt.pie(genders, labels=labels, colors = dark2_colors[6:9], autopct='%i%%')
plt.title("gender breakdown cluster 2")
plt.show()

In [6]:
# Gender cluster #3

gender_grp3 = gender_info[gender_info.cluster_label==int(c)]

fem = float(gender_grp3[gender_grp3.gender=='female'].user_id)
male = float(gender_grp3[gender_grp3.gender=='male'].user_id)
unknown = float(gender_grp3[gender_grp3.gender=='unknown-andy'].user_id) + float(gender_grp3[gender_grp3.gender=='unknown-mostly_female'].user_id)+float(gender_grp3[gender_grp3.gender=='unknown-mostly_male'].user_id)

genders = [male,fem,unknown]

# below is a pie chart of the percentage of users who wore their pebble for 1, 2, 3, 4 ... 8 consecutive days

labels = 'male','female','unknown'

plt.figure()
plt.subplot(aspect=True)
plt.pie(genders, labels=labels, colors = dark2_colors[6:9], autopct='%i%%')
plt.title("gender breakdown cluster 3")
plt.show()

In [7]:
# Gender cluster #4

gender_grp4 = gender_info[gender_info.cluster_label==int(d)]

fem = float(gender_grp4[gender_grp4.gender=='female'].user_id)
male = float(gender_grp4[gender_grp4.gender=='male'].user_id)
unknown = float(gender_grp4[gender_grp4.gender=='unknown-andy'].user_id) + float(gender_grp4[gender_grp4.gender=='unknown-mostly_female'].user_id)+float(gender_grp4[gender_grp4.gender=='unknown-mostly_male'].user_id)

genders = [male,fem,unknown]

# below is a pie chart of the percentage of users who wore their pebble for 1, 2, 3, 4 ... 8 consecutive days

labels = 'male','female','unknown'

plt.figure()
plt.subplot(aspect=True)
plt.pie(genders, labels=labels, colors = dark2_colors[6:9], autopct='%i%%')
plt.title("gender breakdown cluster 4")
plt.show()

In [5]:
# make some functions for graphing

def bar_graph_scale(y,label,y_min,y_max):
    x = np.arange(1,5)
    y_val = pd.Series([y[int(a)],y[int(b)],y[int(c)],y[int(d)]])
    colors = [dark2_colors[1],dark2_colors[3],dark2_colors[0],dark2_colors[2]]
    plt.figure()
    ax = plt.gca()
    ax.tick_params(axis='x',labelsize=16)
    ax.tick_params(axis='y',labelsize=16)
    plt.xticks(np.arange(min(x),max(x)+1,1.0))

    plt.bar(x - 0.4,y_val,color=colors[0:4])

    plt.ylabel(label,size=20)
    plt.xlabel('Cluster',size=20)
    plt.title(label,size=20)

    plt.axis([0.5, 4.5, y_min, y_max])
    
    remove_border(left=False)
    plt.show()

def bar_graph(y,label):
    x = np.arange(1,5)
    y_val = pd.Series([y[int(a)],y[int(b)],y[int(c)],y[int(d)]])
    colors = [dark2_colors[1],dark2_colors[3],dark2_colors[0],dark2_colors[2]]
    plt.figure()
    ax = plt.gca()
    ax.tick_params(axis='x',labelsize=16)
    ax.tick_params(axis='y',labelsize=16)
    plt.xticks(np.arange(min(x),max(x)+1,1.0))

    plt.bar(x - 0.4,y_val,color=colors[0:4])

    plt.ylabel(label,size=20)
    plt.xlabel('Cluster',size=20)
    plt.title(label,size=20)

    remove_border(left=False)
    plt.show()


def line_graph(mat,var,label):
    x = range(1,8)
    plt.figure()

    cluster_code = pd.Series([int(a),int(b),int(c),int(d)])
    colors = [dark2_colors[1],dark2_colors[3],dark2_colors[0],dark2_colors[2]]

    for i in np.arange(4):
        plt.plot(x, mat[mat.cluster_label==cluster_code[i]][var],c=colors[i])
    
    plt.ylabel(label)
    plt.xlabel('day of week (1=Mon; 7=Sun)')
    plt.legend( x, loc = 'center left',bbox_to_anchor=(1,0.5))
    plt.title('%s by Cluster' %(label))

    remove_border(left=False)
    plt.show()

In [6]:
grp_summary

Unnamed: 0,cluster_label,avg_bedtime,avg_sleep,avg_waketime,avg_deep_sleep
0,0,23.874477,6.509093,6.682488,2.354206
1,1,22.781748,7.259396,6.253743,3.289089
2,2,25.009046,6.84955,8.128574,2.674557
3,3,23.611413,7.813337,7.625183,3.747682


In [7]:
# Group average for bedtime

y = grp_summary.avg_bedtime
y_min = 20
y_max = 26
label = 'Avg Bedtime'

bar_graph_scale(y,label,y_min,y_max)


In [19]:
# Group average for waketime

y = grp_summary.avg_waketime
y_min = 5
y_max = 9
label = 'Avg Waketime'

bar_graph_scale(y,label,y_min,y_max)


In [20]:
# Group average for total sleep

y = grp_summary.avg_sleep
y_min = 5
y_max = 9
label = 'Avg Sleep Duration'

bar_graph_scale(y,label,y_min,y_max)


In [21]:
# Group average for deep sleep

y = grp_summary.avg_deep_sleep
y_min = 0
y_max = 4
label = 'Avg Deep Sleep'

bar_graph_scale(y,label,y_min,y_max)


In [36]:
fb_msg_cluster_activity

Unnamed: 0,cluster_label,notif_sent_to_watch
0,0,12.841728
1,1,19.15284
2,2,8.20495
3,3,11.252861


In [6]:
# Avg Fb messages received on watch by android users, split by cluster

y = fb_msg_cluster_activity.notif_sent_to_watch

label = '# FB messenger notifs'

bar_graph(y,label)

In [88]:
# FB messages received by day

mat = fb_msg_clusters_daily
var = 'notif_sent_to_watch'
label = '# FB messenger notifs'

line_graph(mat,var,label)

In [7]:
# Avg Google Maps messages received on watch by android users, split by cluster

y = maps_cluster_activity.notif_sent_to_watch

label = '# G-Maps notifs'

bar_graph(y,label)

In [None]:
# Google Maps messages received by day

mat = maps_clusters_daily
var = 'notif_sent_to_watch'
label = '# G-Maps notifs'

line_graph(mat,var,label)

In [8]:
# Avg Google Mail messages received on watch by android users, split by cluster

y = gmail_cluster_activity.notif_sent_to_watch

label = '# GMail notifs'

bar_graph(y,label)

In [6]:
# Google Mail messages received by day

mat = gmail_clusters_daily
var = 'notif_sent_to_watch'
label = '# GMail notifs'

line_graph(mat,var,label)

In [1]:
# Avg Inbox messages received on watch by android users, split by cluster

y = inbox_cluster_activity.notif_sent_to_watch

label = '# Inbox notifs'

bar_graph(y,label)

NameError: name 'inbox_cluster_activity' is not defined

In [None]:
# Inbox messages received by day

mat = inbox_clusters_daily
var = 'notif_sent_to_watch'
label = '# Inbox notifs'

line_graph(mat,var,label)

In [17]:
gplay_cluster_activity

Unnamed: 0,cluster_label,day_of_week,notif_sent_to_watch
0,0,3.712004,9.461017
1,1,3.862744,9.753684
2,2,3.970465,11.686553
3,3,4.038819,10.538986


In [18]:
# Avg Google Play Store messages received on watch by android users, split by cluster

y = gplay_cluster_activity.notif_sent_to_watch

label = '# G-Play Store notifs'

bar_graph(y,label)

In [None]:
# Google Play Store messages received by day

mat = gplay_clusters_daily
var = 'notif_sent_to_watch'
label = '# G-Play Store notifs'

line_graph(mat,var,label)

In [21]:
# Avg Instagram messages received on watch by android users, split by cluster

y = insta_cluster_activity.notif_sent_to_watch

label = '# Instagram notifs'

bar_graph(y,label)

In [22]:
# Instagram messages received by day

mat = insta_clusters_daily
var = 'notif_sent_to_watch'
label = '# Instagram notifs'

line_graph(mat,var,label)

In [23]:
# Avg Snapchat messages received on watch by android users, split by cluster

y = sc_cluster_activity.notif_sent_to_watch

label = '# Snapchat notifs'

bar_graph(y,label)

In [24]:
# Snapchat messages received by day

mat = sc_clusters_daily
var = 'notif_sent_to_watch'
label = '# Snapchat notifs'

line_graph(mat,var,label)

In [25]:
# Avg Hangout notifs received on watch by android users, split by cluster

y = hang_cluster_activity.notif_sent_to_watch

label = '# Hangout notifs'

bar_graph(y,label)

In [None]:
# Hangout messages received by day

mat = hang_clusters_daily
var = 'notif_sent_to_watch'
label = '# Hangout notifs'

line_graph(mat,var,label)

In [7]:
# Avg FB notifs received on watch by android users, split by cluster

y = fb_cluster_activity.notif_sent_to_watch

label = '# FB notifs'

bar_graph(y,label)

In [8]:
# FB notifs received by day

mat = fb_clusters_daily
var = 'notif_sent_to_watch'
label = '# FB notifs'

line_graph(mat,var,label)

In [None]:
# Avg WhatsApp messages received on watch by android users, split by cluster

y = wa_cluster_activity.notif_sent_to_watch

label = '# WhatsApp notifs'

bar_graph(y,label)

In [None]:
# WhatsApp messages received by day

mat = wa_clusters_daily
var = 'notif_sent_to_watch'
label = '# WhatsApp notifs'

line_graph(mat,var,label)

In [16]:
# Avg SMS received on watch by android users, split by cluster

y = sms_cluster_activity.notif_sent_to_watch

label = '# SMS received'

bar_graph(y,label)

In [17]:
# SMS received by day

mat = sms_clusters_daily
var = 'notif_sent_to_watch'
label = '# SMS received'

line_graph(mat,var,label)

In [21]:
# Cluster groups avg for time on wrist

y = tow_cluster_activity.hours_on_wrist
y_min = 0
y_max = 24

label = 'Hours on Wrist'

bar_graph_scale(y,label,y_min,y_max)


In [25]:
# HOURS ON WRIST BY DAY

mat = tow_clusters_daily
var = 'hours_on_wrist'
label = 'Hours on Wrist'

line_graph(mat,var,label)

In [14]:
# Group average for total steps

y = step_group_analysis.total_step_count

label = 'Avg Steps'

bar_graph(y,label)


In [12]:
# CLUSTERS BY ACTIVITY BY DAY

mat = daily_cluster_activity
var = 'total_step_count'
label = 'Avg Steps'

line_graph(mat,var,label)

In [13]:
# CLUSTERS BY bedtime BY DAY

mat = sleep_cluster_activity
var = 'peb_start_sleep'
label = 'Bedtime'

line_graph(mat,var,label)

In [14]:
# CLUSTERS BY wake-time BY DAY

mat = sleep_cluster_activity
var = 'peb_stop_sleep'
label = 'Wake Time'

line_graph(mat,var,label)

In [15]:
# CLUSTERS BY deep sleep BY DAY

mat = sleep_cluster_activity
var = 'peb_deep_sleep'
label = 'Deep Sleep'

line_graph(mat,var,label)


In [16]:
# CLUSTERS BY sleep BY DAY

mat = sleep_cluster_activity
var = 'peb_total_sleep'
label = 'Total Sleep'

line_graph(mat,var,label)


In [17]:
# sleep session cluster analysis

y = sleep_session_cluster_analysis.num_sleep_sessions
label = '# sleep sessions'

bar_graph(y,label)


In [18]:
# sleep session cluster analysis BY DAY

mat = sleep_session_cluster_daily
var = 'num_sleep_sessions'
label = '# Sleep Sessions'

line_graph(mat,var,label)

In [19]:
# graphing the cluster compositions

cluster_counts = cluster_ids.groupby(['cluster_label']).count().reset_index()
cluster_nums = pd.Series([cluster_counts.user_id[int(a)],cluster_counts.user_id[int(b)],cluster_counts.user_id[int(c)],cluster_counts.user_id[int(d)]])

labels = '1','2','3','4'


plt.figure()
plt.subplot(aspect=True)
plt.pie(cluster_nums, labels=labels, colors = [dark2_colors[1],dark2_colors[3],dark2_colors[0],dark2_colors[2]], autopct='%i%%')
plt.title("cluster composition")
plt.show()

In [7]:
# watch cluster analysis: phone call elapsed time

y = watch_cluster_activity.phone_call_elapsed_time
label = 'Avg Phone Call Time'

bar_graph(y,label)

In [None]:
y = watch_cluster_activity.phone_answer_count
label = 'Avg Phone Answer #'

bar_graph(y,label)

In [None]:
y = watch_cluster_activity.outgoing_call_count
label = 'Avg Outgoing Call #'

bar_graph(y,label)

In [6]:
y = watch_cluster_activity.incoming_call_count
label = 'Avg Incoming Call #'

bar_graph(y,label)

In [None]:
y = watch_cluster_activity.notif_count
label = 'Avg Notif #'

bar_graph(y,label)

In [6]:
y = watch_cluster_activity.buttons_pressed
label = 'Avg Buttons Pressed'

bar_graph(y,label)

In [18]:
watch_cluster_activity.head(2)

Unnamed: 0,cluster_label,day_of_week,day_of_year,buttons_pressed,backlight_on_count,notif_count,notif_count_dnd,vibrate_count,alarm_count,incoming_call_count,...,phone_decline_count,phone_call_elapsed_time,future_scroll_count,past_scroll_count,rescaled_notif_count,rescaled_incoming_call_count,rescaled_outgoing_call_count,rescaled_phone_answer_count,rescaled_phone_call_elapsed_time,sociability_avg
0,0,4.020431,52.344181,79.442227,155.393386,50.730572,7.143663,84.987352,0.90446,1.747983,...,0.126018,20249497.146294,0.705815,0.379908,0.035438,0.050712,0.00041,0.003673,0.238384,0.028854
1,1,4.028132,52.314471,69.373931,130.280046,43.30934,6.456044,65.011523,0.677752,1.373939,...,0.101163,19904311.843522,0.617947,0.387928,0.030254,0.03986,0.000381,0.002474,0.23432,0.023498


In [19]:
y = watch_cluster_activity.sociability_avg
label = 'Avg Sociability Score'

bar_graph(y,label)

In [18]:
# let's run a t-test to see if there is a sig difference between the sociability score for groups 1 vs 2

group1 = watches[watches.cluster_label==int(a)]
group2 = watches[watches.cluster_label==int(b)]
group3 = watches[watches.cluster_label==int(c)]
group4 = watches[watches.cluster_label==int(d)]

stats.ttest_ind(group1['sociability_avg'],group2['sociability_avg'])

#stats.ttest_ind(group3['sociability_avg'],group4['sociability_avg'])

Ttest_indResult(statistic=-11.098898142615027, pvalue=2.1234481276775462e-28)

In [21]:
watches.head(5)

Unnamed: 0,user_id,cluster_label,day_of_week,day_of_year,buttons_pressed,backlight_on_count,notif_count,notif_count_dnd,vibrate_count,alarm_count,...,phone_decline_count,phone_call_elapsed_time,future_scroll_count,past_scroll_count,rescaled_notif_count,rescaled_incoming_call_count,rescaled_outgoing_call_count,rescaled_phone_answer_count,rescaled_phone_call_elapsed_time,sociability_avg
0,52711d4bdc50c571b8000029,2,4.117647,51.588235,29.852941,18.529412,9.529412,0.0,13.882353,0.0,...,0.0,17501.088235,,,0.006657,0.003413,0,0,0.000206,0.003357
1,52711d4cdc50c571b80000c8,1,4.064516,52.0,28.935484,74.451613,7.0,2.451613,16.548387,0.0,...,0.096774,17984432.096774,0.333333,0.0,0.00489,0.01591,0,0,0.211719,0.006933
2,52711d4cdc50c571b80000e2,3,3.882353,51.147059,71.235294,166.264706,31.794118,5.441176,39.529412,0.529412,...,0.088235,487574.970588,,,0.02221,0.066556,0,0,0.00574,0.029589
3,52711d4ddc50c571b8000453,1,3.690476,52.857143,30.261905,140.547619,37.619048,2.333333,105.357143,0.642857,...,0.071429,161499.285714,,,0.026279,0.055261,0,0,0.001901,0.02718
4,52711d4ddc50c571b80004c0,1,4.096774,52.032258,149.483871,139.387097,48.354839,0.0,68.064516,2.064516,...,0.0,100788.967742,,,0.033779,0.028076,0,0,0.001187,0.020618


In [11]:
##############################################################################
# Visualize the results on PCA-reduced data

reduced_data = PCA(n_components=2).fit_transform(training)
kmeans = KMeans(init='k-means++', n_clusters=4, n_init=500)
kmeans.fit(reduced_data)

# Step size of the mesh. Decrease to increase the quality of the VQ.
h = .02     # point in the mesh [x_min, m_max]x[y_min, y_max].

# Plot the decision boundary. For that, we will assign a color to each
x_min, x_max = reduced_data[:, 0].min() - 1, reduced_data[:, 0].max() + 1
y_min, y_max = reduced_data[:, 1].min() - 1, reduced_data[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))

# Obtain labels for each point in mesh. Use last trained model.
Z = kmeans.predict(np.c_[xx.ravel(), yy.ravel()])

# Put the result into a color plot
Z = Z.reshape(xx.shape)
plt.figure(1)
plt.clf()
plt.imshow(Z, interpolation='nearest',
           extent=(xx.min(), xx.max(), yy.min(), yy.max()),
           cmap=plt.cm.Paired,
           aspect='auto', origin='lower')

plt.plot(reduced_data[:, 0], reduced_data[:, 1], 'k.', markersize=2)
# Plot the centroids as a white X
centroids = kmeans.cluster_centers_
plt.scatter(centroids[:, 0], centroids[:, 1],
            marker='x', s=169, linewidths=3,
            color='w', zorder=10)
plt.title('K-means clustering on PCA-reduced data\n'
          'Centroids are marked with white cross')
plt.xlim(x_min, x_max)
plt.ylim(y_min, y_max)
plt.xticks(())
plt.yticks(())
plt.show()


In [None]:
# let's try some k-means clustering!

#% pylab inline

features = ['rescaled_sleep','rescaled_std_bedtime','rescaled_deep_sleep','rescaled_std_waketime']
num_clusters = 4

numfeat = len(features)

training = matrix[features]

# n_init = # of times the algorithm is initialized (i.e. how many times does it come up with clusters?)
# typically, for less than 10 clusters, multiple rounds of optimizations are optimal (typically 50-1000 are used, according to Andrew Ng)
# for our purposes, since we have less than 10 clusters, we will go with n_clusters=500

# init = method for initialization, or choosing the initial centroids
# common methods include random initialization or k-means++
# scikit-learn's default is k-means++, and it has been suggested that this method yields considerable improvement in the final error of k-means
# therefore, we will go with k-means++

km = KMeans(n_clusters=num_clusters,init='k-means++',n_init=500)

km.fit(training)

centroids = km.cluster_centers_
km_labels = km.labels_
inertia = km.inertia_


for i in range(num_clusters):
    ds = training[km_labels==i]
    plt.plot(ds.iloc[:,0],ds.iloc[:,1],'o')
    lines = plt.plot(centroids[i,0],'kx')
    
    
plt.ylabel('st. dev. bedtime')
plt.xlabel('avg. sleep')
#plt.axis([19.5, 30, 0, 11])

inertia
plt.show()

s_coeff = metrics.silhouette_score(training,km_labels,metric='euclidean')

In [17]:
# homo = homogeneity score
# compl = completeness score
# v-means = V measure
# ARI = adjusted Rand index
# AMI = adjusted mutual information
# silhouette = silhouette coefficient

num_clusters = 3

print(79 * '_')
print('% 9s' % 'init'
      '       time     inertia  homo     compl    v-meas    ARI     AMI    silhouette')

def bench_k_means(estimator,name,data):
    t0 = time()
    estimator.fit(data)
    print('% 9s   %.2fs    %i    %.3f    %.3f    %.3f   %.3f    %.3f    %.3f'
          % (name, (time() - t0), estimator.inertia_,
             metrics.homogeneity_score(km_labels, estimator.labels_),
             metrics.completeness_score(km_labels, estimator.labels_),
             metrics.v_measure_score(km_labels, estimator.labels_),
             metrics.adjusted_rand_score(km_labels, estimator.labels_),
             metrics.adjusted_mutual_info_score(km_labels,  estimator.labels_),
             metrics.silhouette_score(data, estimator.labels_,
                                      metric='euclidean')))
bench_k_means(KMeans(init='k-means++', n_clusters=num_clusters, n_init=500),name="k-means++", data=training)

bench_k_means(KMeans(init='random', n_clusters=num_clusters, n_init=500),name="random", data=training)

# in this case the seeding of the centers is deterministic, hence we run the
# kmeans algorithm only once with n_init=1
pca = PCA(n_components=num_clusters).fit(training)
bench_k_means(KMeans(init=pca.components_, n_clusters=num_clusters, n_init=1),name="PCA-based",data=training)

print(79 * '_')



_______________________________________________________________________________
init       time     inertia  homo     compl    v-meas    ARI     AMI    silhouette
k-means++   15.04s    512    0.495    0.623    0.552   0.471    0.495    0.266
   random   14.09s    512    0.494    0.623    0.551   0.470    0.494    0.266


ValueError: The shape of the initial centers ((4, 4)) does not match the number of clusters 3

In [13]:
##############################################################################
# Visualize the results on PCA-reduced data

reduced_data = PCA(n_components=4).fit_transform(training)
kmeans = KMeans(init='k-means++', n_clusters=2, n_init=500)
kmeans.fit(reduced_data)

# Step size of the mesh. Decrease to increase the quality of the VQ.
h = .02     # point in the mesh [x_min, m_max]x[y_min, y_max].

# Plot the decision boundary. For that, we will assign a color to each
x_min, x_max = reduced_data[:, 0].min() - 1, reduced_data[:, 0].max() + 1
y_min, y_max = reduced_data[:, 1].min() - 1, reduced_data[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))

# Obtain labels for each point in mesh. Use last trained model.
Z = kmeans.predict(np.c_[xx.ravel(), yy.ravel()])

# Put the result into a color plot
Z = Z.reshape(xx.shape)
plt.figure(1)
plt.clf()
plt.imshow(Z, interpolation='nearest',
           extent=(xx.min(), xx.max(), yy.min(), yy.max()),
           cmap=plt.cm.Paired,
           aspect='auto', origin='lower')

plt.plot(reduced_data[:, 0], reduced_data[:, 1], 'k.', markersize=2)
# Plot the centroids as a white X
centroids = kmeans.cluster_centers_
plt.scatter(centroids[:, 0], centroids[:, 1],
            marker='x', s=169, linewidths=3,
            color='w', zorder=10)
plt.title('K-means clustering on PCA-reduced data\n'
          'Centroids are marked with white cross')
plt.xlim(x_min, x_max)
plt.ylim(y_min, y_max)
plt.xticks(())
plt.yticks(())
plt.show()


In [19]:
# some follow-up analyses with sleep data

sleep_daily = data.groupby(['user_id','day_of_week']).mean().reset_index()
sleep_daily = sleep_daily[['day_of_week','peb_start_sleep','peb_stop_sleep','peb_total_sleep','peb_deep_sleep']]
daily_sleep_avg = sleep_daily.groupby(['day_of_week']).mean()
daily_sleep_avg = daily_sleep_avg/60

In [5]:
days = np.arange(1,8)

y = sleep_daily.groupby(['day_of_week']).count().peb_start_sleep

#%pylab inline
plt.figure()
ax = plt.gca()
ax.tick_params(axis='x',labelsize=16)
ax.tick_params(axis='y',labelsize=16)

plt.bar(days - 0.4,y,color='purple')
plt.ylabel('# events',size=20)
plt.xlabel('day of week (1 Mon - 7 Sun)',size=20)
plt.title('you can see that there are roughly the same # events across all nights',size=20)

plt.axis([0.5, 7.5, 0, 20000])

remove_border(left=False)

for x, y in zip(days, y):
    plt.annotate("%i" % y, (x,y + 200,), ha = 'center')


plt.show()

In [9]:
days = np.arange(1,8)

y = daily_sleep_avg.peb_deep_sleep

#%pylab inline
plt.figure()
ax = plt.gca()
ax.tick_params(axis='x',labelsize=16)
ax.tick_params(axis='y',labelsize=16)

plt.bar(days - 0.4,y,color='purple')
plt.ylabel('Deep Sleep (hrs)',size=20)
plt.xlabel('day of week (1 Mon - 7 Sun)',size=20)

plt.axis([0.5, 7.5, 0, 4])

remove_border(left=False)

for x, y in zip(days, y):
    plt.annotate("%.1f" % y, (x,y + .0500,), ha = 'center')


plt.show()

In [20]:
days = np.arange(1,8)

y = daily_sleep_avg.peb_total_sleep

#%pylab inline
plt.figure()
ax = plt.gca()
ax.tick_params(axis='x',labelsize=16)
ax.tick_params(axis='y',labelsize=16)


plt.bar(days - 0.4,y,color='purple')
plt.ylabel('Total Sleep (hrs)',size=20)
plt.xlabel('day of week (1 Mon - 7 Sun)',size=20)

plt.axis([0.5, 7.5, 5, 8])

remove_border(left=False)

for x, y in zip(days, y):
    plt.annotate("%.1f" % y, (x,y + .100,), ha = 'center')


plt.show()

In [14]:
days = np.arange(1,8)

y = daily_sleep_avg.peb_stop_sleep

#%pylab inline
plt.figure()
ax = plt.gca()
ax.tick_params(axis='x',labelsize=16)
ax.tick_params(axis='y',labelsize=16)

plt.bar(days - 0.4,y,color='purple')
plt.ylabel('Wake Time',size=20)
plt.xlabel('day of week (1 Mon - 7 Sun)',size=20)

plt.axis([0.5, 7.5, 5, 9])

remove_border(left=False)

for x, y in zip(days, y):
    plt.annotate("%.1f" % y, (x,y + .1,), ha = 'center')


plt.show()

In [18]:
daily_sleep_avg

Unnamed: 0_level_0,peb_start_sleep,peb_stop_sleep,peb_total_sleep,peb_deep_sleep
day_of_week,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,14.244744,6.92162,6.940511,2.879203
2,14.692271,6.82846,6.955955,2.93836
3,14.679927,6.829987,6.945139,2.95462
4,14.692179,6.838308,6.958657,2.975686
5,14.44082,6.880489,6.951325,2.971591
6,12.0408,7.85934,7.438294,3.146721
7,11.100467,8.077047,7.460334,3.104263


In [26]:
days = np.arange(1,8)

y = daily_sleep_avg.peb_start_sleep

#%pylab inline
plt.figure()
ax = plt.gca()
ax.tick_params(axis='x',labelsize=16)
ax.tick_params(axis='y',labelsize=16)


plt.bar(days - 0.4,y,color='purple')
plt.ylabel('Bedtime',size=20)
plt.xlabel('day of week (1 Mon - 7 Sun)',size=20)

plt.axis([0.5, 7.5, 22, 25])

remove_border(left=False)

for x, y in zip(days, y):
    plt.annotate("%.1f" % y, (x,y + .100,), ha = 'center')


plt.show()

In [27]:
# let's take a look at the step data!

steps_daily = step_data.groupby(['user_id','day_of_week']).mean().reset_index()
daily_step_avg = steps_daily.groupby(['day_of_week']).mean()


In [28]:
daily_step_avg

Unnamed: 0_level_0,day_of_year,total_step_count
day_of_week,Unnamed: 1_level_1,Unnamed: 2_level_1
1,53.023642,6510.063065
2,50.889247,6545.657075
3,50.955287,6733.487962
4,52.191909,6937.381909
5,53.278454,7093.422622
6,52.373934,6764.320353
7,51.808393,6032.114551


In [29]:
days = np.arange(1,8)

y = daily_step_avg.total_step_count

#%pylab inline
plt.figure()
ax = plt.gca()
ax.tick_params(axis='x',labelsize=16)
ax.tick_params(axis='y',labelsize=16)

plt.bar(days - 0.4,y,color='purple')
plt.ylabel('Avg Steps',size=20)
plt.xlabel('day of week (1 Mon - 7 Sun)',size=20)
#plt.title('you can see that there are roughly the same # events across all nights',size=20)

#plt.axis([0.5, 7.5, 5000, 7000])

remove_border(left=False)

for x, y in zip(days, y):
    plt.annotate("%i" % y, (x,y + 50,), ha = 'center')


plt.show()

In [146]:
days = np.arange(1,8)

y = steps_daily.groupby(['day_of_week']).count().total_step_count

#%pylab inline
plt.figure()
ax = plt.gca()
ax.tick_params(axis='x',labelsize=16)
ax.tick_params(axis='y',labelsize=16)

plt.bar(days - 0.4,y,color='purple')
plt.ylabel('# events',size=20)
plt.xlabel('day of week (1 Mon - 7 Sun)',size=20)
plt.title("you can see that there are roughly the same # events across all days\nhowever the watch might have still been worn for fewer hours over wkend",size=20)

plt.axis([0.5, 7.5, 0, 20000])

remove_border(left=False)

for x, y in zip(days, y):
    plt.annotate("%i" % y, (x,y + 200,), ha = 'center')


plt.show()

NameError: name 'steps_daily' is not defined