In [1]:
## Machine Learning with R by Brett Lantz
## Chapter 9: Finding Groups of Data - Clustering with k-means

In [2]:
#Example: Finding teen market segments using k-means clustering

In [5]:
#Step 1: Collecting data
#Step 2: Preparing and exploring data
teens <- read.csv("snsdata.csv")
head(teens)
str(teens)

Unnamed: 0,gradyear,gender,age,friends,basketball,football,soccer,softball,volleyball,swimming,ellip.h,blonde,mall,shopping,clothes,hollister,abercrombie,die,death,drunk,drugs
1,2006,M,18.982,7,0,0,0,0,0,0,<8b>,0,0,0,0,0,0,0,0,0,0
2,2006,F,18.801,0,0,1,0,0,0,0,<8b>,0,1,0,0,0,0,0,0,0,0
3,2006,M,18.335,69,0,1,0,0,0,0,<8b>,0,0,0,0,0,0,0,1,0,0
4,2006,F,18.875,0,0,0,0,0,0,0,<8b>,0,0,0,0,0,0,0,0,0,0
5,2006,,18.995,10,0,0,0,0,0,0,<8b>,0,0,2,0,0,0,0,0,1,1
6,2006,F,,142,0,0,0,0,0,0,<8b>,0,0,1,0,0,0,0,0,1,0


'data.frame':	30000 obs. of  40 variables:
 $ gradyear    : int  2006 2006 2006 2006 2006 2006 2006 2006 2006 2006 ...
 $ gender      : Factor w/ 2 levels "F","M": 2 1 2 1 NA 1 1 2 1 1 ...
 $ age         : num  19 18.8 18.3 18.9 19 ...
 $ friends     : int  7 0 69 0 10 142 72 17 52 39 ...
 $ basketball  : int  0 0 0 0 0 0 0 0 0 0 ...
 $ football    : int  0 1 1 0 0 0 0 0 0 0 ...
 $ soccer      : int  0 0 0 0 0 0 0 0 0 0 ...
 $ softball    : int  0 0 0 0 0 0 0 1 0 0 ...
 $ volleyball  : int  0 0 0 0 0 0 0 0 0 0 ...
 $ swimming    : int  0 0 0 0 0 0 0 0 0 0 ...
 $ cheerleading: int  0 0 0 0 0 0 0 0 0 0 ...
 $ baseball    : int  0 0 0 0 0 0 0 0 0 0 ...
 $ tennis      : int  0 0 0 0 0 0 0 0 0 0 ...
 $ sports      : int  0 0 0 0 0 0 0 0 0 0 ...
 $ cute        : int  0 1 0 1 0 0 0 0 0 1 ...
 $ sex         : int  0 0 0 0 1 1 0 2 0 0 ...
 $ sexy        : int  0 0 0 0 0 0 0 1 0 0 ...
 $ hot         : int  0 0 0 0 0 0 0 0 0 1 ...
 $ kissed      : int  0 0 0 0 5 0 0 0 0 0 ...
 $ dance       : int

In [9]:
#check if missing values on genre
table(teens$gender, useNA = "ifany")


    F     M  <NA> 
22054  5222  2724 

In [10]:
#Check if there are missing values on age
#Check max and min 
summary(teens$age)

   Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
  3.086  16.310  17.290  17.990  18.260 106.900    5086 

In [13]:
#We'll treat non-teenagers as missing values
teens$age <- ifelse(teens$age >= 13 & teens$age <20, teens$age, NA )
summary(teens$age)

   Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
  13.03   16.30   17.26   17.25   18.22   20.00    5523 

In [15]:
#Data preparation - dummy coding missing values
teens$female <- ifelse(teens$gender == "F" & !is.na(teens$gender), 1, 0)
teens$nogender <- ifelse(is.na(teens$gender), 1, 0)

In [18]:
table(teens$gender, useNA = "ifany")
table(teens$female, useNA = "ifany")
table(teens$nogender, useNA = "ifany")


    F     M  <NA> 
22054  5222  2724 


    0     1 
 7946 22054 


    0     1 
27276  2724 

In [21]:
#Imputing missing values
mean(teens$age)
mean(teens$age, na.rm = TRUE)

[1] NA

In [25]:
#Compute mean age by gradyear skipping NA values
aggregate(data = teens, age ~ gradyear, mean, na.rm = TRUE)

Unnamed: 0,gradyear,age
1,2006,18.65586
2,2007,17.70617
3,2008,16.7677
4,2009,15.81957


In [26]:
ave_age <- ave(teens$age, teens$gradyear, FUN = function(x) mean(x, na.rm = TRUE))
teens$age <- ifelse(is.na(teens$age), ave_age, teens$age)    

In [27]:
summary(teens$age)

   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  13.03   16.28   17.24   17.24   18.21   20.00 

In [28]:
#Step 3: Training a model on the data
install.packages("stats", repos = "https://cran.r-project.org")
library(stats)

: package 'stats' is not available (for R version 3.2.3)

In [30]:
#The dataset should be all numeric
#First let's just consider those features that denotes interests' teenagers
interests <- teens[5:40]
head(interests)

Unnamed: 0,basketball,football,soccer,softball,volleyball,swimming,cheerleading,baseball,tennis,sports,ellip.h,blonde,mall,shopping,clothes,hollister,abercrombie,die,death,drunk,drugs
1,0,0,0,0,0,0,0,0,0,0,<8b>,0,0,0,0,0,0,0,0,0,0
2,0,1,0,0,0,0,0,0,0,0,<8b>,0,1,0,0,0,0,0,0,0,0
3,0,1,0,0,0,0,0,0,0,0,<8b>,0,0,0,0,0,0,0,1,0,0
4,0,0,0,0,0,0,0,0,0,0,<8b>,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,<8b>,0,0,2,0,0,0,0,0,1,1
6,0,0,0,0,0,0,0,0,0,0,<8b>,0,0,1,0,0,0,0,0,1,0


In [32]:
#We should scale by z-core technique using lapply and scale son larger values don't dominate
interests_z <- as.data.frame(lapply(interests, scale))
head(interests_z)

Unnamed: 0,basketball,football,soccer,softball,volleyball,swimming,cheerleading,baseball,tennis,sports,ellip.h,blonde,mall,shopping,clothes,hollister,abercrombie,die,death,drunk,drugs
1,-0.3322117,-0.3576914,-0.2428701,-0.2179242,-0.2236659,-0.2599662,-0.2073236,-0.2011273,-0.1689361,-0.2971184,<8b>,-0.05093567,-0.3699086,-0.4873061,-0.3141926,-0.2014729,-0.1830287,-0.2947883,-0.2615258,-0.220399,-0.1749047
2,-0.3322117,1.060031,-0.2428701,-0.2179242,-0.2236659,-0.2599662,-0.2073236,-0.2011273,-0.1689361,-0.2971184,<8b>,-0.05093567,1.067374,-0.4873061,-0.3141926,-0.2014729,-0.1830287,-0.2947883,-0.2615258,-0.220399,-0.1749047
3,-0.3322117,1.060031,-0.2428701,-0.2179242,-0.2236659,-0.2599662,-0.2073236,-0.2011273,-0.1689361,-0.2971184,<8b>,-0.05093567,-0.3699086,-0.4873061,-0.3141926,-0.2014729,-0.1830287,-0.2947883,2.027874,-0.220399,-0.1749047
4,-0.3322117,-0.3576914,-0.2428701,-0.2179242,-0.2236659,-0.2599662,-0.2073236,-0.2011273,-0.1689361,-0.2971184,<8b>,-0.05093567,-0.3699086,-0.4873061,-0.3141926,-0.2014729,-0.1830287,-0.2947883,-0.2615258,-0.220399,-0.1749047
5,-0.3322117,-0.3576914,-0.2428701,-0.2179242,-0.2236659,-0.2599662,-0.2073236,-0.2011273,-0.1689361,-0.2971184,<8b>,-0.05093567,-0.3699086,2.273635,-0.3141926,-0.2014729,-0.1830287,-0.2947883,-0.2615258,2.285084,2.719271
6,-0.3322117,-0.3576914,-0.2428701,-0.2179242,-0.2236659,-0.2599662,-0.2073236,-0.2011273,-0.1689361,-0.2971184,<8b>,-0.05093567,-0.3699086,0.8931645,-0.3141926,-0.2014729,-0.1830287,-0.2947883,-0.2615258,2.285084,-0.1749047


In [33]:
teen_clusters <- kmeans(interests_z, 5)
summary(teen_clusters)

             Length Class  Mode   
cluster      30000  -none- numeric
centers        180  -none- numeric
totss            1  -none- numeric
withinss         5  -none- numeric
tot.withinss     1  -none- numeric
betweenss        1  -none- numeric
size             5  -none- numeric
iter             1  -none- numeric
ifault           1  -none- numeric

In [36]:
#Step 4: Evaluating model performance
teen_clusters$size
teen_clusters$centers

Unnamed: 0,basketball,football,soccer,softball,volleyball,swimming,cheerleading,baseball,tennis,sports,Unnamed: 11,blonde,mall,shopping,clothes,hollister,abercrombie,die,death,drunk,drugs
1,0.332536768403742,0.359988250776662,0.125413375952348,0.165002661186523,0.109797032667434,0.265711672842304,0.185250787993477,0.275758695695738,0.0531939806729629,0.785600911744181,<8b>,0.366362497250728,0.614870528202528,0.264356815846442,1.22050377305789,0.154502547831156,0.258547318763069,1.69718501683948,0.923704293556115,1.8798908859951,2.7331187323661
2,0.159116645759225,0.235621393758306,0.103739357136903,0.0649562173730987,0.177381643102623,0.236459792169916,0.395061887167999,0.0271058177721962,0.0278566012063206,0.0906263436135761,<8b>,0.0609620166964236,0.613757496000652,0.789429688772518,0.566361279310996,4.14571209859379,3.96476243252421,0.0458223081629249,0.0984261183413327,0.034494076975784,0.0358849902801541
3,-0.164889178748075,-0.163246587322126,-0.0883791489449224,-0.11322795017053,-0.114691735639493,-0.107293591979164,-0.111184535987488,-0.108130381358217,-0.119179934225852,-0.127497099968252,<8b>,-0.0294697066946642,-0.187862588337058,-0.229966095476345,-0.189729492768666,-0.155281593021098,-0.149811977991784,-0.0968822220128942,-0.0860313499457542,-0.089296771965477,-0.113898287743507
4,0.10605744004794,0.120157051147463,0.112432726350475,-0.0640263345617661,0.0379368116558175,0.0983923015285166,-0.0814670904371048,-4.335426172373e-05,6.4330083165284,0.195669769631385,<8b>,0.0237633175061754,-0.0747523697175859,0.0981613196101578,0.0777925279985496,-0.0727373237483488,0.0165867091156146,-0.0696139129006557,0.0348697438548055,-0.0638062942359785,-0.0650810530097855
5,0.494549897964012,0.472177983480689,0.267422792722426,0.36677137609581,0.357972131415392,0.292366188501947,0.310595377909906,0.330491023431905,-0.0646746350419092,0.288176235636034,<8b>,0.0308800167885904,0.476119520515414,0.64710418411449,0.374543547388742,-0.0606905072476989,-0.078715910212632,0.0499085339996598,0.128917429677913,-0.0069567919461546,-0.0663256813150065


In [47]:
#Step 5: Improving model performance
#Merge cluster label to its indiviudal on original teens dataset
teens$cluster <- teen_clusters$cluste
head(teens[c("cluster", "gender", "age", "friends", "female")])

Unnamed: 0,cluster,gender,age,friends,female
1,3,M,18.982,7,0
2,5,F,18.801,0,1
3,3,M,18.335,69,0
4,3,F,18.875,0,1
5,1,,18.995,10,0
6,3,F,18.65586,142,1


In [46]:
#Check average age by cluster
aggregate(data = teens, age ~ cluster, mean)

Unnamed: 0,cluster,age
1,1,17.12754
2,2,16.86781
3,3,17.29867
4,4,17.22877
5,5,17.09266


In [48]:
#Check average female by cluster
aggregate(data = teens, female ~ cluster, mean)

Unnamed: 0,cluster,female
1,1,0.8038278
2,2,0.8369942
3,3,0.6991828
4,4,0.7008929
5,5,0.8383028


In [49]:
aggregate(data = teens, friends ~ cluster, mean)

Unnamed: 0,cluster,friends
1,1,30.64689
2,2,41.44046
3,3,27.72732
4,4,31.48438
5,5,37.06029
