# Lazy Learning - Classification using Nearest Neighbors

## Example - diagnosis breast cancer with the kNN algorithm
### Step 1 - collecting data

In [31]:
shhh = function(...){
    suppressWarnings(
        suppressPackageStartupMessages(base::library(...))
    )
}

In [32]:
#load appropriate libraries
shhh(dplyr)
shhh(ggplot2)
shhh(psych)

In [33]:
#read the dataset
dat = read.csv('../data/wdbc_data.csv', stringsAsFactors = FALSE)
head(dat)

id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave_points_mean,⋯,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave_points_worst,symmetry_worst,fractal_dimension_worst
<int>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
842302,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,⋯,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,⋯,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,⋯,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,⋯,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,⋯,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678
843786,M,12.45,15.7,82.57,477.1,0.1278,0.17,0.1578,0.08089,⋯,15.47,23.75,103.4,741.6,0.1791,0.5249,0.5355,0.1741,0.3985,0.1244


In [3]:
#remove first column/'id' variable
dat = dat[, -1]

#display a table count of our criterion (i.e, diagnosis)
table(dat$diagnosis)


  B   M 
357 212 

In [4]:
#change 'diagnosis' (our criteria) to a factor with new labels
dat = dat %>%
    mutate(diagnosis = recode(factor(diagnosis), 
                              B = 'Benign', M = 'Malignant'))
head(dat)

diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave_points_mean,symmetry_mean,⋯,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave_points_worst,symmetry_worst,fractal_dimension_worst
<fct>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
Malignant,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,⋯,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
Malignant,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,⋯,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
Malignant,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,⋯,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
Malignant,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,⋯,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
Malignant,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,⋯,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678
Malignant,12.45,15.7,82.57,477.1,0.1278,0.17,0.1578,0.08089,0.2087,⋯,15.47,23.75,103.4,741.6,0.1791,0.5249,0.5355,0.1741,0.3985,0.1244


In [5]:
#check proportions of our critera
prop.table(table(dat$diagnosis)) %>%
    round(2)


   Benign Malignant 
     0.63      0.37 

In [6]:
#take a look at the three features of interests
summary(dat[c('radius_mean', 'area_mean', 'smoothness_mean')])

  radius_mean       area_mean      smoothness_mean  
 Min.   : 6.981   Min.   : 143.5   Min.   :0.05263  
 1st Qu.:11.700   1st Qu.: 420.3   1st Qu.:0.08637  
 Median :13.370   Median : 551.1   Median :0.09587  
 Mean   :14.127   Mean   : 654.9   Mean   :0.09636  
 3rd Qu.:15.780   3rd Qu.: 782.7   3rd Qu.:0.10530  
 Max.   :28.110   Max.   :2501.0   Max.   :0.16340  

In [7]:
#create a func to normalize data; metrics should be equal across features
normalize = function(x){
    return(
    (x - min(x)) / (max(x) - min(x))
        )
}

In [29]:
#normalize the dataset
dat_norm = as.data.frame(lapply(dat[, 2:31], normalize))
head(dat_norm)[c('radius_mean', 'area_mean', 'smoothness_mean')]

radius_mean,area_mean,smoothness_mean
<dbl>,<dbl>,<dbl>
0.5210374,0.3637328,0.5937528
0.6431445,0.5015907,0.2898799
0.6014956,0.4494168,0.5143089
0.2100904,0.1029056,0.8113208
0.6298926,0.4892895,0.4303512
0.2588386,0.1415058,0.6786133


In [19]:
#create training and test datasets
datTrain = dat_norm[1:469, ]
datTest = dat_norm[470:569, ]

In [20]:
#create training and test labels
datTrain_labels = dat[1:469, 1]
datTest_labels = dat[470:569, 1]

In [21]:
#use the 'class' library for one kNN algorithm; there are a plethora of others!
library(class)
library(gmodels)

datTest_pred = knn(train = datTrain, test = datTest, cl = datTrain_labels, k = 21)

In [22]:
#table to see identifications
CrossTable(x = datTest_labels, y = datTest_pred, prop.chisq = F)


 
   Cell Contents
|-------------------------|
|                       N |
|           N / Row Total |
|           N / Col Total |
|         N / Table Total |
|-------------------------|

 
Total Observations in Table:  100 

 
               | datTest_pred 
datTest_labels |    Benign | Malignant | Row Total | 
---------------|-----------|-----------|-----------|
        Benign |        77 |         0 |        77 | 
               |     1.000 |     0.000 |     0.770 | 
               |     0.975 |     0.000 |           | 
               |     0.770 |     0.000 |           | 
---------------|-----------|-----------|-----------|
     Malignant |         2 |        21 |        23 | 
               |     0.087 |     0.913 |     0.230 | 
               |     0.025 |     1.000 |           | 
               |     0.020 |     0.210 |           | 
---------------|-----------|-----------|-----------|
  Column Total |        79 |        21 |       100 | 
               |     0.790 |     0.210 | 

In [30]:
#let's try another transformation (z-score)
dat_z = as.data.frame(scale(dat[, -1]))
head(dat_z)[c('radius_mean', 'area_mean', 'smoothness_mean')]

radius_mean,area_mean,smoothness_mean
<dbl>,<dbl>,<dbl>
1.0960995,0.9835095,1.5670875
1.828212,1.9070303,-0.8262354
1.5784992,1.5575132,0.9413821
-0.7682333,-0.7637917,3.2806668
1.7487579,1.8246238,0.2801253
-0.4759559,-0.5052059,2.2354545


In [23]:
#repeat steps from above with new z-scored data
datTrain = dat_z[1:469, ]
datTest = dat_z[470:569, ]

datTrain_labels = dat[1:469, 1]
datTest_labels = dat[470:569, 1]

In [24]:
#use the same parameters as well
datTest_pred = knn(train = datTrain, test = datTest, cl = datTrain_labels, k = 21)

In [25]:
#no differences between the transformations
CrossTable(x = datTest_labels, y = datTest_pred, prop.chisq = F)


 
   Cell Contents
|-------------------------|
|                       N |
|           N / Row Total |
|           N / Col Total |
|         N / Table Total |
|-------------------------|

 
Total Observations in Table:  100 

 
               | datTest_pred 
datTest_labels |    Benign | Malignant | Row Total | 
---------------|-----------|-----------|-----------|
        Benign |        77 |         0 |        77 | 
               |     1.000 |     0.000 |     0.770 | 
               |     0.975 |     0.000 |           | 
               |     0.770 |     0.000 |           | 
---------------|-----------|-----------|-----------|
     Malignant |         2 |        21 |        23 | 
               |     0.087 |     0.913 |     0.230 | 
               |     0.025 |     1.000 |           | 
               |     0.020 |     0.210 |           | 
---------------|-----------|-----------|-----------|
  Column Total |        79 |        21 |       100 | 
               |     0.790 |     0.210 | 