## Objective:
Predict the Diagnosis results of Prostrate Cancer using Knn-Means Algorithm

In [16]:
library(tidyverse)
library(class)
library(gmodels) #Fpr crosstable() Function

In [8]:
prc <- read.csv("Dataset/Prostate_Cancer.csv",stringsAsFactors = FALSE)
str(prc)
head(prc)

'data.frame':	100 obs. of  10 variables:
 $ id               : int  1 2 3 4 5 6 7 8 9 10 ...
 $ diagnosis_result : chr  "M" "B" "M" "M" ...
 $ radius           : int  23 9 21 14 9 25 16 15 19 25 ...
 $ texture          : int  12 13 27 16 19 25 26 18 24 11 ...
 $ perimeter        : int  151 133 130 78 135 83 120 90 88 84 ...
 $ area             : int  954 1326 1203 386 1297 477 1040 578 520 476 ...
 $ smoothness       : num  0.143 0.143 0.125 0.07 0.141 0.128 0.095 0.119 0.127 0.119 ...
 $ compactness      : num  0.278 0.079 0.16 0.284 0.133 0.17 0.109 0.165 0.193 0.24 ...
 $ symmetry         : num  0.242 0.181 0.207 0.26 0.181 0.209 0.179 0.22 0.235 0.203 ...
 $ fractal_dimension: num  0.079 0.057 0.06 0.097 0.059 0.076 0.057 0.075 0.074 0.082 ...


id,diagnosis_result,radius,texture,perimeter,area,smoothness,compactness,symmetry,fractal_dimension
1,M,23,12,151,954,0.143,0.278,0.242,0.079
2,B,9,13,133,1326,0.143,0.079,0.181,0.057
3,M,21,27,130,1203,0.125,0.16,0.207,0.06
4,M,14,16,78,386,0.07,0.284,0.26,0.097
5,M,9,19,135,1297,0.141,0.133,0.181,0.059
6,B,25,25,83,477,0.128,0.17,0.209,0.076


In [9]:
prc <- prc[-1] #Removes the first column which is an Identifier

prc %>%
  count(diagnosis_result)

diagnosis_result,n
B,38
M,62


In [11]:
#Normalising the Numeric variables so that all values are between 0 and 1:
normalize <- function(x) {
  return ((x-min(x))/(max(x)-min(x))  )
}
prc_n <- as.data.frame(lapply(prc[2:9],normalize)) 
summary(prc_n$radius)
head(prc_n)

   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
 0.0000  0.1875  0.5000  0.4906  0.7500  1.0000 

radius,texture,perimeter,area,smoothness,compactness,symmetry,fractal_dimension
0.875,0.0625,0.825,0.4486874,1.0,0.781759,0.6331361,0.59090909
0.0,0.125,0.675,0.6706444,1.0,0.1335505,0.2721893,0.09090909
0.75,1.0,0.65,0.5972554,0.7534247,0.3973941,0.4260355,0.15909091
0.3125,0.3125,0.2166667,0.1097852,0.0,0.8013029,0.739645,1.0
0.0,0.5,0.6916667,0.6533413,0.9726027,0.3094463,0.2721893,0.13636364
1.0,0.875,0.2583333,0.1640811,0.7945205,0.4299674,0.4378698,0.52272727


In [12]:
#Creating the train and test data sets
prc_train <- prc_n[1:65,]
prc_test <- prc_n[66:100,]

#Takes the diagnosis factor in the column 1 and stores separately for classification
prc_train_labels <- prc[1:65,1]
prc_test_labels <- prc[66:100,1]

In [13]:
#Running the Knn function:
prc_test_pred <- knn(train = prc_train, test = prc_test,cl = prc_train_labels, k=10)
#Note: k value is usually sqrt of the total number of observations.

In [17]:
#Evaluating model performance:
CrossTable(x = prc_test_labels , y = prc_test_pred , chisq = FALSE)


 
   Cell Contents
|-------------------------|
|                       N |
| Chi-square contribution |
|           N / Row Total |
|           N / Col Total |
|         N / Table Total |
|-------------------------|

 
Total Observations in Table:  35 

 
                | prc_test_pred 
prc_test_labels |         B |         M | Row Total | 
----------------|-----------|-----------|-----------|
              B |         7 |        12 |        19 | 
                |     2.695 |     0.674 |           | 
                |     0.368 |     0.632 |     0.543 | 
                |     1.000 |     0.429 |           | 
                |     0.200 |     0.343 |           | 
----------------|-----------|-----------|-----------|
              M |         0 |        16 |        16 | 
                |     3.200 |     0.800 |           | 
                |     0.000 |     1.000 |     0.457 | 
                |     0.000 |     0.571 |           | 
                |     0.000 |     0.457 |           |