In [None]:
##### Chapter 3: Classification using Nearest Neighbors --------------------

## Example: Classifying Cancer Samples ----
## Step 2: Exploring and preparing the data ---- 

# import the CSV file
wbcd = read_csv("wisc_bc_data.csv")

# examine the structure of the wbcd data frame
wbcd.dtypes

# drop the id feature
wbcd.drop(wbcd.columns[0], axis=1)

# table of diagnosis
wbcd.diagnosis.value_counts()


# recode diagnosis as a factor
wbcd['diagnosis'] = wbcd.Categorical(wbcd['diagnosis'], levels = c("B", "M"),
                         labels = c("Benign", "Malignant"))

# table or proportions with more informative labels
round(prop.table(table(wbcd$diagnosis)) * 100, digits = 1)

# summarize three numeric features
summary(wbcd[c("radius_mean", "area_mean", "smoothness_mean")])

# create normalization function
normalize <- function(x) {
  return ((x - min(x)) / (max(x) - min(x)))
}

# test normalization function - result should be identical
normalize(c(1, 2, 3, 4, 5))
normalize(c(10, 20, 30, 40, 50))

# normalize the wbcd data
wbcd_n <- as.data.frame(lapply(wbcd[2:31], normalize))

# confirm that normalization worked
summary(wbcd_n$area_mean)