# KNN Classification

## load the data 

In [None]:
load_or_install = function(list.of.packages){
    new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])]
    if(length(new.packages)) install.packages(new.packages)
    for(pkg in list.of.packages)
        library(pkg,character.only=TRUE)
}

In [None]:
load_or_install('caret')

In [None]:
load_or_install('palmerpenguins')
penguins = penguins[complete.cases(penguins),]
head(penguins)

In [None]:
summary(penguins)

In [None]:
dim(penguins)

In [None]:
table(penguins$species)/nrow(penguins)

In [None]:
class(penguins$species)

# data splitting

Read more: [here](https://topepo.github.io/caret/data-splitting.html) or [here](https://topepo.github.io/caret/subsampling-for-class-imbalances.html)

Way we've previously used 

In [None]:
flds = createFolds(1:nrow(penguins),k=5)
flds

fld = flds[[1]]
table(penguins[fld,'species'])/length(flds[[1]])

maybe a better way:

In [None]:
flds = createFolds(penguins$species,k=5)
flds

In [None]:
fld = flds[[1]]
table(penguins[fld,'species'])/length(flds[[1]])

In [None]:
fldit0 = function(i){
    flds = createFolds(1:nrow(penguins),k=5)
    fld = flds[[1]]
    return(table(penguins[fld,'species'])/length(flds[[1]]))
}

In [None]:
fld0 = data.frame(t(sapply(1:500,fldit0)))
fld0$type='SRS'

In [None]:
fldit = function(i){
    flds = createFolds(penguins$species,k=5)   
    fld = flds[[1]]
    return(table(penguins[fld,'species'])/length(flds[[1]]))
}

In [None]:
fld1 = data.frame(t(sapply(1:500,fldit)))
fld1$type='Strat'

In [None]:
library('tidyr')
library('dplyr')

In [None]:
df = bind_rows(fld0,fld1)

In [None]:
df_long = df %>% pivot_longer(cols=c(Adelie,Chinstrap,Gentoo))

In [None]:
df_long %>% group_by(type,name) %>% summarize(var=var(value))

## multifolds

In [None]:
flds = createMultiFolds(penguins$species,k=2,times=2)

In [None]:
flds

In [None]:
flds = createDataPartition(penguins$species,p=.9,times=3) #without replacement
flds

In [None]:
flds = createResample(penguins$species,times=3) #with replacement
flds 

## fit the model

In [None]:
mod = knn3(species~.,data=penguins,k=5)

In [None]:
preds = predict(mod,newdata=penguins,type='class')
head(preds,n=10)

In [None]:
head(predict(mod,newdata=penguins,type='prob'))

In [None]:
cm = confusionMatrix(data=preds,reference=penguins$species)
cm

In [None]:
C = as.matrix(cm$table)
C

In [None]:
sum(diag(C))/sum(C)

In [None]:
C[1,1]/sum(C[,1])

In [None]:
C[2,2]/sum(C[,2])

# plot the prediction space

In [None]:
summary(penguins)

In [None]:
library('ggplot2')

In [None]:
plot_fit = function(v1,v2,df=penguins,N=floor(sqrt(10000)),k=10){
    r1 = range(df[[v1]])
    r2 = range(df[[v2]])
    
    train_df = df[,c('species',v1,v2)]
    
    mod = knn3(species~.,data=train_df,k=k)
    
    s1 = seq(r1[1],r1[2],length.out=N)
    s2 = seq(r2[1],r2[2],length.out=N)
    
    p_df = expand.grid(v1=s1,v2=s2)
    colnames(p_df) = c(v1,v2)
    preds = predict(mod,newdata=p_df,type='class')
    p_df$species = preds
    
    ggplot(data=p_df,mapping=aes_string(x=v1,y=v2,fill='species',shape='species'))+geom_tile()+
        geom_point(data=train_df,size=5)
    
    
}

In [None]:
plot_fit(v1='bill_length_mm',v2='bill_depth_mm',df=penguins,k=50)

In [None]:
plot_fit(v1='bill_length_mm',v2='bill_depth_mm',df=penguins,k=10)

In [None]:
plot_fit(v1='bill_length_mm',v2='bill_depth_mm',df=penguins,k=1)

In [None]:
plot_fit(v1='bill_length_mm',v2='bill_depth_mm',df=penguins,k=1)+coord_fixed()

In [None]:
plot_fit = function(v1,v2,df=penguins,N=floor(sqrt(10000)),k=10,scaleit=TRUE){
    train_df = df[,c('species',v1,v2)]
    if(scaleit)
        train_df[,c(v1,v2)] = scale(train_df[,c(v1,v2)])
    
    mod = knn3(species~.,data=train_df,k=k)
    
    r1 = range(train_df[[v1]])
    r2 = range(train_df[[v2]])
    
    s1 = seq(r1[1],r1[2],length.out=N)
    s2 = seq(r2[1],r2[2],length.out=N)
    
    p_df = expand.grid(v1=s1,v2=s2)
    colnames(p_df) = c(v1,v2)
    preds = predict(mod,newdata=p_df,type='class')
    p_df$species = preds
    
    ggplot(data=p_df,mapping=aes_string(x=v1,y=v2,fill='species',shape='species'))+geom_tile()+
        geom_point(data=train_df,size=5)+coord_fixed()
}

In [None]:
plot_fit(v1='bill_length_mm',v2='bill_depth_mm',df=penguins,k=1)

In [None]:
plot_fit(v1='flipper_length_mm',v2='bill_depth_mm',df=penguins,k=1)

In [None]:
plot_fit(v1='flipper_length_mm',v2='bill_depth_mm',df=penguins,k=50)