# Evaluation


In [None]:
library('caret')

# simulation

In [None]:
N=1000

In [None]:
x = runif(N,0,2*pi)
f = function(x)(x-pi)^2/10+sin(x)
y = f(x)
e = rnorm(N,0,1/10)
y = y + e

options(repr.plot.width = 5, repr.plot.height = 5, repr.plot.res = 100)
plot(x,y)

In [None]:
df = data.frame(x=x,y=y)

In [None]:
flds = createFolds(1:nrow(df),k=2)
flds

# test/train split

In [None]:
test_df = df[flds[[1]],]
train_df = df[flds[[2]],]

In [None]:
dim(test_df)
dim(train_df)

In [None]:
# build on the testing data
knn_mod = knnreg(y~.,data=train_df,k=5)

In [None]:
train_preds = predict(knn_mod,train_df)

In [None]:
RMSE_train = sqrt(mean((train_df$y-train_preds)^2))

In [None]:
RMSE_train

In [None]:
test_preds = predict(knn_mod,test_df)

In [None]:
RMSE_test = sqrt(mean((test_df$y-test_preds)^2))

In [None]:
RMSE_test

In [None]:
plot(x,y)
xe = data.frame(x=sort(runif(1000,0,2*pi)))
lines(xe$x,f(xe$x),col='blue',lwd=5)
lines(xe$x,predict(knn_mod,xe),col='red',lwd=5)

notice how the training RMSE is typically lower than the testing RMSE

# k fold cross validation

In [None]:
flds = createFolds(1:nrow(df),k=5)
flds

In [None]:
lengths(flds)

In [None]:
i = 1
test_df = df[flds[[i]],]
train_df = df[unlist(flds[-i]),]

In [None]:
dim(test_df)
dim(train_df)

In [None]:
knn_mod = knnreg(y~.,data=train_df,k=10)

train_preds = predict(knn_mod,train_df)
RMSE_train = sqrt(mean((train_df$y-train_preds)^2))

test_preds = predict(knn_mod,test_df)
RMSE_test = sqrt(mean((test_df$y-test_preds)^2))

RMSE_train
RMSE_test

let's put this in a function

In [None]:
tt_split_eval = function(train_idx,test_idx){
    test_df = df[test_idx,]
    train_df = df[train_idx,]
    
    knn_mod = knnreg(y~.,data=train_df,k=5)

    train_preds = predict(knn_mod,train_df)
    RMSE_train = sqrt(mean((train_df$y-train_preds)^2))

    test_preds = predict(knn_mod,test_df)
    RMSE_test = sqrt(mean((test_df$y-test_preds)^2))

    return(data.frame(train=RMSE_train,
        test=RMSE_test
                ))
}

In [None]:
vec = c()
for(i in 1:10)
    vec[i] = i^2

vec

In [None]:
sapply(1:10,function(i)i^2)

In [None]:
lapply(1:10,function(i)i^2)

In [None]:
flds = createFolds(1:nrow(df),k=10)
rmses = lapply(1:length(flds),function(i){
    tdf = tt_split_eval(train_idx = unlist(flds[-i]),test_idx = flds[[i]])
    tdf$i = i 
    return(tdf)
})

In [None]:
rmses[[1]]

In [None]:
rmses[[2]]

In [None]:
RMSE = Reduce('rbind',rmses)

In [None]:
RMSE

In [None]:
library('reshape2')

In [None]:
mRMSE = reshape2::melt(RMSE,id.vars='i')
mRMSE

In [None]:
library('ggplot2')

In [None]:
ggplot(data=mRMSE,mapping=aes(x=i,y=value,color=variable))+
    geom_point()

in total summary we can summarize the RMSEs

In [None]:
median(RMSE$test)

In [None]:
sd(RMSE$test)

# test/train/validate

How can we use this to choose a value of $k$ for KNN? Use a train/validate/test 3-way split

In [None]:
flds = createFolds(1:nrow(df),k=10)
flds

In [None]:
i = 1
test_idx = flds[[i]]
trainval_idx = unlist(flds[-i])

In [None]:
test_df = df[test_idx,]
trainval_df = df[trainval_idx,]

In [None]:
dim(test_df)
dim(trainval_df)

In [None]:
tv_flds = createFolds(1:nrow(trainval_df),k=10)
tv_flds

In [None]:
j=1
val_idx = tv_flds[[j]]
train_idx = unlist(tv_flds[-j])

In [None]:
train_df = trainval_df[train_idx,]
val_df = trainval_df[val_idx,]

In [None]:
dim(train_df)
dim(val_df)

In [None]:
tt_split_eval_k = function(train_idx,val_idx,k=1){
    train_df = trainval_df[train_idx,]
    val_df = trainval_df[val_idx,]
    
    knn_mod = knnreg(y~.,data=train_df,k=k)

    train_preds = predict(knn_mod,train_df)
    RMSE_train = sqrt(mean((train_df$y-train_preds)^2))

    val_preds = predict(knn_mod,val_df)
    RMSE_val = sqrt(mean((val_df$y-val_preds)^2))

    return(data.frame(train=RMSE_train,
        val=RMSE_val
                ))
}

In [None]:
tt_split_eval_k(train_idx,val_idx,k=5)

In [None]:
tt_split_eval_k(train_idx,val_idx,k=10)

In [None]:
RMSE = lapply(1:75,function(k){
    tdf = tt_split_eval_k(train_idx,val_idx,k=k)
    tdf$k = k
    return(tdf)
})
RMSE = Reduce('rbind',RMSE)
head(RMSE)

In [None]:
mRMSE = melt(RMSE,id.vars='k')
head(mRMSE)

In [None]:
ggplot(data=mRMSE,mapping=aes(x=k,y=value,color=variable))+
    geom_point()+
    scale_x_sqrt()

In [None]:
which.min(RMSE$val)

In [None]:
min_df = RMSE[which.min(RMSE$val),]
min_df

In [None]:
knn_mod = knnreg(y~.,data=trainval_df,k=min_df$k)

In [None]:
test_preds = predict(knn_mod,test_df)
RMSE_val = sqrt(mean((test_df$y-test_preds)^2))
RMSE_val

# nested x-validation

can I do this in a cross validated way? yes use nested cross validation!

In [None]:
# outer loop = split into test and trainval datasets
# inner loop = MBP, split into train/val and search over k

In [None]:
flds = createFolds(1:nrow(df),k=10)

In [None]:
TEST_RMSE = rep(NA,length(flds))

for(i in 1:length(flds)){
    
    # split testing from trainval
    test_idx = flds[[i]]
    trainval_idx = unlist(flds[-i])
    test_df = df[test_idx,]
    trainval_df = df[trainval_idx,]
     
    #MODEL BUILDING PROCESS
    tv_flds = createFolds(1:nrow(trainval_df),k=10)
    
    K_seq = seq(1,75)
    VAL_MTX = array(NA,c(length(tv_flds),length(K_seq)))

    # normally wouldn't include these two lines
    TRAIN_MTX = array(NA,c(length(tv_flds),length(K_seq)))
    TEST_MTX = array(NA,c(length(tv_flds),length(K_seq)))
    
    for(j in 1:length(tv_flds)){
        
        val_idx = tv_flds[[j]]
        train_idx = unlist(tv_flds[-j])
        train_df = trainval_df[train_idx,]
        val_df = trainval_df[val_idx,]

        for(k in K_seq){
            knn_mod = knnreg(y~.,data=train_df,k=k)
            val_preds = predict(knn_mod,val_df)
            VAL_MTX[j,k] = sqrt(mean((val_df$y-val_preds)^2))
            
            # normally wouldn't include these lines
            train_preds = predict(knn_mod,train_df)
            test_preds = predict(knn_mod,test_df)
            TRAIN_MTX[j,k] = sqrt(mean((train_df$y-train_preds)^2))
            TEST_MTX[j,k] = sqrt(mean((test_df$y-test_preds)^2))
        }
    }
    
    VAL_K = apply(VAL_MTX,2,mean)
    K_hat = K_seq[which.min(VAL_K)]
    
    knn_mod = knnreg(y~.,data=trainval_df,k=K_hat)
    
    # eval on testing data
    test_preds = predict(knn_mod,test_df)
    TEST_RMSE[i] = sqrt(mean((test_df$y-test_preds)^2))
}

Consider the output of a single run of the **outer** loop:

In [None]:
ggplot(data=melt(TRAIN_MTX),mapping=aes(x=Var2,y=value,color=as.factor(Var1)))+geom_point()

In [None]:
ggplot(data=melt(VAL_MTX),mapping=aes(x=Var2,y=value,color=as.factor(Var1)))+geom_point()

In [None]:
ggplot(data=melt(TEST_MTX),mapping=aes(x=Var2,y=value,color=as.factor(Var1)))+geom_point()

to viz easier, let's average over inner x-validation

In [None]:
library('tidyr')

In [None]:
ttv = data.frame(train=apply(TRAIN_MTX,2,mean),
           test=apply(TEST_MTX,2,mean),
           val=apply(VAL_MTX,2,mean),
           k=K_seq)
ttv = ttv%>%pivot_longer(cols=c(train,test,val))

head(ttv)
ggplot(data=ttv,mapping=aes(x=k,y=value,color=name))+geom_point()

In [None]:
plot(VAL_K)

In [None]:
K_hat = K_seq[which.min(VAL_K)]
K_hat

In [None]:
# overall

In [None]:
TEST_RMSE

In [None]:
mean(TEST_RMSE)

In [None]:
plot(VAL_K)
abline(h=mean(TEST_RMSE),col='red')

What was the point of this nested x-validation, really? Its *really* `TEST_RMSE`

In [None]:
TEST_RMSE

In [None]:
mean(TEST_RMSE)

How do we build the final model for prediction? Basically pull out the inner loop but now use **all** of the data! (Don't hold out a test set in any simple or fancy way)

In [None]:
tv_flds = createFolds(1:nrow(df),k=10) # use all df
    
K_seq = seq(1,75)
VAL_MTX = array(NA,c(length(tv_flds),length(K_seq)))

for(j in 1:length(tv_flds)){

    val_idx = tv_flds[[j]]
    train_idx = unlist(tv_flds[-j])
    train_df = df[train_idx,]
    val_df = df[val_idx,]

    for(k in K_seq){
        knn_mod = knnreg(y~.,data=train_df,k=k)
        val_preds = predict(knn_mod,val_df)
        VAL_MTX[j,k] = sqrt(mean((val_df$y-val_preds)^2))
    }
}

VAL_K = apply(VAL_MTX,2,mean)
K_hat = K_seq[which.min(VAL_K)]

In [None]:
plot(VAL_K)

In [None]:
K_hat

In [None]:
# fit with all the data
knn_mod = knnreg(y~.,data=df,k=K_hat)

Boss: Ok, cool, how good will this do? `TEST_RMSE`

In [None]:
plot(x,y)
lines(xe$x,f(xe$x),col='blue',lwd=5)
lines(xe$x,predict(knn_mod,xe),col='red',lwd=5)