version 1.0

cran · Sep 13, 2015 · 2d83514 · 2d83514
commit 2d83514
Show file tree

Hide file tree

Showing 16 changed files with 739 additions and 0 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -0,0 +1,21 @@
+Package: ESKNN
+Type: Package
+Title: Ensemble of Subset of K-Nearest Neighbours Classifiers for
+        Classification and Class Membership Probability Estimation
+Version: 1.0
+Date: 2015-09-13
+Author: Asma Gul, Aris Perperoglou, Zardad Khan, Osama Mahmoud, Werner Adler, Miftahuddin Miftahuddin, and Berthold Lausen
+Maintainer: Asma Gul <agul@essex.ac.uk>
+Description: Functions for classification and group membership probability estimation are given. 
+             The issue of non-informative features in the data is addressed by utilizing the ensemble method. 
+             A few optimal models are selected in the ensemble from an initially large set of base k-nearest neighbours (KNN) models, generated on subset of features from the training data.
+             A two stage assessment  is applied in selection of optimal models for the ensemble in the training function. 
+             The prediction functions for classification and class membership probability estimation returns class outcomes and class membership probability estimates for the test data. 
+             The package includes measure of classification error and brier score, for classification and probability estimation tasks respectively.  
+Imports: caret,stats
+LazyLoad: yes
+License: GPL (>= 2)
+NeedsCompilation: no
+Packaged: 2015-09-13 05:41:37 UTC; Khan
+Repository: CRAN
+Date/Publication: 2015-09-13 09:22:47
diff --git a/MD5 b/MD5
@@ -0,0 +1,15 @@
+df1e6ee1883c69cf2bff9c0f135b97c3 *DESCRIPTION
+cb851e2167dabfa7a12c0a62a771bb26 *NAMESPACE
+d7d6227a895b6e3006a79585fc84e127 *R/Predict.esknnClass.R
+6470309f7128c8c1f4d240ee86582ab8 *R/Predict.esknnProb.R
+1fbf2cc02a5214066ebfcb3adcca967d *R/esknnClass.R
+61abf7ca61fb8e7c4bff8c7ae5964cd1 *R/esknnProb.R
+25f67ec064173b37ec2dc3dc81988a85 *data/hepatitis.rda
+6d852dc53ef863702a874ebee134ce33 *data/sonar.rda
+5354e091329d37c4c17f04ae25069423 *man/ESKNN-package.Rd
+0a0b0d7058b1f73932d9d3015a2e47a2 *man/Predict.esknnClass.Rd
+f234e851aa9c516433974e6498c28ce7 *man/Predict.esknnProb.Rd
+33ab664f689f0ae14e77bf548e941531 *man/esknnClass.Rd
+a5516ce77b9b200e9c06681a7a98b7b5 *man/esknnProb.Rd
+e518940af9fe919f9c1d5dc0982f745a *man/hepatitis.Rd
+bd33b028a66468062d25fa563af7e8ec *man/sonar.Rd
diff --git a/NAMESPACE b/NAMESPACE
@@ -0,0 +1,2 @@
+exportPattern("^[[:alpha:]]+")
+import(caret,stats)
diff --git a/R/Predict.esknnClass.R b/R/Predict.esknnClass.R
@@ -0,0 +1,37 @@
+Predict.esknnClass <-
+function(optModels, xtest,ytest=NULL,k=NULL)
+{ 
+  mod <- function(x){
+    vt <- table(x)
+    as.numeric(names(vt[vt == max(vt)]))}
+
+  k <- ifelse(is.null(k),3,k)
+  fit<-list()
+  zpred<-list()
+  zprob<-list()
+  ### selecting odd number of models in final ensemble to break th ties if any in voting
+  len <- length(optModels$fsfinal)
+  if(len%%2==0)
+    len <- len-1  
+  ### predicting test data using final ensemble
+  for(z in 1:len) 
+  {
+    fit<- knn3Train(optModels$trainfinal[[z]][,names(optModels$trainfinal[[z]])!="Class"],xtest<-xtest[,optModels$fsfinal[[z]]],optModels$trainfinal[[z]]$Class,k=k)
+    ## extract class vector and probability vector from knn3
+    zpred[[z]]<-as.factor(fit[1:length(fit)])                    ### class labels
+  }
+  ##   binding selected z models 
+  mclass<- do.call("cbind",zpred)
+  predClass<-apply(mclass,1,mod)
+  ##
+  ## Calculating classification Error
+  ##
+  if(is.null(ytest)){
+    return(list("PredClass"= predClass))
+  }
+  else{
+  conf=table("True.Class"=as.numeric(ytest)-1,"Predicted.Class"=as.numeric(predClass)-1)   
+  err=1-(sum(diag(conf))/nrow(xtest))
+  return(list("PredClass"= predClass,"ConfMatrix"=conf,"ClassError"=err))
+}
+}
diff --git a/R/Predict.esknnProb.R b/R/Predict.esknnProb.R
@@ -0,0 +1,40 @@
+Predict.esknnProb <-
+function(optModels, xtest,ytest=NULL,k=NULL)
+{ 
+  k <- ifelse(is.null(k),3,k)  
+  fit<-list()
+  zprob<-list()
+  ### selecting odd number of models in final ensemble to break th ties if any in voting
+  len <- length(optModels$fsfinal)
+  if(len%%2==0)
+    len <- len-1
+
+  ### predicting test data using final ensemble
+
+  for(z in 1:len)
+
+  {
+    fit<- knn3Train(optModels$trainfinal[[z]][,names(optModels$trainfinal[[z]])!="Class"],xtest<-xtest[,optModels$fsfinal[[z]]],optModels$trainfinal[[z]]$Class,k=k)
+    ## extract  probability vector from knn3
+    zprob[[z]]<-attributes(fit)$prob[,2]                         ### class probabilities
+  }
+  ##   binding selected z models 
+  mprob<-  do.call("cbind",zprob)
+  predProb<- apply(mprob,1,mean)
+  predProb<-as.vector(predProb)  
+  ##
+  ###Computing Brier Score
+  ##
+
+  if(is.null(ytest)){
+    return(list("PredProb"=predProb))
+  }
+  else{
+  ytest<-as.numeric(as.factor(ytest))-1
+  BS=mean((predProb-ytest)^2)
+  ##
+  ##Returning List
+  ##
+  return(list("PredProb"=predProb, "BrierScore"=BS))
+}
+}
diff --git a/R/esknnClass.R b/R/esknnClass.R
@@ -0,0 +1,122 @@
+esknnClass <-
+function(xtrain, ytrain, k=NULL, q=NULL, m=NULL, ss=NULL) 
+{
+  k <- ifelse(is.null(k),3,k)
+  q <- ifelse(is.null(q),0.2,q)
+  m <- ifelse(is.null(m),3,m)
+  ss <- ifelse(is.null(ss),3,ss)
+
+  d <- ncol(xtrain)
+  n <- nrow(xtrain)
+  ## Combing feature matrix and class vector in data frame
+  train<-as.data.frame(cbind(xtrain,ytrain)) 
+  names(train)[names(train)=="ytrain"] <- "Class"
+  ##Accuracy of m knn models
+  macc=c()
+  ##
+  ##  list of training and testind set used in m models with random feature subsets selected KNN  models
+  ##
+  trainboot=list()  
+  fp=list()
+  ##  list training data of of 50% selected KNN  models
+  ##
+  training=list()
+  fs=list()
+  knclass<-list()
+  ##
+  ##  taking 5 % of data from training to train mknn models on Brier Score
+  ##
+  BStestp<-sample(1:nrow(train),0.05*nrow(train))
+  BStest<-train[BStestp,]
+  ## Feature Vector
+  xBStest<-BStest[,names(BStest)!="Class"]
+  train=train[-BStestp,] 
+  for (r in 1:m) {
+    bp=sample(1:nrow(train),replace=TRUE)
+    bs<-train[bp,]                             ##bootstrap sample position
+    fp[[r]]=sample(1:d,ss,replace=FALSE)       ## feature sample  position
+
+    ## out of bag sample as test for each model
+    oob=train[-bp,]                                      
+    xtrainboot=train[bp,fp[[r]]]
+    ytrainboot<-bs$Class
+    trainboot[[r]]<-as.data.frame(cbind(xtrainboot,ytrainboot))
+    names(trainboot[[r]])[names(trainboot[[r]])=="ytrainboot"] <- "Class"
+    xoob=oob[,fp[[r]]]
+    yoob=oob$Class
+    ## prediction of test data Class on m bootstrap training sample using KNN and out of bag data as test.
+    knpred<- knn3Train(xtrainboot,xoob,ytrainboot,k=k)
+    ## extract class vector from knn3    
+    knclass[[r]]<-as.factor(knpred[1:length(knpred)])      ### class labels
+    conf=table(knclass[[r]],yoob)   
+    macc[r]=sum(diag(conf))/length(yoob)   
+
+  }
+
+  order1 <- order(macc,decreasing=TRUE)
+  training<-trainboot[order1]
+  fs<-fp[order1]
+  NE=round(0.2*length(order1))+1
+  filtermodel <- list()
+  training2<-list()
+  fs2<-list()
+  pc<-list()
+  ##brier score of selected models
+  fac<-c()
+  fBS<-c()
+  p<- list()  
+
+  for(l in 1:NE)
+  {
+
+    filtermodel[[l]]<- knn3Train((training[[l]][,names(training[[l]])!="Class"]),xBStest[,fs[[l]]],training[[l]]$Class,k=k)
+    training2[[l]]<-as.data.frame(training[[l]])
+    fs2[[l]]<-fs[[l]]
+    ### Extract class probabilities from knn3
+    p[[l]]<-attributes(filtermodel[[l]])$prob[,2]  
+    pred<-as.vector(p[[l]])   
+    ### Brier score of l knn models
+    fBS[[l]]=mean((pred- (as.numeric(BStest$Class)-1))^2) 
+
+  }
+
+  ## ranking models according to thier Brier Score
+
+  order2 <- order(fBS,decreasing=FALSE)
+  fBS<-fBS[order2]
+  trainfinal2<-training2[order2]
+  fs2final<-fs2[order2]
+  p<-p[order2]
+  fsfinal<-list()
+  fsfinal[[1]] <- fs2final[[1]]
+  bfresult<-list()
+  wac <- c()
+  wac[[1]] <- fBS[1]
+  wclass <- p[[1]]
+  trainfinal <- list()
+  trainfinal[[1]] <- as.data.frame(trainfinal2[[1]])
+
+  for(j in 1:(NE-1))
+  {
+
+    wclass<- matrix(c(as.vector(wclass),p[[j+1]]),nrow=length(BStest$Class))
+    ## aggregated result on models 
+    bfresult[[j]]<-apply(wclass, 1,mean)   
+
+    wac[j+1]=mean((as.vector(bfresult[[j]])-(as.numeric(BStest$Class)-1))^2)  
+
+    if(wac[[j+1]]<wac[[j]])
+
+    {
+
+      trainfinal[[j+1]] <-as.data.frame(training2[[j+1]])
+      fsfinal[[j+1]] <-fs2[[j+1]]
+
+    }
+  }
+  ### removing null vectors from the list of training and feature set
+  fsfinal[sapply(fsfinal, is.null)] <- NULL
+  trainfinal[sapply(trainfinal, is.null)] <- NULL
+  ## return objects
+  return(list("trainfinal" = trainfinal,  "fsfinal" =  fsfinal))
+}
diff --git a/R/esknnProb.R b/R/esknnProb.R
@@ -0,0 +1,125 @@
+esknnProb <-
+function(xtrain, ytrain, k=NULL, q=NULL, m=NULL, ss=NULL) 
+{
+  k <- ifelse(is.null(k),3,k)
+  q <- ifelse(is.null(q),0.2,q)
+  m <- ifelse(is.null(m),3,m)
+  ss <- ifelse(is.null(ss),3,ss)
+
+  d <- ncol(xtrain)
+  n <- nrow(xtrain)
+  ## Combing feature matrix and class vector in data frame
+  train<-as.data.frame(cbind(xtrain,ytrain)) 
+  names(train)[names(train)=="ytrain"] <- "Class"
+
+  ##Accuracy of m knn models
+  macc=c()
+  ##
+  ##  list of training and testind set used in m models with random feature subsets selected KNN  models
+  ##
+  trainboot=list()  
+  fp=list()
+  ##  list training data of of 50% selected KNN  models
+  ##
+  training=list()
+  fs=list()
+  knclass<-list()
+  ##
+  ##  taking 5 % of data from training to train mknn models on Brier Score
+  ##
+  BStestp<-sample(1:nrow(train),0.05*nrow(train))
+  BStest<-train[BStestp,]
+  ## Feature Vector
+  xBStest<-BStest[,names(BStest)!="Class"]
+  train=train[-BStestp,] 
+
+  for (r in 1:m) {
+    bp=sample(1:nrow(train),replace=TRUE)
+    bs<-train[bp,]                             ##bootstrap sample position
+    fp[[r]]=sample(1:d,ss,replace=FALSE)       ## feature sample  position
+
+    ## out of bag sample as test for each model
+    oob=train[-bp,]                                      
+    xtrainboot=train[bp,fp[[r]]]
+    ytrainboot<-bs$Class
+    trainboot[[r]]<-as.data.frame(cbind(xtrainboot,ytrainboot))
+    names(trainboot[[r]])[names(trainboot[[r]])=="ytrainboot"] <- "Class"
+    xoob=oob[,fp[[r]]]
+    yoob=oob$Class
+
+    ## prediction of test data Class on m bootstrap training sample using KNN and out of bag data as test.
+    knpred<- knn3Train(xtrainboot,xoob,ytrainboot,k=k)
+    ## extract class vector from knn3    
+    knclass[[r]]<-as.factor(knpred[1:length(knpred)])      ### class labels
+    conf=table(knclass[[r]],yoob)   
+    macc[r]=sum(diag(conf))/length(yoob)   
+
+  }
+
+  order1 <- order(macc,decreasing=TRUE)
+  training<-trainboot[order1]
+  fs<-fp[order1]
+  NE=round(0.2*length(order1))+1
+  filtermodel <- list()
+  training2<-list()
+  fs2<-list()
+  pc<-list()
+  ##brier score of selected models
+  fac<-c()
+  fBS<-c()
+  p<- list()  
+
+  for(l in 1:NE)
+  {
+
+    filtermodel[[l]]<- knn3Train((training[[l]][,names(training[[l]])!="Class"]),xBStest[,fs[[l]]],training[[l]]$Class,k=k)
+    training2[[l]]<-as.data.frame(training[[l]])
+    fs2[[l]]<-fs[[l]]
+    ### Extract class probabilities from knn3
+    p[[l]]<-attributes(filtermodel[[l]])$prob[,2]  
+    pred<-as.vector(p[[l]])   
+    ### Brier score of l knn models
+    fBS[[l]]=mean((pred- (as.numeric(BStest$Class)-1))^2) 
+
+  }
+
+  ## ranking models according to thier Brier Score
+
+  order2 <- order(fBS,decreasing=FALSE)
+  fBS<-fBS[order2]
+  trainfinal2<-training2[order2]
+  fs2final<-fs2[order2]
+  p<-p[order2]
+  fsfinal<-list()
+  fsfinal[[1]] <- fs2final[[1]]
+  bfresult<-list()
+  wac <- c()
+  wac[[1]] <- fBS[1]
+  wclass <- p[[1]]
+  trainfinal <- list()
+  trainfinal[[1]] <- as.data.frame(trainfinal2[[1]])
+
+  for(j in 1:(NE-1))
+  {
+
+    wclass<- matrix(c(as.vector(wclass),p[[j+1]]),nrow=length(BStest$Class))
+    ## aggregated result on models 
+    bfresult[[j]]<-apply(wclass, 1,mean)   
+
+    wac[j+1]=mean((as.vector(bfresult[[j]])-(as.numeric(BStest$Class)-1))^2)  
+
+    if(wac[[j+1]]<wac[[j]])
+
+    {
+
+      trainfinal[[j+1]] <-as.data.frame(training2[[j+1]])
+      fsfinal[[j+1]] <-fs2[[j+1]]
+
+    }
+  }
+  ### removing null vectors from the list of training and feature set
+  fsfinal[sapply(fsfinal, is.null)] <- NULL
+  trainfinal[sapply(trainfinal, is.null)] <- NULL
+  ## return objects
+  return(list("trainfinal" = trainfinal,  "fsfinal" =  fsfinal))
+}
diff --git a/data/hepatitis.rda b/data/hepatitis.rda
diff --git a/data/sonar.rda b/data/sonar.rda
diff --git a/man/ESKNN-package.Rd b/man/ESKNN-package.Rd
@@ -0,0 +1,23 @@
+\name{ESkNN-package}
+\alias{ESkNN-package}
+\alias{ESkNN}
+\docType{package}
+\title{ Ensemble of Subset of K-Nearest Neighbours Classifiers for Classification and Class Membership Probability Estimation
+}
+\description{Functions for building an ensemble of optimal k-nearest neighbours (kNN) models for classification and class membership probability estimation are provided.  To address the issue of non-informative features in the data. A set of base kNN models is generated and a subset of models is selected for the ensemble based on the individual and combined performance of these models. Out-of-bag data and an independent training data set is used for the performance assessment of models individually and collectively. Class labels and class membership probability estimates are returned by the prediction functions. Other measures such as confusion matrix, classification error rate, and brier scores etc, are also returned by the functions.
+}
+\details{
+\tabular{ll}{
+Package: \tab ESKNN\cr
+Type: \tab Package\cr
+Version: \tab 1.0\cr
+Date: \tab 2015-09-13\cr
+License: \tab GPL (>= 2)\cr
+}
+
+}
+\author{Asma Gul, Aris Perperoglou, Zardad Khan, Osama Mahmoud, Miftahuddin, Werner Adler, and Berthold Lausen
+Maintainer: Asma Gul <agul@essex.ac.uk>
+}
+\references{Gul, A., Perperoglou, A., Khan, Z., Mahmoud, O., Miftahuddin, M., Adler, W. and Lausen, B.(2014),\emph{Ensemble of subset of k-nearest neighbours classifiers}, Journal name to appear.}
+\keyword{ package }