Skip to content

Commit

Permalink
version 1.0
Browse files Browse the repository at this point in the history
  • Loading branch information
Asma Gul authored and gaborcsardi committed Sep 13, 2015
0 parents commit 2d83514
Show file tree
Hide file tree
Showing 16 changed files with 739 additions and 0 deletions.
21 changes: 21 additions & 0 deletions DESCRIPTION
@@ -0,0 +1,21 @@
Package: ESKNN
Type: Package
Title: Ensemble of Subset of K-Nearest Neighbours Classifiers for
Classification and Class Membership Probability Estimation
Version: 1.0
Date: 2015-09-13
Author: Asma Gul, Aris Perperoglou, Zardad Khan, Osama Mahmoud, Werner Adler, Miftahuddin Miftahuddin, and Berthold Lausen
Maintainer: Asma Gul <agul@essex.ac.uk>
Description: Functions for classification and group membership probability estimation are given.
The issue of non-informative features in the data is addressed by utilizing the ensemble method.
A few optimal models are selected in the ensemble from an initially large set of base k-nearest neighbours (KNN) models, generated on subset of features from the training data.
A two stage assessment is applied in selection of optimal models for the ensemble in the training function.
The prediction functions for classification and class membership probability estimation returns class outcomes and class membership probability estimates for the test data.
The package includes measure of classification error and brier score, for classification and probability estimation tasks respectively.
Imports: caret,stats
LazyLoad: yes
License: GPL (>= 2)
NeedsCompilation: no
Packaged: 2015-09-13 05:41:37 UTC; Khan
Repository: CRAN
Date/Publication: 2015-09-13 09:22:47
15 changes: 15 additions & 0 deletions MD5
@@ -0,0 +1,15 @@
df1e6ee1883c69cf2bff9c0f135b97c3 *DESCRIPTION
cb851e2167dabfa7a12c0a62a771bb26 *NAMESPACE
d7d6227a895b6e3006a79585fc84e127 *R/Predict.esknnClass.R
6470309f7128c8c1f4d240ee86582ab8 *R/Predict.esknnProb.R
1fbf2cc02a5214066ebfcb3adcca967d *R/esknnClass.R
61abf7ca61fb8e7c4bff8c7ae5964cd1 *R/esknnProb.R
25f67ec064173b37ec2dc3dc81988a85 *data/hepatitis.rda
6d852dc53ef863702a874ebee134ce33 *data/sonar.rda
5354e091329d37c4c17f04ae25069423 *man/ESKNN-package.Rd
0a0b0d7058b1f73932d9d3015a2e47a2 *man/Predict.esknnClass.Rd
f234e851aa9c516433974e6498c28ce7 *man/Predict.esknnProb.Rd
33ab664f689f0ae14e77bf548e941531 *man/esknnClass.Rd
a5516ce77b9b200e9c06681a7a98b7b5 *man/esknnProb.Rd
e518940af9fe919f9c1d5dc0982f745a *man/hepatitis.Rd
bd33b028a66468062d25fa563af7e8ec *man/sonar.Rd
2 changes: 2 additions & 0 deletions NAMESPACE
@@ -0,0 +1,2 @@
exportPattern("^[[:alpha:]]+")
import(caret,stats)
37 changes: 37 additions & 0 deletions R/Predict.esknnClass.R
@@ -0,0 +1,37 @@
Predict.esknnClass <-
function(optModels, xtest,ytest=NULL,k=NULL)
{
mod <- function(x){
vt <- table(x)
as.numeric(names(vt[vt == max(vt)]))}

k <- ifelse(is.null(k),3,k)
fit<-list()
zpred<-list()
zprob<-list()
### selecting odd number of models in final ensemble to break th ties if any in voting
len <- length(optModels$fsfinal)
if(len%%2==0)
len <- len-1
### predicting test data using final ensemble
for(z in 1:len)
{
fit<- knn3Train(optModels$trainfinal[[z]][,names(optModels$trainfinal[[z]])!="Class"],xtest<-xtest[,optModels$fsfinal[[z]]],optModels$trainfinal[[z]]$Class,k=k)
## extract class vector and probability vector from knn3
zpred[[z]]<-as.factor(fit[1:length(fit)]) ### class labels
}
## binding selected z models
mclass<- do.call("cbind",zpred)
predClass<-apply(mclass,1,mod)
##
## Calculating classification Error
##
if(is.null(ytest)){
return(list("PredClass"= predClass))
}
else{
conf=table("True.Class"=as.numeric(ytest)-1,"Predicted.Class"=as.numeric(predClass)-1)
err=1-(sum(diag(conf))/nrow(xtest))
return(list("PredClass"= predClass,"ConfMatrix"=conf,"ClassError"=err))
}
}
40 changes: 40 additions & 0 deletions R/Predict.esknnProb.R
@@ -0,0 +1,40 @@
Predict.esknnProb <-
function(optModels, xtest,ytest=NULL,k=NULL)
{
k <- ifelse(is.null(k),3,k)
fit<-list()
zprob<-list()
### selecting odd number of models in final ensemble to break th ties if any in voting
len <- length(optModels$fsfinal)
if(len%%2==0)
len <- len-1

### predicting test data using final ensemble

for(z in 1:len)

{
fit<- knn3Train(optModels$trainfinal[[z]][,names(optModels$trainfinal[[z]])!="Class"],xtest<-xtest[,optModels$fsfinal[[z]]],optModels$trainfinal[[z]]$Class,k=k)
## extract probability vector from knn3
zprob[[z]]<-attributes(fit)$prob[,2] ### class probabilities
}
## binding selected z models
mprob<- do.call("cbind",zprob)
predProb<- apply(mprob,1,mean)
predProb<-as.vector(predProb)
##
###Computing Brier Score
##

if(is.null(ytest)){
return(list("PredProb"=predProb))
}
else{
ytest<-as.numeric(as.factor(ytest))-1
BS=mean((predProb-ytest)^2)
##
##Returning List
##
return(list("PredProb"=predProb, "BrierScore"=BS))
}
}
122 changes: 122 additions & 0 deletions R/esknnClass.R
@@ -0,0 +1,122 @@
esknnClass <-
function(xtrain, ytrain, k=NULL, q=NULL, m=NULL, ss=NULL)
{
k <- ifelse(is.null(k),3,k)
q <- ifelse(is.null(q),0.2,q)
m <- ifelse(is.null(m),3,m)
ss <- ifelse(is.null(ss),3,ss)

d <- ncol(xtrain)
n <- nrow(xtrain)
## Combing feature matrix and class vector in data frame
train<-as.data.frame(cbind(xtrain,ytrain))
names(train)[names(train)=="ytrain"] <- "Class"
##Accuracy of m knn models
macc=c()
##
## list of training and testind set used in m models with random feature subsets selected KNN models
##
trainboot=list()
fp=list()
## list training data of of 50% selected KNN models
##
training=list()
fs=list()
knclass<-list()
##
## taking 5 % of data from training to train mknn models on Brier Score
##
BStestp<-sample(1:nrow(train),0.05*nrow(train))
BStest<-train[BStestp,]
## Feature Vector
xBStest<-BStest[,names(BStest)!="Class"]
train=train[-BStestp,]
for (r in 1:m) {
bp=sample(1:nrow(train),replace=TRUE)
bs<-train[bp,] ##bootstrap sample position
fp[[r]]=sample(1:d,ss,replace=FALSE) ## feature sample position

## out of bag sample as test for each model
oob=train[-bp,]
xtrainboot=train[bp,fp[[r]]]
ytrainboot<-bs$Class
trainboot[[r]]<-as.data.frame(cbind(xtrainboot,ytrainboot))
names(trainboot[[r]])[names(trainboot[[r]])=="ytrainboot"] <- "Class"
xoob=oob[,fp[[r]]]
yoob=oob$Class
## prediction of test data Class on m bootstrap training sample using KNN and out of bag data as test.
knpred<- knn3Train(xtrainboot,xoob,ytrainboot,k=k)
## extract class vector from knn3
knclass[[r]]<-as.factor(knpred[1:length(knpred)]) ### class labels
conf=table(knclass[[r]],yoob)
macc[r]=sum(diag(conf))/length(yoob)

}

order1 <- order(macc,decreasing=TRUE)
training<-trainboot[order1]
fs<-fp[order1]
NE=round(0.2*length(order1))+1
filtermodel <- list()
training2<-list()
fs2<-list()
pc<-list()
##brier score of selected models
fac<-c()
fBS<-c()
p<- list()

for(l in 1:NE)
{

filtermodel[[l]]<- knn3Train((training[[l]][,names(training[[l]])!="Class"]),xBStest[,fs[[l]]],training[[l]]$Class,k=k)
training2[[l]]<-as.data.frame(training[[l]])
fs2[[l]]<-fs[[l]]
### Extract class probabilities from knn3
p[[l]]<-attributes(filtermodel[[l]])$prob[,2]
pred<-as.vector(p[[l]])
### Brier score of l knn models
fBS[[l]]=mean((pred- (as.numeric(BStest$Class)-1))^2)

}

## ranking models according to thier Brier Score

order2 <- order(fBS,decreasing=FALSE)
fBS<-fBS[order2]
trainfinal2<-training2[order2]
fs2final<-fs2[order2]
p<-p[order2]
fsfinal<-list()
fsfinal[[1]] <- fs2final[[1]]
bfresult<-list()
wac <- c()
wac[[1]] <- fBS[1]
wclass <- p[[1]]
trainfinal <- list()
trainfinal[[1]] <- as.data.frame(trainfinal2[[1]])

for(j in 1:(NE-1))
{

wclass<- matrix(c(as.vector(wclass),p[[j+1]]),nrow=length(BStest$Class))
## aggregated result on models
bfresult[[j]]<-apply(wclass, 1,mean)

wac[j+1]=mean((as.vector(bfresult[[j]])-(as.numeric(BStest$Class)-1))^2)

if(wac[[j+1]]<wac[[j]])

{

trainfinal[[j+1]] <-as.data.frame(training2[[j+1]])
fsfinal[[j+1]] <-fs2[[j+1]]

}
}
### removing null vectors from the list of training and feature set
fsfinal[sapply(fsfinal, is.null)] <- NULL
trainfinal[sapply(trainfinal, is.null)] <- NULL
## return objects
return(list("trainfinal" = trainfinal, "fsfinal" = fsfinal))
}
125 changes: 125 additions & 0 deletions R/esknnProb.R
@@ -0,0 +1,125 @@
esknnProb <-
function(xtrain, ytrain, k=NULL, q=NULL, m=NULL, ss=NULL)
{
k <- ifelse(is.null(k),3,k)
q <- ifelse(is.null(q),0.2,q)
m <- ifelse(is.null(m),3,m)
ss <- ifelse(is.null(ss),3,ss)

d <- ncol(xtrain)
n <- nrow(xtrain)
## Combing feature matrix and class vector in data frame
train<-as.data.frame(cbind(xtrain,ytrain))
names(train)[names(train)=="ytrain"] <- "Class"

##Accuracy of m knn models
macc=c()
##
## list of training and testind set used in m models with random feature subsets selected KNN models
##
trainboot=list()
fp=list()
## list training data of of 50% selected KNN models
##
training=list()
fs=list()
knclass<-list()
##
## taking 5 % of data from training to train mknn models on Brier Score
##
BStestp<-sample(1:nrow(train),0.05*nrow(train))
BStest<-train[BStestp,]
## Feature Vector
xBStest<-BStest[,names(BStest)!="Class"]
train=train[-BStestp,]

for (r in 1:m) {
bp=sample(1:nrow(train),replace=TRUE)
bs<-train[bp,] ##bootstrap sample position
fp[[r]]=sample(1:d,ss,replace=FALSE) ## feature sample position

## out of bag sample as test for each model
oob=train[-bp,]
xtrainboot=train[bp,fp[[r]]]
ytrainboot<-bs$Class
trainboot[[r]]<-as.data.frame(cbind(xtrainboot,ytrainboot))
names(trainboot[[r]])[names(trainboot[[r]])=="ytrainboot"] <- "Class"
xoob=oob[,fp[[r]]]
yoob=oob$Class

## prediction of test data Class on m bootstrap training sample using KNN and out of bag data as test.
knpred<- knn3Train(xtrainboot,xoob,ytrainboot,k=k)
## extract class vector from knn3
knclass[[r]]<-as.factor(knpred[1:length(knpred)]) ### class labels
conf=table(knclass[[r]],yoob)
macc[r]=sum(diag(conf))/length(yoob)

}

order1 <- order(macc,decreasing=TRUE)
training<-trainboot[order1]
fs<-fp[order1]
NE=round(0.2*length(order1))+1
filtermodel <- list()
training2<-list()
fs2<-list()
pc<-list()
##brier score of selected models
fac<-c()
fBS<-c()
p<- list()

for(l in 1:NE)
{

filtermodel[[l]]<- knn3Train((training[[l]][,names(training[[l]])!="Class"]),xBStest[,fs[[l]]],training[[l]]$Class,k=k)
training2[[l]]<-as.data.frame(training[[l]])
fs2[[l]]<-fs[[l]]
### Extract class probabilities from knn3
p[[l]]<-attributes(filtermodel[[l]])$prob[,2]
pred<-as.vector(p[[l]])
### Brier score of l knn models
fBS[[l]]=mean((pred- (as.numeric(BStest$Class)-1))^2)

}

## ranking models according to thier Brier Score

order2 <- order(fBS,decreasing=FALSE)
fBS<-fBS[order2]
trainfinal2<-training2[order2]
fs2final<-fs2[order2]
p<-p[order2]
fsfinal<-list()
fsfinal[[1]] <- fs2final[[1]]
bfresult<-list()
wac <- c()
wac[[1]] <- fBS[1]
wclass <- p[[1]]
trainfinal <- list()
trainfinal[[1]] <- as.data.frame(trainfinal2[[1]])

for(j in 1:(NE-1))
{

wclass<- matrix(c(as.vector(wclass),p[[j+1]]),nrow=length(BStest$Class))
## aggregated result on models
bfresult[[j]]<-apply(wclass, 1,mean)

wac[j+1]=mean((as.vector(bfresult[[j]])-(as.numeric(BStest$Class)-1))^2)

if(wac[[j+1]]<wac[[j]])

{

trainfinal[[j+1]] <-as.data.frame(training2[[j+1]])
fsfinal[[j+1]] <-fs2[[j+1]]

}
}
### removing null vectors from the list of training and feature set
fsfinal[sapply(fsfinal, is.null)] <- NULL
trainfinal[sapply(trainfinal, is.null)] <- NULL
## return objects
return(list("trainfinal" = trainfinal, "fsfinal" = fsfinal))
}
Binary file added data/hepatitis.rda
Binary file not shown.
Binary file added data/sonar.rda
Binary file not shown.
23 changes: 23 additions & 0 deletions man/ESKNN-package.Rd
@@ -0,0 +1,23 @@
\name{ESkNN-package}
\alias{ESkNN-package}
\alias{ESkNN}
\docType{package}
\title{ Ensemble of Subset of K-Nearest Neighbours Classifiers for Classification and Class Membership Probability Estimation
}
\description{Functions for building an ensemble of optimal k-nearest neighbours (kNN) models for classification and class membership probability estimation are provided. To address the issue of non-informative features in the data. A set of base kNN models is generated and a subset of models is selected for the ensemble based on the individual and combined performance of these models. Out-of-bag data and an independent training data set is used for the performance assessment of models individually and collectively. Class labels and class membership probability estimates are returned by the prediction functions. Other measures such as confusion matrix, classification error rate, and brier scores etc, are also returned by the functions.
}
\details{
\tabular{ll}{
Package: \tab ESKNN\cr
Type: \tab Package\cr
Version: \tab 1.0\cr
Date: \tab 2015-09-13\cr
License: \tab GPL (>= 2)\cr
}

}
\author{Asma Gul, Aris Perperoglou, Zardad Khan, Osama Mahmoud, Miftahuddin, Werner Adler, and Berthold Lausen
Maintainer: Asma Gul <agul@essex.ac.uk>
}
\references{Gul, A., Perperoglou, A., Khan, Z., Mahmoud, O., Miftahuddin, M., Adler, W. and Lausen, B.(2014),\emph{Ensemble of subset of k-nearest neighbours classifiers}, Journal name to appear.}
\keyword{ package }

0 comments on commit 2d83514

Please sign in to comment.