In [1]:
##
# @author Chris Bailey
#

source("./src/main/R/svm.R")

spark <- spark.start()

d <- read.csv('./data/readmit_200000.csv', header = TRUE)
mat <- sparse.model.matrix(READMITTED ~ WEIGHT + SYSTOLIC + DIASTOLIC + as.factor(SERVICECODE) - 1, data = d)

# get the sparse matrix dimensions.
nRows     <- mat@Dim[1]
nColumns  <- mat@Dim[2]
nElements <- length(mat@i)
# get the sparse matrix.
rowIndex  <- mat@i
colBegin  <- mat@p
matValue  <- mat@x
# get the labels.
labels    <- as.numeric(d$READMITTED)

modelMatrix <- create.model.matrix(nRows, nColumns, nElements, rowIndex, colBegin, matValue)
# consider increasing the number of partitions for larger datasets.
nPartitions <- 100L
trainData   <- create.training.dataset(spark, modelMatrix, nPartitions, labels)
testData    <- create.test.dataset(spark, modelMatrix, nPartitions)

# create a SVM object.
# DO PLAY with parameters!!
maxIterations <- 50L
tol           <- 1e-8
regParam      <- 1e-8
intercept     <- FALSE
svm <- svm.create(maxIterations, tol, regParam, intercept)

svm.train(svm, trainData)
scores <- svm.predict(svm, testData)

misclassification <- sum((scores - labels)^2)
print(misclassification)

spark$stop()

[1] 51967
