Permalink
Switch branches/tags
last_OK jenkins-tomk-hadoop-1 jenkins-tomas_jenkins-7 jenkins-tomas_jenkins-6 jenkins-tomas_jenkins-5 jenkins-tomas_jenkins-4 jenkins-tomas_jenkins-3 jenkins-tomas_jenkins-2 jenkins-tomas_jenkins-1 jenkins-sample-docs-3 jenkins-sample-docs-2 jenkins-sample-docs-1 jenkins-rel-wright-3 jenkins-rel-wright-2 jenkins-rel-wright-1 jenkins-rel-wolpert-11 jenkins-rel-wolpert-10 jenkins-rel-wolpert-9 jenkins-rel-wolpert-8 jenkins-rel-wolpert-7 jenkins-rel-wolpert-6 jenkins-rel-wolpert-5 jenkins-rel-wolpert-4 jenkins-rel-wolpert-3 jenkins-rel-wolpert-2 jenkins-rel-wolpert-1 jenkins-rel-wheeler-12 jenkins-rel-wheeler-11 jenkins-rel-wheeler-10 jenkins-rel-wheeler-9 jenkins-rel-wheeler-8 jenkins-rel-wheeler-7 jenkins-rel-wheeler-6 jenkins-rel-wheeler-5 jenkins-rel-wheeler-4 jenkins-rel-wheeler-3 jenkins-rel-wheeler-2 jenkins-rel-wheeler-1 jenkins-rel-weierstrass-7 jenkins-rel-weierstrass-6 jenkins-rel-weierstrass-5 jenkins-rel-weierstrass-4 jenkins-rel-weierstrass-3 jenkins-rel-weierstrass-2 jenkins-rel-weierstrass-1 jenkins-rel-vapnik-1 jenkins-rel-vajda-4 jenkins-rel-vajda-3 jenkins-rel-vajda-2 jenkins-rel-vajda-1 jenkins-rel-ueno-12 jenkins-rel-ueno-11 jenkins-rel-ueno-10 jenkins-rel-ueno-9 jenkins-rel-ueno-8 jenkins-rel-ueno-7 jenkins-rel-ueno-6 jenkins-rel-ueno-5 jenkins-rel-ueno-4 jenkins-rel-ueno-3 jenkins-rel-ueno-2 jenkins-rel-ueno-1 jenkins-rel-tverberg-6 jenkins-rel-tverberg-5 jenkins-rel-tverberg-4 jenkins-rel-tverberg-3 jenkins-rel-tverberg-2 jenkins-rel-tverberg-1 jenkins-rel-tutte-2 jenkins-rel-tutte-1 jenkins-rel-turnbull-2 jenkins-rel-turnbull-1 jenkins-rel-turing-10 jenkins-rel-turing-9 jenkins-rel-turing-8 jenkins-rel-turing-7 jenkins-rel-turing-6 jenkins-rel-turing-5 jenkins-rel-turing-4 jenkins-rel-turing-3 jenkins-rel-turing-2 jenkins-rel-turing-1 jenkins-rel-turin-4 jenkins-rel-turin-3 jenkins-rel-turin-2 jenkins-rel-turin-1 jenkins-rel-turchin-11 jenkins-rel-turchin-10 jenkins-rel-turchin-9 jenkins-rel-turchin-8 jenkins-rel-turchin-7 jenkins-rel-turchin-6 jenkins-rel-turchin-5 jenkins-rel-turchin-4 jenkins-rel-turchin-3 jenkins-rel-turchin-2 jenkins-rel-turchin-1 jenkins-rel-turan-4 jenkins-rel-turan-3 jenkins-rel-turan-2
Nothing to show
Find file Copy path
743ea66 Oct 26, 2017
@ericeckstrand @wendycwong @arnocandel
135 lines (113 sloc) 6.4 KB
library(h2o)
h2o.init()
# Set this to True if you want to fetch the data directly from S3.
# This is useful if your cluster is running in EC2.
data_source_is_s3 = FALSE
locate_source <- function(s) {
if (data_source_is_s3)
myPath <- paste0("s3n://h2o-public-test-data/", s)
else
myPath <- h2o:::.h2o.locate(s)
}
# Takes column in the specified format 'MM/dd/yyyy hh:mm:ss a' and refines
# it into 8 columns: "Day", "Month", "Year", "WeekNum", "WeekDay", "Weekend",
# "Season", "HourOfDay"
ComputeDateCols <- function(col, datePattern, dateTimeZone = "Etc/UTC") {
if(nzchar(dateTimeZone) > 0) h2o.setTimezone(dateTimeZone)
## it's now already auto-detected as date, no need to convert to a date column
## d <- as.Date(col, format = datePattern)
d <- col
ds <- c(Day = h2o.day(d), Month = h2o.month(d), Year = h2o.year(d), WeekNum = h2o.week(d),
WeekDay = h2o.dayOfWeek(d), HourOfDay = h2o.hour(d))
# Indicator column of whether day is on the weekend
ds$Weekend <- ifelse(ds$WeekDay == "Sun" | ds$WeekDay == "Sat", 1, 0)
# ds$Weekend <- as.factor(ds$Weekend)
# Categorical column of season: Spring = 0, Summer = 1, Autumn = 2, Winter = 3
ds$Season <- ifelse(ds$Month >= 2 & ds$Month <= 4, 0, # Spring = Mar, Apr, May
ifelse(ds$Month >= 5 & ds$Month <= 7, 1, # Summer = Jun, Jul, Aug
ifelse(ds$Month >= 8 & ds$Month <= 9, 2, 3))) # Autumn = Sep, Oct
ds$Season <- as.factor(ds$Season)
h2o.setLevels(ds$Season, c("Spring", "Summer", "Autumn", "Winter"))
# ds$Season <- cut(ds$Month, breaks = c(-1, 1, 4, 6, 9, 11), labels = c("Winter", "Spring", "Summer", "Autumn", "Winter"))
return(ds)
}
RefineDateColumn <- function(train, dateCol, datePattern, dateTimeZone = "Etc/UTC") {
refinedDateCols <- ComputeDateCols(train[,dateCol], datePattern, dateTimeZone)
# mapply(function(val, nam) { do.call("$<-", list(train, nam, val)) }, refinedDateCols, names(refinedDateCols))
train$Day <- refinedDateCols$Day
train$Month <- refinedDateCols$Month + 1 # Since start counting from 0
train$Year <- refinedDateCols$Year + 1900 # Since indexed starting from 1900
train$WeekNum <- refinedDateCols$WeekNum
train$WeekDay <- refinedDateCols$WeekDay
train$HourOfDay <- refinedDateCols$HourOfDay
train$Weekend <- refinedDateCols$Weekend
train$Season <- refinedDateCols$Season
train
}
weather_path <- locate_source("smalldata/chicago/chicagoAllWeather.csv")
census_path <- locate_source("smalldata/chicago/chicagoCensus.csv")
crimes_path <- locate_source("smalldata/chicago/chicagoCrimes10k.csv.zip")
print("Import and parse data...")
weather <- h2o.importFile(path=weather_path, destination_frame="weather.hex")
crimes <- h2o.importFile(path=crimes_path, destination_frame="crimes.hex")
# census <- h2o.importFile(path=census_path, destination_frame="census.hex")
# TODO: Get rid of this once merging with string cols is supported. See PUBDEV-1188.
census_raw <- h2o.importFile(census_path, parse = FALSE)
census_setup <- h2o.parseSetup(census_raw)
census_setup$column_types[2] <- "Enum" # Change community area name col from string to enum
census <- h2o.parseRaw(census_raw, col.types = census_setup$column_types)
print("Set columns names to be syntactically valid in R")
names(census) <- make.names(names(census))
names(crimes) <- make.names(names(crimes))
print("Replace Date column with Year, Month, Day, Week, Hour, Weekend and Season")
crimes <- RefineDateColumn(crimes, which(colnames(crimes) == "Date"), datePattern = "%m/%d/%Y %I:%M:%S %p")
crimes$Date <- NULL # Remove redundant date columns
weather$date <- NULL
print("Merge crimes and census data on community area number")
names(census)[names(census) == "Community.Area.Number"] <- "Community.Area"
crimeMerge <- h2o.merge(crimes, census, all.x=TRUE, method="hash")
print("Merge crimes and weather data on month, day and year")
names(weather)[match(c("month", "day", "year"), names(weather))] <- c("Month", "Day", "Year")
crimeMerge <- h2o.merge(crimeMerge, weather, all.x=TRUE, method="hash")
print("Split final dataset into test/train (ratio = 20/80)")
# BUG: h2o.splitFrame call causes an NPE. See PUBDEV-1235.
# frs <- h2o.splitFrame(crimeMerge, ratios = c(0.8,0.2))
# train <- frs[1]
# test <- frs[2]
split <- h2o.runif(crimeMerge) # Useful when number of rows too large for R to handle
train <- crimeMerge[split <= 0.8,]
test <- crimeMerge[split > 0.8,]
print("Build a GBM model and score")
myY <- "Arrest"
myX <- setdiff(1:ncol(train), which(colnames(train) == myY))
gbmModel <- h2o.gbm(x = myX, y = myY, training_frame = train, validation_frame = test, ntrees = 10, max_depth = 6, distribution = "bernoulli")
print("Build a Deep Learning model and score")
dlModel <- h2o.deeplearning(x = myX, y = myY, training_frame = train, validation_frame = test, variable_importances = TRUE)
cat("\nModel performance:")
cat("\n\tGBM:\n\t\ttrain AUC = ", gbmModel@model$training_metric@metrics$AUC)
cat("\n\t\ttest AUC = ", gbmModel@model$validation_metric@metrics$AUC)
cat("\n\tDL:\n\t\ttrain AUC = ", dlModel@model$training_metric@metrics$AUC)
cat("\n\t\ttest AUC = ", dlModel@model$validation_metric@metrics$AUC, "\n")
print("Predict on new crime data")
crimeExamples.r <- data.frame(Date = c("02/08/2015 11:43:58 PM", "02/08/2015 11:00:39 PM"),
IUCR = c(1811, 1150),
Primary.Type = c("NARCOTICS", "DECEPTIVE PRACTICE"),
Location.Description = c("STREET", "RESIDENCE"),
Domestic = c("false", "false"),
Beat = c(422, 923),
District = c(4, 9),
Ward = c(7, 14),
Community.Area = c(46, 63),
FBI.Code = c(18, 11))
crimeExamples <- as.h2o(crimeExamples.r)
names(crimeExamples) <- make.names(names(crimeExamples))
crimeExamples <- RefineDateColumn(crimeExamples, which(colnames(crimeExamples) == "Date"), datePattern = "%m/%d/%Y %I:%M:%S %p")
crimeExamples$Date <- NULL # Remove redundant date columns
crimeExamplesMerge <- h2o.merge(crimeExamples, census, all.x=TRUE, method="hash")
predGBM <- predict(gbmModel, crimeExamplesMerge)
predDL <- predict(dlModel, crimeExamplesMerge)
for(i in 1:nrow(crimeExamples)) {
cat("\nCrime:\n"); print(crimeExamples[i,])
cat("\n\tProbability of arrest using GBM:", as.matrix(predGBM$true[i,]))
cat("\n\tProbability of arrest using Deep Learning:", as.matrix(predDL$true[i,]), "\n")
}