-
Notifications
You must be signed in to change notification settings - Fork 2k
/
rdemo.chicago.crime.large.R
134 lines (113 loc) · 6.35 KB
/
rdemo.chicago.crime.large.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
library(h2o)
h2o.init()
# Set this to True if you want to fetch the data directly from S3.
# This is useful if your cluster is running in EC2.
data_source_is_s3 = FALSE
locate_source <- function(s) {
if (data_source_is_s3)
myPath <- paste0("s3n://h2o-public-test-data/", s)
else
myPath <- h2o:::.h2o.locate(s)
}
# Takes column in the specified format 'MM/dd/yyyy hh:mm:ss a' and refines
# it into 8 columns: "Day", "Month", "Year", "WeekNum", "WeekDay", "Weekend",
# "Season", "HourOfDay"
ComputeDateCols <- function(col, datePattern, dateTimeZone = "Etc/UTC") {
if(nzchar(dateTimeZone) > 0) h2o.setTimezone(dateTimeZone)
## it's now already auto-detected as date, no need to convert to a date column
## d <- as.Date(col, format = datePattern)
d <- col
ds <- c(Day = h2o.day(d), Month = h2o.month(d), Year = h2o.year(d), WeekNum = h2o.week(d),
WeekDay = h2o.dayOfWeek(d), HourOfDay = h2o.hour(d))
# Indicator column of whether day is on the weekend
ds$Weekend <- ifelse(ds$WeekDay == "Sun" | ds$WeekDay == "Sat", 1, 0)
# ds$Weekend <- as.factor(ds$Weekend)
# Categorical column of season: Spring = 0, Summer = 1, Autumn = 2, Winter = 3
ds$Season <- ifelse(ds$Month >= 2 & ds$Month <= 4, 0, # Spring = Mar, Apr, May
ifelse(ds$Month >= 5 & ds$Month <= 7, 1, # Summer = Jun, Jul, Aug
ifelse(ds$Month >= 8 & ds$Month <= 9, 2, 3))) # Autumn = Sep, Oct
ds$Season <- as.factor(ds$Season)
h2o.setLevels(ds$Season, c("Spring", "Summer", "Autumn", "Winter"))
# ds$Season <- cut(ds$Month, breaks = c(-1, 1, 4, 6, 9, 11), labels = c("Winter", "Spring", "Summer", "Autumn", "Winter"))
return(ds)
}
RefineDateColumn <- function(train, dateCol, datePattern, dateTimeZone = "Etc/UTC") {
refinedDateCols <- ComputeDateCols(train[,dateCol], datePattern, dateTimeZone)
# mapply(function(val, nam) { do.call("$<-", list(train, nam, val)) }, refinedDateCols, names(refinedDateCols))
train$Day <- refinedDateCols$Day
train$Month <- refinedDateCols$Month + 1 # Since start counting from 0
train$Year <- refinedDateCols$Year + 1900 # Since indexed starting from 1900
train$WeekNum <- refinedDateCols$WeekNum
train$WeekDay <- refinedDateCols$WeekDay
train$HourOfDay <- refinedDateCols$HourOfDay
train$Weekend <- refinedDateCols$Weekend
train$Season <- refinedDateCols$Season
train
}
weather_path <- locate_source("smalldata/chicago/chicagoAllWeather.csv")
census_path <- locate_source("smalldata/chicago/chicagoCensus.csv")
crimes_path <- locate_source("smalldata/chicago/chicagoCrimes10k.csv.zip")
print("Import and parse data...")
weather <- h2o.importFile(path=weather_path, destination_frame="weather.hex")
crimes <- h2o.importFile(path=crimes_path, destination_frame="crimes.hex")
# census <- h2o.importFile(path=census_path, destination_frame="census.hex")
# TODO: Get rid of this once merging with string cols is supported. See PUBDEV-1188.
census_raw <- h2o.importFile(census_path, parse = FALSE)
census_setup <- h2o.parseSetup(census_raw)
census_setup$column_types[2] <- "Enum" # Change community area name col from string to enum
census <- h2o.parseRaw(census_raw, col.types = census_setup$column_types)
print("Set columns names to be syntactically valid in R")
names(census) <- make.names(names(census))
names(crimes) <- make.names(names(crimes))
print("Replace Date column with Year, Month, Day, Week, Hour, Weekend and Season")
crimes <- RefineDateColumn(crimes, which(colnames(crimes) == "Date"), datePattern = "%m/%d/%Y %I:%M:%S %p")
crimes$Date <- NULL # Remove redundant date columns
weather$date <- NULL
print("Merge crimes and census data on community area number")
names(census)[names(census) == "Community.Area.Number"] <- "Community.Area"
crimeMerge <- h2o.merge(crimes, census, all.x=TRUE)
print("Merge crimes and weather data on month, day and year")
names(weather)[match(c("month", "day", "year"), names(weather))] <- c("Month", "Day", "Year")
crimeMerge <- h2o.merge(crimeMerge, weather, all.x=TRUE)
print("Split final dataset into test/train (ratio = 20/80)")
# BUG: h2o.splitFrame call causes an NPE. See PUBDEV-1235.
# frs <- h2o.splitFrame(crimeMerge, ratios = c(0.8,0.2))
# train <- frs[1]
# test <- frs[2]
split <- h2o.runif(crimeMerge) # Useful when number of rows too large for R to handle
train <- crimeMerge[split <= 0.8,]
test <- crimeMerge[split > 0.8,]
print("Build a GBM model and score")
myY <- "Arrest"
myX <- setdiff(1:ncol(train), which(colnames(train) == myY))
gbmModel <- h2o.gbm(x = myX, y = myY, training_frame = train, validation_frame = test, ntrees = 10, max_depth = 6, distribution = "bernoulli")
print("Build a Deep Learning model and score")
dlModel <- h2o.deeplearning(x = myX, y = myY, training_frame = train, validation_frame = test, variable_importances = TRUE)
cat("\nModel performance:")
cat("\n\tGBM:\n\t\ttrain AUC = ", gbmModel@model$training_metric@metrics$AUC)
cat("\n\t\ttest AUC = ", gbmModel@model$validation_metric@metrics$AUC)
cat("\n\tDL:\n\t\ttrain AUC = ", dlModel@model$training_metric@metrics$AUC)
cat("\n\t\ttest AUC = ", dlModel@model$validation_metric@metrics$AUC, "\n")
print("Predict on new crime data")
crimeExamples.r <- data.frame(Date = c("02/08/2015 11:43:58 PM", "02/08/2015 11:00:39 PM"),
IUCR = c(1811, 1150),
Primary.Type = c("NARCOTICS", "DECEPTIVE PRACTICE"),
Location.Description = c("STREET", "RESIDENCE"),
Domestic = c("false", "false"),
Beat = c(422, 923),
District = c(4, 9),
Ward = c(7, 14),
Community.Area = c(46, 63),
FBI.Code = c(18, 11))
crimeExamples <- as.h2o(crimeExamples.r)
names(crimeExamples) <- make.names(names(crimeExamples))
crimeExamples <- RefineDateColumn(crimeExamples, which(colnames(crimeExamples) == "Date"), datePattern = "%m/%d/%Y %I:%M:%S %p")
crimeExamples$Date <- NULL # Remove redundant date columns
crimeExamplesMerge <- h2o.merge(crimeExamples, census, all.x=TRUE)
predGBM <- predict(gbmModel, crimeExamplesMerge)
predDL <- predict(dlModel, crimeExamplesMerge)
for(i in 1:nrow(crimeExamples)) {
cat("\nCrime:\n"); print(crimeExamples[i,])
cat("\n\tProbability of arrest using GBM:", as.matrix(predGBM$true[i,]))
cat("\n\tProbability of arrest using Deep Learning:", as.matrix(predDL$true[i,]), "\n")
}