-
Notifications
You must be signed in to change notification settings - Fork 2k
/
rdemo.citi.bike.small.R
188 lines (158 loc) · 7.57 KB
/
rdemo.citi.bike.small.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
library(h2o)
h2o.init()
# Explore a typical Data Science workflow with H2O and R
#
# Goal: assist the manager of data of NYC to load-balance the bicycles
# across the data network of stations, by predicting the number of bike
# trips taken from the station every day. Use 10 million rows of historical
# data, and eventually add weather data.
# Connect to a cluster
# Set this to True if you want to fetch the data directly from S3.
# This is useful if your cluster is running in EC2.
data_source_is_s3 = FALSE
locate_source <- function(s) {
if (data_source_is_s3)
myPath <- paste0("s3n://h2o-public-test-data/", s)
else
myPath <- h2o:::.h2o.locate(s)
}
# Pick either the big or the small demo.
# Big data is 10M rows
small_test <- locate_source("smalldata/demos/citibike_20k.csv")
# 1- Load data - 1 row per bicycle trip. Has columns showing the start and end
# station, trip duration and trip start time and day. The larger dataset
# totals about 10 million rows
print("Import and Parse bike data...")
start <- Sys.time()
data <- h2o.importFile(path = small_test, destination_frame = "citi_bike")
parseTime <- Sys.time() - start
print(paste("Took", round(parseTime, digits = 2), units(parseTime),"to parse",
nrow(data), "rows and", ncol(data), "columns."))
# 2- light data munging: group the bike starts per-day, converting the 10M rows
# of trips to about 140,000 station&day combos - predicting the number of trip
# starts per-station-per-day.
print('Calculate the dates and day of week based on starttime')
secsPerDay <- 1000*60*60*24
starttime <- data$starttime
data$days <- floor(starttime/secsPerDay)
data$year <- year(starttime) + 1900
data$month <- month(starttime)
data$dayofweek <- dayOfWeek(starttime)
data$day <- day(starttime)
data$age <- data$year - data$"birth year"
print ('Group data into station & day combinations...')
start <- Sys.time()
bpd <- h2o.group_by(data, by = c("days","start station name"), nrow("day") , mean("tripduration"), mean("age"))
groupTime <- Sys.time() - start
print(paste("Took", round(groupTime, digits = 2), units(groupTime), "to group",
nrow(data), "data points into", nrow(bpd), "points."))
names(bpd) <- c("Days","start station name", "bike_count", "mean_duration", "mean_age")
# A little feature engineering
# Add in month-of-year (seasonality; fewer bike rides in winter than summer)
secs <- bpd$Days*secsPerDay
bpd$Month = as.factor(h2o.month(secs))
# Add in day-of-week (work-week; more bike rides on Sunday than Monday)
bpd$DayOfWeek = h2o.dayOfWeek(secs)
print('Examine the distribution of the number of bike rides as well as the average day of riders per day...')
print(quantile(bpd$bike_count))
print(quantile(bpd$mean_age))
print(h2o.hist(bpd$bike_count))
print(h2o.hist(bpd$mean_age))
print(summary(bpd))
# 3- Fit a model on train; using test as validation
# Function for doing class test/train/holdout split
split_fit_predict <- function(data) {
r <- h2o.runif(data$Days)
train <- data[r < 0.6,]
test <- data[(r >= 0.6) & (r < 0.9),]
hold <- data[r >= 0.9,]
print(paste("Training data has", ncol(train), "columns and", nrow(train), "rows, test has",
nrow(test), "rows, holdout has", nrow(hold)))
myY <- "bike_count"
myX <- setdiff(names(train), myY)
# Run GBM
gbm <- h2o.gbm(x = myX,
y = myY,
training_frame = train,
validation_frame = test,
ntrees = 500,
max_depth = 6,
learn_rate = 0.1)
# Run DRF
drf <- h2o.randomForest(x = myX,
y = myY,
training_frame = train,
validation_frame = test,
ntrees = 250,
max_depth = 30)
# Run GLM
glm <- h2o.glm(x = myX,
y = myY,
training_frame = train,
validation_frame = test,
family = "poisson")
# 4- Score on holdout set & report
train_rmse_gbm <- h2o.rmse(gbm, train = TRUE)
test_rmse_gbm <- h2o.rmse(gbm, valid = TRUE)
hold_perf_gbm <- h2o.performance(model = gbm, newdata = hold)
hold_rmse_gbm <- h2o.rmse(object = hold_perf_gbm)
print(paste0("GBM rmse TRAIN = ", train_rmse_gbm, ", rmse TEST = ", test_rmse_gbm, ", rmse HOLDOUT = ",
hold_rmse_gbm))
train_rmse_drf <- h2o.rmse(drf, train = TRUE)
test_rmse_drf <- h2o.rmse(drf, valid = TRUE)
hold_perf_drf <- h2o.performance(model = drf, newdata = hold)
hold_rmse_drf <- h2o.rmse(object = hold_perf_drf)
print(paste0("DRF rmse TRAIN = ", train_rmse_drf, ", rmse TEST = ", test_rmse_drf, ", rmse HOLDOUT = ",
hold_rmse_drf))
train_rmse_glm <- h2o.rmse(glm, train = TRUE)
test_rmse_glm <- h2o.rmse(glm, valid = TRUE)
hold_perf_glm <- h2o.performance(model = glm, newdata = hold)
hold_rmse_glm <- h2o.rmse(hold_perf_glm)
print(paste0("GLM rmse TRAIN = ", train_rmse_glm, ", rmse TEST = ", test_rmse_glm, ", rmse HOLDOUT = ",
hold_rmse_glm))
}
# Split the data (into test & train), fit some models and predict on the holdout data
start <- Sys.time()
split_fit_predict(bpd)
modelBuild <- Sys.time() - start
print(paste("Took", round(modelBuild, digits = 2), units(modelBuild), "to build a gbm, a random forest, and a glm model, score and report rmse values."))
# Here we see an r^2 of 0.91 for GBM, and 0.71 for GLM. This means given just
# the station, the month, and the day-of-week we can predict 90% of the
# variance of the bike-trip-starts.
# 5- Now lets add some weather
# Load weather data
wthr1 <- h2o.importFile(path =
c(locate_source("smalldata/demos/31081_New_York_City__Hourly_2013.csv"),
locate_source("smalldata/demos/31081_New_York_City__Hourly_2014.csv")))
# Peek at the data
print(summary(wthr1))
# Lots of columns in there! Lets plan on converting to time-since-epoch to do
# a 'join' with the bike data, plus gather weather info that might affect
# cyclists - rain, snow, temperature. Alas, drop the "snow" column since it's
# all NA's. Also add in dew point and humidity just in case. Slice out just
# the columns of interest and drop the rest.
wthr2 <- wthr1[, c("Year Local","Month Local","Day Local","Hour Local","Dew Point (C)",
"Humidity Fraction","Precipitation One Hour (mm)","Temperature (C)",
"Weather Code 1/ Description")]
colnames(wthr2)[match("Precipitation One Hour (mm)", colnames(wthr2))] <- "Rain (mm)" # Shorter column name
names(wthr2)[match("Weather Code 1/ Description", colnames(wthr2))] <- "WC1" # Shorter column name
print(summary(wthr2))
# Much better!
# Filter down to the weather at Noon
wthr3 <- wthr2[ wthr2["Hour Local"]==12 ,]
# Also, most rain numbers are missing - lets assume those are zero rain days
wthr3[,"Rain (mm)"] <- ifelse(is.na(wthr3[,"Rain (mm)"]), 0, wthr3[,"Rain (mm)"])
names(wthr3) = c("year", "month", "day", names(wthr3)[4:9])
starttime = h2o.mktime(year=wthr3$year, month=wthr3$month-1, day=wthr3$day-1, hour=wthr3["Hour Local"])
wthr3$Days = floor(starttime/secsPerDay)
# 6 - Join the weather data-per-day to the bike-starts-per-day
print("Merge Daily Weather with Bikes-Per-Day")
bpd_with_weather <- h2o.merge(x = bpd, y = wthr3, all.x = T, all.y = F)
summary(bpd_with_weather)
print(bpd_with_weather)
dim(bpd_with_weather)
# 7 - Test/Train split again, model build again, this time with weather
start <- Sys.time()
split_fit_predict(bpd_with_weather)
modelBuild <- Sys.time() - start
print(paste("Took", round(modelBuild, digits = 2), units(modelBuild) ,"to build a gbm, a random forest, and a glm model, score and report rmse values."))