Skip to content
Permalink
master
Go to file
 
 
Cannot retrieve contributors at this time
59 lines (46 sloc) 2.7 KB
library(h2o)
h2o.init()
job.titles.path <- h2o:::.h2o.locate("smalldata/craigslistJobTitles.csv")
job.titles <- h2o.importFile(job.titles.path, destination_frame = "jobtitles",
col.names = c("category", "jobtitle"), col.types = c("Enum", "String"), header = TRUE)
STOP_WORDS = c("ax","i","you","edu","s","t","m","subject","can","lines","re","what",
"there","all","we","one","the","a","an","of","or","in","for","by","on",
"but","is","in","a","not","with","as","was","if","they","are","this","and","it","have",
"from","at","my","be","by","not","that","to","from","com","org","like","likes","so")
tokenize <- function(sentences, stop.words = STOP_WORDS) {
tokenized <- h2o.tokenize(sentences, "\\\\W+")
# convert to lower case
tokenized.lower <- h2o.tolower(tokenized)
# remove short words (less than 2 characters)
tokenized.lengths <- h2o.nchar(tokenized.lower)
tokenized.filtered <- tokenized.lower[is.na(tokenized.lengths) || tokenized.lengths >= 2,]
# remove words that contain numbers
tokenized.words <- tokenized.filtered[h2o.grep("[0-9]", tokenized.filtered, invert = TRUE, output.logical = TRUE),]
# remove stop words
tokenized.words[is.na(tokenized.words) || (! tokenized.words %in% STOP_WORDS),]
}
# `predict` conflicts with generic fn defined in R.stats
.predict <- function(job.title, w2v, gbm) {
words <- tokenize(as.character(as.h2o(job.title)))
job.title.vec <- h2o.transform(w2v, words, aggregate_method = "AVERAGE")
h2o.predict(gbm, job.title.vec)
}
print("Break job titles into sequence of words")
words <- tokenize(job.titles$jobtitle)
print("Build word2vec model")
w2v.model <- h2o.word2vec(words, sent_sample_rate = 0, epochs = 10)
print("Sanity check - find synonyms for the word 'teacher'")
print(h2o.findSynonyms(w2v.model, "teacher", count = 5))
print("Calculate a vector for each job title")
job.title.vecs <- h2o.transform(w2v.model, words, aggregate_method = "AVERAGE")
print("Prepare training&validation data (keep only job titles made of known words)")
valid.job.titles <- ! is.na(job.title.vecs$C1)
data <- h2o.cbind(job.titles[valid.job.titles, "category"], job.title.vecs[valid.job.titles, ])
data.split <- h2o.splitFrame(data, ratios = 0.8)
print("Build a basic GBM model")
gbm.model <- h2o.gbm(x = names(job.title.vecs), y = "category",
training_frame = data.split[[1]], validation_frame = data.split[[2]])
print("Predict!")
print(.predict("school teacher having holidays every month", w2v.model, gbm.model))
print(.predict("developer with 3+ Java experience, jumping", w2v.model, gbm.model))
print(.predict("Financial accountant CPA preferred", w2v.model, gbm.model))
You can’t perform that action at this time.