Permalink
Switch branches/tags
last_OK jenkins-tomk-hadoop-1 jenkins-tomas_jenkins-7 jenkins-tomas_jenkins-6 jenkins-tomas_jenkins-5 jenkins-tomas_jenkins-4 jenkins-tomas_jenkins-3 jenkins-tomas_jenkins-2 jenkins-tomas_jenkins-1 jenkins-sample-docs-3 jenkins-sample-docs-2 jenkins-sample-docs-1 jenkins-rel-wright-7 jenkins-rel-wright-6 jenkins-rel-wright-5 jenkins-rel-wright-4 jenkins-rel-wright-3 jenkins-rel-wright-2 jenkins-rel-wright-1 jenkins-rel-wolpert-11 jenkins-rel-wolpert-10 jenkins-rel-wolpert-9 jenkins-rel-wolpert-8 jenkins-rel-wolpert-7 jenkins-rel-wolpert-6 jenkins-rel-wolpert-5 jenkins-rel-wolpert-4 jenkins-rel-wolpert-3 jenkins-rel-wolpert-2 jenkins-rel-wolpert-1 jenkins-rel-wheeler-12 jenkins-rel-wheeler-11 jenkins-rel-wheeler-10 jenkins-rel-wheeler-9 jenkins-rel-wheeler-8 jenkins-rel-wheeler-7 jenkins-rel-wheeler-6 jenkins-rel-wheeler-5 jenkins-rel-wheeler-4 jenkins-rel-wheeler-3 jenkins-rel-wheeler-2 jenkins-rel-wheeler-1 jenkins-rel-weierstrass-7 jenkins-rel-weierstrass-6 jenkins-rel-weierstrass-5 jenkins-rel-weierstrass-4 jenkins-rel-weierstrass-3 jenkins-rel-weierstrass-2 jenkins-rel-weierstrass-1 jenkins-rel-vapnik-1 jenkins-rel-vajda-4 jenkins-rel-vajda-3 jenkins-rel-vajda-2 jenkins-rel-vajda-1 jenkins-rel-ueno-12 jenkins-rel-ueno-11 jenkins-rel-ueno-10 jenkins-rel-ueno-9 jenkins-rel-ueno-8 jenkins-rel-ueno-7 jenkins-rel-ueno-6 jenkins-rel-ueno-5 jenkins-rel-ueno-4 jenkins-rel-ueno-3 jenkins-rel-ueno-2 jenkins-rel-ueno-1 jenkins-rel-tverberg-6 jenkins-rel-tverberg-5 jenkins-rel-tverberg-4 jenkins-rel-tverberg-3 jenkins-rel-tverberg-2 jenkins-rel-tverberg-1 jenkins-rel-tutte-2 jenkins-rel-tutte-1 jenkins-rel-turnbull-2 jenkins-rel-turnbull-1 jenkins-rel-turing-10 jenkins-rel-turing-9 jenkins-rel-turing-8 jenkins-rel-turing-7 jenkins-rel-turing-6 jenkins-rel-turing-5 jenkins-rel-turing-4 jenkins-rel-turing-3 jenkins-rel-turing-2 jenkins-rel-turing-1 jenkins-rel-turin-4 jenkins-rel-turin-3 jenkins-rel-turin-2 jenkins-rel-turin-1 jenkins-rel-turchin-11 jenkins-rel-turchin-10 jenkins-rel-turchin-9 jenkins-rel-turchin-8 jenkins-rel-turchin-7 jenkins-rel-turchin-6 jenkins-rel-turchin-5 jenkins-rel-turchin-4 jenkins-rel-turchin-3 jenkins-rel-turchin-2
Nothing to show
Find file Copy path
59 lines (45 sloc) 2.69 KB
library(h2o)
h2o.init()
job.titles.path = "https://raw.githubusercontent.com/h2oai/sparkling-water/rel-1.6/examples/smalldata/craigslistJobTitles.csv"
job.titles <- h2o.importFile(job.titles.path, destination_frame = "jobtitles",
col.names = c("category", "jobtitle"), col.types = c("Enum", "String"), header = TRUE)
STOP_WORDS = c("ax","i","you","edu","s","t","m","subject","can","lines","re","what",
"there","all","we","one","the","a","an","of","or","in","for","by","on",
"but","is","in","a","not","with","as","was","if","they","are","this","and","it","have",
"from","at","my","be","by","not","that","to","from","com","org","like","likes","so")
tokenize <- function(sentences, stop.words = STOP_WORDS) {
tokenized <- h2o.tokenize(sentences, "\\\\W+")
# convert to lower case
tokenized.lower <- h2o.tolower(tokenized)
# remove short words (less than 2 characters)
tokenized.lengths <- h2o.nchar(tokenized.lower)
tokenized.filtered <- tokenized.lower[is.na(tokenized.lengths) || tokenized.lengths >= 2,]
# remove words that contain numbers
tokenized.words <- tokenized.filtered[h2o.grep("[0-9]", tokenized.filtered, invert = TRUE, output.logical = TRUE),]
# remove stop words
tokenized.words[is.na(tokenized.words) || (! tokenized.words %in% STOP_WORDS),]
}
predict <- function(job.title, w2v, gbm) {
words <- tokenize(as.character(as.h2o(job.title)))
job.title.vec <- h2o.transform(w2v, words, aggregate_method = "AVERAGE")
h2o.predict(gbm, job.title.vec)
}
print("Break job titles into sequence of words")
words <- tokenize(job.titles$jobtitle)
print("Build word2vec model")
w2v.model <- h2o.word2vec(words, sent_sample_rate = 0, epochs = 10)
print("Sanity check - find synonyms for the word 'teacher'")
print(h2o.findSynonyms(w2v.model, "teacher", count = 5))
print("Calculate a vector for each job title")
job.title.vecs <- h2o.transform(w2v.model, words, aggregate_method = "AVERAGE")
print("Prepare training&validation data (keep only job titles made of known words)")
valid.job.titles <- ! is.na(job.title.vecs$C1)
data <- h2o.cbind(job.titles[valid.job.titles, "category"], job.title.vecs[valid.job.titles, ])
data.split <- h2o.splitFrame(data, ratios = 0.8)
print("Build a basic GBM model")
gbm.model <- h2o.gbm(x = names(job.title.vecs), y = "category",
training_frame = data.split[[1]], validation_frame = data.split[[2]])
print("Predict!")
print(predict("school teacher having holidays every month", w2v.model, gbm.model))
print(predict("developer with 3+ Java experience, jumping", w2v.model, gbm.model))
print(predict("Financial accountant CPA preferred", w2v.model, gbm.model))