Skip to content

Commit

Permalink
Bring web-tag example up to date, read a bit better
Browse files Browse the repository at this point in the history
  • Loading branch information
dakrone committed Mar 4, 2012
1 parent c782126 commit 29b057d
Showing 1 changed file with 12 additions and 14 deletions.
26 changes: 12 additions & 14 deletions examples/web-tag.clj
@@ -1,17 +1,19 @@
(ns examples.web-tag (ns examples.web-tag
(:use [opennlp.nlp]) ; http://github.com/dakrone/clojure-opennlp (:use [opennlp.nlp])
(:use [clojure.pprint :only [pprint]])) (:use [clojure.pprint :only [pprint]]))


(def get-sentences (make-sentence-detector "models/en-sent.bin"))
(def tokenize (make-tokenizer "models/en-token.bin"))
(def pos-tag (make-pos-tagger "models/en-pos-maxent.bin"))

(defn strip-html-tags (defn strip-html-tags
"Messily strip html tags from a web page" "Messily strip html tags from a web page"
[string] [string]
(.replaceAll (-> string
(.replaceAll (.replaceAll "<script .*?>.*?</script>" " ")
(.replaceAll (.replaceAll "<style .*?>.*?</style>" " ")
(.replaceAll string "<script .*?>.*?</script>" " ") (.replaceAll "<.*?>" " ")
"<style .*?>.*?</style>" " ") (.replaceAll "[ ]+" " ")))
"<.*?>" " ")
"[ ]+" " "))


(defn fetch-page (defn fetch-page
[url] [url]
Expand All @@ -22,19 +24,15 @@
[url] [url]
(strip-html-tags (fetch-page url))) (strip-html-tags (fetch-page url)))


(def get-sentences (make-sentence-detector "models/en-sent.bin"))
(def tokenize (make-tokenizer "models/en-token.bin"))
(def pos-tag (make-pos-tagger "models/en-pos-maxent.bin"))

(defn- tag-sentences (defn- tag-sentences
[sent-seq] [sent-seq]
(map #(pos-tag (tokenize %)) sent-seq)) (map #(pos-tag (tokenize %)) sent-seq))


(defn tag-page (defn tag-page
[url] [url]
(let [page (fetch-plain-page url) (let [page (fetch-plain-page url)
sentences (get-sentences page) sentences (get-sentences page)
sent-seq (partition-all 10 sentences)] sent-seq (partition-all 10 sentences)]
(pmap tag-sentences sent-seq))) (pmap tag-sentences sent-seq)))


(tag-page "http://writequit.org") (tag-page "http://writequit.org")

0 comments on commit 29b057d

Please sign in to comment.