Permalink
Browse files

Bring web-tag example up to date, read a bit better

  • Loading branch information...
1 parent c782126 commit 29b057d678e53b2ba0903c52ee647d63bcd74f72 @dakrone committed Mar 4, 2012
Showing with 12 additions and 14 deletions.
  1. +12 −14 examples/web-tag.clj
View
@@ -1,17 +1,19 @@
(ns examples.web-tag
- (:use [opennlp.nlp]) ; http://github.com/dakrone/clojure-opennlp
+ (:use [opennlp.nlp])
(:use [clojure.pprint :only [pprint]]))
+(def get-sentences (make-sentence-detector "models/en-sent.bin"))
+(def tokenize (make-tokenizer "models/en-token.bin"))
+(def pos-tag (make-pos-tagger "models/en-pos-maxent.bin"))
+
(defn strip-html-tags
"Messily strip html tags from a web page"
[string]
- (.replaceAll
- (.replaceAll
- (.replaceAll
- (.replaceAll string "<script .*?>.*?</script>" " ")
- "<style .*?>.*?</style>" " ")
- "<.*?>" " ")
- "[ ]+" " "))
+ (-> string
+ (.replaceAll "<script .*?>.*?</script>" " ")
+ (.replaceAll "<style .*?>.*?</style>" " ")
+ (.replaceAll "<.*?>" " ")
+ (.replaceAll "[ ]+" " ")))
(defn fetch-page
[url]
@@ -22,19 +24,15 @@
[url]
(strip-html-tags (fetch-page url)))
-(def get-sentences (make-sentence-detector "models/en-sent.bin"))
-(def tokenize (make-tokenizer "models/en-token.bin"))
-(def pos-tag (make-pos-tagger "models/en-pos-maxent.bin"))
-
(defn- tag-sentences
[sent-seq]
(map #(pos-tag (tokenize %)) sent-seq))
(defn tag-page
[url]
- (let [page (fetch-plain-page url)
+ (let [page (fetch-plain-page url)
sentences (get-sentences page)
- sent-seq (partition-all 10 sentences)]
+ sent-seq (partition-all 10 sentences)]
(pmap tag-sentences sent-seq)))
(tag-page "http://writequit.org")

0 comments on commit 29b057d

Please sign in to comment.