Permalink
Browse files

Init

  • Loading branch information...
derek-schaefer committed Aug 25, 2012
0 parents commit fa8323dfb95b40ec95a13a8bfd37e84200b823dc
Showing with 18,014 additions and 0 deletions.
  1. +6 −0 .gitignore
  2. +15 −0 README
  3. +5 −0 project.clj
  4. +47 −0 src/clojure_classifier/classifier.clj
  5. +9 −0 src/clojure_classifier/core.clj
  6. +241 −0 src/clojure_classifier/tokenizer.clj
  7. +31 −0 src/java/org/tartarus/snowball/Among.java
  8. +432 −0 src/java/org/tartarus/snowball/SnowballProgram.java
  9. +7 −0 src/java/org/tartarus/snowball/SnowballStemmer.java
  10. +439 −0 src/java/org/tartarus/snowball/ext/danishStemmer.java
  11. +853 −0 src/java/org/tartarus/snowball/ext/dutchStemmer.java
  12. +1,330 −0 src/java/org/tartarus/snowball/ext/englishStemmer.java
  13. +1,050 −0 src/java/org/tartarus/snowball/ext/finnishStemmer.java
  14. +1,520 −0 src/java/org/tartarus/snowball/ext/frenchStemmer.java
  15. +733 −0 src/java/org/tartarus/snowball/ext/germanStemmer.java
  16. +1,174 −0 src/java/org/tartarus/snowball/ext/hungarianStemmer.java
  17. +1,196 −0 src/java/org/tartarus/snowball/ext/italianStemmer.java
  18. +374 −0 src/java/org/tartarus/snowball/ext/norwegianStemmer.java
  19. +922 −0 src/java/org/tartarus/snowball/ext/porterStemmer.java
  20. +1,132 −0 src/java/org/tartarus/snowball/ext/portugueseStemmer.java
  21. +1,040 −0 src/java/org/tartarus/snowball/ext/romanianStemmer.java
  22. +743 −0 src/java/org/tartarus/snowball/ext/russianStemmer.java
  23. +1,198 −0 src/java/org/tartarus/snowball/ext/spanishStemmer.java
  24. +365 −0 src/java/org/tartarus/snowball/ext/swedishStemmer.java
  25. +3,146 −0 src/java/org/tartarus/snowball/ext/turkishStemmer.java
  26. +6 −0 test/clojure_classifier/test/core.clj
@@ -0,0 +1,6 @@
pom.xml
*jar
/lib/
/classes/
.lein-failures
.lein-deps-sum
15 README
@@ -0,0 +1,15 @@
# clojure-classifier
A simple naive Bayes classifier implemented in Clojure.
It's not yet complete and it certainly isn't pretty.
## Usage
Use Leiningen for great good!
## License
Copyright (C) 2012 FIXME
Distributed under the Eclipse Public License, the same as Clojure.
@@ -0,0 +1,5 @@
(defproject clojure-classifier "1.0.0-SNAPSHOT"
:description "FIXME: write description"
:dependencies [[org.clojure/clojure "1.4.0"]]
:java-source-path [["src/java"]]
:main clojure-classifier.core)
@@ -0,0 +1,47 @@
(ns clojure-classifier.classifier
(:require [clojure-classifier.tokenizer :as tokenizer])
(:gen-class))
(def wcount (atom {}))
(def ccount (atom {}))
(def probs (atom {}))
(defn train [lang category text]
(swap! ccount assoc category (+ 1 (get @ccount category 0)))
(doseq [w (tokenizer/each-word lang text)]
(let [pair [category w]]
(swap! wcount assoc pair (+ 1 (get @wcount pair 0))))))
(defn total-count [] (reduce + (vals @ccount)))
(declare word-prob)
(defn word-weighted-avg [category word]
(let [weight 1.0
assumed-prob 0.5
basic-prob (word-prob category word)
totals (reduce + (cons 0 (map #(get @wcount % 0) (keys @ccount))))]
(/ (+ (* weight assumed-prob) (* totals basic-prob)) (+ weight totals))))
(defn document-prob [lang category text]
(reduce * (cons 1.0 (map #(word-weighted-avg category %) (tokenizer/each-word lang text)))))
(defn text-prob [lang category text]
(* (/ (get @ccount category 0) (total-count)) (document-prob lang category text)))
(defn category-scores [lang text]
(doseq [c (seq @ccount)]
(let [k (nth c 0)]
(swap! probs assoc k (text-prob lang k text)))))
(defn classify [lang text]
(category-scores lang text)
(nth (first (reverse (sort-by second (seq @probs)))) 0))
(defn categories [] (keys @ccount))
(defn word-prob [category word]
(let [wc (get @wcount [category word])]
(if wc
(/ wc (get @ccount category 1.0))
0.0)))
@@ -0,0 +1,9 @@
(ns clojure-classifier.core
(:require [clojure-classifier.classifier :as classifier])
(:gen-class :main true))
(defn -main []
(classifier/train "english" "cat1" "herp derp")
(classifier/train "english" "cat2" "herp thingy")
(classifier/train "english" "cat2" "herp stuff")
(println (classifier/classify "english" "herp")))
@@ -0,0 +1,241 @@
(ns clojure-classifier.tokenizer
(:gen-class))
(defn load-stemmer [lang]
(.newInstance
(Class/forName (str "org.tartarus.snowball.ext." (.toLowerCase lang) "Stemmer"))))
(defn words [text]
(map #(.toLowerCase %)
(re-seq #"\w+" (.replaceAll (.replaceAll text "['`]" "") "[^a-zA-Z]+" " "))))
(declare stop-words)
(defn stem-word [stemmer word]
(if (not (contains? stop-words word))
(do
(.setCurrent stemmer word)
(.stem stemmer)
(.getCurrent stemmer))
nil))
(defn each-word [lang text]
(let [stemmer (load-stemmer lang)]
(remove #{nil} (map #(stem-word stemmer %) (words text)))))
(def stop-words
["a"
"about"
"above"
"after"
"again"
"against"
"all"
"am"
"an"
"and"
"any"
"are"
"arent"
"aren't"
"as"
"at"
"be"
"because"
"been"
"before"
"being"
"below"
"between"
"both"
"but"
"by"
"cant"
"can't"
"cannot"
"could"
"couldnt"
"couldn't"
"did"
"didnt"
"didn't"
"do"
"does"
"doesnt"
"doesn't"
"doing"
"dont"
"don't"
"down"
"during"
"each"
"few"
"for"
"from"
"further"
"had"
"hadnt"
"hadn't"
"has"
"hasnt"
"hasn't"
"have"
"havent"
"haven't"
"having"
"he"
"hed"
"he'd"
"he'll"
"hes"
"he's"
"her"
"here"
"heres"
"here's"
"hers"
"herself"
"him"
"himself"
"his"
"how"
"hows"
"how's"
"i"
"i'd"
"i'll"
"im"
"i'm"
"ive"
"i've"
"if"
"in"
"into"
"is"
"isnt"
"isn't"
"it"
"it's"
"its"
"itself"
"lets"
"let's"
"me"
"more"
"most"
"mustnt"
"mustn't"
"my"
"myself"
"no"
"nor"
"not"
"of"
"off"
"on"
"once"
"only"
"or"
"other"
"ought"
"our"
"ours "
"ourselves"
"out"
"over"
"own"
"same"
"shant"
"shan't"
"she"
"she'd"
"she'll"
"shes"
"she's"
"should"
"shouldnt"
"shouldn't"
"so"
"some"
"such"
"than"
"that"
"thats"
"that's"
"the"
"their"
"theirs"
"them"
"themselves"
"then"
"there"
"theres"
"there's"
"these"
"they"
"theyd"
"they'd"
"theyll"
"they'll"
"theyre"
"they're"
"theyve"
"they've"
"this"
"those"
"through"
"to"
"too"
"under"
"until"
"up"
"very"
"was"
"wasnt"
"wasn't"
"we"
"we'd"
"we'll"
"we're"
"weve"
"we've"
"were"
"werent"
"weren't"
"what"
"whats"
"what's"
"when"
"whens"
"when's"
"where"
"wheres"
"where's"
"which"
"while"
"who"
"whos"
"who's"
"whom"
"why"
"whys"
"why's"
"with"
"wont"
"won't"
"would"
"wouldnt"
"wouldn't"
"you"
"youd"
"you'd"
"youll"
"you'll"
"youre"
"you're"
"youve"
"you've"
"your"
"yours"
"yourself"
"yourselves"])
@@ -0,0 +1,31 @@
package org.tartarus.snowball;
import java.lang.reflect.Method;
public class Among {
public Among (String s, int substring_i, int result,
String methodname, SnowballProgram methodobject) {
this.s_size = s.length();
this.s = s.toCharArray();
this.substring_i = substring_i;
this.result = result;
this.methodobject = methodobject;
if (methodname.length() == 0) {
this.method = null;
} else {
try {
this.method = methodobject.getClass().
getDeclaredMethod(methodname, new Class[0]);
} catch (NoSuchMethodException e) {
throw new RuntimeException(e);
}
}
}
public final int s_size; /* search string */
public final char[] s; /* search string */
public final int substring_i; /* index to longest matching substring */
public final int result; /* result of the lookup */
public final Method method; /* method to use if substring matches */
public final SnowballProgram methodobject; /* object to invoke method on */
}
Oops, something went wrong.

0 comments on commit fa8323d

Please sign in to comment.