Module Tokenizer become a class - Adding support for preprocessor reg…

…exp - specific method of tokenizer available by (StuffClassifier.new).tokenizer.method_name
denniskuczynski · Apr 22, 2012 · f0357b9 · f0357b9
1 parent bdccc14
commit f0357b9
Show file tree

Hide file tree

Showing 8 changed files with 58 additions and 28 deletions.
diff --git a/.gitignore b/.gitignore
@@ -2,4 +2,5 @@
 coverage/
 .DS_Store
 *.gem
+utils.rb
 Gemfile.lock
diff --git a/lib/stuff-classifier.rb b/lib/stuff-classifier.rb
@@ -1,8 +1,8 @@
 module StuffClassifier
   autoload :VERSION,    'stuff-classifier/version'
-  autoload :STOP_WORDS, 'stuff-classifier/stop_words'
-
   autoload :Tokenizer,  'stuff-classifier/tokenizer'
+  autoload :STOP_WORDS, 'stuff-classifier/tokenizer_properties'
+
   autoload :Base,       'stuff-classifier/base'
   autoload :Bayes,      'stuff-classifier/bayes'
   autoload :TfIdf,      'stuff-classifier/tf-idf'

diff --git a/lib/stuff-classifier/base.rb b/lib/stuff-classifier/base.rb
@@ -1,21 +1,16 @@
 # encoding: utf-8
-require "lingua/stemmer"
 
 class StuffClassifier::Base
-  include StuffClassifier::Tokenizer
+#  include StuffClassifier::Tokenizer
   attr_reader :name
 
+  def tokenizer
+    @tokenizer
+  end
   def initialize(name, opts={})
-    @stemming = opts.key?(:stemming) ? opts[:stemming] : true
     purge_state = opts[:purge_state]
-
-    if opts[:language]
-      @language=opts[:language]
-    else
-      @language="en"
-    end
-
-    @stemmer = Lingua::Stemmer.new(:language => @language)
+
+    @tokenizer = StuffClassifier::Tokenizer.new(opts)
 
     @name = name
     @wcount = {}
@@ -64,7 +59,7 @@ def categories
   end
 
   def train(category, text)
-    each_word(text) {|w| incr_word(w, category) }
+    @tokenizer.each_word(text) {|w| incr_word(w, category) }
     incr_cat(category)
   end
 

diff --git a/lib/stuff-classifier/bayes.rb b/lib/stuff-classifier/bayes.rb
@@ -11,7 +11,7 @@ def initialize(name, opts={})
   end
 
   def doc_prob(text, category)
-    each_word(text).map {|w|
+    @tokenizer.each_word(text).map {|w|
       word_weighted_average(w, category)
     }.inject(1) {|p,c| p * c}
   end

diff --git a/lib/stuff-classifier/tf-idf.rb b/lib/stuff-classifier/tf-idf.rb
@@ -12,7 +12,7 @@ def tf_idf(word, cat)
   end
 
   def text_prob(text, cat)
-    each_word(text).map{|w| tf_idf(w, cat)}.inject(0){|s,p| s + p}
+    @tokenizer.each_word(text).map{|w| tf_idf(w, cat)}.inject(0){|s,p| s + p}
   end
 
   def cat_scores(text)

diff --git a/lib/stuff-classifier/tokenizer.rb b/lib/stuff-classifier/tokenizer.rb
@@ -1,26 +1,58 @@
 # encoding: utf-8
 
-module StuffClassifier::Tokenizer
-  attr_writer :stemming
+require "lingua/stemmer"
+class StuffClassifier::Tokenizer
+
+  def initialize(opts={})
+    if opts[:language]
+      @language=opts[:language]
+    else
+      @language="en"
+    end
+    @stemming = opts.key?(:stemming) ? opts[:stemming] : true
+    if @stemming
+      @stemmer = Lingua::Stemmer.new(:language => @language)
+    end
+    @properties = StuffClassifier::Tokenizer::TOKENIZER_PROPERTIES[@language]
+
+  end
+
+  def preprocessing_regexps=(value)
+    @preprocessing_regexps = value
+  end
+
+  def preprocessing_regexps
+    @preprocessing_regexps || @properties["preprocessing_regexps"]
+  end
 
   def ignore_words=(value)
     @ignore_words = value
   end
 
   def ignore_words
-    @ignore_words || StuffClassifier::STOP_WORDS[@language]
+    @ignore_words || @properties["stop_word"]
+  end
+
+  def stemming=(value)
+    @stemming = value
   end
 
   def stemming?
-    defined?(@stemming) ? @stemming : false
+    @stemming || false
   end
 
   def each_word(string)
     string = string.strip
     return if string == ''
 
     words = []
-
+
+    # Apply preprocessing regexps
+    if preprocessing_regexps
+      preprocessing_regexps.each { |regexp,replace_by| string.gsub!(regexp, replace_by) }
+    end
+
+    # tokenize string
     string.split("\n").each do |line|
       line.gsub(/\p{Word}+/).each do |w|
         next if w == '' || ignore_words.member?(w.downcase)

diff --git a/lib/stuff-classifier/stop_words.rb → lib/stuff-classifier/tokenizer_properties.rb b/lib/stuff-classifier/stop_words.rb → lib/stuff-classifier/tokenizer_properties.rb
@@ -1,8 +1,10 @@
 # encoding: utf-8
 require 'set'
 
-StuffClassifier::STOP_WORDS = {
-  "en" => Set.new([
+StuffClassifier::Tokenizer::TOKENIZER_PROPERTIES = {
+  "en" => {
+    "preprocessing_regexps" => {/['`]/ => ''},
+    "stop_word" => Set.new([
     'a', 'about', 'above', 'across', 'after', 'afterwards', 
     'again', 'against', 'all', 'almost', 'alone', 'along', 
     'already', 'also', 'although', 'always', 'am', 'among', 
@@ -56,7 +58,8 @@
     'whoever', 'whole', 'whom', 'whose', 'why', 'will', 
     'with', 'within', 'without', 'would', 'yet', 'you', 'your', 'yours', 
     'yourself', 'yourselves'
-]),
+])
+},
 "fr" => Set.new(
 ["au",  "aux",  "avec",  "ce",  "ces",  "dans",  "de",  "des",  "du",  "elle",  "en",  "et",  "eux",
   "il",  "je",  "la",  "le",  "leur",  "lui",  "ma",  "mais",  "me",  "même",  "mes",  "moi",  "mon",

diff --git a/test/test_001_tokenizer.rb b/test/test_001_tokenizer.rb
@@ -3,12 +3,11 @@
 class Test001Tokenizer < TestBase
   before do
     @tokenizer = StuffClassifier::Bayes.new("TEST")
-
   end
 
   def test_simple_tokens
-    assert_equal ["hello", "world"], 
-      @tokenizer.each_word('Hello world! How are you?')
+     assert_equal ["hello", "world"], 
+       @tokenizer.each_word('Hello world! How are you?')
   end    
 
   def test_with_stemming
@@ -26,7 +25,7 @@ def test_complicated_tokens
       your output is ok I guess ;-)")
 
     should_return = [
-      "really", "want", "accomplish", "class",
+      "realli", "want", "accomplish", "class",
       "testeval", "test", "eval", "testeval", "new", "class", "end",
       "yields", "nil", "output", "ok", "guess"]