Skip to content

Commit

Permalink
Module Tokenizer become a class - Adding support for preprocessor reg…
Browse files Browse the repository at this point in the history
…exp - specific method of tokenizer available by (StuffClassifier.new).tokenizer.method_name
  • Loading branch information
Oliviergg committed Apr 22, 2012
1 parent bdccc14 commit f0357b9
Show file tree
Hide file tree
Showing 8 changed files with 58 additions and 28 deletions.
1 change: 1 addition & 0 deletions .gitignore
Expand Up @@ -2,4 +2,5 @@
coverage/
.DS_Store
*.gem
utils.rb
Gemfile.lock
4 changes: 2 additions & 2 deletions lib/stuff-classifier.rb
@@ -1,8 +1,8 @@
module StuffClassifier
autoload :VERSION, 'stuff-classifier/version'
autoload :STOP_WORDS, 'stuff-classifier/stop_words'

autoload :Tokenizer, 'stuff-classifier/tokenizer'
autoload :STOP_WORDS, 'stuff-classifier/tokenizer_properties'

autoload :Base, 'stuff-classifier/base'
autoload :Bayes, 'stuff-classifier/bayes'
autoload :TfIdf, 'stuff-classifier/tf-idf'
Expand Down
19 changes: 7 additions & 12 deletions lib/stuff-classifier/base.rb
@@ -1,21 +1,16 @@
# encoding: utf-8
require "lingua/stemmer"

class StuffClassifier::Base
include StuffClassifier::Tokenizer
# include StuffClassifier::Tokenizer
attr_reader :name

def tokenizer
@tokenizer
end
def initialize(name, opts={})
@stemming = opts.key?(:stemming) ? opts[:stemming] : true
purge_state = opts[:purge_state]

if opts[:language]
@language=opts[:language]
else
@language="en"
end

@stemmer = Lingua::Stemmer.new(:language => @language)

@tokenizer = StuffClassifier::Tokenizer.new(opts)

@name = name
@wcount = {}
Expand Down Expand Up @@ -64,7 +59,7 @@ def categories
end

def train(category, text)
each_word(text) {|w| incr_word(w, category) }
@tokenizer.each_word(text) {|w| incr_word(w, category) }
incr_cat(category)
end

Expand Down
2 changes: 1 addition & 1 deletion lib/stuff-classifier/bayes.rb
Expand Up @@ -11,7 +11,7 @@ def initialize(name, opts={})
end

def doc_prob(text, category)
each_word(text).map {|w|
@tokenizer.each_word(text).map {|w|
word_weighted_average(w, category)
}.inject(1) {|p,c| p * c}
end
Expand Down
2 changes: 1 addition & 1 deletion lib/stuff-classifier/tf-idf.rb
Expand Up @@ -12,7 +12,7 @@ def tf_idf(word, cat)
end

def text_prob(text, cat)
each_word(text).map{|w| tf_idf(w, cat)}.inject(0){|s,p| s + p}
@tokenizer.each_word(text).map{|w| tf_idf(w, cat)}.inject(0){|s,p| s + p}
end

def cat_scores(text)
Expand Down
42 changes: 37 additions & 5 deletions lib/stuff-classifier/tokenizer.rb
@@ -1,26 +1,58 @@
# encoding: utf-8

module StuffClassifier::Tokenizer
attr_writer :stemming
require "lingua/stemmer"
class StuffClassifier::Tokenizer

def initialize(opts={})
if opts[:language]
@language=opts[:language]
else
@language="en"
end
@stemming = opts.key?(:stemming) ? opts[:stemming] : true
if @stemming
@stemmer = Lingua::Stemmer.new(:language => @language)
end
@properties = StuffClassifier::Tokenizer::TOKENIZER_PROPERTIES[@language]

end

def preprocessing_regexps=(value)
@preprocessing_regexps = value
end

def preprocessing_regexps
@preprocessing_regexps || @properties["preprocessing_regexps"]
end

def ignore_words=(value)
@ignore_words = value
end

def ignore_words
@ignore_words || StuffClassifier::STOP_WORDS[@language]
@ignore_words || @properties["stop_word"]
end

def stemming=(value)
@stemming = value
end

def stemming?
defined?(@stemming) ? @stemming : false
@stemming || false
end

def each_word(string)
string = string.strip
return if string == ''

words = []


# Apply preprocessing regexps
if preprocessing_regexps
preprocessing_regexps.each { |regexp,replace_by| string.gsub!(regexp, replace_by) }
end

# tokenize string
string.split("\n").each do |line|
line.gsub(/\p{Word}+/).each do |w|
next if w == '' || ignore_words.member?(w.downcase)
Expand Down
@@ -1,8 +1,10 @@
# encoding: utf-8
require 'set'

StuffClassifier::STOP_WORDS = {
"en" => Set.new([
StuffClassifier::Tokenizer::TOKENIZER_PROPERTIES = {
"en" => {
"preprocessing_regexps" => {/['`]/ => ''},
"stop_word" => Set.new([
'a', 'about', 'above', 'across', 'after', 'afterwards',
'again', 'against', 'all', 'almost', 'alone', 'along',
'already', 'also', 'although', 'always', 'am', 'among',
Expand Down Expand Up @@ -56,7 +58,8 @@
'whoever', 'whole', 'whom', 'whose', 'why', 'will',
'with', 'within', 'without', 'would', 'yet', 'you', 'your', 'yours',
'yourself', 'yourselves'
]),
])
},
"fr" => Set.new(
["au", "aux", "avec", "ce", "ces", "dans", "de", "des", "du", "elle", "en", "et", "eux",
"il", "je", "la", "le", "leur", "lui", "ma", "mais", "me", "même", "mes", "moi", "mon",
Expand Down
7 changes: 3 additions & 4 deletions test/test_001_tokenizer.rb
Expand Up @@ -3,12 +3,11 @@
class Test001Tokenizer < TestBase
before do
@tokenizer = StuffClassifier::Bayes.new("TEST")

end

def test_simple_tokens
assert_equal ["hello", "world"],
@tokenizer.each_word('Hello world! How are you?')
assert_equal ["hello", "world"],
@tokenizer.each_word('Hello world! How are you?')
end

def test_with_stemming
Expand All @@ -26,7 +25,7 @@ def test_complicated_tokens
your output is ok I guess ;-)")

should_return = [
"really", "want", "accomplish", "class",
"realli", "want", "accomplish", "class",
"testeval", "test", "eval", "testeval", "new", "class", "end",
"yields", "nil", "output", "ok", "guess"]

Expand Down

0 comments on commit f0357b9

Please sign in to comment.