Fix tokenization for words containing non-ascii characters

domnikl · Jan 31, 2013 · 9c974ac · 9c974ac
1 parent 796c078
commit 9c974ac
Show file tree

Hide file tree

Showing 2 changed files with 10 additions and 2 deletions.
diff --git a/lib/highscore/content.rb b/lib/highscore/content.rb
@@ -31,7 +31,7 @@ def initialize(content, wordlist = nil)
         :consonants => 0,
         :ignore_short_words => true,
         :ignore_case => false,
-        :word_pattern => /\w+/,
+        :word_pattern => /\p{Word}+/u,
         :stemming => false
       }
     end

diff --git a/test/highscore/test_content.rb b/test/highscore/test_content.rb
@@ -1,3 +1,4 @@
+# encoding: utf-8
 $:.unshift(File.join(File.dirname(__FILE__), %w{.. .. lib highscore}))
 require "content"
 require "test/unit"
@@ -32,6 +33,13 @@ def test_keywords_fixnum
     assert_equal 1, content.keywords.length
   end
 
+  def test_keywords_utf8
+    content = 'Schöne Grüße, caractères, русский'
+
+    content = Highscore::Content.new content
+    assert_equal 4, content.keywords.length
+  end
+
   def test_vowels_and_consonants
     keywords = 'foobar RubyGems'.keywords do
       set :vowels, 2
@@ -93,4 +101,4 @@ def test_stemming
       # do nothing, just skip this test
     end
   end
-end
+end