diff --git a/History.txt b/History.txt index b32f67a..552ea93 100644 --- a/History.txt +++ b/History.txt @@ -1,6 +1,7 @@ == 0.6.0 / 2013-02-02 * added per-language support for black- and whitelists (thanks to bobjflong) +* fixed tokenization for UTF-8 strings (this is broken in Ruby 1.8.x!) (thanks to pdg) == 0.5.2 / 2012-02-25 diff --git a/lib/highscore/content.rb b/lib/highscore/content.rb index bf265ef..8c8e64d 100644 --- a/lib/highscore/content.rb +++ b/lib/highscore/content.rb @@ -33,9 +33,13 @@ def initialize(content, wordlist = nil) :consonants => 0, :ignore_short_words => true, :ignore_case => false, - :word_pattern => /\w+/, + :word_pattern => /\p{Word}+/u, :stemming => false } + + if RUBY_VERSION =~ /^1\.8/ + @emphasis[:word_pattern] = /\w+/ + end end # configure ranking diff --git a/test/highscore/test_content.rb b/test/highscore/test_content.rb index 31f3a82..273e35f 100644 --- a/test/highscore/test_content.rb +++ b/test/highscore/test_content.rb @@ -1,4 +1,8 @@ -require File.dirname(__FILE__) + '/../test_highscore' +# encoding: utf-8 +$:.unshift(File.join(File.dirname(__FILE__), %w{.. .. lib highscore})) +require "content" +require "test/unit" +require 'rubygems' class TestContent < Highscore::TestCase def setup @@ -29,6 +33,19 @@ def test_keywords_fixnum assert_equal 1, content.keywords.length end + def test_keywords_utf8 + content = 'Schöne Grüße, caractères, русский' + + content = Highscore::Content.new content + + if RUBY_VERSION =~ /^1\.8/ + # Ruby 1.8 doesn't support correct tokenization + assert_equal 3, content.keywords.length + else + assert_equal 4, content.keywords.length + end + end + def test_vowels_and_consonants keywords = 'foobar RubyGems'.keywords do set :vowels, 2 @@ -109,4 +126,5 @@ def test_language_english def test_language_german assert_equal :german, Highscore::Content.new("Das ist sicherlich ein deutscher Text!").language end -end \ No newline at end of file +end +