Permalink
Browse files

+ "european" character substitution

  • Loading branch information...
1 parent d1cfb8f commit e817ffd09464df2bf5aa0abc8d983de0b7518d49 @floere committed Oct 30, 2010
@@ -0,0 +1,33 @@
+# encoding: utf-8
+#
+module CharacterSubstitution
+ # Substitutes Umlauts like
+ # ä, ö, ü => ae, oe, ue.
+ # (and more, see specs)
+ #
+ class European
+
+ def initialize
+ @chars = ActiveSupport::Multibyte.proxy_class
+ end
+
+ def substitute text
+ trans = @chars.new(text).normalize(:kd)
+
+ # substitute special cases
+ #
+ trans.gsub!('ß', 'ss')
+
+ # substitute umlauts (of A,O,U,a,o,u)
+ #
+ trans.gsub!(/([AOUaou])\314\210/u, '\1e')
+
+ # get rid of ecutes, graves and …
+ #
+ trans.unpack('U*').select { |cp|
+ cp < 0x0300 || cp > 0x035F
+ }.pack('U*')
+ end
+
+ end
+end
@@ -104,14 +104,17 @@ def self.load_framework
load_relative 'helpers/cache'
load_relative 'helpers/measuring'
+ # Character Substitution
+ #
+ load_relative 'character_substitution/european'
+
# Signal handling
#
load_relative 'signals'
# Various.
#
load_relative 'loggers/search'
- load_relative 'umlaut_substituter'
# Index generation strategies.
#
@@ -96,7 +96,7 @@ def tokenize text
attr_accessor :substituter
alias substituter? substituter
- def initialize substituter = UmlautSubstituter.new
+ def initialize substituter = CharacterSubstitution::European.new
@substituter = substituter
# TODO Default handling.
@@ -1,34 +0,0 @@
-# encoding: utf-8
-#
-
-# Substitutes certain umlauts, like
-# ä, ö, ü => ae, oe, ue.
-# (and more, see specs)
-#
-class UmlautSubstituter
-
- attr_reader :chars
-
- def initialize
- @chars = ActiveSupport::Multibyte.proxy_class
- end
-
- def substitute text
- trans = chars.new(text).normalize(:kd)
-
- # substitute special cases
- #
- trans.gsub!('ß', 'ss')
-
- # substitute umlauts (of A,O,U,a,o,u)
- #
- trans.gsub!(/([AOUaou])\314\210/u, '\1e')
-
- # get rid of ecutes, graves and …
- #
- trans.unpack('U*').select { |cp|
- cp < 0x0300 || cp > 0x035F
- }.pack('U*')
- end
-
-end
@@ -1,9 +1,10 @@
# encoding: utf-8
+#
require 'spec_helper'
-describe UmlautSubstituter do
+describe CharacterSubstitution do
before(:each) do
- @substituter = UmlautSubstituter.new
+ @substituter = CharacterSubstitution::European.new
end
# A bit of metaprogramming to help with the myriads of its.
@@ -82,5 +83,15 @@ def self.it_should_not_substitute(special_character)
it_should_substitute 'å', 'a'
it_should_substitute 'Å', 'A'
end
+
+ describe "diacritic" do
+ it_should_substitute 'ñ', 'n'
+ end
+
+ describe "speed" do
+ it "is fast" do
+ performance_of { @substituter.substitute('ä') }.should < 0.00006
+ end
+ end
end

0 comments on commit e817ffd

Please sign in to comment.