Skip to content

Commit

Permalink
BUGFIX: Chinese search was broken
Browse files Browse the repository at this point in the history
BUGFIX: User locale was used index data
BUGFIX: missing Norwegian fulltext config
FEATURE: store the text used to index stuff in fulltext (for diagnostics / in page search)
FEATURE: re-index posts when locale changes (in bg job)
FEATURE: allow reindexing by trucating post_search_data

Note: I removed japanese specific config cause it requires custom pg config,
  happy to add it once our base docker config ships with it
  • Loading branch information
SamSaffron committed Jun 24, 2014
1 parent 51ff644 commit 3c84876
Show file tree
Hide file tree
Showing 8 changed files with 134 additions and 16 deletions.
2 changes: 2 additions & 0 deletions Gemfile
Expand Up @@ -223,6 +223,8 @@ gem 'gctools', require: false, platform: :mri_21
gem 'stackprof', require: false, platform: :mri_21
gem 'memory_profiler', require: false, platform: :mri_21

gem 'rmmseg-cpp', require: false

# This silly path comment just makes it easier for me to do dev
# will be removed in a few weeks
gem 'logster'#, path: '../logster'
Expand Down
2 changes: 2 additions & 0 deletions Gemfile.lock
Expand Up @@ -283,6 +283,7 @@ GEM
rest-client (1.6.7)
mime-types (>= 1.16)
rinku (1.7.3)
rmmseg-cpp (0.2.9)
rspec (2.14.1)
rspec-core (~> 2.14.0)
rspec-expectations (~> 2.14.0)
Expand Down Expand Up @@ -462,6 +463,7 @@ DEPENDENCIES
redis
rest-client
rinku
rmmseg-cpp
rspec-given
rspec-rails
ruby-readability
Expand Down
2 changes: 2 additions & 0 deletions app/jobs/scheduled/periodical_updates.rb
Expand Up @@ -31,6 +31,8 @@ def execute(args)
unless UserAvatar.where("last_gravatar_download_attempt IS NULL").limit(1).first
Post.rebake_old(250)
end

Search.rebuild_problem_posts
end

end
Expand Down
21 changes: 18 additions & 3 deletions app/models/search_observer.rb
Expand Up @@ -8,6 +8,8 @@ def self.scrub_html_for_search(html)
end

def self.update_index(table, id, search_data)
search_data = Search.prepare_data(search_data)

table_name = "#{table}_search_data"
foreign_key = "#{table}_id"

Expand All @@ -16,9 +18,18 @@ def self.update_index(table, id, search_data)

# Would be nice to use AR here but not sure how to execut Postgres functions
# when inserting data like this.
rows = Post.exec_sql_row_count("UPDATE #{table_name} SET search_data = TO_TSVECTOR('#{stemmer}', ?) WHERE #{foreign_key} = ?", search_data, id)
rows = Post.exec_sql_row_count("UPDATE #{table_name}
SET
raw_data = :search_data,
locale = :locale,
search_data = TO_TSVECTOR('#{stemmer}', :search_data)
WHERE #{foreign_key} = :id",
search_data: search_data, id: id, locale: SiteSetting.default_locale)
if rows == 0
Post.exec_sql("INSERT INTO #{table_name} (#{foreign_key}, search_data) VALUES (?, TO_TSVECTOR('#{stemmer}', ?))", id, search_data)
Post.exec_sql("INSERT INTO #{table_name}
(#{foreign_key}, search_data, locale, raw_data)
VALUES (:id, TO_TSVECTOR('#{stemmer}', :search_data), :locale, :search_data)",
search_data: search_data, id: id, locale: SiteSetting.default_locale)
end
rescue
# don't allow concurrency to mess up saving a post
Expand All @@ -39,7 +50,7 @@ def self.update_categories_index(category_id, name)
update_index('category', category_id, name)
end

def after_save(obj)
def self.index(obj)
if obj.class == Post && obj.cooked_changed?
if obj.topic
category_name = obj.topic.category.name if obj.topic.category
Expand Down Expand Up @@ -67,6 +78,10 @@ def after_save(obj)
end
end

def after_save(object)
SearchObserver.index(object)
end


class HtmlScrubber < Nokogiri::XML::SAX::Document
attr_reader :scrubbed
Expand Down
11 changes: 11 additions & 0 deletions db/migrate/20140624044600_add_raw_data_to_search.rb
@@ -0,0 +1,11 @@
class AddRawDataToSearch < ActiveRecord::Migration
def change
add_column :post_search_data, :raw_data, :text
add_column :user_search_data, :raw_data, :text
add_column :category_search_data, :raw_data, :text

add_column :post_search_data, :locale, :string
add_column :user_search_data, :locale, :text
add_column :category_search_data, :locale, :text
end
end
68 changes: 55 additions & 13 deletions lib/search.rb
Expand Up @@ -19,25 +19,67 @@ def self.facets
end

def self.long_locale
case I18n.locale # Currently-present in /conf/locales/* only, sorry :-( Add as needed
when :da then 'danish'
when :de then 'german'
when :en then 'english'
when :es then 'spanish'
when :fr then 'french'
when :it then 'italian'
when :ja then 'japanese'
when :nl then 'dutch'
when :pt then 'portuguese'
when :sv then 'swedish'
when :ru then 'russian'
# if adding a language see:
# /usr/share/postgresql/9.3/tsearch_data for possible options
# Do not add languages that are missing without amending the
# base docker config
#
case SiteSetting.default_locale.to_sym
when :da then 'danish'
when :de then 'german'
when :en then 'english'
when :es then 'spanish'
when :fr then 'french'
when :it then 'italian'
when :nl then 'dutch'
when :nb_NO then 'norwegian'
when :pt then 'portuguese'
when :pt_BR then 'portuguese'
when :sv then 'swedish'
when :ru then 'russian'
else 'simple' # use the 'simple' stemmer for other languages
end
end

def self.rebuild_problem_posts(limit = 10000)
posts = Post.joins(:topic)
.where('posts.id NOT IN (
SELECT post_id from post_search_data
WHERE locale = ?
)', SiteSetting.default_locale).limit(10000)

posts.each do |post|
post.cooked += " "
SearchObserver.index(post)
end

nil
end

def self.prepare_data(search_data)
data = search_data.squish
# TODO rmmseg is designed for chinese, we need something else for Korean / Japanese
if ['zh_TW', 'zh_CN', 'ja', 'ko'].include?(SiteSetting.default_locale)
unless defined? RMMSeg
require 'rmmseg'
RMMSeg::Dictionary.load_dictionaries
end

algo = RMMSeg::Algorithm.new(search_data)

data = ""
while token = algo.next_token
data << token.text << " "
end
end

data.force_encoding("UTF-8")
data
end

def initialize(term, opts=nil)
if term.present?
@term = term.to_s
@term = Search.prepare_data(term.to_s)
@original_term = PG::Connection.escape_string(@term)
end

Expand Down
4 changes: 4 additions & 0 deletions lib/sql_builder.rb
Expand Up @@ -77,6 +77,10 @@ def exec(args = {})
16 => :value_to_boolean
}

def self.map_exec(klass, sql, args = {})
self.new(sql).map_exec(klass, args)
end

def map_exec(klass = OpenStruct, args = {})
results = exec(args)

Expand Down
40 changes: 40 additions & 0 deletions spec/models/search_observer_spec.rb
@@ -0,0 +1,40 @@
require 'spec_helper'

describe SearchObserver do

def get_row(post_id)
SqlBuilder.map_exec(
OpenStruct,
"select * from post_search_data where post_id = :post_id",
post_id: post_id
).first
end

it 'correctly indexes chinese' do
SiteSetting.default_locale = 'zh_CN'
data = "你好世界"
data.split(" ").length.should == 1

SearchObserver.update_posts_index(99, "你好世界", "", nil)

row = get_row(99)
row.raw_data.split(' ').length.should == 2
end

it 'correctly indexes a post' do
data = "<a>This</a> is a test"

SearchObserver.update_posts_index(99, data, "", nil)

row = get_row(99)

row.raw_data.should == "This is a test"
row.locale.should == "en"

SearchObserver.update_posts_index(99, "tester", "", nil)

row = get_row(99)

row.raw_data.should == "tester"
end
end

1 comment on commit 3c84876

@eviltrout
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

❤️

Please sign in to comment.