Skip to content
Permalink
Browse files

FIX: Relevance search will now consider document length in ranking.

The default ranking options ranks by the number of matches which is
highly problematic when posts are stuffed with a keyword. The ranking
will now be divided by the document length which is a much fairer way to
rank.
  • Loading branch information...
tgxworld committed Apr 1, 2019
1 parent cadd1d6 commit e87ca594014733d171df0112149f49180d898678
Showing with 56 additions and 11 deletions.
  1. +3 −2 lib/search.rb
  2. +53 −9 spec/components/search_spec.rb
@@ -838,13 +838,14 @@ def posts_query(limit, opts = nil)
posts = posts.order("posts.like_count DESC")
end
else
# 0|32 default normalization scaled into the range zero to one
# 2|32 divides the rank by the document length and scales the range from
# zero to one
data_ranking = <<~SQL
(
TS_RANK_CD(
post_search_data.search_data,
#{ts_query(weight_filter: weights)},
0|32
2|32
) *
(
CASE categories.search_priority
@@ -334,6 +334,27 @@ def new_post(raw, topic = nil)
expect(result.posts).to contain_exactly(reply)
expect(result.blurb(reply)).to eq(expected_blurb)
end

it 'does not allow a post with repeated words to dominate the ranking' do
category = Fabricate(:category, name: "winter is coming")

post = Fabricate(:post,
raw: "I think winter will end soon",
topic: Fabricate(:topic,
title: "dragon john snow winter",
category: category
)
)

post2 = Fabricate(:post,
raw: "I think winter winter winter winter winter will end soon",
topic: Fabricate(:topic, title: "dragon john snow summer", category: category)
)

result = Search.execute('winter')

expect(result.posts).to eq([post, post2, category.topic.first_post])
end
end

context 'searching for quoted title' do
@@ -940,22 +961,45 @@ def search
today = Date.today
yesterday = 1.day.ago
two_days_ago = 2.days.ago
category = Fabricate(:category)

old_topic = Fabricate(:topic,
title: 'First Topic, testing the created_at sort',
created_at: two_days_ago,
category: category
)

old_topic = Fabricate(:topic,
title: 'First Topic, testing the created_at sort',
created_at: two_days_ago)
latest_topic = Fabricate(:topic,
title: 'Second Topic, testing the created_at sort',
created_at: yesterday)
title: 'Second Topic, testing the created_at sort',
created_at: yesterday,
category: category
)

old_relevant_topic_post = Fabricate(:post,
topic: old_topic,
created_at: yesterday,
raw: 'Relevant Relevant Topic'
)

old_relevant_topic_post = Fabricate(:post, topic: old_topic, created_at: yesterday, raw: 'Relevant Topic')
latest_irelevant_topic_post = Fabricate(:post, topic: latest_topic, created_at: today, raw: 'Not Relevant')
latest_irelevant_topic_post = Fabricate(:post,
topic: latest_topic,
created_at: today,
raw: 'Not Relevant'
)

# Expecting the default results
expect(Search.execute('Topic').posts.map(&:id)).to eq([old_relevant_topic_post.id, latest_irelevant_topic_post.id])
expect(Search.execute('Topic').posts).to contain_exactly(
old_relevant_topic_post,
latest_irelevant_topic_post,
category.topic.first_post
)

# Expecting the ordered by topic creation results
expect(Search.execute('Topic order:latest_topic').posts.map(&:id)).to eq([latest_irelevant_topic_post.id, old_relevant_topic_post.id])
expect(Search.execute('Topic order:latest_topic').posts).to contain_exactly(
latest_irelevant_topic_post,
old_relevant_topic_post,
category.topic.first_post
)
end

it 'can tokenize dots' do

2 comments on commit e87ca59

@discoursebot

This comment has been minimized.

Copy link

replied Apr 4, 2019

This commit has been mentioned on Discourse Meta. There might be relevant details there:

https://meta.discourse.org/t/search-improvements-in-2-3/113411/1

Please sign in to comment.
You can’t perform that action at this time.