Skip to content

Commit

Permalink
FIX: Relevance search will now consider document length in ranking.
Browse files Browse the repository at this point in the history
The default ranking options ranks by the number of matches which is
highly problematic when posts are stuffed with a keyword. The ranking
will now be divided by the document length which is a much fairer way to
rank.
  • Loading branch information
tgxworld committed Apr 1, 2019
1 parent cadd1d6 commit e87ca59
Show file tree
Hide file tree
Showing 2 changed files with 56 additions and 11 deletions.
5 changes: 3 additions & 2 deletions lib/search.rb
Expand Up @@ -838,13 +838,14 @@ def posts_query(limit, opts = nil)
posts = posts.order("posts.like_count DESC")
end
else
# 0|32 default normalization scaled into the range zero to one
# 2|32 divides the rank by the document length and scales the range from
# zero to one
data_ranking = <<~SQL
(
TS_RANK_CD(
post_search_data.search_data,
#{ts_query(weight_filter: weights)},
0|32
2|32
) *
(
CASE categories.search_priority
Expand Down
62 changes: 53 additions & 9 deletions spec/components/search_spec.rb
Expand Up @@ -334,6 +334,27 @@ def new_post(raw, topic = nil)
expect(result.posts).to contain_exactly(reply)
expect(result.blurb(reply)).to eq(expected_blurb)
end

it 'does not allow a post with repeated words to dominate the ranking' do
category = Fabricate(:category, name: "winter is coming")

post = Fabricate(:post,
raw: "I think winter will end soon",
topic: Fabricate(:topic,
title: "dragon john snow winter",
category: category
)
)

post2 = Fabricate(:post,
raw: "I think winter winter winter winter winter will end soon",
topic: Fabricate(:topic, title: "dragon john snow summer", category: category)
)

result = Search.execute('winter')

expect(result.posts).to eq([post, post2, category.topic.first_post])
end
end

context 'searching for quoted title' do
Expand Down Expand Up @@ -940,22 +961,45 @@ def search
today = Date.today
yesterday = 1.day.ago
two_days_ago = 2.days.ago
category = Fabricate(:category)

old_topic = Fabricate(:topic,
title: 'First Topic, testing the created_at sort',
created_at: two_days_ago,
category: category
)

old_topic = Fabricate(:topic,
title: 'First Topic, testing the created_at sort',
created_at: two_days_ago)
latest_topic = Fabricate(:topic,
title: 'Second Topic, testing the created_at sort',
created_at: yesterday)
title: 'Second Topic, testing the created_at sort',
created_at: yesterday,
category: category
)

old_relevant_topic_post = Fabricate(:post,
topic: old_topic,
created_at: yesterday,
raw: 'Relevant Relevant Topic'
)

old_relevant_topic_post = Fabricate(:post, topic: old_topic, created_at: yesterday, raw: 'Relevant Topic')
latest_irelevant_topic_post = Fabricate(:post, topic: latest_topic, created_at: today, raw: 'Not Relevant')
latest_irelevant_topic_post = Fabricate(:post,
topic: latest_topic,
created_at: today,
raw: 'Not Relevant'
)

# Expecting the default results
expect(Search.execute('Topic').posts.map(&:id)).to eq([old_relevant_topic_post.id, latest_irelevant_topic_post.id])
expect(Search.execute('Topic').posts).to contain_exactly(
old_relevant_topic_post,
latest_irelevant_topic_post,
category.topic.first_post
)

# Expecting the ordered by topic creation results
expect(Search.execute('Topic order:latest_topic').posts.map(&:id)).to eq([latest_irelevant_topic_post.id, old_relevant_topic_post.id])
expect(Search.execute('Topic order:latest_topic').posts).to contain_exactly(
latest_irelevant_topic_post,
old_relevant_topic_post,
category.topic.first_post
)
end

it 'can tokenize dots' do
Expand Down

2 comments on commit e87ca59

@tgxworld
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@discoursebot
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This commit has been mentioned on Discourse Meta. There might be relevant details there:

https://meta.discourse.org/t/search-improvements-in-2-3/113411/1

Please sign in to comment.