-
Notifications
You must be signed in to change notification settings - Fork 8.2k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
FIX: Inject extra lexemes for host lexeme. #10198
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -16,26 +16,9 @@ def self.scrub_html_for_search(html, strip_diacritics: SiteSetting.search_ignore | |
HtmlScrubber.scrub(html, strip_diacritics: strip_diacritics) | ||
end | ||
|
||
def self.inject_extra_terms(raw) | ||
return raw if !SiteSetting.search_inject_extra_terms | ||
|
||
# insert some extra words for I.am.a.word so "word" is tokenized | ||
# I.am.a.word becomes I.am.a.word am a word | ||
raw.gsub(/[^[:space:]]*[\.]+[^[:space:]]*/) do |with_dot| | ||
|
||
split = with_dot.split(/https?:\/\/|[?:;,.\/]/) | ||
|
||
if split.length > 1 | ||
with_dot + ((+" ") << split[1..-1].reject { |x| x.blank? }.join(" ")) | ||
else | ||
with_dot | ||
end | ||
end | ||
end | ||
|
||
def self.update_index(table: , id: , raw_data:) | ||
search_data = raw_data.map do |data| | ||
inject_extra_terms(Search.prepare_data(data || "", :index)) | ||
Search.prepare_data(data || "", :index) | ||
end | ||
|
||
table_name = "#{table}_search_data" | ||
|
@@ -53,15 +36,39 @@ def self.update_index(table: , id: , raw_data:) | |
|
||
indexed_data = search_data.select { |d| d.length > 0 }.join(' ') | ||
|
||
params = { | ||
ranked_params = { | ||
a: search_data[0], | ||
b: search_data[1], | ||
c: search_data[2], | ||
d: search_data[3], | ||
} | ||
|
||
tsvector = DB.query_single("SELECT #{ranked_index}", ranked_params)[0] | ||
additional_lexemes = [] | ||
|
||
tsvector.scan(/'(([a-zA-Z0-9]+\.)+[a-zA-Z0-9]+)'\:([\w+,]+)/).reduce(additional_lexemes) do |array, (lexeme, _, positions)| | ||
count = 0 | ||
|
||
loop do | ||
count += 1 | ||
break if count >= 10 # Safeguard here to prevent infinite loop when a term has many dots | ||
term, _, remaining = lexeme.partition(".") | ||
break if remaining.blank? | ||
array << "'#{term}':#{positions} '#{remaining}':#{positions}" | ||
lexeme = remaining | ||
end | ||
|
||
array | ||
end | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. My big question though is, is this injection still too much? If my post contains There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Ahh I thought PG was smart enough to only retrieve the host... looks like it treats query params as file here. I wonder if we can identify what type a lexeme is. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
I think so since this was one of the bug reports we got previously https://meta.discourse.org/t/discourses-internal-search-does-not-find-the-phrase-pagedowncustom-but-google-does/35406/8.
Query params are tricky and it seems like the PG default parser isn't very smart.
I guess one thing we can do is to drop query params from the search index? |
||
|
||
tsvector = "#{tsvector} #{additional_lexemes.join(' ')}" | ||
|
||
params = { | ||
raw_data: indexed_data, | ||
id: id, | ||
locale: SiteSetting.default_locale, | ||
version: INDEX_VERSION | ||
version: INDEX_VERSION, | ||
tsvector: tsvector, | ||
} | ||
|
||
# Would be nice to use AR here but not sure how to execut Postgres functions | ||
|
@@ -71,7 +78,7 @@ def self.update_index(table: , id: , raw_data:) | |
SET | ||
raw_data = :raw_data, | ||
locale = :locale, | ||
search_data = #{ranked_index}, | ||
search_data = (:tsvector)::tsvector, | ||
version = :version | ||
WHERE #{foreign_key} = :id | ||
SQL | ||
|
@@ -80,7 +87,7 @@ def self.update_index(table: , id: , raw_data:) | |
DB.exec(<<~SQL, params) | ||
INSERT INTO #{table_name} | ||
(#{foreign_key}, search_data, locale, raw_data, version) | ||
VALUES (:id, #{ranked_index}, :locale, :raw_data, :version) | ||
VALUES (:id, (:tsvector)::tsvector, :locale, :raw_data, :version) | ||
SQL | ||
end | ||
rescue | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -86,6 +86,15 @@ def self.prepare_data(search_data, purpose = :query) | |
data = strip_diacritics(data) | ||
end | ||
end | ||
|
||
data.gsub!(EmailCook.url_regexp) do |url| | ||
uri = URI.parse(url) | ||
uri.query = nil | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @SamSaffron I think we should just strip the query string from URLs. The following seems to work as expected
However, once a path is present
The parsing here is really inconsistent and complex query strings will end up generating noise in our search data. |
||
uri.to_s | ||
rescue URI::Error | ||
# Don't fail even if URL turns out to be invalid | ||
end | ||
|
||
data | ||
end | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -1255,20 +1255,26 @@ def search | |
]) | ||
end | ||
|
||
it 'can tokenize dots' do | ||
it 'can search for terms with dots' do | ||
post = Fabricate(:post, raw: 'Will.2000 Will.Bob.Bill...') | ||
expect(Search.execute('bill').posts.map(&:id)).to eq([post.id]) | ||
expect(Search.execute('bob').posts.map(&:id)).to eq([post.id]) | ||
expect(Search.execute('2000').posts.map(&:id)).to eq([post.id]) | ||
end | ||
|
||
it 'can search URLS correctly' do | ||
post = Fabricate(:post, raw: 'i like http://wb.camra.org.uk/latest#test so yay') | ||
|
||
expect(Search.execute('http://wb.camra.org.uk/latest#test').posts.map(&:id)).to eq([post.id]) | ||
expect(Search.execute('camra').posts.map(&:id)).to eq([post.id]) | ||
|
||
complex_url = "https://test.some.site.com/path?some.range_input=74235a" | ||
post2 = Fabricate(:post, raw: "this is a complex url #{complex_url} so complex") | ||
|
||
expect(Search.execute(complex_url).posts.map(&:id)).to eq([post2.id]) | ||
expect(Search.execute('http://wb').posts.map(&:id)).to eq([post.id]) | ||
expect(Search.execute('wb.camra').posts.map(&:id)).to eq([post.id]) | ||
expect(Search.execute('wb.camra.org').posts.map(&:id)).to eq([post.id]) | ||
expect(Search.execute('org.uk').posts.map(&:id)).to eq([post.id]) | ||
expect(Search.execute('camra.org.uk').posts.map(&:id)).to eq([post.id]) | ||
expect(Search.execute('wb.camra.org.uk').posts.map(&:id)).to eq([post.id]) | ||
expect(Search.execute('wb.camra.org.uk/latest').posts.map(&:id)).to eq([post.id]) | ||
expect(Search.execute('/latest#test').posts.map(&:id)).to eq([post.id]) | ||
Comment on lines
+1270
to
+1277
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @SamSaffron IMO the searches which I've added above are all valid and makes search better. |
||
end | ||
|
||
it 'supports category slug and tags' do | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Realized we need a safe guard here to prevent extreme cases. 'some.super.long.file.name.that.will.never.end.some.super.long.file.name.that.will.never.end.some.super.long.file.name.that.will.never.end.some.super.long.file.name.that.will.never.end`