Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,6 @@ DistributedTransport transport
SearchInferenceId = endpoint.NoElasticInferenceService ? null : ".elser-2-elastic"
});


public abstract class ElasticsearchExporter<TChannelOptions, TChannel> : IDisposable
where TChannelOptions : CatalogIndexChannelOptionsBase<DocumentationDocument>
where TChannel : CatalogIndexChannel<DocumentationDocument, TChannelOptions>
Expand Down Expand Up @@ -149,14 +148,14 @@ protected static string CreateMappingSetting(string synonymSetName) =>
"analysis": {
"analyzer": {
"synonyms_analyzer": {
"tokenizer": "whitespace",
"tokenizer": "group_tokenizer",
"filter": [
"lowercase",
"synonyms_filter"
]
},
"highlight_analyzer": {
"tokenizer": "standard",
"tokenizer": "group_tokenizer",
"filter": [
"lowercase",
"english_stop"
Expand All @@ -176,7 +175,11 @@ protected static string CreateMappingSetting(string synonymSetName) =>
}
},
"tokenizer": {
"path_tokenizer": {
"group_tokenizer": {
"type": "char_group",
"tokenize_on_chars": [ "whitespace", ",", ";", "?", "!", "(", ")", "&", "'", "\"", "/", "[", "]", "{", "}" ]
},
"path_tokenizer": {
"type": "path_hierarchy",
"delimiter": "/"
}
Expand Down Expand Up @@ -243,17 +246,24 @@ protected static string CreateMapping(string? inferenceId) =>
},
"stripped_body": {
"type": "text",
"search_analyzer": "highlight_analyzer",
"search_analyzer": "synonyms_analyzer",
"term_vector": "with_positions_offsets"
}
},
"headings": {
"type": "text",
"search_analyzer": "synonyms_analyzer"
},
{{(!string.IsNullOrWhiteSpace(inferenceId) ? AbstractInferenceMapping(inferenceId) : AbstractMapping())}}
}
}
""";

private static string AbstractMapping() =>
"""
, "abstract": { "type": "text" }
, "abstract": {
"type": "text",
"search_analyzer": "synonyms_analyzer"
}
""";

private static string InferenceMapping(string inferenceId) =>
Expand All @@ -278,5 +288,4 @@ public void Dispose()

GC.SuppressFinalize(this);
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,7 @@ public record SearchResultItem
public required string Title { get; init; }
public required string Description { get; init; }
public required SearchResultItemParent[] Parents { get; init; }
public string[]? Headings { get; init; }
public float Score { get; init; }
public string? HighlightedBody { get; init; }
}
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,9 @@ internal sealed record DocumentDto
[JsonPropertyName("url_segment_count")]
public int UrlSegmentCount { get; init; }

[JsonPropertyName("headings")]
public string[] Headings { get; init; } = [];

[JsonPropertyName("parents")]
public ParentDocumentDto[] Parents { get; init; } = [];

Expand Down Expand Up @@ -88,10 +91,15 @@ public ElasticsearchGateway(ElasticsearchOptions elasticsearchOptions, ILogger<E

var lexicalSearchRetriever =
((Query)new PrefixQuery(Infer.Field<DocumentDto>(f => f.Title.Suffix("keyword")), searchQuery) { Boost = 10.0f, CaseInsensitive = true }
|| new MatchPhrasePrefixQuery(Infer.Field<DocumentDto>(f => f.Title), searchQuery) { Boost = 9.0f }
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is for a follow up but jotting thoughts down here, we probably do not want to have all these prefix queries score so high.

I think we should set up an explicit completion fields:

For general purpose: https://www.elastic.co/docs/reference/elasticsearch/mapping-reference/search-as-you-type

And explicit .prefix multifields on title and maybe headers using (edge)NGram tokenizer (or use the one provided by the search_as_you_type field).

|| new MatchQuery(Infer.Field<DocumentDto>(f => f.Title), searchQuery) { Operator = Operator.And, Boost = 8.0f }
|| new MatchBoolPrefixQuery(Infer.Field<DocumentDto>(f => f.Title), searchQuery) { Boost = 6.0f }
|| new MatchQuery(Infer.Field<DocumentDto>(f => f.Abstract), searchQuery) { Boost = 4.0f }
|| new MatchQuery(Infer.Field<DocumentDto>(f => f.StrippedBody), searchQuery) { Boost = 3.0f }
|| new MatchQuery(Infer.Field<DocumentDto>(f => f.Abstract), searchQuery) { Operator = Operator.And, Boost = 5.0f }
|| new MatchQuery(Infer.Field<DocumentDto>(f => f.StrippedBody), searchQuery) { Operator = Operator.And, Boost = 4.5f }
|| new MatchQuery(Infer.Field<DocumentDto>(f => f.Headings), searchQuery) { Operator = Operator.And, Boost = 4.5f }
|| new MatchQuery(Infer.Field<DocumentDto>(f => f.Abstract), searchQuery) { Operator = Operator.Or, Boost = 4.0f }
|| new MatchQuery(Infer.Field<DocumentDto>(f => f.StrippedBody), searchQuery) { Operator = Operator.Or, Boost = 3.0f }
|| new MatchQuery(Infer.Field<DocumentDto>(f => f.Headings), searchQuery) { Operator = Operator.Or, Boost = 3.0f }
|| new MatchQuery(Infer.Field<DocumentDto>(f => f.Parents.First().Title), searchQuery) { Boost = 2.0f }
|| new MatchQuery(Infer.Field<DocumentDto>(f => f.Title), searchQuery) { Fuzziness = 1, Boost = 1.0f }
)
Expand Down Expand Up @@ -129,7 +137,8 @@ public ElasticsearchGateway(ElasticsearchOptions elasticsearchOptions, ILogger<E
e => e.Title,
e => e.Url,
e => e.Description,
e => e.Parents
e => e.Parents,
e => e.Headings
)
)
)
Expand Down Expand Up @@ -193,6 +202,7 @@ private static (int TotalHits, List<SearchResultItem> Results) ProcessSearchResp
Url = doc.Url,
Title = doc.Title,
Description = doc.Description ?? string.Empty,
Headings = doc.Headings,
Parents = doc.Parents.Select(parent => new SearchResultItemParent
{
Title = parent.Title,
Expand Down
Loading