Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions docs/changelog/138140.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
pr: 138140
summary: "Fix semantic highlighting when using a `knn` query with minimum `similarity`"
area: Relevance
type: bug
issues: []
Original file line number Diff line number Diff line change
Expand Up @@ -47,12 +47,11 @@ public VectorSimilarityQuery(Query innerKnnQuery, float similarity, float docSco
this.innerKnnQuery = innerKnnQuery;
}

// For testing
Query getInnerKnnQuery() {
public Query getInnerKnnQuery() {
return innerKnnQuery;
}

float getSimilarity() {
public float getSimilarity() {
return similarity;
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,9 @@ public Set<NodeFeature> getFeatures() {

private static final NodeFeature SEMANTIC_TEXT_HIGHLIGHTER = new NodeFeature("semantic_text.highlighter");
private static final NodeFeature SEMANTIC_TEXT_HIGHLIGHTER_DEFAULT = new NodeFeature("semantic_text.highlighter.default");
private static final NodeFeature SEMANTIC_TEXT_HIGHLIGHTER_VECTOR_SIMILARITY_SUPPORT = new NodeFeature(
"semantic_text.highlighter.vector_similarity_support"
);
private static final NodeFeature TEST_RERANKING_SERVICE_PARSE_TEXT_AS_SCORE = new NodeFeature(
"test_reranking_service.parse_text_as_score"
);
Expand Down Expand Up @@ -82,6 +85,7 @@ public Set<NodeFeature> getTestFeatures() {
COHERE_V2_API,
SEMANTIC_QUERY_REWRITE_INTERCEPTORS_PROPAGATE_BOOST_AND_QUERY_NAME_FIX,
SEMANTIC_TEXT_HIGHLIGHTING_FLAT,
SEMANTIC_TEXT_HIGHLIGHTER_VECTOR_SIMILARITY_SUPPORT,
SemanticQueryBuilder.SEMANTIC_QUERY_FILTER_FIELD_CAPS_FIX
);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -33,8 +33,10 @@
import org.elasticsearch.search.fetch.subphase.highlight.HighlightUtils;
import org.elasticsearch.search.fetch.subphase.highlight.Highlighter;
import org.elasticsearch.search.vectors.DenseVectorQuery;
import org.elasticsearch.search.vectors.RescoreKnnVectorQuery;
import org.elasticsearch.search.vectors.SparseVectorQueryWrapper;
import org.elasticsearch.search.vectors.VectorData;
import org.elasticsearch.search.vectors.VectorSimilarityQuery;
import org.elasticsearch.xcontent.Text;
import org.elasticsearch.xpack.inference.mapper.OffsetSourceField;
import org.elasticsearch.xpack.inference.mapper.OffsetSourceFieldMapper;
Expand Down Expand Up @@ -266,18 +268,26 @@ public void consumeTerms(Query query, Term... terms) {
super.consumeTerms(query, terms);
}

@Override
public void visitLeaf(Query query) {
private void visitLeaf(Query query, Float similarity) {
if (query instanceof KnnFloatVectorQuery knnQuery) {
queries.add(fieldType.createExactKnnQuery(VectorData.fromFloats(knnQuery.getTargetCopy()), null));
queries.add(fieldType.createExactKnnQuery(VectorData.fromFloats(knnQuery.getTargetCopy()), similarity));
} else if (query instanceof KnnByteVectorQuery knnQuery) {
queries.add(fieldType.createExactKnnQuery(VectorData.fromBytes(knnQuery.getTargetCopy()), null));
queries.add(fieldType.createExactKnnQuery(VectorData.fromBytes(knnQuery.getTargetCopy()), similarity));
} else if (query instanceof MatchAllDocsQuery) {
queries.add(new MatchAllDocsQuery());
} else if (query instanceof DenseVectorQuery.Floats floatsQuery) {
queries.add(fieldType.createExactKnnQuery(VectorData.fromFloats(floatsQuery.getQuery()), null));
queries.add(fieldType.createExactKnnQuery(VectorData.fromFloats(floatsQuery.getQuery()), similarity));
} else if (query instanceof RescoreKnnVectorQuery rescoreQuery) {
visitLeaf(rescoreQuery.innerQuery(), similarity);
} else if (query instanceof VectorSimilarityQuery similarityQuery) {
visitLeaf(similarityQuery.getInnerKnnQuery(), similarityQuery.getSimilarity());
}
}

@Override
public void visitLeaf(Query query) {
visitLeaf(query, null);
}
});
return queries;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -200,6 +200,30 @@ public void testNoSemanticField() throws Exception {
);
}

@SuppressWarnings("unchecked")
public void testDenseVectorWithSimilarityThreshold() throws Exception {
var mapperService = createDefaultMapperService(useLegacyFormat);
Map<String, Object> queryMap = (Map<String, Object>) queries.get("dense_vector_1");
float[] vector = readDenseVector(queryMap.get("embeddings"));
var fieldType = (SemanticTextFieldMapper.SemanticTextFieldType) mapperService.mappingLookup().getFieldType(SEMANTIC_FIELD_E5);

KnnVectorQueryBuilder knnQuery = new KnnVectorQueryBuilder(fieldType.getEmbeddingsField().fullPath(), vector, 10, 10, null, 0.85f);
NestedQueryBuilder nestedQueryBuilder = new NestedQueryBuilder(fieldType.getChunksField().fullPath(), knnQuery, ScoreMode.Max);
var shardRequest = createShardSearchRequest(nestedQueryBuilder);
var sourceToParse = new SourceToParse("0", readSampleDoc(useLegacyFormat), XContentType.JSON);

String[] expectedPassages = ((List<String>) queryMap.get("expected_with_similarity_threshold")).toArray(String[]::new);
assertHighlightOneDoc(
mapperService,
shardRequest,
sourceToParse,
SEMANTIC_FIELD_E5,
expectedPassages.length,
HighlightBuilder.Order.SCORE,
expectedPassages
);
}

private MapperService createDefaultMapperService(boolean useLegacyFormat) throws IOException {
var mappings = Streams.readFully(SemanticTextHighlighterTests.class.getResourceAsStream("mappings.json"));
var settings = Settings.builder()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -399,6 +399,9 @@
"After the marshland between the river Seine and its slower 'dead arm' to its north was filled in from around the 10th century, Paris's cultural centre began to move to the Right Bank. In 1137, a new city marketplace (today's Les Halles) replaced the two smaller ones on the Île de la Cité and Place de Grève (Place de l'Hôtel de Ville). The latter location housed the headquarters of Paris's river trade corporation, an organisation that later became, unofficially (although formally in later years), Paris's first municipal government.\n\n\nIn the late 12th century, Philip Augustus extended the Louvre fortress to defend the city against river invasions from the west, gave the city its first walls between 1190 and 1215, rebuilt its bridges to either side of its central island, and paved its main thoroughfares. In 1190, he transformed Paris's former cathedral school into a student-teacher corporation that would become the University of Paris and would draw students from all of Europe.\n\n\nWith 200,000 inhabitants in 1328, Paris, then already the capital of France, was the most populous city of Europe. By comparison, London in 1300 had 80,000 inhabitants. By the early fourteenth century, so much filth had collected inside urban Europe that French and Italian cities were naming streets after human waste. In medieval Paris, several street names were inspired by merde, the French word for \"shit\".\n\n\n",
"In March 2001, Bertrand Delanoë became the first socialist mayor. He was re-elected in March 2008. In 2007, in an effort to reduce car traffic, he introduced the Vélib', a system which rents bicycles. Bertrand Delanoë also transformed a section of the highway along the Left Bank of the Seine into an urban promenade and park, the Promenade des Berges de la Seine, which he inaugurated in June 2013.\n\n\nIn 2007, President Nicolas Sarkozy launched the Grand Paris project, to integrate Paris more closely with the towns in the region around it. After many modifications, the new area, named the Metropolis of Grand Paris, with a population of 6.7 million, was created on 1 January 2016. In 2011, the City of Paris and the national government approved the plans for the Grand Paris Express, totalling 205 km (127 mi) of automated metro lines to connect Paris, the innermost three departments around Paris, airports and high-speed rail (TGV) stations, at an estimated cost of €35 billion. The system is scheduled to be completed by 2030.\n\n\nIn January 2015, Al-Qaeda in the Arabian Peninsula claimed attacks across the Paris region. 1.5 million people marched in Paris in a show of solidarity against terrorism and in support of freedom of speech. In November of the same year, terrorist attacks, claimed by ISIL, killed 130 people and injured more than 350.\n\n\n",
"Bal-musette is a style of French music and dance that first became popular in Paris in the 1870s and 1880s; by 1880 Paris had some 150 dance halls. Patrons danced the bourrée to the accompaniment of the cabrette (a bellows-blown bagpipe locally called a \"musette\") and often the vielle à roue (hurdy-gurdy) in the cafés and bars of the city. Parisian and Italian musicians who played the accordion adopted the style and established themselves in Auvergnat bars, and Paris became a major centre for jazz and still attracts jazz musicians from all around the world to its clubs and cafés.\n\n\nParis is the spiritual home of gypsy jazz in particular, and many of the Parisian jazzmen who developed in the first half of the 20th century began by playing Bal-musette in the city. Django Reinhardt rose to fame in Paris, having moved to the 18th arrondissement in a caravan as a young boy, and performed with violinist Stéphane Grappelli and their Quintette du Hot Club de France in the 1930s and 1940s.\n\n\nImmediately after the War the Saint-Germain-des-Pres quarter and the nearby Saint-Michel quarter became home to many small jazz clubs, including the Caveau des Lorientais, the Club Saint-Germain, the Rose Rouge, the Vieux-Colombier, and the most famous, Le Tabou. They introduced Parisians to the music of Claude Luter, Boris Vian, Sydney Bechet, Mezz Mezzrow, and Henri Salvador. "
],
"expected_with_similarity_threshold": [
"\nParis (.mw-parser-output .IPA-label-small{font-size:85%}.mw-parser-output .references .IPA-label-small,.mw-parser-output .infobox .IPA-label-small,.mw-parser-output .navbox .IPA-label-small{font-size:100%}French pronunciation: ⓘ) is the capital and largest city of France. With an estimated population of 2,102,650 residents in January 2023 in an area of more than 105 km2 (41 sq mi), Paris is the fourth-largest city in the European Union and the 30th most densely populated city in the world in 2022. Since the 17th century, Paris has been one of the world's major centres of finance, diplomacy, commerce, culture, fashion, and gastronomy. Because of its leading role in the arts and sciences and its early adaptation of extensive street lighting, it became known as the City of Light in the 19th century.\n\n\nThe City of Paris is the centre of the Île-de-France region, or Paris Region, with an official estimated population of 12,271,794 inhabitants in January 2023, or about 19% of the population of France. The Paris Region had a nominal GDP of €765 billion (US$1.064 trillion when adjusted for PPP) in 2021, the highest in the European Union. According to the Economist Intelligence Unit Worldwide Cost of Living Survey, in 2022, Paris was the city with the ninth-highest cost of living in the world.\n\n\n"
]
},
"sparse_vector_1": {
Expand Down Expand Up @@ -464,4 +467,4 @@
"Diderot and D'Alembert published their Encyclopédie in 1751, before the Montgolfier Brothers launched the first manned flight in a hot air balloon on 21 November 1783. Paris was the financial capital of continental Europe, as well the primary European centre for book publishing, fashion and the manufacture of fine furniture and luxury goods. On 22 October 1797, Paris was also the site of the first parachute jump in history, by Garnerin.\n\n\nIn the summer of 1789, Paris became the centre stage of the French Revolution. On 14 July, a mob seized the arsenal at the Invalides, acquiring thousands of guns, with which it stormed the Bastille, a principal symbol of royal authority. The first independent Paris Commune, or city council, met in the Hôtel de Ville and elected a Mayor, the astronomer Jean Sylvain Bailly, on 15 July.\n\n\nLouis XVI and the royal family were brought to Paris and incarcerated in the Tuileries Palace. In 1793, as the revolution turned increasingly radical, the king, queen and mayor were beheaded by guillotine in the Reign of Terror, along with more than 16,000 others throughout France. The property of the aristocracy and the church was nationalised, and the city's churches were closed, sold or demolished. A succession of revolutionary factions ruled Paris until 9 November 1799 (coup d'état du 18 brumaire), when Napoleon Bonaparte seized power as First Consul.\n\n\n"
]
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,6 @@ setup:
title: "Elasticsearch"
body: [ "ElasticSearch is an open source, distributed, RESTful, search engine which is built on top of Lucene internally and enjoys all the features it provides.", "You Know, for Search!" ]
refresh: true

---
"Highlighting empty field":
- do:
Expand Down Expand Up @@ -671,3 +670,73 @@ setup:
- length: { hits.hits.0.highlight.bbq_hnsw_field: 1 }
- match: { hits.hits.0.highlight.bbq_hnsw_field.0: "ElasticSearch is an open source, distributed, RESTful, search engine which is built on top of Lucene internally and enjoys all the features it provides." }


---
"Highlighting with knn with similarity":
- requires:
cluster_features: "semantic_text.highlighter.vector_similarity_support"
reason: semantic highlighter fix for knn with similarity

- do:
index:
index: test-dense-index
id: doc_1
body:
body: [ "ElasticSearch is an open source, distributed, RESTful, search engine which is built on top of Lucene internally and enjoys all the features it provides.", "You Know, for Search!", "For a moment, nothing happened. Then, after a second or so, nothing continued to happen." ]
- do:
index:
index: test-dense-index
id: doc_2
body:
body: [ "Nothing travels faster than the speed of light with the possible exception of bad news, which obeys its own special laws."]
refresh: true

- do:
search:
index: test-dense-index
body:
query:
match_all: { }
highlight:
fields:
body:
type: "semantic"
number_of_fragments: 1

- match: { hits.total.value: 2 }

- match: { hits.hits.0._id: "doc_1" }
- length: { hits.hits.0.highlight: 1 }
- length: { hits.hits.0.highlight.body: 1 }
- match: { hits.hits.0.highlight.body.0: "ElasticSearch is an open source, distributed, RESTful, search engine which is built on top of Lucene internally and enjoys all the features it provides." }

- match: { hits.hits.1._id: "doc_2" }
- length: { hits.hits.1.highlight: 1 }
- length: { hits.hits.1.highlight.body: 1 }
- match: { hits.hits.1.highlight.body.0: "Nothing travels faster than the speed of light with the possible exception of bad news, which obeys its own special laws." }

- do:
search:
index: test-dense-index
body:
query:
knn:
field: "body"
query_vector_builder:
text_embedding:
model_text: "What is Elasticsearch?"
k: 10
num_candidates: 10
similarity: 0.9977
highlight:
fields:
body:
type: "semantic"
number_of_fragments: 3

- match: { hits.total.value: 1 }
- match: { hits.hits.0._id: "doc_1" }
- length: { hits.hits.0.highlight.body: 3 }
- match: { hits.hits.0.highlight.body.0: "ElasticSearch is an open source, distributed, RESTful, search engine which is built on top of Lucene internally and enjoys all the features it provides." }
- match: { hits.hits.0.highlight.body.1: "You Know, for Search!" }
- match: { hits.hits.0.highlight.body.2: "For a moment, nothing happened. Then, after a second or so, nothing continued to happen."}
Original file line number Diff line number Diff line change
Expand Up @@ -649,5 +649,72 @@ setup:
- length: { hits.hits.0.highlight.bbq_hnsw_field: 1 }
- match: { hits.hits.0.highlight.bbq_hnsw_field.0: "ElasticSearch is an open source, distributed, RESTful, search engine which is built on top of Lucene internally and enjoys all the features it provides." }

---
"Highlighting with knn with similarity":
- requires:
cluster_features: "semantic_text.highlighter.vector_similarity_support"
reason: semantic highlighter fix for knn with similarity

- do:
index:
index: test-dense-index
id: doc_1
body:
body: [ "ElasticSearch is an open source, distributed, RESTful, search engine which is built on top of Lucene internally and enjoys all the features it provides.", "You Know, for Search!", "For a moment, nothing happened. Then, after a second or so, nothing continued to happen." ]
- do:
index:
index: test-dense-index
id: doc_2
body:
body: [ "Nothing travels faster than the speed of light with the possible exception of bad news, which obeys its own special laws."]
refresh: true

- do:
search:
index: test-dense-index
body:
query:
match_all: { }
highlight:
fields:
body:
type: "semantic"
number_of_fragments: 1

- match: { hits.total.value: 2 }

- match: { hits.hits.0._id: "doc_1" }
- length: { hits.hits.0.highlight: 1 }
- length: { hits.hits.0.highlight.body: 1 }
- match: { hits.hits.0.highlight.body.0: "ElasticSearch is an open source, distributed, RESTful, search engine which is built on top of Lucene internally and enjoys all the features it provides." }

- match: { hits.hits.1._id: "doc_2" }
- length: { hits.hits.1.highlight: 1 }
- length: { hits.hits.1.highlight.body: 1 }
- match: { hits.hits.1.highlight.body.0: "Nothing travels faster than the speed of light with the possible exception of bad news, which obeys its own special laws." }

- do:
search:
index: test-dense-index
body:
query:
knn:
field: "body"
query_vector_builder:
text_embedding:
model_text: "What is Elasticsearch?"
k: 10
num_candidates: 10
similarity: 0.9977
highlight:
fields:
body:
type: "semantic"
number_of_fragments: 3

- match: { hits.total.value: 1 }
- match: { hits.hits.0._id: "doc_1" }
- length: { hits.hits.0.highlight.body: 3 }
- match: { hits.hits.0.highlight.body.0: "ElasticSearch is an open source, distributed, RESTful, search engine which is built on top of Lucene internally and enjoys all the features it provides." }
- match: { hits.hits.0.highlight.body.1: "You Know, for Search!" }
- match: { hits.hits.0.highlight.body.2: "For a moment, nothing happened. Then, after a second or so, nothing continued to happen."}