From 656ceb6ce5a5a4ade3cbf65dc84839bddebf4d04 Mon Sep 17 00:00:00 2001 From: david-leifker <114954101+david-leifker@users.noreply.github.com> Date: Sun, 1 Jan 2023 23:28:14 -0600 Subject: [PATCH] David leifker/elasticsearch optimization ext (#6920) --- .../datahub-gms/env/docker-without-neo4j.env | 1 + docker/datahub-gms/env/docker.cassandra.env | 1 + docker/datahub-gms/env/docker.mariadb.env | 1 + docker/datahub-gms/env/docker.postgres.env | 1 + .../env/docker-without-neo4j.env | 1 + ...ocker-compose-without-neo4j.quickstart.yml | 1 + ...ose.consumers-without-neo4j.quickstart.yml | 1 + .../metadata/client/JavaEntityClient.java | 20 +++++----- .../metadata/search/SearchService.java | 3 +- .../client/CachingEntitySearchService.java | 10 ++--- .../elasticsearch/ElasticSearchService.java | 4 +- .../indexbuilder/SettingsBuilder.java | 13 ++++++- .../elasticsearch/query/ESSearchDAO.java | 6 +-- .../query/request/SearchQueryBuilder.java | 8 ++-- .../query/request/SearchRequestHandler.java | 10 ++--- .../com/linkedin/metadata/ESTestUtils.java | 8 +++- .../metadata/search/SearchServiceTest.java | 34 +++++++++++------ .../ElasticSearchServiceTest.java | 4 +- .../fixtures/SampleDataFixtureTests.java | 38 ++++++++++++++++++- .../query/request/SearchQueryBuilderTest.java | 4 +- .../linkedin/metadata/query/SearchFlags.pdl | 2 +- ...com.linkedin.entity.entities.restspec.json | 16 +++++++- ...com.linkedin.entity.entities.snapshot.json | 6 ++- .../linkedin/entity/client/EntityClient.java | 4 +- .../entity/client/RestliEntityClient.java | 8 ++-- .../resources/entity/EntityResource.java | 14 ++++--- smoke-test/smoke-dev.sh | 29 -------------- 27 files changed, 152 insertions(+), 96 deletions(-) delete mode 100755 smoke-test/smoke-dev.sh diff --git a/docker/datahub-gms/env/docker-without-neo4j.env b/docker/datahub-gms/env/docker-without-neo4j.env index e1917f3bbd2b9..ee6ed973cbb5b 100644 --- a/docker/datahub-gms/env/docker-without-neo4j.env +++ b/docker/datahub-gms/env/docker-without-neo4j.env @@ -1,3 +1,4 @@ +BUILD_INDICES_HISTORY_KAFKA_CONSUMER_GROUP_ID=generic-bihe-consumer-job-client-gms EBEAN_DATASOURCE_USERNAME=datahub EBEAN_DATASOURCE_PASSWORD=datahub EBEAN_DATASOURCE_HOST=mysql:3306 diff --git a/docker/datahub-gms/env/docker.cassandra.env b/docker/datahub-gms/env/docker.cassandra.env index 18263b297e7a7..ed265d0c53dd1 100644 --- a/docker/datahub-gms/env/docker.cassandra.env +++ b/docker/datahub-gms/env/docker.cassandra.env @@ -1,3 +1,4 @@ +BUILD_INDICES_HISTORY_KAFKA_CONSUMER_GROUP_ID=generic-bihe-consumer-job-client-gms KAFKA_BOOTSTRAP_SERVER=broker:29092 KAFKA_SCHEMAREGISTRY_URL=http://schema-registry:8081 ELASTICSEARCH_HOST=elasticsearch diff --git a/docker/datahub-gms/env/docker.mariadb.env b/docker/datahub-gms/env/docker.mariadb.env index a40126d7208c9..968fc8788afe0 100644 --- a/docker/datahub-gms/env/docker.mariadb.env +++ b/docker/datahub-gms/env/docker.mariadb.env @@ -1,3 +1,4 @@ +BUILD_INDICES_HISTORY_KAFKA_CONSUMER_GROUP_ID=generic-bihe-consumer-job-client-gms EBEAN_DATASOURCE_USERNAME=datahub EBEAN_DATASOURCE_PASSWORD=datahub EBEAN_DATASOURCE_HOST=mariadb:3306 diff --git a/docker/datahub-gms/env/docker.postgres.env b/docker/datahub-gms/env/docker.postgres.env index f99134ebb0238..13d0e53a170ed 100644 --- a/docker/datahub-gms/env/docker.postgres.env +++ b/docker/datahub-gms/env/docker.postgres.env @@ -1,3 +1,4 @@ +BUILD_INDICES_HISTORY_KAFKA_CONSUMER_GROUP_ID=generic-bihe-consumer-job-client-gms EBEAN_DATASOURCE_USERNAME=datahub EBEAN_DATASOURCE_PASSWORD=datahub EBEAN_DATASOURCE_HOST=postgres:5432 diff --git a/docker/datahub-mae-consumer/env/docker-without-neo4j.env b/docker/datahub-mae-consumer/env/docker-without-neo4j.env index 183d66987e358..9c6d3e88aea8a 100644 --- a/docker/datahub-mae-consumer/env/docker-without-neo4j.env +++ b/docker/datahub-mae-consumer/env/docker-without-neo4j.env @@ -1,3 +1,4 @@ +BUILD_INDICES_HISTORY_KAFKA_CONSUMER_GROUP_ID=generic-bihe-consumer-job-client-mcl DATAHUB_GMS_HOST=datahub-gms DATAHUB_GMS_PORT=8080 diff --git a/docker/quickstart/docker-compose-without-neo4j.quickstart.yml b/docker/quickstart/docker-compose-without-neo4j.quickstart.yml index 7fb9d263094db..172b00edb5be1 100644 --- a/docker/quickstart/docker-compose-without-neo4j.quickstart.yml +++ b/docker/quickstart/docker-compose-without-neo4j.quickstart.yml @@ -76,6 +76,7 @@ services: environment: - DATAHUB_SERVER_TYPE=${DATAHUB_SERVER_TYPE:-quickstart} - DATAHUB_TELEMETRY_ENABLED=${DATAHUB_TELEMETRY_ENABLED:-true} + - BUILD_INDICES_HISTORY_KAFKA_CONSUMER_GROUP_ID=generic-bihe-consumer-job-client-gms - EBEAN_DATASOURCE_USERNAME=datahub - EBEAN_DATASOURCE_PASSWORD=datahub - EBEAN_DATASOURCE_HOST=mysql:3306 diff --git a/docker/quickstart/docker-compose.consumers-without-neo4j.quickstart.yml b/docker/quickstart/docker-compose.consumers-without-neo4j.quickstart.yml index 614034a344704..48b0cdef426c9 100644 --- a/docker/quickstart/docker-compose.consumers-without-neo4j.quickstart.yml +++ b/docker/quickstart/docker-compose.consumers-without-neo4j.quickstart.yml @@ -6,6 +6,7 @@ services: datahub-mae-consumer: container_name: datahub-mae-consumer environment: + - BUILD_INDICES_HISTORY_KAFKA_CONSUMER_GROUP_ID=generic-bihe-consumer-job-client-mcl - DATAHUB_GMS_HOST=datahub-gms - DATAHUB_GMS_PORT=8080 - MAE_CONSUMER_ENABLED=true diff --git a/metadata-io/src/main/java/com/linkedin/metadata/client/JavaEntityClient.java b/metadata-io/src/main/java/com/linkedin/metadata/client/JavaEntityClient.java index f94e6b3eb3774..9af5149bd2d0b 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/client/JavaEntityClient.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/client/JavaEntityClient.java @@ -28,6 +28,7 @@ import com.linkedin.metadata.query.AutoCompleteResult; import com.linkedin.metadata.query.ListResult; import com.linkedin.metadata.query.ListUrnsResult; +import com.linkedin.metadata.query.SearchFlags; import com.linkedin.metadata.query.filter.Filter; import com.linkedin.metadata.query.filter.SortCriterion; import com.linkedin.metadata.entity.AspectUtils; @@ -248,15 +249,15 @@ public SearchResult search( int start, int count, @Nonnull Authentication authentication, - @Nullable Boolean structured) + @Nullable Boolean fulltext) throws RemoteInvocationException { - if (Optional.ofNullable(structured).orElse(true)) { + if (Optional.ofNullable(fulltext).orElse(false)) { return ValidationUtils.validateSearchResult( - _entitySearchService.structuredSearch(entity, input, newFilter(requestFilters), null, start, count), _entityService); + _entitySearchService.fullTextSearch(entity, input, newFilter(requestFilters), null, start, count), _entityService); } else { return ValidationUtils.validateSearchResult( - _entitySearchService.fullTextSearch(entity, input, newFilter(requestFilters), null, start, count), _entityService); + _entitySearchService.structuredSearch(entity, input, newFilter(requestFilters), null, start, count), _entityService); } } @@ -305,15 +306,15 @@ public SearchResult search( int start, int count, @Nonnull Authentication authentication, - @Nullable Boolean structured) + @Nullable Boolean fulltext) throws RemoteInvocationException { - if (Optional.ofNullable(structured).orElse(true)) { + if (Optional.ofNullable(fulltext).orElse(false)) { return ValidationUtils.validateSearchResult( - _entitySearchService.structuredSearch(entity, input, filter, sortCriterion, start, count), + _entitySearchService.fullTextSearch(entity, input, filter, sortCriterion, start, count), _entityService); } else { return ValidationUtils.validateSearchResult( - _entitySearchService.fullTextSearch(entity, input, filter, sortCriterion, start, count), + _entitySearchService.structuredSearch(entity, input, filter, sortCriterion, start, count), _entityService); } } @@ -338,7 +339,8 @@ public SearchResult searchAcrossEntities( int count, @Nonnull final Authentication authentication) throws RemoteInvocationException { return ValidationUtils.validateSearchResult( - _searchService.searchAcrossEntities(entities, input, filter, null, start, count, null), _entityService); + _searchService.searchAcrossEntities(entities, input, filter, null, start, count, + new SearchFlags().setFulltext(true)), _entityService); } @Nonnull diff --git a/metadata-io/src/main/java/com/linkedin/metadata/search/SearchService.java b/metadata-io/src/main/java/com/linkedin/metadata/search/SearchService.java index 1b136e758ea2d..220cc6971a3b3 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/search/SearchService.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/search/SearchService.java @@ -88,7 +88,6 @@ public SearchResult searchAcrossEntities(@Nonnull List entities, @Nonnul log.debug(String.format( "Searching Search documents entities: %s, input: %s, postFilters: %s, sortCriterion: %s, from: %s, size: %s", entities, input, postFilters, sortCriterion, from, size)); - SearchFlags forceFlags = Optional.ofNullable(searchFlags).orElse(new SearchFlags()).setStructured(false); - return _cachingAllEntitiesSearchAggregator.getSearchResults(entities, input, postFilters, sortCriterion, from, size, forceFlags); + return _cachingAllEntitiesSearchAggregator.getSearchResults(entities, input, postFilters, sortCriterion, from, size, searchFlags); } } diff --git a/metadata-io/src/main/java/com/linkedin/metadata/search/client/CachingEntitySearchService.java b/metadata-io/src/main/java/com/linkedin/metadata/search/client/CachingEntitySearchService.java index 1572304c7bdf0..fd6e9b88e89e2 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/search/client/CachingEntitySearchService.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/search/client/CachingEntitySearchService.java @@ -118,7 +118,7 @@ public SearchResult getCachedSearchResults( cacheManager.getCache(ENTITY_SEARCH_SERVICE_SEARCH_CACHE_NAME), batchSize, querySize -> getRawSearchResults(entityName, query, filters, sortCriterion, querySize.getFrom(), - querySize.getSize(), searchFlags.isStructured()), + querySize.getSize(), Boolean.TRUE.equals(searchFlags.isFulltext())), querySize -> Quintet.with(entityName, query, filters, sortCriterion, querySize), flags, enableCache).getSearchResults(from, size); } @@ -197,9 +197,9 @@ private SearchResult getRawSearchResults( final SortCriterion sortCriterion, final int start, final int count, - final boolean structured) { - if (structured) { - return entitySearchService.structuredSearch( + final boolean fulltext) { + if (fulltext) { + return entitySearchService.fullTextSearch( entityName, input, filters, @@ -207,7 +207,7 @@ private SearchResult getRawSearchResults( start, count); } else { - return entitySearchService.fullTextSearch( + return entitySearchService.structuredSearch( entityName, input, filters, diff --git a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/ElasticSearchService.java b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/ElasticSearchService.java index 78347af4ff49c..ea09e3a4b258b 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/ElasticSearchService.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/ElasticSearchService.java @@ -108,7 +108,7 @@ public SearchResult fullTextSearch(@Nonnull String entityName, @Nonnull String i log.debug(String.format( "Searching FullText Search documents entityName: %s, input: %s, postFilters: %s, sortCriterion: %s, from: %s, size: %s", entityName, input, postFilters, sortCriterion, from, size)); - return esSearchDAO.search(entityName, input, postFilters, sortCriterion, from, size, false); + return esSearchDAO.search(entityName, input, postFilters, sortCriterion, from, size, true); } @Nonnull @@ -118,7 +118,7 @@ public SearchResult structuredSearch(@Nonnull String entityName, @Nonnull String log.debug(String.format( "Searching Structured Search documents entityName: %s, input: %s, postFilters: %s, sortCriterion: %s, from: %s, size: %s", entityName, input, postFilters, sortCriterion, from, size)); - return esSearchDAO.search(entityName, input, postFilters, sortCriterion, from, size, true); + return esSearchDAO.search(entityName, input, postFilters, sortCriterion, from, size, false); } @Nonnull diff --git a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/indexbuilder/SettingsBuilder.java b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/indexbuilder/SettingsBuilder.java index db574cc9e9321..406438ecc1e9e 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/indexbuilder/SettingsBuilder.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/indexbuilder/SettingsBuilder.java @@ -40,6 +40,7 @@ public class SettingsBuilder { public static final String NORMALIZER = "normalizer"; public static final String PATTERN = "pattern"; public static final String PATTERNS = "patterns"; + public static final String REPLACEMENT = "replacement"; public static final String PRESERVE_ORIGINAL = "preserve_original"; public static final String SEARCH_ANALYZER = "search_analyzer"; public static final String SPLIT_ON_NUMERICS = "split_on_numerics"; @@ -69,6 +70,7 @@ public class SettingsBuilder { public static final String FLATTEN_GRAPH = "flatten_graph"; public static final String LOWERCASE = "lowercase"; public static final String MIN_LENGTH_2 = "min_length_2"; + public static final String REPLACE_NUM_LENGTH_3 = "replace_num_length_3"; public static final String MULTIFILTER = "multifilter"; public static final String MULTIFILTER_GRAPH = "multifilter_graph"; public static final String PARTIAL_URN_COMPONENT = "partial_urn_component"; @@ -100,6 +102,7 @@ public class SettingsBuilder { public static final String SLASH_TOKENIZER = "slash_tokenizer"; public static final List ALPHA_ONLY_PATTERNS = ImmutableList.of("([a-z0-9]{2,})"); + public static final String NUM_LENGTH_3_PATTERN = "(^[0-9]{1,3}$)"; public static final List URN_STOP_WORDS = ImmutableList.of("urn", "li"); public final Map settings; @@ -136,7 +139,7 @@ private static Map buildFilters() throws IOException { // Filter to split string into words filters.put(CUSTOM_DELIMITER, ImmutableMap.builder() .put(TYPE, WORD_DELIMITER) - .put(SPLIT_ON_NUMERICS, false) + .put(SPLIT_ON_NUMERICS, true) .put(PRESERVE_ORIGINAL, true) .put(TYPE_TABLE, ImmutableList.of( COLON_SUBWORD_DELIMITER @@ -145,7 +148,7 @@ private static Map buildFilters() throws IOException { filters.put(CUSTOM_DELIMITER_GRAPH, ImmutableMap.builder() .put(TYPE, WORD_DELIMITER_GRAPH) - .put(SPLIT_ON_NUMERICS, false) + .put(SPLIT_ON_NUMERICS, true) .put(PRESERVE_ORIGINAL, true) .put(TYPE_TABLE, ImmutableList.of( COLON_SUBWORD_DELIMITER @@ -187,6 +190,12 @@ private static Map buildFilters() throws IOException { .put(PATTERNS, ALPHA_ONLY_PATTERNS) .build()); + filters.put(REPLACE_NUM_LENGTH_3, ImmutableMap.builder() + .put(TYPE, "pattern_replace") + .put(PATTERN, NUM_LENGTH_3_PATTERN) + .put(REPLACEMENT, "") + .build()); + filters.put(SHINGLE_2_3, ImmutableMap.builder() .put(TYPE, "shingle") .put(MIN_SHINGLE_SIZE, "2") diff --git a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/ESSearchDAO.java b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/ESSearchDAO.java index d75597e306843..094ebb74fa48c 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/ESSearchDAO.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/ESSearchDAO.java @@ -72,18 +72,18 @@ private SearchResult executeAndExtract(@Nonnull EntitySpec entitySpec, @Nonnull * @param sortCriterion {@link SortCriterion} to be applied to search results * @param from index to start the search from * @param size the number of search hits to return - * @param structured Structured or full text search modes + * @param fulltext Structured or full text search modes * @return a {@link com.linkedin.metadata.dao.SearchResult} that contains a list of matched documents and related search result metadata */ @Nonnull public SearchResult search(@Nonnull String entityName, @Nonnull String input, @Nullable Filter postFilters, - @Nullable SortCriterion sortCriterion, int from, int size, boolean structured) { + @Nullable SortCriterion sortCriterion, int from, int size, boolean fulltext) { final String finalInput = input.isEmpty() ? "*" : input; Timer.Context searchRequestTimer = MetricUtils.timer(this.getClass(), "searchRequest").time(); EntitySpec entitySpec = entityRegistry.getEntitySpec(entityName); // Step 1: construct the query final SearchRequest searchRequest = SearchRequestHandler.getBuilder(entitySpec) - .getSearchRequest(finalInput, postFilters, sortCriterion, from, size, structured); + .getSearchRequest(finalInput, postFilters, sortCriterion, from, size, fulltext); searchRequest.indices(indexConvention.getIndexName(entitySpec)); searchRequestTimer.stop(); // Step 2: execute the query and extract results, validated against document model as well diff --git a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/request/SearchQueryBuilder.java b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/request/SearchQueryBuilder.java index 9baa46561d92b..1989994d70ee8 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/request/SearchQueryBuilder.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/request/SearchQueryBuilder.java @@ -37,12 +37,12 @@ public class SearchQueryBuilder { private SearchQueryBuilder() { } - public static QueryBuilder buildQuery(@Nonnull EntitySpec entitySpec, @Nonnull String query, boolean structured) { + public static QueryBuilder buildQuery(@Nonnull EntitySpec entitySpec, @Nonnull String query, boolean fulltext) { final QueryBuilder queryBuilder; - if (structured) { - queryBuilder = buildInternalQuery(entitySpec, query, false, true); - } else { + if (fulltext) { queryBuilder = buildInternalQuery(entitySpec, query, true, false); + } else { + queryBuilder = buildInternalQuery(entitySpec, query, false, true); } return QueryBuilders.functionScoreQuery(queryBuilder, buildScoreFunctions(entitySpec)) diff --git a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/request/SearchRequestHandler.java b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/request/SearchRequestHandler.java index 64a5b37741a8e..fe2522882b3a3 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/request/SearchRequestHandler.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/request/SearchRequestHandler.java @@ -154,13 +154,13 @@ public static BoolQueryBuilder getFilterQuery(@Nullable Filter filter) { * @param filter the search filter * @param from index to start the search from * @param size the number of search hits to return - * @param structured Structured or full text search modes + * @param fulltext Structured or full text search modes * @return a valid search request */ @Nonnull @WithSpan public SearchRequest getSearchRequest(@Nonnull String input, @Nullable Filter filter, - @Nullable SortCriterion sortCriterion, int from, int size, boolean structured) { + @Nullable SortCriterion sortCriterion, int from, int size, boolean fulltext) { SearchRequest searchRequest = new SearchRequest(); SearchSourceBuilder searchSourceBuilder = new SearchSourceBuilder(); @@ -170,7 +170,7 @@ public SearchRequest getSearchRequest(@Nonnull String input, @Nullable Filter fi BoolQueryBuilder filterQuery = getFilterQuery(filter); searchSourceBuilder.query(QueryBuilders.boolQuery() - .must(getQuery(input, structured)) + .must(getQuery(input, fulltext)) .must(filterQuery)); getAggregations().forEach(searchSourceBuilder::aggregation); searchSourceBuilder.highlighter(getHighlights()); @@ -228,8 +228,8 @@ public static SearchRequest getAggregationRequest(@Nonnull String field, @Nullab return searchRequest; } - private QueryBuilder getQuery(@Nonnull String query, boolean structured) { - return SearchQueryBuilder.buildQuery(_entitySpec, query, structured); + private QueryBuilder getQuery(@Nonnull String query, boolean fulltext) { + return SearchQueryBuilder.buildQuery(_entitySpec, query, fulltext); } private List getAggregations() { diff --git a/metadata-io/src/test/java/com/linkedin/metadata/ESTestUtils.java b/metadata-io/src/test/java/com/linkedin/metadata/ESTestUtils.java index c7433600161c9..0281275d70001 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/ESTestUtils.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/ESTestUtils.java @@ -11,6 +11,7 @@ import com.linkedin.datahub.graphql.resolvers.ResolverUtils; import com.linkedin.datahub.graphql.types.SearchableEntityType; import com.linkedin.metadata.graph.LineageDirection; +import com.linkedin.metadata.query.SearchFlags; import com.linkedin.metadata.search.LineageSearchResult; import com.linkedin.metadata.search.LineageSearchService; import com.linkedin.metadata.search.SearchResult; @@ -68,7 +69,12 @@ private ESTestUtils() { public static SearchResult search(SearchService searchService, String query) { return searchService.searchAcrossEntities(SEARCHABLE_ENTITIES, query, null, null, 0, - 100, null); + 100, new SearchFlags().setFulltext(true)); + } + + public static SearchResult searchStructured(SearchService searchService, String query) { + return searchService.searchAcrossEntities(SEARCHABLE_ENTITIES, query, null, null, 0, + 100, new SearchFlags().setFulltext(false)); } public static LineageSearchResult lineage(LineageSearchService lineageSearchService, Urn root, int hops) { diff --git a/metadata-io/src/test/java/com/linkedin/metadata/search/SearchServiceTest.java b/metadata-io/src/test/java/com/linkedin/metadata/search/SearchServiceTest.java index 56f249c247bbf..647b16ad353ef 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/search/SearchServiceTest.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/search/SearchServiceTest.java @@ -10,6 +10,7 @@ import com.linkedin.metadata.ESTestConfiguration; import com.linkedin.metadata.models.registry.EntityRegistry; import com.linkedin.metadata.models.registry.SnapshotEntityRegistry; +import com.linkedin.metadata.query.SearchFlags; import com.linkedin.metadata.query.filter.Condition; import com.linkedin.metadata.query.filter.ConjunctiveCriterion; import com.linkedin.metadata.query.filter.ConjunctiveCriterionArray; @@ -122,9 +123,11 @@ private void clearCache() { @Test public void testSearchService() throws Exception { SearchResult searchResult = - _searchService.searchAcrossEntities(ImmutableList.of(ENTITY_NAME), "test", null, null, 0, 10, null); + _searchService.searchAcrossEntities(ImmutableList.of(ENTITY_NAME), "test", null, + null, 0, 10, new SearchFlags().setFulltext(true)); assertEquals(searchResult.getNumEntities().intValue(), 0); - searchResult = _searchService.searchAcrossEntities(ImmutableList.of(), "test", null, null, 0, 10, null); + searchResult = _searchService.searchAcrossEntities(ImmutableList.of(), "test", null, + null, 0, 10, new SearchFlags().setFulltext(true)); assertEquals(searchResult.getNumEntities().intValue(), 0); clearCache(); @@ -137,7 +140,8 @@ public void testSearchService() throws Exception { _elasticSearchService.upsertDocument(ENTITY_NAME, document.toString(), urn.toString()); syncAfterWrite(_bulkProcessor); - searchResult = _searchService.searchAcrossEntities(ImmutableList.of(), "test", null, null, 0, 10, null); + searchResult = _searchService.searchAcrossEntities(ImmutableList.of(), "test", null, + null, 0, 10, new SearchFlags().setFulltext(true)); assertEquals(searchResult.getNumEntities().intValue(), 1); assertEquals(searchResult.getEntities().get(0).getEntity(), urn); clearCache(); @@ -151,7 +155,8 @@ public void testSearchService() throws Exception { _elasticSearchService.upsertDocument(ENTITY_NAME, document2.toString(), urn2.toString()); syncAfterWrite(_bulkProcessor); - searchResult = _searchService.searchAcrossEntities(ImmutableList.of(), "test2", null, null, 0, 10, null); + searchResult = _searchService.searchAcrossEntities(ImmutableList.of(), "'test2'", null, + null, 0, 10, new SearchFlags().setFulltext(true)); assertEquals(searchResult.getNumEntities().intValue(), 1); assertEquals(searchResult.getEntities().get(0).getEntity(), urn2); clearCache(); @@ -159,7 +164,8 @@ public void testSearchService() throws Exception { _elasticSearchService.deleteDocument(ENTITY_NAME, urn.toString()); _elasticSearchService.deleteDocument(ENTITY_NAME, urn2.toString()); syncAfterWrite(_bulkProcessor); - searchResult = _searchService.searchAcrossEntities(ImmutableList.of(), "test2", null, null, 0, 10, null); + searchResult = _searchService.searchAcrossEntities(ImmutableList.of(), "'test2'", null, + null, 0, 10, new SearchFlags().setFulltext(true)); assertEquals(searchResult.getNumEntities().intValue(), 0); } @@ -187,7 +193,8 @@ public void testAdvancedSearchOr() throws Exception { SearchResult searchResult = - _searchService.searchAcrossEntities(ImmutableList.of(ENTITY_NAME), "test", filterWithCondition, null, 0, 10, null); + _searchService.searchAcrossEntities(ImmutableList.of(ENTITY_NAME), "test", filterWithCondition, + null, 0, 10, new SearchFlags().setFulltext(true)); assertEquals(searchResult.getNumEntities().intValue(), 0); clearCache(); @@ -224,7 +231,8 @@ public void testAdvancedSearchOr() throws Exception { syncAfterWrite(_bulkProcessor); - searchResult = _searchService.searchAcrossEntities(ImmutableList.of(), "test", filterWithCondition, null, 0, 10, null); + searchResult = _searchService.searchAcrossEntities(ImmutableList.of(), "test", filterWithCondition, + null, 0, 10, new SearchFlags().setFulltext(true)); assertEquals(searchResult.getNumEntities().intValue(), 2); assertEquals(searchResult.getEntities().get(0).getEntity(), urn); assertEquals(searchResult.getEntities().get(1).getEntity(), urn2); @@ -253,7 +261,8 @@ public void testAdvancedSearchSoftDelete() throws Exception { SearchResult searchResult = - _searchService.searchAcrossEntities(ImmutableList.of(ENTITY_NAME), "test", filterWithCondition, null, 0, 10, null); + _searchService.searchAcrossEntities(ImmutableList.of(ENTITY_NAME), "test", filterWithCondition, + null, 0, 10, new SearchFlags().setFulltext(true)); assertEquals(searchResult.getNumEntities().intValue(), 0); clearCache(); @@ -293,7 +302,8 @@ public void testAdvancedSearchSoftDelete() throws Exception { syncAfterWrite(_bulkProcessor); - searchResult = _searchService.searchAcrossEntities(ImmutableList.of(), "test", filterWithCondition, null, 0, 10, null); + searchResult = _searchService.searchAcrossEntities(ImmutableList.of(), "test", filterWithCondition, + null, 0, 10, new SearchFlags().setFulltext(true)); assertEquals(searchResult.getNumEntities().intValue(), 1); assertEquals(searchResult.getEntities().get(0).getEntity(), urn); clearCache(); @@ -316,7 +326,8 @@ public void testAdvancedSearchNegated() throws Exception { SearchResult searchResult = - _searchService.searchAcrossEntities(ImmutableList.of(ENTITY_NAME), "test", filterWithCondition, null, 0, 10, null); + _searchService.searchAcrossEntities(ImmutableList.of(ENTITY_NAME), "test", filterWithCondition, + null, 0, 10, new SearchFlags().setFulltext(true)); assertEquals(searchResult.getNumEntities().intValue(), 0); clearCache(); @@ -356,7 +367,8 @@ public void testAdvancedSearchNegated() throws Exception { syncAfterWrite(_bulkProcessor); - searchResult = _searchService.searchAcrossEntities(ImmutableList.of(), "test", filterWithCondition, null, 0, 10, null); + searchResult = _searchService.searchAcrossEntities(ImmutableList.of(), "test", filterWithCondition, + null, 0, 10, new SearchFlags().setFulltext(true)); assertEquals(searchResult.getNumEntities().intValue(), 1); assertEquals(searchResult.getEntities().get(0).getEntity(), urn3); clearCache(); diff --git a/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/ElasticSearchServiceTest.java b/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/ElasticSearchServiceTest.java index 28ef4d008a809..25e2cf07c7e37 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/ElasticSearchServiceTest.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/ElasticSearchServiceTest.java @@ -176,7 +176,7 @@ public void testElasticSearchServiceFulltext() throws Exception { _elasticSearchService.upsertDocument(ENTITY_NAME, document2.toString(), urn2.toString()); syncAfterWrite(_bulkProcessor); - searchResult = _elasticSearchService.fullTextSearch(ENTITY_NAME, "test2", null, null, 0, 10); + searchResult = _elasticSearchService.fullTextSearch(ENTITY_NAME, "'test2'", null, null, 0, 10); assertEquals(searchResult.getNumEntities().intValue(), 1); assertEquals(searchResult.getEntities().get(0).getEntity(), urn2); @@ -187,7 +187,7 @@ public void testElasticSearchServiceFulltext() throws Exception { _elasticSearchService.deleteDocument(ENTITY_NAME, urn.toString()); _elasticSearchService.deleteDocument(ENTITY_NAME, urn2.toString()); syncAfterWrite(_bulkProcessor); - searchResult = _elasticSearchService.fullTextSearch(ENTITY_NAME, "test2", null, null, 0, 10); + searchResult = _elasticSearchService.fullTextSearch(ENTITY_NAME, "'test2'", null, null, 0, 10); assertEquals(searchResult.getNumEntities().intValue(), 0); assertEquals(_elasticSearchService.docCount(ENTITY_NAME), 0); diff --git a/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/fixtures/SampleDataFixtureTests.java b/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/fixtures/SampleDataFixtureTests.java index 92d281334e39b..d3748658f3d8e 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/fixtures/SampleDataFixtureTests.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/fixtures/SampleDataFixtureTests.java @@ -30,6 +30,7 @@ import static com.linkedin.metadata.ESTestUtils.autocomplete; import static com.linkedin.metadata.ESTestUtils.search; +import static com.linkedin.metadata.ESTestUtils.searchStructured; import static org.testng.Assert.assertEquals; import static org.testng.Assert.assertTrue; import static org.testng.Assert.assertNotNull; @@ -283,7 +284,7 @@ public void testTokenizationWithNumber() throws IOException { assertEquals(tokens, List.of( "harshal-playground-306419", "harshal", "playground", "306419", "test_schema", "test", "schema", - "austin311_deriv", "austin311", "deriv"), + "austin311_deriv", "austin311", "deriv", "austin", "311"), String.format("Unexpected tokens. Found %s", tokens)); request = AnalyzeRequest.withIndexAnalyzer( @@ -295,7 +296,7 @@ public void testTokenizationWithNumber() throws IOException { assertEquals(tokens, List.of( "harshal-playground-306419", "harshal", "playground", "306419", "test_schema", "test", "schema", - "austin311_deriv", "austin311", "deriv"), + "austin311_deriv", "austin311", "deriv", "austin", "311"), String.format("Unexpected tokens. Found %s", tokens)); } @@ -327,6 +328,7 @@ public void testTokenizationDataPlatform() throws IOException { "urn:li:dataplatform:hive", "data", "dataplatform", "platform", "hive", "samplehivedataset-ac611929-c3ac-4b92-aafb-f4603ddb408a", "samplehivedataset", "ac611929", "c3ac", "4b92", "aafb", "f4603ddb408a", "sampl", + "ac", "611929", "92", "4603", "ddb", "408", "prod", "production"), String.format("Unexpected tokens. Found %s", tokens)); @@ -358,6 +360,38 @@ public void testChartAutoComplete() throws InterruptedException { }); } + @Test + public void testSmokeTestQueries() { + Map expectedMinimums = Map.of( + "sample", 3, + "covid", 1 + ); + + Map results = expectedMinimums.entrySet().stream() + .collect(Collectors.toMap(Map.Entry::getKey, entry -> search(searchService, entry.getKey()))); + + results.forEach((key, value) -> { + Integer actualCount = value.getEntities().size(); + Integer expectedCount = expectedMinimums.get(key); + assertTrue(actualCount >= expectedCount, + String.format("Search term `%s` has %s fulltext results, expected %s results.", key, + actualCount, expectedCount)); + }); + } + + @Test + public void testMinNumberLengthLimit() throws IOException { + AnalyzeRequest request = AnalyzeRequest.withIndexAnalyzer( + "smpldat_datasetindex_v2", + "word_delimited", + "data2022.data22" + ); + List expected = List.of("data2022", "data", "2022", "data22", "22"); + List actual = getTokens(request).map(AnalyzeResponse.AnalyzeToken::getTerm).collect(Collectors.toList()); + assertEquals(actual, expected, + String.format("Expected: %s Actual: %s", expected, actual)); + } + private Stream getTokens(AnalyzeRequest request) throws IOException { return _searchClient.indices().analyze(request, RequestOptions.DEFAULT).getTokens().stream(); } diff --git a/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/query/request/SearchQueryBuilderTest.java b/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/query/request/SearchQueryBuilderTest.java index a9b26c4c56ca9..26cc264072901 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/query/request/SearchQueryBuilderTest.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/query/request/SearchQueryBuilderTest.java @@ -18,7 +18,7 @@ public class SearchQueryBuilderTest { public void testQueryBuilderFulltext() { FunctionScoreQueryBuilder result = (FunctionScoreQueryBuilder) SearchQueryBuilder.buildQuery(TestEntitySpecBuilder.getSpec(), "testQuery", - false); + true); BoolQueryBuilder mainQuery = (BoolQueryBuilder) result.query(); List shouldQueries = mainQuery.should(); assertEquals(shouldQueries.size(), 2); @@ -55,7 +55,7 @@ public void testQueryBuilderFulltext() { public void testQueryBuilderStructured() { FunctionScoreQueryBuilder result = (FunctionScoreQueryBuilder) SearchQueryBuilder.buildQuery(TestEntitySpecBuilder.getSpec(), "testQuery", - true); + false); BoolQueryBuilder mainQuery = (BoolQueryBuilder) result.query(); List shouldQueries = mainQuery.should(); assertEquals(shouldQueries.size(), 1); diff --git a/metadata-models/src/main/pegasus/com/linkedin/metadata/query/SearchFlags.pdl b/metadata-models/src/main/pegasus/com/linkedin/metadata/query/SearchFlags.pdl index 7ce19971d9f45..9448dbf5f8aae 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/metadata/query/SearchFlags.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/metadata/query/SearchFlags.pdl @@ -17,5 +17,5 @@ record SearchFlags { /** * Structured or unstructured fulltext query */ - structured: boolean = true + fulltext:optional boolean } diff --git a/metadata-service/restli-api/src/main/idl/com.linkedin.entity.entities.restspec.json b/metadata-service/restli-api/src/main/idl/com.linkedin.entity.entities.restspec.json index 0e65e5152c07a..534d0c29eeb44 100644 --- a/metadata-service/restli-api/src/main/idl/com.linkedin.entity.entities.restspec.json +++ b/metadata-service/restli-api/src/main/idl/com.linkedin.entity.entities.restspec.json @@ -263,8 +263,10 @@ "name" : "count", "type" : "int" }, { - "name" : "structured", - "type" : "boolean" + "name" : "fulltext", + "type" : "boolean", + "default" : "true", + "optional" : true } ], "returns" : "com.linkedin.metadata.search.SearchResult" }, { @@ -290,6 +292,11 @@ }, { "name" : "count", "type" : "int" + }, { + "name" : "fulltext", + "type" : "boolean", + "default" : "true", + "optional" : true } ], "returns" : "com.linkedin.metadata.search.SearchResult" }, { @@ -326,6 +333,11 @@ }, { "name" : "count", "type" : "int" + }, { + "name" : "fulltext", + "type" : "boolean", + "default" : "true", + "optional" : true } ], "returns" : "com.linkedin.metadata.search.LineageSearchResult" }, { diff --git a/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.entities.snapshot.json b/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.entities.snapshot.json index 1c77f4a72966f..0de11a0229fac 100644 --- a/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.entities.snapshot.json +++ b/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.entities.snapshot.json @@ -6272,8 +6272,10 @@ "name" : "count", "type" : "int" }, { - "name" : "structured", - "type" : "boolean" + "name" : "fulltext", + "type" : "boolean", + "default" : "true", + "optional" : true } ], "returns" : "com.linkedin.metadata.search.SearchResult" }, { diff --git a/metadata-service/restli-client/src/main/java/com/linkedin/entity/client/EntityClient.java b/metadata-service/restli-client/src/main/java/com/linkedin/entity/client/EntityClient.java index 7b81bc70d98ac..da58f077341b2 100644 --- a/metadata-service/restli-client/src/main/java/com/linkedin/entity/client/EntityClient.java +++ b/metadata-service/restli-client/src/main/java/com/linkedin/entity/client/EntityClient.java @@ -134,7 +134,7 @@ public void batchUpdate(@Nonnull final Set entities, @Nonnull final Auth @Nonnull public SearchResult search(@Nonnull String entity, @Nonnull String input, @Nullable Map requestFilters, int start, int count, @Nonnull Authentication authentication, - @Nullable Boolean structured) + @Nullable Boolean fulltext) throws RemoteInvocationException; /** @@ -164,7 +164,7 @@ public ListResult list(@Nonnull String entity, @Nullable Map req @Nonnull public SearchResult search(@Nonnull String entity, @Nonnull String input, @Nullable Filter filter, SortCriterion sortCriterion, int start, int count, @Nonnull Authentication authentication, - @Nullable Boolean structured) throws RemoteInvocationException; + @Nullable Boolean fulltext) throws RemoteInvocationException; /** * Searches for entities matching to a given query and filters across multiple entity types diff --git a/metadata-service/restli-client/src/main/java/com/linkedin/entity/client/RestliEntityClient.java b/metadata-service/restli-client/src/main/java/com/linkedin/entity/client/RestliEntityClient.java index dd665502a451c..cb3d43a9f66af 100644 --- a/metadata-service/restli-client/src/main/java/com/linkedin/entity/client/RestliEntityClient.java +++ b/metadata-service/restli-client/src/main/java/com/linkedin/entity/client/RestliEntityClient.java @@ -330,7 +330,7 @@ public void batchUpdate(@Nonnull final Set entities, @Nonnull final Auth @Override public SearchResult search(@Nonnull String entity, @Nonnull String input, @Nullable Map requestFilters, int start, int count, @Nonnull final Authentication authentication, - @Nullable Boolean structured) + @Nullable Boolean fulltext) throws RemoteInvocationException { final EntitiesDoSearchRequestBuilder requestBuilder = ENTITIES_REQUEST_BUILDERS.actionSearch() @@ -339,7 +339,7 @@ public SearchResult search(@Nonnull String entity, @Nonnull String input, .filterParam(newFilter(requestFilters)) .startParam(start) .countParam(count) - .structuredParam(structured); + .fulltextParam(fulltext); return sendClientRequest(requestBuilder, authentication).getEntity(); } @@ -380,7 +380,7 @@ public ListResult list(@Nonnull String entity, @Nullable Map req @Override public SearchResult search(@Nonnull String entity, @Nonnull String input, @Nullable Filter filter, SortCriterion sortCriterion, int start, int count, @Nonnull final Authentication authentication, - @Nullable Boolean structured) + @Nullable Boolean fulltext) throws RemoteInvocationException { final EntitiesDoSearchRequestBuilder requestBuilder = ENTITIES_REQUEST_BUILDERS.actionSearch() @@ -388,7 +388,7 @@ public SearchResult search(@Nonnull String entity, @Nonnull String input, @Nulla .inputParam(input) .startParam(start) .countParam(count) - .structuredParam(structured); + .fulltextParam(fulltext); if (filter != null) { requestBuilder.filterParam(filter); diff --git a/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/entity/EntityResource.java b/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/entity/EntityResource.java index 0ce05086b53a4..6a44968192344 100644 --- a/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/entity/EntityResource.java +++ b/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/entity/EntityResource.java @@ -21,6 +21,7 @@ import com.linkedin.metadata.query.AutoCompleteResult; import com.linkedin.metadata.query.ListResult; import com.linkedin.metadata.query.ListUrnsResult; +import com.linkedin.metadata.query.SearchFlags; import com.linkedin.metadata.query.filter.Condition; import com.linkedin.metadata.query.filter.Criterion; import com.linkedin.metadata.query.filter.Filter; @@ -99,7 +100,7 @@ public class EntityResource extends CollectionResourceTaskTemplate batchIngest(@ActionParam(PARAM_ENTITIES) @Nonnull Entity[] ent public Task search(@ActionParam(PARAM_ENTITY) @Nonnull String entityName, @ActionParam(PARAM_INPUT) @Nonnull String input, @ActionParam(PARAM_FILTER) @Optional @Nullable Filter filter, @ActionParam(PARAM_SORT) @Optional @Nullable SortCriterion sortCriterion, @ActionParam(PARAM_START) int start, - @ActionParam(PARAM_COUNT) int count, @ActionParam(PARAM_STRUCTURED) Boolean structured) { + @ActionParam(PARAM_COUNT) int count, @Optional @Nullable @ActionParam(PARAM_FULLTEXT) Boolean fulltext) { log.info("GET SEARCH RESULTS for {} with query {}", entityName, input); // TODO - change it to use _searchService once we are confident on it's latency return RestliUtil.toTask( () -> { final SearchResult result; - if (structured) { - result = _entitySearchService.structuredSearch(entityName, input, filter, sortCriterion, start, count); - } else { + if (Boolean.TRUE.equals(fulltext)) { result = _entitySearchService.fullTextSearch(entityName, input, filter, sortCriterion, start, count); + } else { + result = _entitySearchService.structuredSearch(entityName, input, filter, sortCriterion, start, count); } return validateSearchResult(result, _entityService); }, @@ -296,7 +297,8 @@ public Task searchAcrossEntities(@ActionParam(PARAM_ENTITIES) @Opt List entityList = entities == null ? Collections.emptyList() : Arrays.asList(entities); log.info("GET SEARCH RESULTS ACROSS ENTITIES for {} with query {}", entityList, input); return RestliUtil.toTask(() -> validateSearchResult( - _searchService.searchAcrossEntities(entityList, input, filter, sortCriterion, start, count, null), + _searchService.searchAcrossEntities(entityList, input, filter, sortCriterion, start, count, + new SearchFlags().setFulltext(true)), _entityService), "searchAcrossEntities"); } diff --git a/smoke-test/smoke-dev.sh b/smoke-test/smoke-dev.sh deleted file mode 100755 index 9237065e94835..0000000000000 --- a/smoke-test/smoke-dev.sh +++ /dev/null @@ -1,29 +0,0 @@ -#!/bin/bash -set -euxo pipefail - -# Runs a basic e2e test. It is not meant to be fully comprehensive, -# but rather should catch obvious bugs before they make it into prod. -# -# Script assumptions: -# - The gradle build has already been run. -# - Python 3.6+ is installed and in the PATH. - -# Log the locally loaded images -# docker images | grep "datahub-" - -DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" -cd "$DIR" - -python3 -m venv venv -source venv/bin/activate -pip install --upgrade pip wheel setuptools -pip install -r requirements.txt - -echo "DATAHUB_VERSION = ${DATAHUB_VERSION:=acryl-datahub 0.0.0.dev0}" -DATAHUB_TELEMETRY_ENABLED=false \ -DOCKER_COMPOSE_BASE="file://$( dirname "$DIR" )" \ -datahub docker quickstart --build-locally --standalone_consumers --dump-logs-on-failure - -(cd ..; ./gradlew :smoke-test:yarnInstall) - -pytest -rP --durations=20 -vv --junit-xml=junit.smoke.xml $@