diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index d7460d48..2bd65bf6 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -17,9 +17,9 @@ jobs: fail-fast: false matrix: es_stack: - - 8.14.2 - - 8.15.0 - - 8.16.0-SNAPSHOT + - 8.15.3 + - 8.16.0 + - 8.17.0-SNAPSHOT runs-on: ubuntu-latest services: elasticsearch: diff --git a/supporting-blog-content/hybrid-search-for-an-e-commerce-product-catalogue/product-store-search/api/api.py b/supporting-blog-content/hybrid-search-for-an-e-commerce-product-catalogue/product-store-search/api/api.py index 44c63aaa..c98bab16 100644 --- a/supporting-blog-content/hybrid-search-for-an-e-commerce-product-catalogue/product-store-search/api/api.py +++ b/supporting-blog-content/hybrid-search-for-an-e-commerce-product-catalogue/product-store-search/api/api.py @@ -11,27 +11,30 @@ def get_client_es(): - with open('../config.yml', 'r') as file: + with open("../config.yml", "r") as file: config = yaml.safe_load(file) - return Elasticsearch( - cloud_id=config['cloud_id'], - api_key=config['api_key'] - ) + return Elasticsearch(cloud_id=config["cloud_id"], api_key=config["api_key"]) def get_text_vector(sentences): - model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2') + model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2") embeddings = model.encode(sentences) return embeddings def build_query(term=None, categories=None, product_types=None, brands=None): - must_query = [{"match_all": {}}] if not term else [{ - "multi_match": { - "query": term, - "fields": ["name", "category", "description"] - } - }] + must_query = ( + [{"match_all": {}}] + if not term + else [ + { + "multi_match": { + "query": term, + "fields": ["name", "category", "description"], + } + } + ] + ) filters = [] if categories: @@ -42,17 +45,23 @@ def build_query(term=None, categories=None, product_types=None, brands=None): filters.append({"terms": {"brand.keyword": brands}}) return { - "_source": ["id", "brand", "name", "price", "currency", "image_link", "category", "tag_list"], - "query": { - "bool": { - "must": must_query, - "filter": filters - } - } + "_source": [ + "id", + "brand", + "name", + "price", + "currency", + "image_link", + "category", + "tag_list", + ], + "query": {"bool": {"must": must_query, "filter": filters}}, } -def build_hybrid_query(term=None, categories=None, product_types=None, brands=None, hybrid=False): +def build_hybrid_query( + term=None, categories=None, product_types=None, brands=None, hybrid=False +): # Standard query organic_query = build_query(term, categories, product_types, brands) @@ -65,81 +74,79 @@ def build_hybrid_query(term=None, categories=None, product_types=None, brands=No "retriever": { "rrf": { "retrievers": [ - { - "standard": { - "query": organic_query['query'] - } - }, + {"standard": {"query": organic_query["query"]}}, { "knn": { "field": "description_embeddings", "query_vector": vector, "k": 5, "num_candidates": 20, - "filter": { - "bool": { - "filter": [] - } - } + "filter": {"bool": {"filter": []}}, } - } + }, ], "rank_window_size": 20, - "rank_constant": 5 + "rank_constant": 5, } }, - "_source": organic_query['_source'] + "_source": organic_query["_source"], } if categories: - query['retriever']['rrf']['retrievers'][1]['knn']['filter']['bool']['filter'].append({ - "terms": {"category": categories} - }) + query["retriever"]["rrf"]["retrievers"][1]["knn"]["filter"]["bool"][ + "filter" + ].append({"terms": {"category": categories}}) if product_types: - query['retriever']['rrf']['retrievers'][1]['knn']['filter']['bool']['filter'].append({ - "terms": {"product_type": product_types} - }) + query["retriever"]["rrf"]["retrievers"][1]["knn"]["filter"]["bool"][ + "filter" + ].append({"terms": {"product_type": product_types}}) if brands: - query['retriever']['rrf']['retrievers'][1]['knn']['filter']['bool']['filter'].append({ - "terms": {"brand.keyword": brands} - }) + query["retriever"]["rrf"]["retrievers"][1]["knn"]["filter"]["bool"][ + "filter" + ].append({"terms": {"brand.keyword": brands}}) else: query = organic_query return query -def search_products(term, categories=None, product_types=None, brands=None, promote_products=[], hybrid=False): +def search_products( + term, + categories=None, + product_types=None, + brands=None, + promote_products=[], + hybrid=False, +): query = build_hybrid_query(term, categories, product_types, brands, hybrid) if promote_products and not hybrid: query = { - "query": { - "pinned": { - "ids": promote_products, - "organic": query['query'] - } - }, - "_source": query['_source'] + "query": {"pinned": {"ids": promote_products, "organic": query["query"]}}, + "_source": query["_source"], } print(query) response = get_client_es().search(index="products-catalog", body=query, size=20) results = [] - for hit in response['hits']['hits']: + for hit in response["hits"]["hits"]: print(f"Product Name: {hit['_source']['name']}, Score: {hit['_score']}") - results.append({ - "id": hit['_source']['id'], - "brand": hit['_source']['brand'], - "name": hit['_source']['name'], - "price": hit['_source']['price'], - "currency": hit['_source']['currency'] if hit['_source']['currency'] else "USD", - "image_link": hit['_source']['image_link'], - "category": hit['_source']['category'], - "tags": hit['_source'].get('tag_list', []) - }) + results.append( + { + "id": hit["_source"]["id"], + "brand": hit["_source"]["brand"], + "name": hit["_source"]["name"], + "price": hit["_source"]["price"], + "currency": ( + hit["_source"]["currency"] if hit["_source"]["currency"] else "USD" + ), + "image_link": hit["_source"]["image_link"], + "category": hit["_source"]["category"], + "tags": hit["_source"].get("tag_list", []), + } + ) return results @@ -149,51 +156,55 @@ def get_facets_data(term, categories=None, product_types=None, brands=None): query["aggs"] = { "product_types": {"terms": {"field": "product_type"}}, "categories": {"terms": {"field": "category"}}, - "brands": {"terms": {"field": "brand.keyword"}} + "brands": {"terms": {"field": "brand.keyword"}}, } response = get_client_es().search(index="products-catalog", body=query, size=0) return { "product_types": [ - {"product_type": bucket['key'], "count": bucket['doc_count']} - for bucket in response['aggregations']['product_types']['buckets'] + {"product_type": bucket["key"], "count": bucket["doc_count"]} + for bucket in response["aggregations"]["product_types"]["buckets"] ], "categories": [ - {"category": bucket['key'], "count": bucket['doc_count']} - for bucket in response['aggregations']['categories']['buckets'] + {"category": bucket["key"], "count": bucket["doc_count"]} + for bucket in response["aggregations"]["categories"]["buckets"] ], "brands": [ - {"brand": bucket['key'], "count": bucket['doc_count']} - for bucket in response['aggregations']['brands']['buckets'] - ] + {"brand": bucket["key"], "count": bucket["doc_count"]} + for bucket in response["aggregations"]["brands"]["buckets"] + ], } -@app.route('/api/products/search', methods=['GET']) +@app.route("/api/products/search", methods=["GET"]) def search(): - query = request.args.get('query') - categories = request.args.getlist('selectedCategories[]') - product_types = request.args.getlist('selectedProductTypes[]') - brands = request.args.getlist('selectedBrands[]') - hybrid = request.args.get('hybrid', 'False').lower() == 'true' - results = search_products(query, categories=categories, product_types=product_types, - brands=brands, - promote_products=promote_products_free_gluten, - hybrid=hybrid) + query = request.args.get("query") + categories = request.args.getlist("selectedCategories[]") + product_types = request.args.getlist("selectedProductTypes[]") + brands = request.args.getlist("selectedBrands[]") + hybrid = request.args.get("hybrid", "False").lower() == "true" + results = search_products( + query, + categories=categories, + product_types=product_types, + brands=brands, + promote_products=promote_products_free_gluten, + hybrid=hybrid, + ) return jsonify(results) -@app.route('/api/products/facets', methods=['GET']) +@app.route("/api/products/facets", methods=["GET"]) def facets(): - query = request.args.get('query') - categories = request.args.getlist('selectedCategories[]') - product_types = request.args.getlist('selectedProductTypes[]') - brands = request.args.getlist('selectedBrands[]') - results = get_facets_data(query, categories=categories, - product_types=product_types, - brands=brands) + query = request.args.get("query") + categories = request.args.getlist("selectedCategories[]") + product_types = request.args.getlist("selectedProductTypes[]") + brands = request.args.getlist("selectedBrands[]") + results = get_facets_data( + query, categories=categories, product_types=product_types, brands=brands + ) return jsonify(results) -if __name__ == '__main__': +if __name__ == "__main__": app.run(debug=True) diff --git a/supporting-blog-content/hybrid-search-for-an-e-commerce-product-catalogue/product-store-search/files/dataset/generate_data.py b/supporting-blog-content/hybrid-search-for-an-e-commerce-product-catalogue/product-store-search/files/dataset/generate_data.py index 2cca232e..fd8b946c 100644 --- a/supporting-blog-content/hybrid-search-for-an-e-commerce-product-catalogue/product-store-search/files/dataset/generate_data.py +++ b/supporting-blog-content/hybrid-search-for-an-e-commerce-product-catalogue/product-store-search/files/dataset/generate_data.py @@ -1,15 +1,26 @@ import csv import json -desired_fields = ["id", "brand", "name", "price", "price_sign", "currency", - "image_link", "description", "rating", "category", - "product_type", "tag_list"] +desired_fields = [ + "id", + "brand", + "name", + "price", + "price_sign", + "currency", + "image_link", + "description", + "rating", + "category", + "product_type", + "tag_list", +] input_file = "dataset_products.csv" # Replace with your actual filename output_file = "products.json" # Open CSV file -with open(input_file, 'r') as csvfile: +with open(input_file, "r") as csvfile: # Read CSV data using DictReader csv_reader = csv.DictReader(csvfile) @@ -37,8 +48,8 @@ json_data.append(product_data) # Open JSON file for writing -with open(output_file, 'w') as jsonfile: +with open(output_file, "w") as jsonfile: # Write JSON data to file with indentation json.dump(json_data, jsonfile, indent=4) -print(f"Converted CSV data to JSON and saved to {output_file}") \ No newline at end of file +print(f"Converted CSV data to JSON and saved to {output_file}") diff --git a/supporting-blog-content/hybrid-search-for-an-e-commerce-product-catalogue/product-store-search/infra/create_index.py b/supporting-blog-content/hybrid-search-for-an-e-commerce-product-catalogue/product-store-search/infra/create_index.py index a6f647db..a9ba6c48 100644 --- a/supporting-blog-content/hybrid-search-for-an-e-commerce-product-catalogue/product-store-search/infra/create_index.py +++ b/supporting-blog-content/hybrid-search-for-an-e-commerce-product-catalogue/product-store-search/infra/create_index.py @@ -1,7 +1,7 @@ import yaml from elasticsearch import Elasticsearch -index_name = 'products-catalog' +index_name = "products-catalog" mapping = { "settings": { "index": { @@ -11,63 +11,31 @@ }, "mappings": { "properties": { - "id": { - "type": "keyword" - }, + "id": {"type": "keyword"}, "brand": { "type": "text", - "fields": { - "keyword": { - "type": "keyword" - } - }, - }, - "name": { - "type": "text" - }, - "price": { - "type": "float" - }, - "price_sign": { - "type": "keyword" - }, - "currency": { - "type": "keyword" - }, - "image_link": { - "type": "keyword" - }, - "description": { - "type": "text" - }, - "description_embeddings": { - "type": "dense_vector", - "dims": 384 - }, - "rating": { - "type": "keyword" - }, - "category": { - "type": "keyword" - }, - "product_type": { - "type": "keyword" - }, - "tag_list": { - "type": "keyword" - } + "fields": {"keyword": {"type": "keyword"}}, + }, + "name": {"type": "text"}, + "price": {"type": "float"}, + "price_sign": {"type": "keyword"}, + "currency": {"type": "keyword"}, + "image_link": {"type": "keyword"}, + "description": {"type": "text"}, + "description_embeddings": {"type": "dense_vector", "dims": 384}, + "rating": {"type": "keyword"}, + "category": {"type": "keyword"}, + "product_type": {"type": "keyword"}, + "tag_list": {"type": "keyword"}, } - } + }, } def get_client_es(): - with open('../config.yml', 'r') as file: + with open("../config.yml", "r") as file: config = yaml.safe_load(file) - return Elasticsearch( - cloud_id=config['cloud_id'], - api_key=config['api_key'] - ) + return Elasticsearch(cloud_id=config["cloud_id"], api_key=config["api_key"]) def create_index(index_name, mapping): diff --git a/supporting-blog-content/hybrid-search-for-an-e-commerce-product-catalogue/product-store-search/ingestion/ingestion.ipynb b/supporting-blog-content/hybrid-search-for-an-e-commerce-product-catalogue/product-store-search/ingestion/ingestion.ipynb index 9d6fbb1c..32ff2532 100644 --- a/supporting-blog-content/hybrid-search-for-an-e-commerce-product-catalogue/product-store-search/ingestion/ingestion.ipynb +++ b/supporting-blog-content/hybrid-search-for-an-e-commerce-product-catalogue/product-store-search/ingestion/ingestion.ipynb @@ -50,7 +50,7 @@ "metadata": {}, "outputs": [], "source": [ - "# Step 1: Elasticsearch client setup using cloud configuration\n" + "# Step 1: Elasticsearch client setup using cloud configuration" ] }, { @@ -63,12 +63,9 @@ " \"\"\"\n", " Initializes Elasticsearch client using cloud_id and api_key from config.yml\n", " \"\"\"\n", - " with open('../config.yml', 'r') as file:\n", + " with open(\"../config.yml\", \"r\") as file:\n", " config = yaml.safe_load(file)\n", - " return Elasticsearch(\n", - " cloud_id=config['cloud_id'],\n", - " api_key=config['api_key']\n", - " )" + " return Elasticsearch(cloud_id=config[\"cloud_id\"], api_key=config[\"api_key\"])" ] }, { @@ -88,7 +85,7 @@ " \"\"\"\n", " Generates sentence embeddings using pre-trained model 'all-MiniLM-L6-v2'.\n", " \"\"\"\n", - " model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')\n", + " model = SentenceTransformer(\"sentence-transformers/all-MiniLM-L6-v2\")\n", " embeddings = model.encode(sentences)\n", " return embeddings" ] @@ -110,7 +107,7 @@ " \"\"\"\n", " Reads and loads the dataset from a JSON file.\n", " \"\"\"\n", - " with open(file_path, 'r') as file:\n", + " with open(file_path, \"r\") as file:\n", " data = json.load(file)\n", " return data" ] @@ -133,7 +130,7 @@ " Yields chunks of data in batch sizes for bulk indexing in Elasticsearch.\n", " \"\"\"\n", " for i in range(0, len(data), batch_size):\n", - " yield data[i:i + batch_size]" + " yield data[i : i + batch_size]" ] }, { @@ -155,13 +152,9 @@ " Adds 'description_embeddings' by encoding the 'description' field.\n", " \"\"\"\n", " for item in data_batch:\n", - " document_id = item['id']\n", - " item['description_embeddings'] = get_text_vector(item['description'])\n", - " yield {\n", - " \"_index\": index_name,\n", - " \"_id\": document_id,\n", - " \"_source\": item\n", - " }" + " document_id = item[\"id\"]\n", + " item[\"description_embeddings\"] = get_text_vector(item[\"description\"])\n", + " yield {\"_index\": index_name, \"_id\": document_id, \"_source\": item}" ] }, { @@ -188,9 +181,10 @@ " success, failed = helpers.bulk(get_client_es(), actions)\n", " print(f\"Batch indexed: {success} successful, {failed} failed\")\n", "\n", + "\n", "# main execution block\n", "# if __name__ == '__main__':\n", - "# index_data_in_batches(\"../files/dataset/products.json\", \"products-catalog\", batch_size=100)\n" + "# index_data_in_batches(\"../files/dataset/products.json\", \"products-catalog\", batch_size=100)" ] }, { @@ -240,7 +234,9 @@ } ], "source": [ - "index_data_in_batches(\"../files/dataset/products.json\", \"products-catalog-2\", batch_size=100)" + "index_data_in_batches(\n", + " \"../files/dataset/products.json\", \"products-catalog-2\", batch_size=100\n", + ")" ] } ], diff --git a/supporting-blog-content/hybrid-search-for-an-e-commerce-product-catalogue/product-store-search/ingestion/ingestion.py b/supporting-blog-content/hybrid-search-for-an-e-commerce-product-catalogue/product-store-search/ingestion/ingestion.py index ed755fb0..540fc917 100644 --- a/supporting-blog-content/hybrid-search-for-an-e-commerce-product-catalogue/product-store-search/ingestion/ingestion.py +++ b/supporting-blog-content/hybrid-search-for-an-e-commerce-product-catalogue/product-store-search/ingestion/ingestion.py @@ -6,40 +6,33 @@ def get_client_es(): - with open('../config.yml', 'r') as file: + with open("../config.yml", "r") as file: config = yaml.safe_load(file) - return Elasticsearch( - cloud_id=config['cloud_id'], - api_key=config['api_key'] - ) + return Elasticsearch(cloud_id=config["cloud_id"], api_key=config["api_key"]) def get_text_vector(sentences): - model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2') + model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2") embeddings = model.encode(sentences) return embeddings def read_json_file(file_path): - with open(file_path, 'r') as file: + with open(file_path, "r") as file: data = json.load(file) return data def chunk_data(data, batch_size): for i in range(0, len(data), batch_size): - yield data[i:i + batch_size] + yield data[i : i + batch_size] def generate_bulk_actions(index_name, data_batch): for item in data_batch: - document_id = item['id'] - item['description_embeddings'] = get_text_vector(item['description']) - yield { - "_index": index_name, - "_id": document_id, - "_source": item - } + document_id = item["id"] + item["description_embeddings"] = get_text_vector(item["description"]) + yield {"_index": index_name, "_id": document_id, "_source": item} def index_data_in_batches(file_path, index_name, batch_size=100): @@ -51,5 +44,7 @@ def index_data_in_batches(file_path, index_name, batch_size=100): print(f"Batch indexed: {success} successful, {failed} failed") -if __name__ == '__main__': - index_data_in_batches("../files/dataset/products.json", "products-catalog", batch_size=100) +if __name__ == "__main__": + index_data_in_batches( + "../files/dataset/products.json", "products-catalog", batch_size=100 + )