In [5]:
import json
import sys
import tempfile
from urllib.parse import urljoin

import requests
import xgboost as xgb
from opensearchpy import OpenSearch
import matplotlib.pyplot as plt
from xgboost import XGBClassifier
from xgboost import plot_tree

from logger import logger

## Setup index

In [78]:
host = 'localhost'
port = 9200
base_url = "https://{}:{}/".format(host, port)
auth = ('admin', 'admin')  # For testing only. Don't store credentials in code.

# Create the client with SSL/TLS enabled, but hostname and certificate verification disabled.
client = OpenSearch(
    hosts=[{'host': host, 'port': port}],
    http_compress=True,  # enables gzip compression for request bodies
    http_auth=auth,
    # client_cert = client_cert_path,
    # client_key = client_key_path,
    use_ssl=True,
    verify_certs=False,
    ssl_assert_hostname=False,
    ssl_show_warn=False,
)
# Add our sample document to the index.
docs = [
    {
        "id": "doc_a",
        "title": "Fox and Hounds",
        "body": "The quick red fox jumped over the lazy brown dogs.",
        "price": "5.99",
        "in_stock": True,
        "category": "childrens"},
    {
        "id": "doc_b",
        "title": "Fox wins championship",
        "body": "Wearing all red, the Fox jumped out to a lead in the race over the Dog.",
        "price": "15.13",
        "in_stock": True,
        "category": "sports"},
    {
        "id": "doc_c",
        "title": "Lead Paint Removal",
        "body": "All lead must be removed from the brown and red paint.",
        "price": "150.21",
        "in_stock": False,
        "category": "instructional"},
    {
        "id": "doc_d",
        "title": "The Three Little Pigs Revisited",
        "price": "3.51",
        "in_stock": True,
        "body": "The big, bad wolf huffed and puffed and blew the house down. The end.",
        "category": "childrens"},
    {
        "id": "doc_e",
        "title": "Pigs in a Blanket and Other Recipes",
        "price": "27.50",
        "in_stock": True,
        "body": "Pigs in a blanket aren't as cute as you would think given it's a food and not actual pigs wrapped in blankets.",
        "category": "instructional"},
    {
        "id": "doc_f",
        "title": "Dogs are the best",
        "body": "Dogs beat cats every day of the week and twice on Sunday. A dog is always up for doing something.  Since there are so many dog breeds, there is a dog for everyone!",
        "price": "50.99",
        "in_stock": True,
        "category": "childrens"},
    {
        "id": "doc_g",
        "title": "Dog",
        "body": "Dogs rule",
        "price": "5.99",
        "in_stock": True,
        "category": "childrens"},
    {
        "id": "doc_h",
        "title": "Dog: The bounty hunter: living in the red",
        "body": "Dog is a bounty hunter who goes on pretend missions with his friends, one of whom is the Fox",
        "price": "125.99",
        "in_stock": True,
        "category": "sports"},
]

# Create a new index
index_name = 'searchml_ltr'
index_body = {
    'settings': {
        'index': {
            'query': {
                'default_field': "body"
            }
        }
    },
    "mappings": {
        "properties": {
            "title": {"type": "text", "analyzer": "english"},
            "body": {"type": "text", "analyzer": "english"},
            "in_stock": {"type": "boolean"},
            "category": {"type": "keyword", "ignore_above": "256"},
            "price": {"type": "float"}
        }
    }
}

client.indices.delete(index_name, ignore_unavailable=True)
client.indices.create(index_name, body=index_body)
# Index our documents
print("Indexing our documents")
for doc in docs:
    doc_id = doc["id"]
    print("\tIndexing {}".format(doc_id))
    client.index(
        index=index_name,
        body=doc,
        id=doc_id,
        refresh=True
    )

# Verify they are in:
print("We indexed:\n{}".format(client.cat.count(index_name, params={"v": "true"})))

Indexing our documents
	Indexing doc_a
	Indexing doc_b
	Indexing doc_c
	Indexing doc_d
	Indexing doc_e
	Indexing doc_f
	Indexing doc_g
	Indexing doc_h
We indexed:
epoch      timestamp count
1677960113 20:01:53  8



## Setup LTR storage

In [79]:
# Turn on the LTR store and name it the same as our index
ltr_store_name = index_name
ltr_store_path = "_ltr/" + ltr_store_name

print("Create our LTR store")
# LTR requests are not supported by the OpenSearchPy client, so we will drop down to using Python's Requests library
ltr_model_path = urljoin(base_url, ltr_store_path)
# Delete any old storage
resp = requests.delete(ltr_model_path, auth=auth, verify=False)
print("\tDeleted old store response status: %s" % resp.status_code)
# Create our new LTR storage
resp = requests.put(ltr_model_path, auth=auth, verify=False)
print("\tCreate the new store response status: %s" % resp.status_code)

Create our LTR store
	Deleted old store response status: 200
	Create the new store response status: 200


## Set up LTR feature set

In [80]:
featureset_name = "ltr_toy"
headers = {"Content-Type": 'application/json'}
featureset_path = urljoin(ltr_model_path + "/", "_featureset/{}".format(featureset_name))
# Upload our feature set to our model
body_query_feature_name = "body_query"
title_query_feature_name = "title_query"
price_func_feature_name = "price_func"
print("\tUpload our features to the LTR storage")
ltr_feature_set = {"featureset": {
    "features": [
        {  # Instead of using our multifield query_string match, break it out into parts
            "name": title_query_feature_name,
            "params": ["keywords"],
            "template_language": "mustache",
            "template": {
                "match": {
                    "title": "{{keywords}}"
                }
            }
        },
        {  # Instead of using our multifield query_string match, break it out into parts
            "name": body_query_feature_name,
            "params": ["keywords"],
            "template_language": "mustache",
            "template": {
                "match": {
                    "body": "{{keywords}}"
                }
            }
        },
        # factor in price, albeit naively for this purpose, in practice we should normalize it, which we will do in the project!
        {
            "name": ("%s" % price_func_feature_name),
            "template_language": "mustache",
            "template": {
                "function_score": {
                    "functions": [{
                        "field_value_factor": {
                            "field": "price",
                            "missing": 0
                        }
                    }],
                    "query": {
                        "match_all": {}
                    }
                }
            }

        }
    ]
}}
resp = requests.post(featureset_path, headers=headers, data=json.dumps(ltr_feature_set), auth=auth, verify=False)

	Upload our features to the LTR storage


## Collect judgements

In [7]:
class Judgment:

    def __init__(self, query, doc_id, display_name, grade=0, features=[], query_str=None):
        self.query = query
        self.query_str = query_str
        self.doc_id = doc_id
        self.display_name = display_name
        self.grade = grade
        self.features = features

    # Modified from https://github.com/o19s/elasticsearch-ltr-demo/blob/master/train/judgments.py
    def toXGBFormat(self):
        featuresAsStrs = ["%s:%s" % (idx + 1, feature.get('value', 0)) for idx, feature in enumerate(self.features)]
        comment = "# %s\t%s" % (self.doc_id, self.query_str)
        return "%s\tqid:%s\t%s %s" % (self.grade, self.query, "\t".join(featuresAsStrs), comment)


# Create a map for tracking queries
queries = {1: "dogs", 2: "red fox", 3: "wolf huffed AND puffed OR pig"}
# A map where the key is the query id and the value is a list of judgments, one per document rated for that query
judgments = {}

# Loop over queries, execute a search
for query in queries:
    # Used to get the original queries to create the judgments
    query_obj = {
        'size': 5,
        'query': {
            'multi_match': {
                'query': queries[query],
                'fields': ['title^2', 'body']
            }
        }
    }
    print("################\nExecuting search: qid: {}; query: {}\n##########".format(query, queries[query]))
    response = client.search(body=query_obj, index=index_name)
    hits = response['hits']['hits']
    if len(hits) > 0:
        print(
            "For each hit answer the question: 'Is this hit relevant(1) or not relevant(0) to the query: {}?':".format(
                queries[query]))
        judge_vals = judgments.get(query)
        if judge_vals is None:
            judge_vals = []
            judgments[query] = judge_vals
        for hit in hits:
            print("Title: {}\n\nBody: {}\n".format(hit['_source']['title'], hit['_source']['body']))
            print("Enter 0 or 1:")
            input = ""
            for input in sys.stdin.readline():
                grade = input.rstrip()
                if grade == "0" or grade == "1":
                    judgment = Judgment(query, hit['_id'], hit['_source']['title'], int(grade))
                    judge_vals.append(judgment)
                    break
                elif grade == "skip" or grade == "s":
                    break
                elif grade == "exit" or grade == 'e':
                    input = grade  # set this back to the trimmed grade so we can exit the outer loop.  Very clunky!
                    break
            if input == "exit" or input == "e":
                break  # break out of hits, this is ugly, but OK for what we are doing here

################
Executing search: qid: 1; query: dogs
##########
For each hit answer the question: 'Is this hit relevant(1) or not relevant(0) to the query: dogs?':
Title: Dog

Body: Dogs rule

Enter 0 or 1:
Title: Dogs are the best

Body: Dogs beat cats every day of the week and twice on Sunday. A dog is always up for doing something.  Since there are so many dog breeds, there is a dog for everyone!

Enter 0 or 1:
Title: Dog: The bounty hunter: living in the red

Body: Dog is a bounty hunter who goes on pretend missions with his friends, one of whom is the Fox

Enter 0 or 1:
Title: Fox and Hounds

Body: The quick red fox jumped over the lazy brown dogs.

Enter 0 or 1:
Title: Fox wins championship

Body: Wearing all red, the Fox jumped out to a lead in the race over the Dog.

Enter 0 or 1:
################
Executing search: qid: 2; query: red fox
##########
For each hit answer the question: 'Is this hit relevant(1) or not relevant(0) to the query: red fox?':
Title: Fox and Hounds

Bod

## Create training data

In [81]:
train_file = tempfile.NamedTemporaryFile(delete=False)
# Log our features by sending our query and it's judged documents to OpenSearch
for (idx, item) in enumerate(judgments.items()):
    judge_vals = item[1]
    # create a new SLTR query with an appropriate filter query
    doc_ids = []
    for judgment in judge_vals:
        # Note: we are executing one query per judgment doc id here because it's easier, but we could do this
        # by adding all the doc ids for this query and scoring them all at once and cut our number of queries down
        # significantly
        # Create our SLTR query, filtering so we only retrieve the doc id in question
        query_obj = {
            'query': {
                'bool': {
                    "filter": [  # use a filter so that we don't actually score anything
                        {
                            "terms": {
                                "_id": [judgment.doc_id]
                            }
                        },
                        {  # use the LTR query bring in the LTR feature set
                            "sltr": {
                                "_name": "logged_featureset",
                                "featureset": featureset_name,
                                "store": ltr_store_name,
                                "params": {
                                    "keywords": queries[judgment.query]
                                }
                            }
                        }
                    ]
                }
            },
            # Turn on feature logging so that we get weights back for our features
            "ext": {
                "ltr_log": {
                    "log_specs": {
                        "name": "log_entry",
                        "named_query": "logged_featureset"
                    }
                }
            }
        }
        # Run the query just like any other search
        response = client.search(body=query_obj, index=index_name)
        print(response)
        # For each response, extract out the features and build our training features
        # We are going to do this by iterating through the hits, which should be in doc_ids order and put the
        # values back onto the Judgment object, which has a place to store these.
        if response and len(response['hits']) > 0 and len(response['hits']['hits']) == 1:
            hits = response['hits']['hits']
            # there should only be one hit
            judgment.features = hits[0]['fields']['_ltrlog'][0]['log_entry']
            # 		<grade> qid:<query_id> <feature_number>:<weight>... # <doc_id> <comments>
            # see https://xgboost.readthedocs.io/en/latest/tutorials/input_format.html
            xgb_format = judgment.toXGBFormat() + "\n"
            print(xgb_format)
            train_file.write(bytes(xgb_format, 'utf-8'))
        else:
            print("Weirdness. Fix")

train_file.close()

In [119]:
# Custom query on toy
query_obj = {
    'query': {
        'bool': {
            "filter": [  # use a filter so that we don't actually score anything
                {
                    "terms": {
                        "_id": ["doc_a", "doc_b", "doc_c", "doc_d"]
                }
                },
                {  # use the LTR query bring in the LTR feature set
                    "sltr": {
                        "_name": "logged_featureset",
                        "featureset": "ltr_toy",
                        "store": "searchml_ltr",
                        "params": {
                            "keywords": "dog"
                        }
                    }
                }
            ]
        }
    },
    # Turn on feature logging so that we get weights back for our features
    "ext": {
        "ltr_log": {
            "log_specs": {
                "name": "log_entry",
                "named_query": "logged_featureset"
            }
        }
    }
}

response = client.search(body=query_obj, index='searchml_ltr')

In [120]:
hits = response['hits']['hits']

for i, doc_id in enumerate(["doc_a", "doc_b", "doc_c", "doc_d"]):    
    log_entry = hits[0]['fields']['_ltrlog'][0]['log_entry']
    logger.info(f'i: {i}, doc id: {doc_id}, log entry: {log_entry}')

2023-03-04 20:08:40,169 - i: 0, doc id: doc_a, log entry: [{'name': 'title_query'}, {'name': 'body_query', 'value': 0.5410643}, {'name': 'price_func', 'value': 5.99}]
2023-03-04 20:08:40,171 - i: 1, doc id: doc_b, log entry: [{'name': 'title_query'}, {'name': 'body_query', 'value': 0.5410643}, {'name': 'price_func', 'value': 5.99}]
2023-03-04 20:08:40,171 - i: 2, doc id: doc_c, log entry: [{'name': 'title_query'}, {'name': 'body_query', 'value': 0.5410643}, {'name': 'price_func', 'value': 5.99}]
2023-03-04 20:08:40,172 - i: 3, doc id: doc_d, log entry: [{'name': 'title_query'}, {'name': 'body_query', 'value': 0.5410643}, {'name': 'price_func', 'value': 5.99}]


In [121]:
log_entry = hits[0]['fields']['_ltrlog'][0]['log_entry']

In [134]:
from typing import List

In [136]:
def get_feature_value(log_entry: List, feature_name: str) -> float:
    """
    Find the feature name's value in log entry. Returns 0 if no value found
    
    args:
        log_entry: List of dict of feature name and value
        feature_name: Name of feature to search for
        
    Returns:
        Feature value if exists; 0 otherwise.
    """
    for log in log_entry:
        if log['name'] == feature_name:
            try:
                return log['value']
            except KeyError:
                return 0
    return 0

In [137]:
get_feature_value(log_entry, 'title_query')

0

In [138]:
get_feature_value(log_entry, 'body_query')

0.5410643

In [139]:
get_feature_value(log_entry, 'price_func')

5.99

In [142]:
# Custom query
query_obj = {
    'query': {
        'bool': {
            "filter": [  # use a filter so that we don't actually score anything
                {
                    "terms": {
                        "sku": [2052194, 2053166, 8523243, 9311586]
                }
                },
                {  # use the LTR query bring in the LTR feature set
                    "sltr": {
                        "_name": "logged_featureset",
                        "featureset": "bbuy_main_featureset",
                        "store": "week1",
                        "params": {
                            "keywords": "yamaha"
                        }
                    }
                }
            ]
        }
    },
    # Turn on feature logging so that we get weights back for our features
    "ext": {
        "ltr_log": {
            "log_specs": {
                "name": "log_entry",
                "named_query": "logged_featureset"
            }
        }
    }
}

response = client.search(body=query_obj, index='bbuy_products')

In [144]:
hits

[{'_index': 'bbuy_products',
  '_id': '2052194',
  '_score': 0.0,
  '_source': {'productId': ['1218307633099'],
   'sku': ['2052194'],
   'name': ['Yamaha - 500W 5.1-Ch. A/V Home Theater Receiver'],
   'type': ['HardGood'],
   'startDate': ['2011-04-24'],
   'active': ['false'],
   'regularPrice': ['229.99'],
   'salePrice': ['229.99'],
   'artistName': [],
   'onSale': ['false'],
   'digital': ['false'],
   'frequentlyPurchasedWith': ['9904981',
    '8837716',
    '1147998',
    '9397495',
    '8293081',
    '2337715',
    '8428551',
    '8826121',
    '2138237',
    '9356617'],
   'accessories': ['4024468', '1879711', '9939314', '9837565', '9342945'],
   'relatedProducts': ['9543782'],
   'crossSell': [],
   'salesRankShortTerm': ['32475'],
   'salesRankMediumTerm': ['20142'],
   'salesRankLongTerm': ['22695'],
   'bestSellingRank': ['32500'],
   'url': [],
   'categoryPath': ['Best Buy', 'Audio & MP3', 'Home Audio', 'Receivers'],
   'categoryPathIds': ['cat00000',
    'abcat0200000'

In [145]:
hits = response['hits']['hits']

for i, doc_id in enumerate([2052194, 2053166, 8523243, 9311586]):
    log_entry = hits[0]['fields']['_ltrlog'][0]['log_entry']
    logger.info(f'i: {i}, doc id: {doc_id}, name_match: {get_feature_value(log_entry, "name_match")}')
    logger.info(f'log entry: {log_entry}')

2023-03-04 20:21:06,680 - i: 0, doc id: 2052194, name_match: 6.2757225
2023-03-04 20:21:06,681 - log entry: [{'name': 'name_match', 'value': 6.2757225}]
2023-03-04 20:21:06,682 - i: 1, doc id: 2053166, name_match: 6.2757225
2023-03-04 20:21:06,684 - log entry: [{'name': 'name_match', 'value': 6.2757225}]
2023-03-04 20:21:06,684 - i: 2, doc id: 8523243, name_match: 6.2757225
2023-03-04 20:21:06,685 - log entry: [{'name': 'name_match', 'value': 6.2757225}]
2023-03-04 20:21:06,685 - i: 3, doc id: 9311586, name_match: 6.2757225
2023-03-04 20:21:06,686 - log entry: [{'name': 'name_match', 'value': 6.2757225}]
