In [2]:
import json
import sys
import tempfile
from urllib.parse import urljoin

import pandas as pd
import requests
import xgboost as xgb
from opensearchpy import OpenSearch
import matplotlib.pyplot as plt
from xgboost import XGBClassifier
from xgboost import plot_tree

from logger import logger

## Setup index

In [4]:
host = 'localhost'
port = 9200
base_url = "https://{}:{}/".format(host, port)
auth = ('admin', 'admin')  # For testing only. Don't store credentials in code.

# Create the client with SSL/TLS enabled, but hostname and certificate verification disabled.
client = OpenSearch(
    hosts=[{'host': host, 'port': port}],
    http_compress=True,  # enables gzip compression for request bodies
    http_auth=auth,
    # client_cert = client_cert_path,
    # client_key = client_key_path,
    use_ssl=True,
    verify_certs=False,
    ssl_assert_hostname=False,
    ssl_show_warn=False,
)
# Add our sample document to the index.
docs = [
    {
        "id": "doc_a",
        "title": "Fox and Hounds",
        "body": "The quick red fox jumped over the lazy brown dogs.",
        "price": "5.99",
        "in_stock": True,
        "category": "childrens"},
    {
        "id": "doc_b",
        "title": "Fox wins championship",
        "body": "Wearing all red, the Fox jumped out to a lead in the race over the Dog.",
        "price": "15.13",
        "in_stock": True,
        "category": "sports"},
    {
        "id": "doc_c",
        "title": "Lead Paint Removal",
        "body": "All lead must be removed from the brown and red paint.",
        "price": "150.21",
        "in_stock": False,
        "category": "instructional"},
    {
        "id": "doc_d",
        "title": "The Three Little Pigs Revisited",
        "price": "3.51",
        "in_stock": True,
        "body": "The big, bad wolf huffed and puffed and blew the house down. The end.",
        "category": "childrens"},
    {
        "id": "doc_e",
        "title": "Pigs in a Blanket and Other Recipes",
        "price": "27.50",
        "in_stock": True,
        "body": "Pigs in a blanket aren't as cute as you would think given it's a food and not actual pigs wrapped in blankets.",
        "category": "instructional"},
    {
        "id": "doc_f",
        "title": "Dogs are the best",
        "body": "Dogs beat cats every day of the week and twice on Sunday. A dog is always up for doing something.  Since there are so many dog breeds, there is a dog for everyone!",
        "price": "50.99",
        "in_stock": True,
        "category": "childrens"},
    {
        "id": "doc_g",
        "title": "Dog",
        "body": "Dogs rule",
        "price": "5.99",
        "in_stock": True,
        "category": "childrens"},
    {
        "id": "doc_h",
        "title": "Dog: The bounty hunter: living in the red",
        "body": "Dog is a bounty hunter who goes on pretend missions with his friends, one of whom is the Fox",
        "price": "125.99",
        "in_stock": True,
        "category": "sports"},
]

# Create a new index
index_name = 'searchml_ltr'
index_body = {
    'settings': {
        'index': {
            'query': {
                'default_field': "body"
            }
        }
    },
    "mappings": {
        "properties": {
            "title": {"type": "text", "analyzer": "english"},
            "body": {"type": "text", "analyzer": "english"},
            "in_stock": {"type": "boolean"},
            "category": {"type": "keyword", "ignore_above": "256"},
            "price": {"type": "float"}
        }
    }
}

client.indices.delete(index_name, ignore_unavailable=True)
client.indices.create(index_name, body=index_body)
# Index our documents
print("Indexing our documents")
for doc in docs:
    doc_id = doc["id"]
    print("\tIndexing {}".format(doc_id))
    client.index(
        index=index_name,
        body=doc,
        id=doc_id,
        refresh=True
    )

# Verify they are in:
print("We indexed:\n{}".format(client.cat.count(index_name, params={"v": "true"})))

Indexing our documents
	Indexing doc_a
	Indexing doc_b
	Indexing doc_c
	Indexing doc_d
	Indexing doc_e
	Indexing doc_f
	Indexing doc_g
	Indexing doc_h
We indexed:
epoch      timestamp count
1678033529 16:25:29  8



## Setup LTR storage

In [5]:
# Turn on the LTR store and name it the same as our index
ltr_store_name = index_name
ltr_store_path = "_ltr/" + ltr_store_name

print("Create our LTR store")
# LTR requests are not supported by the OpenSearchPy client, so we will drop down to using Python's Requests library
ltr_model_path = urljoin(base_url, ltr_store_path)
# Delete any old storage
resp = requests.delete(ltr_model_path, auth=auth, verify=False)
print("\tDeleted old store response status: %s" % resp.status_code)
# Create our new LTR storage
resp = requests.put(ltr_model_path, auth=auth, verify=False)
print("\tCreate the new store response status: %s" % resp.status_code)

Create our LTR store
	Deleted old store response status: 200
	Create the new store response status: 200


## Set up LTR feature set

In [6]:
featureset_name = "ltr_toy"
headers = {"Content-Type": 'application/json'}
featureset_path = urljoin(ltr_model_path + "/", "_featureset/{}".format(featureset_name))
# Upload our feature set to our model
body_query_feature_name = "body_query"
title_query_feature_name = "title_query"
price_func_feature_name = "price_func"
print("\tUpload our features to the LTR storage")
ltr_feature_set = {"featureset": {
    "features": [
        {  # Instead of using our multifield query_string match, break it out into parts
            "name": title_query_feature_name,
            "params": ["keywords"],
            "template_language": "mustache",
            "template": {
                "match": {
                    "title": "{{keywords}}"
                }
            }
        },
        {  # Instead of using our multifield query_string match, break it out into parts
            "name": body_query_feature_name,
            "params": ["keywords"],
            "template_language": "mustache",
            "template": {
                "match": {
                    "body": "{{keywords}}"
                }
            }
        },
        # factor in price, albeit naively for this purpose, in practice we should normalize it, which we will do in the project!
        {
            "name": ("%s" % price_func_feature_name),
            "template_language": "mustache",
            "template": {
                "function_score": {
                    "functions": [{
                        "field_value_factor": {
                            "field": "price",
                            "missing": 0
                        }
                    }],
                    "query": {
                        "match_all": {}
                    }
                }
            }

        }
    ]
}}
resp = requests.post(featureset_path, headers=headers, data=json.dumps(ltr_feature_set), auth=auth, verify=False)

	Upload our features to the LTR storage


## Collect judgements

In [7]:
class Judgment:

    def __init__(self, query, doc_id, display_name, grade=0, features=[], query_str=None):
        self.query = query
        self.query_str = query_str
        self.doc_id = doc_id
        self.display_name = display_name
        self.grade = grade
        self.features = features

    # Modified from https://github.com/o19s/elasticsearch-ltr-demo/blob/master/train/judgments.py
    def toXGBFormat(self):
        featuresAsStrs = ["%s:%s" % (idx + 1, feature.get('value', 0)) for idx, feature in enumerate(self.features)]
        comment = "# %s\t%s" % (self.doc_id, self.query_str)
        return "%s\tqid:%s\t%s %s" % (self.grade, self.query, "\t".join(featuresAsStrs), comment)


# Create a map for tracking queries
queries = {1: "dogs", 2: "red fox", 3: "wolf huffed AND puffed OR pig"}
# A map where the key is the query id and the value is a list of judgments, one per document rated for that query
judgments = {}

# Loop over queries, execute a search
for query in queries:
    # Used to get the original queries to create the judgments
    query_obj = {
        'size': 5,
        'query': {
            'multi_match': {
                'query': queries[query],
                'fields': ['title^2', 'body']
            }
        }
    }
    print("################\nExecuting search: qid: {}; query: {}\n##########".format(query, queries[query]))
    response = client.search(body=query_obj, index=index_name)
    hits = response['hits']['hits']
    if len(hits) > 0:
        print(
            "For each hit answer the question: 'Is this hit relevant(1) or not relevant(0) to the query: {}?':".format(
                queries[query]))
        judge_vals = judgments.get(query)
        if judge_vals is None:
            judge_vals = []
            judgments[query] = judge_vals
        for hit in hits:
            print("Title: {}\n\nBody: {}\n".format(hit['_source']['title'], hit['_source']['body']))
            print("Enter 0 or 1:")
            input = ""
            for input in sys.stdin.readline():
                grade = input.rstrip()
                if grade == "0" or grade == "1":
                    judgment = Judgment(query, hit['_id'], hit['_source']['title'], int(grade))
                    judge_vals.append(judgment)
                    break
                elif grade == "skip" or grade == "s":
                    break
                elif grade == "exit" or grade == 'e':
                    input = grade  # set this back to the trimmed grade so we can exit the outer loop.  Very clunky!
                    break
            if input == "exit" or input == "e":
                break  # break out of hits, this is ugly, but OK for what we are doing here

################
Executing search: qid: 1; query: dogs
##########
For each hit answer the question: 'Is this hit relevant(1) or not relevant(0) to the query: dogs?':
Title: Dog

Body: Dogs rule

Enter 0 or 1:
Title: Dogs are the best

Body: Dogs beat cats every day of the week and twice on Sunday. A dog is always up for doing something.  Since there are so many dog breeds, there is a dog for everyone!

Enter 0 or 1:
Title: Dog: The bounty hunter: living in the red

Body: Dog is a bounty hunter who goes on pretend missions with his friends, one of whom is the Fox

Enter 0 or 1:
Title: Fox and Hounds

Body: The quick red fox jumped over the lazy brown dogs.

Enter 0 or 1:
Title: Fox wins championship

Body: Wearing all red, the Fox jumped out to a lead in the race over the Dog.

Enter 0 or 1:
################
Executing search: qid: 2; query: red fox
##########
For each hit answer the question: 'Is this hit relevant(1) or not relevant(0) to the query: red fox?':
Title: Fox and Hounds

Bod

## Create training data

In [8]:
train_file = tempfile.NamedTemporaryFile(delete=False)
# Log our features by sending our query and it's judged documents to OpenSearch
for (idx, item) in enumerate(judgments.items()):
    judge_vals = item[1]
    # create a new SLTR query with an appropriate filter query
    doc_ids = []
    for judgment in judge_vals:
        # Note: we are executing one query per judgment doc id here because it's easier, but we could do this
        # by adding all the doc ids for this query and scoring them all at once and cut our number of queries down
        # significantly
        # Create our SLTR query, filtering so we only retrieve the doc id in question
        query_obj = {
            'query': {
                'bool': {
                    "filter": [  # use a filter so that we don't actually score anything
                        {
                            "terms": {
                                "_id": [judgment.doc_id]
                            }
                        },
                        {  # use the LTR query bring in the LTR feature set
                            "sltr": {
                                "_name": "logged_featureset",
                                "featureset": featureset_name,
                                "store": ltr_store_name,
                                "params": {
                                    "keywords": queries[judgment.query]
                                }
                            }
                        }
                    ]
                }
            },
            # Turn on feature logging so that we get weights back for our features
            "ext": {
                "ltr_log": {
                    "log_specs": {
                        "name": "log_entry",
                        "named_query": "logged_featureset"
                    }
                }
            }
        }
        # Run the query just like any other search
        response = client.search(body=query_obj, index=index_name)
        print(response)
        # For each response, extract out the features and build our training features
        # We are going to do this by iterating through the hits, which should be in doc_ids order and put the
        # values back onto the Judgment object, which has a place to store these.
        if response and len(response['hits']) > 0 and len(response['hits']['hits']) == 1:
            hits = response['hits']['hits']
            # there should only be one hit
            judgment.features = hits[0]['fields']['_ltrlog'][0]['log_entry']
            # 		<grade> qid:<query_id> <feature_number>:<weight>... # <doc_id> <comments>
            # see https://xgboost.readthedocs.io/en/latest/tutorials/input_format.html
            xgb_format = judgment.toXGBFormat() + "\n"
            print(xgb_format)
            train_file.write(bytes(xgb_format, 'utf-8'))
        else:
            print("Weirdness. Fix")

train_file.close()

In [9]:
# Custom query on toy
query_obj = {
    'query': {
        'bool': {
            "filter": [  # use a filter so that we don't actually score anything
                {
                    "terms": {
                        "_id": ["doc_a", "doc_b", "doc_c", "doc_d"]
                }
                },
                {  # use the LTR query bring in the LTR feature set
                    "sltr": {
                        "_name": "logged_featureset",
                        "featureset": "ltr_toy",
                        "store": "searchml_ltr",
                        "params": {
                            "keywords": "dog"
                        }
                    }
                }
            ]
        }
    },
    # Turn on feature logging so that we get weights back for our features
    "ext": {
        "ltr_log": {
            "log_specs": {
                "name": "log_entry",
                "named_query": "logged_featureset"
            }
        }
    }
}

response = client.search(body=query_obj, index='searchml_ltr')

In [10]:
hits = response['hits']['hits']

for i, doc_id in enumerate(["doc_a", "doc_b", "doc_c", "doc_d"]):    
    log_entry = hits[0]['fields']['_ltrlog'][0]['log_entry']
    logger.info(f'i: {i}, doc id: {doc_id}, log entry: {log_entry}')

2023-03-05 16:25:37,163 - i: 0, doc id: doc_a, log entry: [{'name': 'title_query'}, {'name': 'body_query', 'value': 0.5410643}, {'name': 'price_func', 'value': 5.99}]
2023-03-05 16:25:37,164 - i: 1, doc id: doc_b, log entry: [{'name': 'title_query'}, {'name': 'body_query', 'value': 0.5410643}, {'name': 'price_func', 'value': 5.99}]
2023-03-05 16:25:37,165 - i: 2, doc id: doc_c, log entry: [{'name': 'title_query'}, {'name': 'body_query', 'value': 0.5410643}, {'name': 'price_func', 'value': 5.99}]
2023-03-05 16:25:37,165 - i: 3, doc id: doc_d, log entry: [{'name': 'title_query'}, {'name': 'body_query', 'value': 0.5410643}, {'name': 'price_func', 'value': 5.99}]


In [11]:
log_entry = hits[0]['fields']['_ltrlog'][0]['log_entry']

In [12]:
from typing import List

In [13]:
def get_feature_value(log_entry: List, feature_name: str) -> float:
    """
    Find the feature name's value in log entry. Returns 0 if no value found
    
    args:
        log_entry: List of dict of feature name and value
        feature_name: Name of feature to search for
        
    Returns:
        Feature value if exists; 0 otherwise.
    """
    for log in log_entry:
        if log['name'] == feature_name:
            try:
                return log['value']
            except KeyError:
                return 0
    return 0

In [14]:
get_feature_value(log_entry, 'title_query')

0

In [15]:
get_feature_value(log_entry, 'body_query')

0.5410643

In [16]:
get_feature_value(log_entry, 'price_func')

5.99

In [17]:
# Custom query
query_obj = {
    'query': {
        'bool': {
            "filter": [  # use a filter so that we don't actually score anything
                {
                    "terms": {
                        "sku": [2052194, 2053166, 8523243, 9311586]
                }
                },
                {  # use the LTR query bring in the LTR feature set
                    "sltr": {
                        "_name": "logged_featureset",
                        "featureset": "bbuy_main_featureset",
                        "store": "week1",
                        "params": {
                            "keywords": "yamaha"
                        }
                    }
                }
            ]
        }
    },
    # Turn on feature logging so that we get weights back for our features
    "ext": {
        "ltr_log": {
            "log_specs": {
                "name": "log_entry",
                "named_query": "logged_featureset"
            }
        }
    }
}

response = client.search(body=query_obj, index='bbuy_products')

In [19]:
hits = response['hits']['hits']

for i, doc_id in enumerate([2052194, 2053166, 8523243, 9311586]):
    log_entry = hits[0]['fields']['_ltrlog'][0]['log_entry']
    logger.info(f'i: {i}, doc id: {doc_id}, name_match: {get_feature_value(log_entry, "name_match")}')
    logger.info(f'log entry: {log_entry}')

2023-03-05 16:25:41,487 - i: 0, doc id: 2052194, name_match: 5.8058386
2023-03-05 16:25:41,489 - log entry: [{'name': 'name_match', 'value': 5.8058386}, {'name': 'name_match_phrase', 'value': 5.8058386}, {'name': 'customer_review_average', 'value': 4.4}, {'name': 'customer_review_count', 'value': 42.0}, {'name': 'artist_name_match_phrase'}, {'name': 'short_desc_match_phrase'}, {'name': 'long_desc_match_phrase'}, {'name': 'sales_rank_short_term', 'value': 3.3055312e-11}]
2023-03-05 16:25:41,490 - i: 1, doc id: 2053166, name_match: 5.8058386
2023-03-05 16:25:41,490 - log entry: [{'name': 'name_match', 'value': 5.8058386}, {'name': 'name_match_phrase', 'value': 5.8058386}, {'name': 'customer_review_average', 'value': 4.4}, {'name': 'customer_review_count', 'value': 42.0}, {'name': 'artist_name_match_phrase'}, {'name': 'short_desc_match_phrase'}, {'name': 'long_desc_match_phrase'}, {'name': 'sales_rank_short_term', 'value': 3.3055312e-11}]
2023-03-05 16:25:41,492 - i: 2, doc id: 8523243, n

## Import and check training data

In [3]:
df = pd.read_csv('/workspace/datasets/train.csv')

In [23]:
gb = df.groupby('query')

In [24]:
gb.first()

Unnamed: 0_level_0,user,sku,category,click_time,query_time
query,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
!pod,ee0c57592de376bf9e687eaab46c05f6d8e15988,9084206,pcmcat144700050004,2011-10-16 22:22:01.996,2011-10-16 22:21:11.988
#,0240c3e637c81f402565ce97978c032b2f6454f3,1972308,cat02010,2011-10-25 23:46:17.884,2011-10-25 23:43:48.686
# 1 hits of the 50s,22ee0cd6a6dce1df2f5487ed6a6faa6b78c8fb06,17072944,cat02010,2011-09-07 18:43:00.204,2011-09-07 18:42:38.189
# TC-P50GT30,144d310cedd839dc33d5dddc8847b90a1347b048,2025359,abcat0102003,2011-09-26 02:16:49.572,2011-09-26 02:12:28.204
#1 Girl,e9fbedbfa8d6d3309b00871185a73aa40019321c,3486349,cat02009,2011-09-20 14:20:15.77,2011-09-20 14:19:59.599
...,...,...,...,...,...
��Mfr: SONY ��Model: VPCYB/15KX/P �� ��UPC: 027242819757,1b4cd9dccc522a5aa0ef1e146c5277ce704c821e,1945531,pcmcat209000050007,2011-10-04 18:17:43.862,2011-10-04 18:15:27.714
��Mfr: TOSHIBA ��Model: BDX4200 ��,28e001b8f0605e7b3a2eac7ec21c80ea10af3488,2114042,abcat0102003,2011-09-24 11:14:53.634,2011-09-24 11:14:42.439
��Model: U30JCB2B,d512aff4d504f12273f069830226aac45ed8b432,3104033,pcmcat247400050000,2011-10-08 05:22:29.851,2011-10-08 05:17:54.345
����Mfr: SONY��Model: DSCW530/B ��,3a7e036989a666d101d77d29cdcf9405c4e5261c,2507506,abcat0401004,2011-10-04 11:20:10.006,2011-10-04 11:19:25.816


In [25]:
# Get count of ipad query
gb.get_group('ipad').count()

user          5036
sku           5036
category      5036
query         5036
click_time    5036
query_time    5036
dtype: int64

In [27]:
# Get queries for this category
df[df['category'] == 'abcat0101001']

Unnamed: 0,user,sku,category,query,click_time,query_time
0,000000df17cd56a5df4a94074e133c9d4739fae3,2125233,abcat0101001,Televisiones Panasonic 50 pulgadas,2011-09-01 23:44:52.533,2011-09-01 23:43:59.752
1,000001928162247ffaf63185cd8b2a244c78e7c6,2009324,abcat0101001,Sharp,2011-09-05 12:25:37.42,2011-09-05 12:25:01.187
3,000017f79c2b5da56721f22f9fdd726b13daf8e8,2877125,abcat0101001,rca,2011-10-25 07:18:14.722,2011-10-25 07:16:51.759
19,0000c4e9d7075985d1020c456e7ce36f83f834eb,2126065,abcat0101001,Samsung 40,2011-09-28 17:27:11.184,2011-09-28 17:26:15.832
40,00017f7beeac02736c0ce7bf1e75f3010939b34e,1831054,abcat0101001,lcd tv,2011-09-28 07:26:32.153,2011-09-28 07:26:21.476
...,...,...,...,...,...,...
1865071,fff7afecf87f8043ce4b4fcbd8ed47d7ffd52679,2620821,abcat0101001,lcd tv,2011-10-16 11:13:30.233,2011-10-16 11:13:25.858
1865077,fff7edfacf2c376fa8ea4f637c33013d057ae70a,1854328,abcat0101001,goldeneye movie,2011-09-20 18:25:30.494,2011-09-20 18:25:16.548
1865119,fff97c95234212c39d6f04c9854fa94d94bc0cce,1854819,abcat0101001,2622037 2127204 2127213 2121716 2138291,2011-10-08 07:00:01.108,2011-10-08 06:56:16.829
1865190,fffc750c85c62fe732b8223d5872be6e0fc03593,2262074,abcat0101001,Pn51d8000,2011-10-24 18:15:10.27,2011-10-24 18:15:05.628


In [28]:
val_df = pd.read_csv('/workspace/ltr_output/validity.csv')

In [29]:
val_df

Unnamed: 0,sku,status
0,2125233,1
1,2009324,1
2,1517163,1
3,2877125,1
4,2877134,1
...,...,...
69775,3027627,1
69776,19326342,1
69777,8612539,1
69778,8002092,1


In [36]:
query_obj = {
    'size': 0,
    'query': {
        'match_all': {}
    },
    'aggs': {
        'missing_short': {
            'missing': {
                'field': 'salesRankShortTerm'
            }
        },
        'missing_medium': {
            'missing': {
                'field': 'salesRankMediumTerm'
            }
        },
        'missing_long': {
            'missing': {
                'field': 'salesRankingLongTerm'
            }
        }
    }
}

client.search(body=query_obj, index='bbuy_products')

{'took': 121,
 'timed_out': False,
 '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0},
 'hits': {'total': {'value': 10000, 'relation': 'gte'},
  'max_score': None,
  'hits': []},
 'aggregations': {'missing_long': {'doc_count': 1275077},
  'missing_short': {'doc_count': 1162594},
  'missing_medium': {'doc_count': 1143849}}}

In [43]:
# Long tail queries
df['query'].value_counts().tail(20)

Samsumg led tv                     1
Panasonic DYWL10 Wireless LAN      1
Nikon d9                           1
Virgin mobile slide no contract    1
Sharp Aquos 60 tv                  1
Samsumg led 32tv                   1
In wall volume control             1
External drive MacBook air         1
kindle.                            1
bluetooth car speaker kit          1
Hdtv 70                            1
Panasonic sdr                      1
Symphonic DVD/vhs                  1
samsung n455                       1
Elements SE                        1
Modus                              1
800 watt kickers                   1
Epic otterbox                      1
converter compacitor               1
ttv                                1
Name: query, dtype: int64

## Debugging LTR

In [44]:
simple = pd.read_csv('/workspace/ltr_output/analysis/simple_better.csv')

In [45]:
simple

Unnamed: 0,query,sku,rank_simple,type_simple,found_simple,new_simple,score_simple,rank_ltr,type_ltr,found_ltr,new_ltr,score_ltr
0,speaker,16437642,6,simple,True,False,493.312680,222,ltr_simple,True,False,0.761895
1,speaker,2629895,110,simple,True,False,421.915470,183,ltr_simple,True,False,0.761895
2,speaker,8428551,145,simple,True,False,421.910980,229,ltr_simple,True,False,0.759103
3,shrek,9923261,32,simple,True,False,614.394700,100,ltr_simple,True,False,0.802234
4,shrek,1094314,220,simple,True,False,321.345800,443,ltr_simple,True,False,0.612005
...,...,...,...,...,...,...,...,...,...,...,...,...
462,firewire,8934745,61,simple,True,False,387.931670,205,ltr_simple,True,False,0.725689
463,firewire,7338776,84,simple,True,False,355.358460,146,ltr_simple,True,False,0.758418
464,firewire,6307392,126,simple,True,False,68.991930,237,ltr_simple,True,False,0.724641
465,firewire,8210633,166,simple,True,False,44.706367,262,ltr_simple,True,False,0.724641


In [46]:
exp = pd.read_csv('/workspace/ltr_output/analysis/simple_ltr_explains.csv')

In [48]:
exp.describe()

Unnamed: 0,sku,score,clause_0,clause_1,clause_2,clause_3,clause_4,Feature 0(name_match),Feature 1(name_match_phrase),Feature 2(customer_review_average),Feature 3(customer_review_count),Feature 4(artist_name_match_phrase),Feature 5(short_desc_match_phrase),Feature 6(long_desc_match_phrase),Feature 7(sales_rank_short_term),clause_5,clause_6
count,99.0,99.0,99.0,99.0,99.0,99.0,99.0,99.0,99.0,99.0,99.0,99.0,99.0,99.0,99.0,99.0,99.0
mean,101001700000000.0,397.996313,53.555588,210.228696,87.034226,29.638313,17.45382,8.219951,6.377989,3.289899,7.222222,0.137051,1.092978,3.973327,0.145595,0.085671,0.0
std,1004954000000000.0,328.468286,115.026547,312.694148,97.988611,77.60856,69.733237,5.556581,5.609842,0.997905,11.789441,1.363643,2.795381,3.950636,0.302381,0.177667,0.0
min,1007701.0,0.450096,0.001,0.007,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2608174.0,77.271473,0.106243,0.078225,0.368465,0.051714,0.0,5.379574,0.0,2.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,3540322.0,361.639,0.25,0.401117,69.1526,0.380947,0.006,7.405062,5.921458,3.0,2.0,0.0,0.0,4.480664,0.0,0.0,0.0
75%,9463792.0,527.49562,49.294497,397.133685,118.568995,14.762141,0.219416,10.971387,8.639381,4.15,10.5,0.0,0.0,6.692025,9e-06,0.0,0.0
max,9999164000000000.0,1521.9863,485.35638,1247.0166,377.65814,513.5948,390.873,26.602394,24.94033,5.0,66.0,13.568079,11.211414,16.187332,0.999994,0.538375,0.0


In [53]:
# We don't see the large values like in the appendix here
exp['Feature 7(sales_rank_short_term)'].value_counts().head()

0.000000    65
0.429543     2
0.000005     1
0.006946     1
0.843471     1
Name: Feature 7(sales_rank_short_term), dtype: int64

In [78]:
# Querying for raw term statistics
query_obj = {
    'query': {
        'match_explorer': {
            'type': 'mean_raw_tf',
            'query': {
                'match': {
                    'name': 'iphone'
                }
            }
        }
    },
    '_source': ['name']
}

client.search(body=query_obj, index='bbuy_products')['hits']['hits'][:3]

[{'_index': 'bbuy_products',
  '_id': '3812736',
  '_score': 2.0,
  '_source': {'name': ['OtterBox - Commuter Series Case for Apple® iPhone® 4 and 4S\n\n iPhone 4S - Gunmetal Gray/Yellow']}},
 {'_index': 'bbuy_products',
  '_id': '3869388',
  '_score': 2.0,
  '_source': {'name': ['Belkin - Essential 050 Case for Apple® iPhone® 4 and iPhone 4S - Pink/Purple']}},
 {'_index': 'bbuy_products',
  '_id': '3869439',
  '_score': 2.0,
  '_source': {'name': ['Belkin - Essential 050 Case for Apple® iPhone® 4 and iPhone 4S - Blue/White']}}]

In [79]:
# Querying for raw term statistics
query_obj = {
    'query': {
        'match_explorer': {
            'type': 'unique_terms_count',
            'query': {
                'match': {
                    'name': 'iphone'
                }
            }
        }
    },
    '_source': ['name']
}

client.search(body=query_obj, index='bbuy_products')['hits']['hits'][:3]

[{'_index': 'bbuy_products',
  '_id': '3515280',
  '_score': 1.0,
  '_source': {'name': ['Live After Death [Expanded] - CD']}},
 {'_index': 'bbuy_products',
  '_id': '3589196',
  '_score': 1.0,
  '_source': {'name': ['Xentris Wireless - Case for LG Cosmos 2 Mobile Phones - Black Matté']}},
 {'_index': 'bbuy_products',
  '_id': '3589202',
  '_score': 1.0,
  '_source': {'name': ['Xentris Wireless - Plastic Case for Samsung Galaxy Indulge R915 Mobile Phones - Black/Chrome']}}]

In [83]:
# Querying for raw term statistics
query_obj = {
    'query': {
        'match_explorer': {
            'type': 'min_raw_tp',
            'query': {
                'match': {
                    'name': 'iphone'
                }
            }
        }
    },
    '_source': ['name']
}

client.search(body=query_obj, index='bbuy_products')['hits']['hits'][:3]

[{'_index': 'bbuy_products',
  '_id': '2396158',
  '_score': 17.0,
  '_source': {'name': ['LuxMobile - <i>Sky and Water</i> by M. C. Escher Snap-On Case for Apple® iPhone® 4 and 4S - Black/White']}},
 {'_index': 'bbuy_products',
  '_id': '3991597',
  '_score': 16.0,
  '_source': {'name': ['RETAIL SALES SOLUTIONS - Boston Red Sox Podsta Stand for Apple® iPod® touch and iPhone®']}},
 {'_index': 'bbuy_products',
  '_id': '9225572',
  '_score': 16.0,
  '_source': {'name': ['Griffin Technology - Elan Passport Leather & Metal Accents Case for Select Apple® iPod® and iPhone Models - Black']}}]

In [64]:
# Querying for explain statistics
query_obj = {
    'query': {
        'match': {
            'name': 'iphone'
        }
    }
}

response = client.explain(id='4095076', body=query_obj, index='bbuy_products')

In [71]:
response['explanation']['details']

[{'value': 6.739648,
  'description': 'score(freq=2.0), computed as boost * idf * tf from:',
  'details': [{'value': 2.2, 'description': 'boost', 'details': []},
   {'value': 6.744475,
    'description': 'idf, computed as log(1 + (N - n + 0.5) / (n + 0.5)) from:',
    'details': [{'value': 1500,
      'description': 'n, number of documents containing term',
      'details': []},
     {'value': 1274453,
      'description': 'N, total number of documents with field',
      'details': []}]},
   {'value': 0.45422012,
    'description': 'tf, computed as freq / (freq + k1 * (1 - b + b * dl / avgdl)) from:',
    'details': [{'value': 2.0,
      'description': 'freq, occurrences of term within document',
      'details': []},
     {'value': 1.2,
      'description': 'k1, term saturation parameter',
      'details': []},
     {'value': 0.75,
      'description': 'b, length normalization parameter',
      'details': []},
     {'value': 12.0, 'description': 'dl, length of field', 'details': []},


## Using prior query history
- I.e., Using the clicks from prior queries to predict future query results.
- This comes with the cold-start problem where a new product that's not clicked in historical data will not be predicted because the model hasn't seen this product.
- One way around this is to have good item representations. For example, the features we use on name match, description match, are a representation of the item. Alternatively, content embedding (which is similar to text matching features).