# Tuning a multi `match` query, linear combination score

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import importlib
import os
import sys

from copy import deepcopy
from elasticsearch import Elasticsearch
from skopt.plots import plot_objective

In [3]:
# project library
sys.path.insert(0, os.path.abspath('..'))

import qopt
importlib.reload(qopt)

from qopt.notebooks import evaluate_mrr100_dev_templated, optimize_query_mrr100_templated, set_bm25_params
from qopt.optimize import Config

In [4]:
# use a local Elasticsearch or Cloud instance (https://cloud.elastic.co/)
# es = Elasticsearch('http://localhost:9200')
es = Elasticsearch('http://34.78.38.243:9200')

# set the parallelization parameter `max_concurrent_searches` for the Rank Evaluation API calls
max_concurrent_searches = 30

index = 'msmarco-document.doc2query'
template_id = 'query'

# set best BM25 params
set_bm25_params(es, index, [
    ('url', {'k1': 0.33066956222950633, 'b': 0.9589101032169087}), # 0.2201
    ('title', {'k1': 0.34885436112727763, 'b': 1.0}), # 0.2354
    ('title.bigrams', {'k1': 1.2, 'b': 0.75}), # 0.1295
    ('body', {'k1': 3.0128735487205525, 'b': 0.8200709176657588}), # 0.2645
    ('body.bigrams', {'k1': 1.9100199633100623, 'b': 0.7336619962002098}), # 0.2045
    ('expansions', {'k1': 4.870954366799399, 'b': 0.9249613913608172}), # 0.3220
    ('expansions.bigrams', {'k1': 1.2, 'b': 0.75}) # 0.2837
])

# base template for tuning
base_templates = [{
    "id": template_id,
    "template": {
        "lang": "mustache",
        "source": { "query": {} }
    }
}]

def match_query(name):
    return {
        "match": {
            name: {
                "query": "{{query_string}}",
                "boost": "{{" + name.replace('.', '_') + "|boost}}",
            },
        },
    }

Setting BM25 params fields:
 - url: {'k1': 0.33066956222950633, 'b': 0.9589101032169087}
 - title: {'k1': 0.34885436112727763, 'b': 1.0}
 - title.bigrams: {'k1': 1.2, 'b': 0.75}
 - body: {'k1': 3.0128735487205525, 'b': 0.8200709176657588}
 - body.bigrams: {'k1': 1.9100199633100623, 'b': 0.7336619962002098}
 - expansions: {'k1': 4.870954366799399, 'b': 0.9249613913608172}
 - expansions.bigrams: {'k1': 1.2, 'b': 0.75}


## Experiments

### Base fields

In [5]:
_field_names = [
    'url',
    'title',
    'body',
]
_templates = deepcopy(base_templates)
_match_queries = [match_query(x) for x in _field_names]
_templates[0]['template']['source']['query']['bool'] = { 'should': _match_queries }

In [6]:
%%time

_ = evaluate_mrr100_dev_templated(es, max_concurrent_searches, index, _templates, template_id, params={
    'url|boost': 1.0,
    'title|boost': 1.0,
    'body|boost': 1.0,
})

Evaluation with: MRR@100
Score: 0.2866
CPU times: user 1.77 s, sys: 455 ms, total: 2.23 s
Wall time: 2min 9s


In [None]:
%%time

_, _, final_params, metadata = optimize_query_mrr100_templated(es, max_concurrent_searches, index, _templates, template_id,
    config_space=Config.parse({
        'method': 'bayesian',
        'num_iterations': 50,
        'num_initial_points': 20,
        'space': {
            'url|boost': { 'low': 0.0, 'high': 10.0 },
            'title|boost': { 'low': 0.0, 'high': 10.0 },
            'body|boost': { 'low': 0.0, 'high': 10.0 },
        }
    }))

Optimizing parameters
 - metric: MRR@100
 - queries: data/msmarco-document-sampled-queries.1000.tsv
 - queries: data/msmarco/document/msmarco-doctrain-qrels.tsv
 > iteration 1/50, took 0:00:32 (remains: 0:26:47)
   | 0.2653 (best: 0.2653) - {'url|boost': 2.4655500378357047, 'title|boost': 2.107829960621424, 'body|boost': 3.103734494692036}
 > iteration 2/50, took 0:00:20 (remains: 0:16:13)
   | 0.2563 (best: 0.2653) - {'url|boost': 9.579021603479642, 'title|boost': 5.24245833820261, 'body|boost': 5.517848683453418}
 > iteration 3/50, took 0:00:17 (remains: 0:13:28)
   | 0.2597 (best: 0.2653) - {'url|boost': 2.0496250657292063, 'title|boost': 9.994536752252772, 'body|boost': 5.763678016376662}
 > iteration 4/50, took 0:00:16 (remains: 0:12:45)
   | 0.2479 (best: 0.2653) - {'url|boost': 0.13237918169020538, 'title|boost': 0.426157161134152, 'body|boost': 9.629915818745612}
 > iteration 5/50, took 0:00:21 (remains: 0:15:51)
   | 0.2639 (best: 0.2653) - {'url|boost': 0.42700619900124753, '

In [None]:
_ = plot_objective(metadata, sample_source='result')

In [None]:
%%time

_ = evaluate_mrr100_dev_templated(es, max_concurrent_searches, index, _templates, template_id, params=final_params)

In [None]:
final_params

### Base fields + bigrams

In [None]:
_field_names = [
    'url',
    'title', 'title.bigrams',
    'body', 'body.bigrams',
]
_templates = deepcopy(base_templates)
_match_queries = [match_query(x) for x in _field_names]
_templates[0]['template']['source']['query']['bool'] = { 'should': _match_queries }

In [None]:
%%time

_ = evaluate_mrr100_dev_templated(es, max_concurrent_searches, index, _templates, template_id, params={
    'url|boost': 1.0,
    'title|boost': 1.0,
    'title_bigrams|boost': 1.0,
    'body|boost': 1.0,
    'body_bigrams|boost': 1.0,
})

In [None]:
%%time

_, _, final_params, metadata = optimize_query_mrr100_templated(es, max_concurrent_searches, index, _templates, template_id,
    config_space=Config.parse({
        'method': 'bayesian',
        'num_iterations': 50,
        'num_initial_points': 20,
        'space': {
            'url|boost': { 'low': 0.0, 'high': 10.0 },
            'title|boost': { 'low': 0.0, 'high': 10.0 },
            'title_bigrams|boost': { 'low': 0.0, 'high': 10.0 },
            'body|boost': { 'low': 0.0, 'high': 10.0 },
            'body_bigrams|boost': { 'low': 0.0, 'high': 10.0 },
        }
    }))

In [None]:
_ = plot_objective(metadata, sample_source='result')

In [None]:
%%time

_ = evaluate_mrr100_dev_templated(es, max_concurrent_searches, index, _templates, template_id, params=final_params)

In [None]:
final_params

### Base fields + expansions

In [None]:
_field_names = [
    'url',
    'title',
    'body',
    'expansions',
]
_templates = deepcopy(base_templates)
_match_queries = [match_query(x) for x in _field_names]
_templates[0]['template']['source']['query']['bool'] = { 'should': _match_queries }

In [None]:
%%time

_ = evaluate_mrr100_dev_templated(es, max_concurrent_searches, index, _templates, template_id, params={
    'url|boost': 1.0,
    'title|boost': 1.0,
    'body|boost': 1.0,
    'expansions|boost': 1.0,
})

In [None]:
%%time

_, _, final_params, metadata = optimize_query_mrr100_templated(es, max_concurrent_searches, index, _templates, template_id,
    config_space=Config.parse({
        'method': 'bayesian',
        'num_iterations': 50,
        'num_initial_points': 20,
        'space': {
            'url|boost': { 'low': 0.0, 'high': 10.0 },
            'title|boost': { 'low': 0.0, 'high': 10.0 },
            'body|boost': { 'low': 0.0, 'high': 10.0 },
            'expansions|boost': { 'low': 0.0, 'high': 10.0 },
        }
    }))

In [None]:
_ = plot_objective(metadata, sample_source='result')

In [None]:
%%time

_ = evaluate_mrr100_dev_templated(es, max_concurrent_searches, index, _templates, template_id, params=final_params)

In [None]:
final_params

### Base fields + expansions + bigrams

In [None]:
_field_names = [
    'url',
    'title', 'title.bigrams',
    'body', 'body.bigrams',
    'expansions', 'expansions.bigrams',
]
_templates = deepcopy(base_templates)
_match_queries = [match_query(x) for x in _field_names]
_templates[0]['template']['source']['query']['bool'] = { 'should': _match_queries }

In [None]:
%%time

_ = evaluate_mrr100_dev_templated(es, max_concurrent_searches, index, _templates, template_id, params={
    'url|boost': 1.0,
    'title|boost': 1.0,
    'title_bigrams|boost': 1.0,
    'body|boost': 1.0,
    'body_bigrams|boost': 1.0,
    'expansions|boost': 1.0,
    'expansions_bigrams|boost': 1.0
})

In [None]:
%%time

_, _, final_params, metadata = optimize_query_mrr100_templated(es, max_concurrent_searches, index, _templates, template_id,
    config_space=Config.parse({
        'method': 'bayesian',
        'num_iterations': 50,
        'num_initial_points': 20,
        'space': {
            'url|boost': { 'low': 0.0, 'high': 10.0 },
            'title|boost': { 'low': 0.0, 'high': 10.0 },
            'title_bigrams|boost': { 'low': 0.0, 'high': 10.0 },
            'body|boost': { 'low': 0.0, 'high': 10.0 },
            'body_bigrams|boost': { 'low': 0.0, 'high': 10.0 },
            'expansions|boost': { 'low': 0.0, 'high': 10.0 },
            'expansions_bigrams|boost': { 'low': 0.0, 'high': 10.0 },
        }
    }))

In [None]:
_ = plot_objective(metadata, sample_source='result')

In [None]:
%%time

_ = evaluate_mrr100_dev_templated(es, max_concurrent_searches, index, _templates, template_id, params=final_params)

In [None]:
final_params

In [None]:
## with 100/20 iterations
## 0.3412

{
    'url|boost': 6.831285778689427,
    'title|boost': 0.0,
    'title_bigrams|boost': 0.0,
    'body|boost': 10.0,
    'body_bigrams|boost': 10.0,
    'expansions|boost': 10.0,
    'expansions_bigrams|boost': 3.348671601852749
}

### Selective fields

Based on the results from above, remove fields with `0` boost values.

In [None]:
_field_names = [
    'url',
    'body', 'body.bigrams',
    'expansions', 'expansions.bigrams',
]
_templates = deepcopy(base_templates)
_match_queries = [match_query(x) for x in _field_names]
_templates[0]['template']['source']['query']['bool'] = { 'should': _match_queries }

In [None]:
%%time

_ = evaluate_mrr100_dev_templated(es, max_concurrent_searches, index, _templates, template_id, params={
    'url|boost': 1.0,
    'body|boost': 1.0,
    'body_bigrams|boost': 1.0,
    'expansions|boost': 1.0,
    'expansions_bigrams|boost': 1.0
})

In [None]:
%%time

_, _, final_params, metadata = optimize_query_mrr100_templated(es, max_concurrent_searches, index, _templates, template_id,
    config_space=Config.parse({
        'method': 'bayesian',
        'num_iterations': 50,
        'num_initial_points': 20,
        'space': {
            'url|boost': { 'low': 0.0, 'high': 10.0 },
            'body|boost': { 'low': 0.0, 'high': 10.0 },
            'body_bigrams|boost': { 'low': 0.0, 'high': 10.0 },
            'expansions|boost': { 'low': 0.0, 'high': 10.0 },
            'expansions_bigrams|boost': { 'low': 0.0, 'high': 10.0 },
        }
    }))

In [None]:
_ = plot_objective(metadata, sample_source='result')

In [None]:
%%time

_ = evaluate_mrr100_dev_templated(es, max_concurrent_searches, index, _templates, template_id, params=final_params)

In [None]:
final_params

In [None]:
# with 50/20 iterations
# 0.3415

{
    'url|boost': 5.906335138830406,
    'body|boost': 7.285007082865544,
    'body_bigrams|boost': 0.1561216257683724,
    'expansions|boost': 9.922407448775347,
    'expansions_bigrams|boost': 4.5189511755570635
}