# Tuning a `multi_match` `best_fields` query

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import importlib
import os
import sys

from copy import deepcopy
from elasticsearch import Elasticsearch
from skopt.plots import plot_objective

In [None]:
# project library
sys.path.insert(0, os.path.abspath('..'))

import qopt
importlib.reload(qopt)

from qopt.notebooks import evaluate_mrr100_dev_templated, optimize_query_mrr100_templated
from qopt.optimize import Config

In [None]:
# use a local Elasticsearch or Cloud instance (https://cloud.elastic.co/)
# es = Elasticsearch('http://localhost:9200')
es = Elasticsearch('http://35.246.228.72:9200')

# set the parallelization parameter `max_concurrent_searches` for the Rank Evaluation API calls
max_concurrent_searches = 30

index = 'msmarco-document.doc2query'
template_id = 'query'

# set best BM25 params
set_bm25_params(es, index, [
    ('url', {'k1': 0.2835389588290694, 'b': 0.8307098387153782}),
    ('title', {'k1': 0.3477150744985997, 'b': 0.6174817900867441}),
    ('title.bigrams', {'k1': 1.2, 'b': 0.75}),
    ('body', {'k1': 3.0128735487205525, 'b': 0.8200709176657588}),
    ('body.bigrams', {'k1': 1.9241932055770454, 'b': 0.7257382745572979}),
    ('expansions', {'k1': 4.870954366799399, 'b': 0.9249613913608172}),
    ('expansions.bigrams', {'k1': 1.2, 'b': 0.75})
])

# base template for tuning
base_templates = [{
    "id": template_id,
    "template": {
        "lang": "mustache",
        "source": { "query": {} }
    }
}]

def match_query(name):
    return {
        "match": {
            name: {
                "query": "{{query_string}}",
                "boost": "{{" + name.replace('.', '_') + "|boost}}",
            },
        },
    }

## Experiments

### Base fields

In [None]:
_field_names = [
    'url',
    'title',
    'body',
]
_templates = deepcopy(base_templates)
_match_queries = [match_query(x) for x in _field_names]
_templates[0]['template']['source']['query']['bool'] = { 'should': _match_queries }

In [None]:
%%time

_ = evaluate_mrr100_dev_templated(es, max_concurrent_searches, index, _templates, template_id, params={
    'url|boost': 1.0,
    'title|boost': 1.0,
    'body|boost': 1.0,
})

In [None]:
%%time

_, _, final_params, metadata = optimize_query_mrr100_templated(es, max_concurrent_searches, index, _templates, template_id,
    config_space=Config.parse({
        'method': 'bayesian',
        'num_iterations': 50,
        'num_initial_points': 20,
        'space': {
            'url|boost': { 'low': 0.0, 'high': 10.0 },
            'title|boost': { 'low': 0.0, 'high': 10.0 },
            'body|boost': { 'low': 0.0, 'high': 10.0 },
        }
    }))

In [None]:
_ = plot_objective(metadata, sample_source='result')

In [None]:
%%time

_ = evaluate_mrr100_dev_templated(es, max_concurrent_searches, index, _templates, template_id, params=final_params)

In [None]:
final_params

### Base fields + bigrams

In [None]:
_field_names = [
    'url',
    'title', 'title.bigrams',
    'body', 'body.bigrams',
]
_templates = deepcopy(base_templates)
_match_queries = [match_query(x) for x in _field_names]
_templates[0]['template']['source']['query']['bool'] = { 'should': _match_queries }

In [None]:
%%time

_ = evaluate_mrr100_dev_templated(es, max_concurrent_searches, index, _templates, template_id, params={
    'url|boost': 1.0,
    'title|boost': 1.0,
    'title_bigrams|boost': 1.0,
    'body|boost': 1.0,
    'body_bigrams|boost': 1.0,
})

In [None]:
%%time

_, _, final_params, metadata = optimize_query_mrr100_templated(es, max_concurrent_searches, index, _templates, template_id,
    config_space=Config.parse({
        'method': 'bayesian',
        'num_iterations': 50,
        'num_initial_points': 20,
        'space': {
            'url|boost': { 'low': 0.0, 'high': 10.0 },
            'title|boost': { 'low': 0.0, 'high': 10.0 },
            'title_bigrams|boost': { 'low': 0.0, 'high': 10.0 },
            'body|boost': { 'low': 0.0, 'high': 10.0 },
            'body_bigrams|boost': { 'low': 0.0, 'high': 10.0 },
        }
    }))

In [None]:
_ = plot_objective(metadata, sample_source='result')

In [None]:
%%time

_ = evaluate_mrr100_dev_templated(es, max_concurrent_searches, index, _templates, template_id, params=final_params)

In [None]:
final_params

### Base fields + expansions

In [None]:
_field_names = [
    'url',
    'title',
    'body',
    'expansions',
]
_templates = deepcopy(base_templates)
_match_queries = [match_query(x) for x in _field_names]
_templates[0]['template']['source']['query']['bool'] = { 'should': _match_queries }

In [None]:
%%time

_ = evaluate_mrr100_dev_templated(es, max_concurrent_searches, index, _templates, template_id, params={
    'url|boost': 1.0,
    'title|boost': 1.0,
    'body|boost': 1.0,
    'expansions|boost': 1.0,
})

In [None]:
%%time

_, _, final_params, metadata = optimize_query_mrr100_templated(es, max_concurrent_searches, index, _templates, template_id,
    config_space=Config.parse({
        'method': 'bayesian',
        'num_iterations': 50,
        'num_initial_points': 20,
        'space': {
            'url|boost': { 'low': 0.0, 'high': 10.0 },
            'title|boost': { 'low': 0.0, 'high': 10.0 },
            'body|boost': { 'low': 0.0, 'high': 10.0 },
            'expansions|boost': { 'low': 0.0, 'high': 10.0 },
        }
    }))

In [None]:
_ = plot_objective(metadata, sample_source='result')

In [None]:
%%time

_ = evaluate_mrr100_dev_templated(es, max_concurrent_searches, index, _templates, template_id, params=final_params)

In [None]:
final_params

### Base fields + expansions + bigrams

In [None]:
_field_names = [
    'url',
    'title', 'title.bigrams',
    'body', 'body.bigrams',
    'expansions', 'expansions.bigrams',
]
_templates = deepcopy(base_templates)
_match_queries = [match_query(x) for x in _field_names]
_templates[0]['template']['source']['query']['bool'] = { 'should': _match_queries }

In [None]:
%%time

_ = evaluate_mrr100_dev_templated(es, max_concurrent_searches, index, _templates, template_id, params={
    'url|boost': 1.0,
    'title|boost': 1.0,
    'title_bigrams|boost': 1.0,
    'body|boost': 1.0,
    'body_bigrams|boost': 1.0,
    'expansions|boost': 1.0,
    'expansions_bigrams|boost': 1.0
})

In [None]:
%%time

_, _, final_params, metadata = optimize_query_mrr100_templated(es, max_concurrent_searches, index, _templates, template_id,
    config_space=Config.parse({
        'method': 'bayesian',
        'num_iterations': 50,
        'num_initial_points': 20,
        'space': {
            'url|boost': { 'low': 0.0, 'high': 10.0 },
            'title|boost': { 'low': 0.0, 'high': 10.0 },
            'title_bigrams|boost': { 'low': 0.0, 'high': 10.0 },
            'body|boost': { 'low': 0.0, 'high': 10.0 },
            'body_bigrams|boost': { 'low': 0.0, 'high': 10.0 },
            'expansions|boost': { 'low': 0.0, 'high': 10.0 },
            'expansions_bigrams|boost': { 'low': 0.0, 'high': 10.0 },
        }
    }))

In [None]:
_ = plot_objective(metadata, sample_source='result')

In [None]:
%%time

_ = evaluate_mrr100_dev_templated(es, max_concurrent_searches, index, _templates, template_id, params=final_params)

In [None]:
final_params

In [None]:
## with 100/20 iterations
## 0.3412

{
    'url|boost': 6.831285778689427,
    'title|boost': 0.0,
    'title_bigrams|boost': 0.0,
    'body|boost': 10.0,
    'body_bigrams|boost': 10.0,
    'expansions|boost': 10.0,
    'expansions_bigrams|boost': 3.348671601852749
}

### Selective fields

Based on the results from above, remove fields with `0` boost values.

In [None]:
_field_names = [
    'url',
    'body', 'body.bigrams',
    'expansions', 'expansions.bigrams',
]
_templates = deepcopy(base_templates)
_match_queries = [match_query(x) for x in _field_names]
_templates[0]['template']['source']['query']['bool'] = { 'should': _match_queries }

In [None]:
%%time

_ = evaluate_mrr100_dev_templated(es, max_concurrent_searches, index, _templates, template_id, params={
    'url|boost': 1.0,
    'body|boost': 1.0,
    'body_bigrams|boost': 1.0,
    'expansions|boost': 1.0,
    'expansions_bigrams|boost': 1.0
})

In [None]:
%%time

_, _, final_params, metadata = optimize_query_mrr100_templated(es, max_concurrent_searches, index, _templates, template_id,
    config_space=Config.parse({
        'method': 'bayesian',
        'num_iterations': 50,
        'num_initial_points': 20,
        'space': {
            'url|boost': { 'low': 0.0, 'high': 10.0 },
            'body|boost': { 'low': 0.0, 'high': 10.0 },
            'body_bigrams|boost': { 'low': 0.0, 'high': 10.0 },
            'expansions|boost': { 'low': 0.0, 'high': 10.0 },
            'expansions_bigrams|boost': { 'low': 0.0, 'high': 10.0 },
        }
    }))

In [None]:
_ = plot_objective(metadata, sample_source='result')

In [None]:
%%time

_ = evaluate_mrr100_dev_templated(es, max_concurrent_searches, index, _templates, template_id, params=final_params)

In [None]:
final_params

In [None]:
# with 50/20 iterations
# 0.3415

{
    'url|boost': 5.906335138830406,
    'body|boost': 7.285007082865544,
    'body_bigrams|boost': 0.1561216257683724,
    'expansions|boost': 9.922407448775347,
    'expansions_bigrams|boost': 4.5189511755570635
}