# Boilerplate Setup

In [1]:
import json
import os
from pprint import pprint
import requests
import time

ELASTICSEARCH_URL = os.environ.get('ELASTICSEARCH_URL', 'http://localhost:9200')

# 3.4 Indexing TMDB Movies

In [2]:
def reindex(movies, analysis={}, mappings={}):
    settings = {
        'settings': {
            'number_of_shards': 1,
            'index': {
                'analysis' : analysis,
            },
        },
        'mappings': mappings,
    }

    response = requests.delete(ELASTICSEARCH_URL + '/tmdb')
    response = requests.put(ELASTICSEARCH_URL + '/tmdb', json=settings)

    bulk_movies = ""
    for id, movie in movies.items(): 
        add_op = {"index": {"_index": "tmdb",
                            "_type": "movie",
                            "_id": movie["id"]}}
        bulk_movies += json.dumps(add_op) + "\n" + json.dumps(movie) + "\n"

    response = requests.post(
        ELASTICSEARCH_URL + '/_bulk?refresh=wait_for',
        data=bulk_movies,
        headers={'Content-Type': 'application/x-ndjson'}
    )
    return response

In [3]:
with open('tmdb.json') as f:
     movies = json.load(f)

reindex(movies)

<Response [200]>

# 3.4.1 Basic Searching

In [4]:
def search(query):
    url = ELASTICSEARCH_URL + '/tmdb/movie/_search'
    response = requests.get(url, json=query)
    hits = response.json()['hits']
    
    print("Num\tRelevance Score\t\tMovie Title\t\tOverview")
    for idx, hit in enumerate(hits['hits']):
            print("%s\t%08.5f\t\t%s" % (idx + 1, hit['_score'], hit['_source']['title']))

In [5]:
query = {
    'query': {
        'multi_match': { 
            'query': 'basketball with cartoon aliens',
            'fields': ['title^10', 'overview'],
        },
    },
    'size': '37'
}
search(query)

Num	Relevance Score		Movie Title		Overview
1	85.56930		Aliens
2	73.71077		The Basketball Diaries
3	71.32020		Cowboys & Aliens
4	61.13922		Monsters vs Aliens
5	53.50182		Aliens vs Predator: Requiem
6	53.50182		Aliens in the Attic
7	45.22109		Fire with Fire
8	45.22109		Dances with Wolves
9	45.22109		Friends with Benefits
10	45.22109		Friends with Kids
11	39.57216		From Paris with Love
12	39.57216		Trouble with the Curve
13	39.57216		To Rome with Love
14	39.57216		Gone with the Wind
15	39.57216		Hobo with a Shotgun
16	39.57216		Interview with the Vampire
17	39.57216		From Russia With Love
18	39.57216		Sleeping with the Enemy
19	39.57216		Just Go With It
20	39.57216		My Week with Marilyn
21	35.17781		Girl with a Pearl Earring
22	35.17781		Die Hard: With a Vengeance
23	35.17781		Fun with Dick and Jane
24	31.66188		You Don't Mess With the Zohan
25	31.66188		The Life Aquatic With Steve Zissou
26	31.66188		The Girl with the Dragon Tattoo
27	31.66188		Cloudy with a Chance of Meatballs
28	31.661

# 3.5.1 Query Validation API

In [6]:
query = {
   'query': {
        'multi_match': { 
            'query': 'basketball with cartoon aliens',
            'fields': ['title^10', 'overview']
        }
    }
}
response = requests.get(
    ELASTICSEARCH_URL + '/tmdb/movie/_validate/query?explain',
    json=query,
)

pprint(response.json())

{'_shards': {'failed': 0, 'successful': 1, 'total': 1},
 'explanations': [{'explanation': '+((title:basketball title:with '
                                  'title:cartoon title:aliens)^10.0 | '
                                  '(overview:basketball overview:with '
                                  'overview:cartoon overview:aliens)) #*:*',
                   'index': 'tmdb',
                   'valid': True}],
 'valid': True}


# 3.5.3 Debugging Analysis

In [7]:
response = requests.get(
    ELASTICSEARCH_URL + '/tmdb/_analyze?format=yaml', 
    json={'text': "Fire with Fire", 'field': 'title'}
)
print(response.text)

---
tokens:
- token: "fire"
  start_offset: 0
  end_offset: 4
  type: "<ALPHANUM>"
  position: 0
- token: "with"
  start_offset: 5
  end_offset: 9
  type: "<ALPHANUM>"
  position: 1
- token: "fire"
  start_offset: 10
  end_offset: 14
  type: "<ALPHANUM>"
  position: 2



# 3.5.5 Solving The Matching Problem

In [8]:
mappings = {
       'movie': {
            'properties': {
               'title': {
                   'type': 'text',
                   'analyzer': 'english'
               },
            'overview': {
                   'type': 'text',
                   'analyzer': 'english'
               }
            }
       }
}

reindex(movies, mappings=mappings)

<Response [200]>

In [9]:
response = requests.get(
    ELASTICSEARCH_URL + '/tmdb/_analyze?format=yaml', 
    json={'text': "Fire with Fire", 'field': 'title'}
)
print(response.text)

---
tokens:
- token: "fire"
  start_offset: 0
  end_offset: 4
  type: "<ALPHANUM>"
  position: 0
- token: "fire"
  start_offset: 10
  end_offset: 14
  type: "<ALPHANUM>"
  position: 2



In [10]:
query = {
   'query': {
        'multi_match': { 
            'query': 'basketball with cartoon aliens',
            'fields': ['title^10', 'overview']
        }
    }
}
response = requests.get(
    ELASTICSEARCH_URL + '/tmdb/movie/_validate/query?explain',
    json=query
)
pprint(response.json())

{'_shards': {'failed': 0, 'successful': 1, 'total': 1},
 'explanations': [{'explanation': '+((title:basketbal title:cartoon '
                                  'title:alien)^10.0 | (overview:basketbal '
                                  'overview:cartoon overview:alien)) #*:*',
                   'index': 'tmdb',
                   'valid': True}],
 'valid': True}


In [11]:
query = {
    'query': {
        'multi_match': { 
            'query': 'basketball with cartoon aliens',
            'fields': ['title^10', 'overview'],
        },
    },
    'size': '11'
}
search(query)

Num	Relevance Score		Movie Title		Overview
1	78.76022		The Basketball Diaries
2	74.09074		Alien³
3	74.09074		Alien
4	74.09074		Aliens
5	59.67700		Alien: Resurrection
6	59.67700		Aliens in the Attic
7	59.67700		Cowboys & Aliens
8	49.95806		Monsters vs Aliens
9	42.96141		Aliens vs Predator: Requiem
10	42.96141		AVP: Alien vs. Predator
11	12.88235		Space Jam


# 3.6.1	Decomposing Relevance Score With Lucene’s Explain

In [12]:
def simpler_explain(explain_json, depth=0):
    result = " " * (depth * 2) + "%s, %s\n" % (explain_json['value'], explain_json['description'].replace('\n', ''))
    if 'details' in explain_json:
        for detail in explain_json['details']:
            result += simpler_explain(detail, depth=depth+1)
    return result

def search_explain(query):
    query['explain'] = True
    response = requests.get(ELASTICSEARCH_URL + '/tmdb/movie/_search', json=query)
    data = response.json()
    for i in [3, 10]:
        print("Explain for %s" % data['hits']['hits'][i]['_source']['title'])
        print(simpler_explain(data['hits']['hits'][i]['_explanation']))

search_explain(query)

Explain for Aliens
74.090744, max of:
  74.090744, sum of:
    74.090744, weight(title:alien in 849) [PerFieldSimilarity], result of:
      74.090744, score(doc=849,freq=1.0 = termFreq=1.0), product of:
        10.0, boost
        5.7722607, idf, computed as log(1 + (docCount - docFreq + 0.5) / (docFreq + 0.5)) from:
          9.0, docFreq
          3051.0, docCount
        1.2835655, tfNorm, computed as (freq * (k1 + 1)) / (freq + k1 * (1 - b + b * fieldLength / avgFieldLength)) from:
          1.0, termFreq=1.0
          1.2, parameter k1
          0.75, parameter b
          2.1740413, avgFieldLength
          1.0, fieldLength
  3.6068602, sum of:
    3.6068602, weight(overview:alien in 849) [PerFieldSimilarity], result of:
      3.6068602, score(doc=849,freq=1.0 = termFreq=1.0), product of:
        3.739638, idf, computed as log(1 + (docCount - docFreq + 0.5) / (docFreq + 0.5)) from:
          72.0, docFreq
          3050.0, docCount
        0.96449447, tfNorm, computed as (freq * 

# 3.6.6 Lies, damned lies, and similarity

Elasticsearch 6 defaults to BM25 and since 6.3.0 deprecated classic similarity.

# 3.6.8 Fixing Space Jam vs Alien Ranking

In [13]:
query = {
    'query': {
        'multi_match': { 
            'query': 'basketball with cartoon aliens',
            'fields': ['title^0.1', 'overview'],
        }
    }
}
search(query)

Num	Relevance Score		Movie Title		Overview
1	12.88235		Space Jam
2	07.53847		Grown Ups
3	07.49968		Speed Racer
4	07.24409		Semi-Pro
5	07.16264		The Flintstones
6	06.94339		Coach Carter
7	06.76537		White Men Can't Jump
8	05.84522		Meet Dave
9	05.80056		Aliens vs Predator: Requiem
10	05.44030		Bedazzled


# Notes

Takeaways:

1. `multi_match` default behavior grabs the score of the best matched field, it's not some kind of smard weighted aggregation some people imagine it to be
2. boosting should not be decided beforehand based on intuition of field importance; instead it should be used after experimentally checking if fields of different natures require score balancing before they are compared/aggregated

Experiments:

- **[1,2]**: the title field *is* more important than the overview field, but it's also usually shorter and sometimes loosely representative of the movie's content (therefore may have a higher diversity of words). Therefore it will naturally yield higher field scores per match, which have to be boosted down before implicitly compared by a `multi_match` aggregation with the overview field.