# Boilerplate Setup

In [1]:
import json
import os
from pprint import pprint
import requests
import time

ELASTICSEARCH_URL = os.environ.get('ELASTICSEARCH_URL', 'http://localhost:9200')

# 3.4 Indexing TMDB Movies

In [2]:
def extract():
    with open('tmdb.json') as f:
         return json.loads(f.read())

In [3]:
def reindex(analysis={}, mappings={}, movies={}):
    settings = {
        'settings': {
            'number_of_shards': 1,
            'index': {
                'analysis' : analysis,
            },
        },
        'mappings': mappings,
    }

    response = requests.delete(ELASTICSEARCH_URL + '/tmdb')
    response = requests.put(ELASTICSEARCH_URL + '/tmdb', json=settings)

    bulk_movies = ""
    for id, movie in movies.items(): 
        add_op = {"index": {"_index": "tmdb",
                            "_type": "movie",
                            "_id": movie["id"]}}
        bulk_movies += json.dumps(add_op) + "\n" + json.dumps(movie) + "\n"

    response = requests.post(
        ELASTICSEARCH_URL + '/_bulk',
        data=bulk_movies,
        headers={'Content-Type': 'application/x-ndjson'}
    )
    return response

In [4]:
movies = extract()
reindex(movies=movies)

<Response [200]>

# 3.4.1 Basic Searching

In [5]:
def search(query):
    url = ELASTICSEARCH_URL + '/tmdb/movie/_search'
    response = requests.get(url, json=query)
    hits = response.json()['hits']
    print("Num\tRelevance Score\t\tMovie Title\t\tOverview")
    for idx, hit in enumerate(hits['hits']):
            print("%s\t%.3f\t\t%s" % (idx + 1, hit['_score'], hit['_source']['title']))

In [6]:
time.sleep(5) # give ES time to finish indexing everything

In [7]:
query = {
    'query': {
        'multi_match': { 
            'query': 'basketball with cartoon aliens',
            'fields': ['title^10', 'overview'],
        },
    },
    'size': '37'
}
search(query)

Num	Relevance Score		Movie Title		Overview
1	85.569		Aliens
2	73.711		The Basketball Diaries
3	71.320		Cowboys & Aliens
4	61.139		Monsters vs Aliens
5	53.502		Aliens vs Predator: Requiem
6	53.502		Aliens in the Attic
7	45.221		Friends with Kids
8	45.221		Dances with Wolves
9	45.221		Fire with Fire
10	45.221		Friends with Benefits
11	39.572		To Rome with Love
12	39.572		Just Go With It
13	39.572		Interview with the Vampire
14	39.572		From Russia With Love
15	39.572		Gone with the Wind
16	39.572		Sleeping with the Enemy
17	39.572		From Paris with Love
18	39.572		Hobo with a Shotgun
19	39.572		My Week with Marilyn
20	39.572		Trouble with the Curve
21	35.178		Fun with Dick and Jane
22	35.178		Girl with a Pearl Earring
23	35.178		Die Hard: With a Vengeance
24	31.662		The Girl Who Played with Fire
25	31.662		The Man with the Iron Fists
26	31.662		The Man with the Golden Gun
27	31.662		You Don't Mess With the Zohan
28	31.662		The Girl with the Dragon Tattoo
29	31.662		Twin Peaks: Fire Walk wi

# 3.5.1 Query Validation API

In [8]:
query = {
   'query': {
        'multi_match': { 
            'query': 'basketball with cartoon aliens',
            'fields': ['title^10', 'overview']
        }
    }
}
response = requests.get(
    ELASTICSEARCH_URL + '/tmdb/movie/_validate/query?explain',
    json=query
)

pprint(response.json())

{'_shards': {'failed': 0, 'successful': 1, 'total': 1},
 'explanations': [{'explanation': '+((title:basketball title:with '
                                  'title:cartoon title:aliens)^10.0 | '
                                  '(overview:basketball overview:with '
                                  'overview:cartoon overview:aliens)) #*:*',
                   'index': 'tmdb',
                   'valid': True}],
 'valid': True}


# 3.5.3 Debugging Analysis

In [9]:
response = requests.get(
    ELASTICSEARCH_URL + '/tmdb/_analyze?format=yaml', 
    json={'text': "Fire with Fire", 'field': 'title'}
)
print(response.text)

---
tokens:
- token: "fire"
  start_offset: 0
  end_offset: 4
  type: "<ALPHANUM>"
  position: 0
- token: "with"
  start_offset: 5
  end_offset: 9
  type: "<ALPHANUM>"
  position: 1
- token: "fire"
  start_offset: 10
  end_offset: 14
  type: "<ALPHANUM>"
  position: 2



# 3.5.5 Solving The Matching Problem

In [10]:
mappings = {
       'movie': {
            'properties': {
               'title': {
                   'type': 'text',
                   'analyzer': 'english'
               },
            'overview': {
                   'type': 'text',
                   'analyzer': 'english'
               }
            }
       }
}
reindex(mappings=mappings, movies=movies)

<Response [200]>

In [11]:
response = requests.get(
    ELASTICSEARCH_URL + '/tmdb/_analyze?format=yaml', 
    json={'text': "Fire with Fire", 'field': 'title'}
)
print(response.text)

---
tokens:
- token: "fire"
  start_offset: 0
  end_offset: 4
  type: "<ALPHANUM>"
  position: 0
- token: "fire"
  start_offset: 10
  end_offset: 14
  type: "<ALPHANUM>"
  position: 2



In [12]:
query = {
   'query': {
        'multi_match': { 
            'query': 'basketball with cartoon aliens',
            'fields': ['title^10', 'overview']
        }
    }
}
response = requests.get(
    ELASTICSEARCH_URL + '/tmdb/movie/_validate/query?explain',
    json=query
)
pprint(response.json())

{'_shards': {'failed': 0, 'successful': 1, 'total': 1},
 'explanations': [{'explanation': '+((title:basketbal title:cartoon '
                                  'title:alien)^10.0 | (overview:basketbal '
                                  'overview:cartoon overview:alien)) #*:*',
                   'index': 'tmdb',
                   'valid': True}],
 'valid': True}


In [13]:
time.sleep(5) # give ES time to finish indexing everything

In [14]:
query = {
    'query': {
        'multi_match': { 
            'query': 'basketball with cartoon aliens',
            'fields': ['title^10', 'overview'],
        },
    },
    'size': '11'
}
search(query)

Num	Relevance Score		Movie Title		Overview
1	78.760		The Basketball Diaries
2	74.091		Alien³
3	74.091		Aliens
4	74.091		Alien
5	59.677		Cowboys & Aliens
6	59.677		Alien: Resurrection
7	59.677		Aliens in the Attic
8	49.958		Monsters vs Aliens
9	42.961		Aliens vs Predator: Requiem
10	42.961		AVP: Alien vs. Predator
11	12.882		Space Jam


# 3.6.1	Decomposing Relevance Score With Lucene’s Explain

In [15]:
def simpler_explain(explain_json, depth=0):
    result = " " * (depth * 2) + "%s, %s\n" % (explain_json['value'], explain_json['description'].replace('\n', ''))
    if 'details' in explain_json:
        for detail in explain_json['details']:
            result += simpler_explain(detail, depth=depth+1)
    return result

query['explain'] = True
response = requests.get(ELASTICSEARCH_URL + '/tmdb/movie/_search', json=query)
data = response.json()
for i in [3, 10]:
    print("Explain for %s" % data['hits']['hits'][i]['_source']['title'])
    print(simpler_explain(data['hits']['hits'][i]['_explanation']))

Explain for Alien
74.090744, max of:
  74.090744, sum of:
    74.090744, weight(title:alien in 338) [PerFieldSimilarity], result of:
      74.090744, score(doc=338,freq=1.0 = termFreq=1.0), product of:
        10.0, boost
        5.7722607, idf, computed as log(1 + (docCount - docFreq + 0.5) / (docFreq + 0.5)) from:
          9.0, docFreq
          3051.0, docCount
        1.2835655, tfNorm, computed as (freq * (k1 + 1)) / (freq + k1 * (1 - b + b * fieldLength / avgFieldLength)) from:
          1.0, termFreq=1.0
          1.2, parameter k1
          0.75, parameter b
          2.1740413, avgFieldLength
          1.0, fieldLength
  3.3211906, sum of:
    3.3211906, weight(overview:alien in 338) [PerFieldSimilarity], result of:
      3.3211906, score(doc=338,freq=1.0 = termFreq=1.0), product of:
        3.739638, idf, computed as log(1 + (docCount - docFreq + 0.5) / (docFreq + 0.5)) from:
          72.0, docFreq
          3050.0, docCount
        0.8881048, tfNorm, computed as (freq * (k

# 3.6.6 Lies, damned lies, and similarity

Elasticsearch 6 defaults to BM25 and since 6.3.0 deprecated classic similarity.

# 3.6.8 Fixing Space Jam vs Alien Ranking

In [16]:
query = {
    'query': {
        'multi_match': { 
            'query': 'basketball with cartoon aliens',
            'fields': ['title^0.1', 'overview'],
        }
    }
}
search(query)

Num	Relevance Score		Movie Title		Overview
1	12.882		Space Jam
2	7.538		Grown Ups
3	7.500		Speed Racer
4	7.244		Semi-Pro
5	7.163		The Flintstones
6	6.943		Coach Carter
7	6.765		White Men Can't Jump
8	5.845		Meet Dave
9	5.801		Aliens vs Predator: Requiem
10	5.440		Bedazzled


# Notes

Takeaways:

1. `multi_match` default behavior grabs the score of the best matched field, it's not some kind of smard weighted aggregation some people imagine it to be
2. boosting should not be decided beforehand based on intuition of field importance; instead it should be used after experimentally checking if fields of different natures require score balancing before they are compared/aggregated

Experiments:

- **[1,2]**: the title field *is* more important than the overview field, but it's also usually shorter and sometimes loosely representative of the movie's content (therefore may have a higher diversity of words). Therefore it will naturally yield higher field scores per match, which have to be boosted down before implicitly compared by a `multi_match` aggregation with the overview field.