# Boilerplate Setup

In [1]:
import json
import os
from pprint import pprint
import requests

ELASTICSEARCH_URL = os.environ.get('ELASTICSEARCH_URL', 'http://192.168.99.100:9200')
# Optional, enable client-side caching for TMDB
# Requires: https://httpcache.readthedocs.org/en/latest/
#from httpcache import CachingHTTPAdapter
#tmdb_api.mount('https://', CachingHTTPAdapter())
#tmdb_api.mount('http://', CachingHTTPAdapter())

# Some utilities for flattening the explain into something a bit more
# readable. Pass Explain JSON, get something readable (ironically this is what Solr's default output is :-p)
def flatten(l):
    [item for sublist in l for item in sublist]

def simpler_explain(explain_json, depth=0):
    result = " " * (depth * 2) + "%s, %s\n" % (explain_json['value'], explain_json['description'])
    #print json.dumps(explain_json, indent=True)
    if 'details' in explain_json:
        for detail in explain_json['details']:
            result += simpler_explain(detail, depth=depth+1)
    return result

# 3.4 Indexing TMDB Movies

In [2]:
def extract():
    with open('tmdb.json') as f:
         return json.loads(f.read())

In [3]:
def reindex(analysis={}, mappings={}, movies={}):
    settings = {
        'settings': {
            'number_of_shards': 1,
            'index': {
                'analysis' : analysis,
            },
        },
        'mappings': mappings,
    }

    response = requests.delete(ELASTICSEARCH_URL + '/tmdb')
    response = requests.put(ELASTICSEARCH_URL + '/tmdb', json=settings)

    bulk_movies = ""
    for id, movie in movies.items(): 
        add_op = {"index": {"_index": "tmdb",
                            "_type": "movie",
                            "_id": movie["id"]}}
        bulk_movies += json.dumps(add_op) + "\n" + json.dumps(movie) + "\n"

    response = requests.post(
        ELASTICSEARCH_URL + '/_bulk',
        data=bulk_movies,
        headers={'Content-Type': 'application/x-ndjson'}
    )
    return response

In [4]:
movies = extract()
reindex(movies=movies)

<Response [200]>

# 3.4.1 Basic Searching

In [5]:
def search(query):
    url = ELASTICSEARCH_URL + '/tmdb/movie/_search'
    response = requests.get(url, json=query)
    hits = response.json()['hits']
    print("Num\tRelevance Score\t\tMovie Title\t\tOverview")
    for idx, hit in enumerate(hits['hits']):
            print("%s\t%.3f\t\t%s" % (idx + 1, hit['_score'], hit['_source']['title']))

In [8]:
inquiry = 'basketball with cartoon aliens'
query = {
    'query': {
        'multi_match': { 
            'query': inquiry,
            'fields': ['title^10', 'overview'],
        },
    },
    'size': '37'
}
search(query)

Num	Relevance Score		Movie Title		Overview
1	85.569		Aliens
2	73.711		The Basketball Diaries
3	71.320		Cowboys & Aliens
4	61.139		Monsters vs Aliens
5	53.502		Aliens vs Predator: Requiem
6	53.502		Aliens in the Attic
7	45.221		Dances with Wolves
8	45.221		Friends with Kids
9	45.221		Friends with Benefits
10	45.221		Fire with Fire
11	39.572		My Week with Marilyn
12	39.572		Trouble with the Curve
13	39.572		Sleeping with the Enemy
14	39.572		From Paris with Love
15	39.572		Interview with the Vampire
16	39.572		Just Go With It
17	39.572		Gone with the Wind
18	39.572		From Russia With Love
19	39.572		To Rome with Love
20	39.572		Hobo with a Shotgun
21	35.178		Fun with Dick and Jane
22	35.178		Die Hard: With a Vengeance
23	35.178		Girl with a Pearl Earring
24	31.662		The Girl with the Dragon Tattoo
25	31.662		The Life Aquatic With Steve Zissou
26	31.662		The Man with the Iron Fists
27	31.662		Twin Peaks: Fire Walk with Me
28	31.662		The Man with the Golden Gun
29	31.662		Cloudy with a Chanc

# 3.5.1 Query Validation API

In [9]:
query = {
   'query': {
        'multi_match': { 
            'query': inquiry,
            'fields': ['title^10', 'overview']
        }
    }
}
response = requests.get(
    ELASTICSEARCH_URL + '/tmdb/movie/_validate/query?explain',
    json=query
)
response.json()

{'_shards': {'failed': 0, 'successful': 1, 'total': 1},
 'explanations': [{'explanation': '+((title:basketball title:with title:cartoon title:aliens)^10.0 | (overview:basketball overview:with overview:cartoon overview:aliens)) #*:*',
   'index': 'tmdb',
   'valid': True}],
 'valid': True}

# 3.5.3 Debugging Analysis

In [10]:
# Inner Layer of the Onion -- Why did the search engine consider these movies matches? Two sides to this
# (1) What tokens are placed in the search engine?
# (2) What did the search engine attempt to match exactly?

response = requests.get(
    ELASTICSEARCH_URL + '/tmdb/_analyze?format=yaml', 
    json={'text': "Fire with Fire", 'field': 'title'}
)
print(response.text)

---
tokens:
- token: "fire"
  start_offset: 0
  end_offset: 4
  type: "<ALPHANUM>"
  position: 0
- token: "with"
  start_offset: 5
  end_offset: 9
  type: "<ALPHANUM>"
  position: 1
- token: "fire"
  start_offset: 10
  end_offset: 14
  type: "<ALPHANUM>"
  position: 2



# 3.5.5 Solving The Matching Problem

In [11]:
mappings = {
       'movie': {
            'properties': {
               'title': {
                   'type': 'text',
                   'analyzer': 'english'
               },
            'overview': {
                   'type': 'text',
                   'analyzer': 'english'
               }
            }
       }
}
reindex(mappings=mappings, movies=movies)

<Response [200]>

In [12]:
response = requests.get(
    ELASTICSEARCH_URL + '/tmdb/_analyze?format=yaml', 
    json={'text': "Fire with Fire", 'field': 'title'}
)
print(response.text)

---
tokens:
- token: "fire"
  start_offset: 0
  end_offset: 4
  type: "<ALPHANUM>"
  position: 0
- token: "fire"
  start_offset: 10
  end_offset: 14
  type: "<ALPHANUM>"
  position: 2



In [13]:
query = {
   'query': {
        'multi_match': { 
            'query': inquiry,
            'fields': ['title^10', 'overview']
        }
    }
}
response = requests.get(
    ELASTICSEARCH_URL + '/tmdb/movie/_validate/query?explain',
    json=query
)
response.json()

{'_shards': {'failed': 0, 'successful': 1, 'total': 1},
 'explanations': [{'explanation': '+((title:basketbal title:cartoon title:alien)^10.0 | (overview:basketbal overview:cartoon overview:alien)) #*:*',
   'index': 'tmdb',
   'valid': True}],
 'valid': True}

In [15]:
inquiry = 'basketball with cartoon aliens'
query = {
    'query': {
        'multi_match': { 
            'query': inquiry,
            'fields': ['title^10', 'overview'],
        },
    },
    'size': '11'
}
search(query)

Num	Relevance Score		Movie Title		Overview
1	78.760		The Basketball Diaries
2	74.091		Alien³
3	74.091		Aliens
4	74.091		Alien
5	59.677		Cowboys & Aliens
6	59.677		Aliens in the Attic
7	59.677		Alien: Resurrection
8	49.958		Monsters vs Aliens
9	42.961		Aliens vs Predator: Requiem
10	42.961		AVP: Alien vs. Predator
11	12.882		Space Jam


# 2.4.1	Decomposing Relevance Score With Lucene’s Explain

In [81]:
query['explain'] = True
httpResp = requests.get('http://localhost:9200/tmdb/movie/_search', data=json.dumps(query))
jsonResp = json.loads(httpResp.text)
print json.dumps(jsonResp['hits']['hits'][0]['_explanation'], indent=True)
print "Explain for %s" % jsonResp['hits']['hits'][0]['_source']['title']
print simplerExplain(jsonResp['hits']['hits'][0]['_explanation'])
print "Explain for %s" % jsonResp['hits']['hits'][1]['_source']['title']
print simplerExplain(jsonResp['hits']['hits'][1]['_explanation'])
print "Explain for %s" % jsonResp['hits']['hits'][2]['_source']['title']
print simplerExplain(jsonResp['hits']['hits'][2]['_explanation'])
print "Explain for %s" % jsonResp['hits']['hits'][3]['_source']['title']
print simplerExplain(jsonResp['hits']['hits'][3]['_explanation'])
print "Explain for %s" % jsonResp['hits']['hits'][10]['_source']['title']
print simplerExplain(jsonResp['hits']['hits'][10]['_explanation'])


{
 "description": "sum of:", 
 "value": 1.0754712, 
 "details": [
  {
   "description": "max of:", 
   "value": 1.0754712, 
   "details": [
    {
     "description": "product of:", 
     "value": 1.0754712, 
     "details": [
      {
       "description": "sum of:", 
       "value": 3.2264135, 
       "details": [
        {
         "description": "weight(title:alien in 23) [PerFieldSimilarity], result of:", 
         "value": 3.2264135, 
         "details": [
          {
           "description": "score(doc=23,freq=1.0), product of:", 
           "value": 3.2264135, 
           "details": [
            {
             "description": "queryWeight, product of:", 
             "value": 0.48007536, 
             "details": [
              {
               "description": "idf(docFreq=9, maxDocs=3051)", 
               "value": 6.7206397, 
               "details": []
              }, 
              {
               "description": "queryNorm", 
               "value": 0.071432985, 
         

# 3.4.4	Fixing Space Jam vs Alien Ranking

In [82]:
query = {
    'query': {
        'multi_match': { 
            'query': usersSearch,  #User's query
            'fields': ['title^0.1', 'overview'],
        }
    },
    'explain': True
}
search(query)


Num	Relevance Score		Movie Title		Overview
1	1.0016364		Space Jam
2	0.29594672		Grown Ups
3	0.28491083		Speed Racer
4	0.28491083		The Flintstones
5	0.2536686		White Men Can't Jump
6	0.2536686		Coach Carter
7	0.21968345		Semi-Pro
8	0.20324169		The Thing
9	0.1724563		Meet Dave
10	0.16911241		Teen Wolf
