# Boilerplate Setup

In [99]:
import json
import os
from pprint import pprint
import requests

ELASTICSEARCH_URL = os.environ.get('ELASTICSEARCH_URL', 'http://192.168.99.100:9200')
# Optional, enable client-side caching for TMDB
# Requires: https://httpcache.readthedocs.org/en/latest/
#from httpcache import CachingHTTPAdapter
#tmdb_api.mount('https://', CachingHTTPAdapter())
#tmdb_api.mount('http://', CachingHTTPAdapter())

# Some utilities for flattening the explain into something a bit more
# readable. Pass Explain JSON, get something readable (ironically this is what Solr's default output is :-p)
def flatten(l):
    [item for sublist in l for item in sublist]

def simpler_explain(explain_json, depth=0):
    result = " " * (depth * 2) + "%s, %s\n" % (explain_json['value'], explain_json['description'])
    #print json.dumps(explain_json, indent=True)
    if 'details' in explain_json:
        for detail in explain_json['details']:
            result += simpler_explain(detail, depth=depth+1)
    return result

# 3.4 Indexing TMDB Movies

In [20]:
def extract():
    with open('tmdb.json') as f:
         return json.loads(f.read())

In [52]:
def reindex(analysis={}, mappings={}, movies={}):
    settings = {
        "settings": {
            "number_of_shards": 1,
            "index": {
                "analysis" : analysis,
            },
            'mappings': mappings
        }
    }

    resp = requests.delete(ELASTICSEARCH_URL)
    resp = requests.put(ELASTICSEARCH_URL, json=settings)

    bulk_movies = ""
    for id, movie in movies.items(): 
        add_op = {"index": {"_index": "tmdb",
                            "_type": "movie",
                            "_id": movie["id"]}}
        bulk_movies += json.dumps(add_op) + "\n" + json.dumps(movie) + "\n"

    resp = requests.post(
        ELASTICSEARCH_URL + '/_bulk',
        data=bulk_movies,
        headers={'Content-Type': 'application/x-ndjson'}
    )


In [53]:
movies = extract()
reindex(movies=movies)

# 3.4.1 Basic Searching

In [101]:
def search(query):
    url = ELASTICSEARCH_URL + '/tmdb/movie/_search'
    response = requests.get(url, json=query)
    hits = response.json()['hits']
    print("Num\tRelevance Score\t\tMovie Title\t\tOverview")
    for idx, hit in enumerate(hits['hits']):
            print("%s\t%.3f\t\t%s" % (idx + 1, hit['_score'], hit['_source']['title']))

In [102]:
inquiry = 'basketball with cartoon aliens'
query = {
    'query': {
        'multi_match': { 
            'query': inquiry,
            'fields': ['title^10', 'overview'],
        },
    },
    'size': '10'
}
search(query)

Num	Relevance Score		Movie Title		Overview
1	71.448		Aliens
2	59.805		Cowboys & Aliens
3	57.590		The Basketball Diaries
4	57.590		Monsters vs Aliens
5	49.728		Friends with Kids
6	49.450		Aliens in the Attic
7	45.105		Aliens vs Predator: Requiem
8	43.483		From Russia With Love
9	43.221		Hobo with a Shotgun
10	42.459		Dances with Wolves


# 3.5.1 Query Validation API

In [104]:
query = {
   'query': {
        'multi_match': { 
            'query': inquiry,
            'fields': ['title^10', 'overview']
        }
    }
}
response = requests.get(
    ELASTICSEARCH_URL + '/tmdb/movie/_validate/query?explain',
    json=query
)
response.json()

{'_shards': {'failed': 0, 'successful': 1, 'total': 1},
 'explanations': [{'explanation': '+((title:basketball title:with title:cartoon title:aliens)^10.0 | (overview:basketball overview:with overview:cartoon overview:aliens)) #*:*',
   'index': 'tmdb',
   'valid': True}],
 'valid': True}

# 3.5.3 Debugging Analysis

In [111]:
# Inner Layer of the Onion -- Why did the search engine consider these movies matches? Two sides to this
# (1) What tokens are placed in the search engine?
# (2) What did the search engine attempt to match exactly?

# Explain of what's happening when we construct these terms

#resp = requests.get(elasticSearchUrl + "/tmdb/_mapping/movie/field/title?format=yaml'
response = requests.get(
    ELASTICSEARCH_URL + '/tmdb/_analyze?format=yaml', 
    json={'text': "Fire with Fire"}
)
print(response.text)

---
tokens:
- token: "fire"
  start_offset: 0
  end_offset: 4
  type: "<ALPHANUM>"
  position: 0
- token: "with"
  start_offset: 5
  end_offset: 9
  type: "<ALPHANUM>"
  position: 1
- token: "fire"
  start_offset: 10
  end_offset: 14
  type: "<ALPHANUM>"
  position: 2



# 2.3.5 -- Solving The Matching Problem

In [67]:
mappingSettings = {
       'movie': {
            'properties': {
               'title': { #A
                   'type': 'string',
                   'analyzer': 'english'
               },
            'overview': {
                   'type': 'string',
                   'analyzer': 'english'
               }
            }
       }
}
reindex(mappingSettings=mappingSettings, movieDict=movieDict) 


building...
indexing...


In [73]:
resp = requests.get('http://localhost:9200/tmdb/_analyze?field=title&format=yaml', 
                    data="Fire with Fire")
print resp.text

---
tokens:
- token: "fire"
  start_offset: 0
  end_offset: 4
  type: "<ALPHANUM>"
  position: 0
- token: "fire"
  start_offset: 10
  end_offset: 14
  type: "<ALPHANUM>"
  position: 2



## Repeat the search

In [77]:
usersSearch = 'basketball with cartoon aliens'
query = {
    'query': {
        'multi_match': { 
            'query': usersSearch, #A
            'fields': ['title^10', 'overview'], #B
        },
    },
    'size': '100'
}
search(query)


Num	Relevance Score		Movie Title		Overview
1	1.0754712		Alien
2	1.0754712		Aliens
3	1.0754712		Alien³
4	1.032656		The Basketball Diaries
5	0.67216945		Cowboys & Aliens
6	0.67216945		Aliens in the Attic
7	0.67216945		Alien: Resurrection
8	0.5377356		AVP: Alien vs. Predator
9	0.5377356		Monsters vs Aliens
10	0.5377356		Aliens vs Predator: Requiem
11	0.08214001		Space Jam
12	0.024269354		Grown Ups
13	0.023364348		Speed Racer
14	0.023364348		The Flintstones
15	0.020802302		White Men Can't Jump
16	0.020802302		Coach Carter
17	0.018015321		Semi-Pro
18	0.016667001		The Thing
19	0.014142419		Meet Dave
20	0.0138682015		Teen Wolf
21	0.0138682015		High School Musical
22	0.0138682015		Bedazzled
23	0.013333602		Invasion of the Body Snatchers
24	0.013333602		Escape from Planet Earth
25	0.013333602		Slither
26	0.013333602		The Darkest Hour
27	0.01178535		District 9
28	0.011666901		Avatar
29	0.011666901		The Last Starfighter
30	0.010103833		The X Files
31	0.010000201		Scary Movie 3
32	0.010000201		The

# 2.4.1	Decomposing Relevance Score With Lucene’s Explain

In [81]:
query['explain'] = True
httpResp = requests.get('http://localhost:9200/tmdb/movie/_search', data=json.dumps(query))
jsonResp = json.loads(httpResp.text)
print json.dumps(jsonResp['hits']['hits'][0]['_explanation'], indent=True)
print "Explain for %s" % jsonResp['hits']['hits'][0]['_source']['title']
print simplerExplain(jsonResp['hits']['hits'][0]['_explanation'])
print "Explain for %s" % jsonResp['hits']['hits'][1]['_source']['title']
print simplerExplain(jsonResp['hits']['hits'][1]['_explanation'])
print "Explain for %s" % jsonResp['hits']['hits'][2]['_source']['title']
print simplerExplain(jsonResp['hits']['hits'][2]['_explanation'])
print "Explain for %s" % jsonResp['hits']['hits'][3]['_source']['title']
print simplerExplain(jsonResp['hits']['hits'][3]['_explanation'])
print "Explain for %s" % jsonResp['hits']['hits'][10]['_source']['title']
print simplerExplain(jsonResp['hits']['hits'][10]['_explanation'])


{
 "description": "sum of:", 
 "value": 1.0754712, 
 "details": [
  {
   "description": "max of:", 
   "value": 1.0754712, 
   "details": [
    {
     "description": "product of:", 
     "value": 1.0754712, 
     "details": [
      {
       "description": "sum of:", 
       "value": 3.2264135, 
       "details": [
        {
         "description": "weight(title:alien in 23) [PerFieldSimilarity], result of:", 
         "value": 3.2264135, 
         "details": [
          {
           "description": "score(doc=23,freq=1.0), product of:", 
           "value": 3.2264135, 
           "details": [
            {
             "description": "queryWeight, product of:", 
             "value": 0.48007536, 
             "details": [
              {
               "description": "idf(docFreq=9, maxDocs=3051)", 
               "value": 6.7206397, 
               "details": []
              }, 
              {
               "description": "queryNorm", 
               "value": 0.071432985, 
         

# 3.4.4	Fixing Space Jam vs Alien Ranking

In [82]:
query = {
    'query': {
        'multi_match': { 
            'query': usersSearch,  #User's query
            'fields': ['title^0.1', 'overview'],
        }
    },
    'explain': True
}
search(query)


Num	Relevance Score		Movie Title		Overview
1	1.0016364		Space Jam
2	0.29594672		Grown Ups
3	0.28491083		Speed Racer
4	0.28491083		The Flintstones
5	0.2536686		White Men Can't Jump
6	0.2536686		Coach Carter
7	0.21968345		Semi-Pro
8	0.20324169		The Thing
9	0.1724563		Meet Dave
10	0.16911241		Teen Wolf
