# Boilerplate Setup

In [7]:
import requests
import json


headers = {'Content-Type': 'application/json'};

# Optional, enable client-side caching for TMDB
# Requires: https://httpcache.readthedocs.org/en/latest/
#from httpcache import CachingHTTPAdapter
#tmdb_api.mount('https://', CachingHTTPAdapter())
#tmdb_api.mount('http://', CachingHTTPAdapter())

# Some utilities for flattening the explain into something a bit more
# readable. Pass Explain JSON, get something readable (ironically this is what Solr's default output is :-p)
def flatten(l):
    [item for sublist in l for item in sublist]

def simplerExplain(explainJson, depth=0):
    result = " " * (depth * 2) + "%s, %s\n" % (explainJson['value'], explainJson['description'])
    #print json.dumps(explainJson, indent=True)
    if 'details' in explainJson:
        for detail in explainJson['details']:
            result += simplerExplain(detail, depth=depth+1)
    return result

# 3.2.2 Indexing TMDB Movies

In [8]:
def extract():
    f = open('tmdb.json')
    if f:
         return json.loads(f.read());        
    return {}

In [20]:
def reindex(analysisSettings={}, mappingSettings={}, movieDict={}):
    settings = { #A
        "settings": {
            "number_of_shards": 1, #B
            "index": {
                "analysis" : analysisSettings, #C
            }}}

    if mappingSettings:
        settings['mappings'] = mappingSettings #C

    resp = requests.delete("http://localhost:9200/tmdb") #D
    resp = requests.put("http://localhost:9200/tmdb", 
                        data=json.dumps(settings),headers=headers)

    bulkMovies = ""
    print "building..."
    for id, movie in movieDict.iteritems(): 
        addCmd = {"index": {"_index": "tmdb", #E
                            "_type": "movie",
                            "_id": movie["id"]}}
        bulkMovies += json.dumps(addCmd) + "\n" + json.dumps(movie) + "\n"

    print "indexing..."
    resp = requests.post("http://localhost:9200/_bulk", data=bulkMovies, headers=headers)


In [21]:
movieDict = extract()
reindex(movieDict=movieDict)


building...
indexing...


{
 "poster_path": "/6diHusTm8eQJnfoI7MyCSDgaBRp.jpg", 
 "production_countries": [
  {
   "iso_3166_1": "AU", 
   "name": "Australia"
  }, 
  {
   "iso_3166_1": "US", 
   "name": "United States of America"
  }
 ], 
 "revenue": 23006849, 
 "overview": "Two teenage girls discover a mermaid in their beach club's swimming pool.", 
 "video": false, 
 "id": 14191, 
 "genres": [
  {
   "id": 35, 
   "name": "Comedy"
  }, 
  {
   "id": 14, 
   "name": "Fantasy"
  }, 
  {
   "id": 10749, 
   "name": "Romance"
  }, 
  {
   "id": 10751, 
   "name": "Family"
  }
 ], 
 "title": "Aquamarine", 
 "tagline": "A Fish-Out-Of-Water Comedy.", 
 "vote_count": 59, 
 "homepage": "http://www.aquamarinemovie.com/", 
 "belongs_to_collection": null, 
 "original_language": "en", 
 "status": "Released", 
 "spoken_languages": [
  {
   "iso_639_1": "en", 
   "name": "English"
  }
 ], 
 "imdb_id": "tt0429591", 
 "adult": false, 
 "backdrop_path": "/6LN4NJfNWzpJMcfzfqplsm8lVfe.jpg", 
 "production_companies": [
  {
   "n

# 3.2.3 Basic Searching

In [24]:
def search(query):
    url = 'http://localhost:9200/tmdb/movie/_search'
    httpResp = requests.get(url, data=json.dumps(query), headers=headers) #A
    searchHits = json.loads(httpResp.text)['hits']
    print "Num\tRelevance Score\t\tMovie Title\t\tOverview" #B
    for idx, hit in enumerate(searchHits['hits']):
            print "%s\t%s\t\t%s" % (idx + 1, hit['_score'], hit['_source']['title'])


In [25]:
usersSearch = 'basketball with cartoon aliens'
query = {
    'query': {
        'multi_match': { 
            'query': usersSearch, #A
            'fields': ['title^10', 'overview'], #B
        },
    },
    'size': '100'
}
search(query)


Num	Relevance Score		Movie Title		Overview
1	85.5693		Aliens
2	73.71077		The Basketball Diaries
3	71.3202		Cowboys & Aliens
4	61.13922		Monsters vs Aliens
5	53.501823		Aliens in the Attic
6	53.501823		Aliens vs Predator: Requiem
7	45.221092		Dances with Wolves
8	45.221092		Friends with Kids
9	45.221092		Friends with Benefits
10	45.221092		Fire with Fire
11	39.57216		From Paris with Love
12	39.57216		Sleeping with the Enemy
13	39.57216		Interview with the Vampire
14	39.57216		Just Go With It
15	39.57216		To Rome with Love
16	39.57216		Gone with the Wind
17	39.57216		My Week with Marilyn
18	39.57216		Hobo with a Shotgun
19	39.57216		From Russia With Love
20	39.57216		Trouble with the Curve
21	35.177814		Die Hard: With a Vengeance
22	35.177814		Fun with Dick and Jane
23	35.177814		Girl with a Pearl Earring
24	31.661877		The Girl with the Dragon Tattoo
25	31.661877		The Life Aquatic With Steve Zissou
26	31.661877		The Man with the Golden Gun
27	31.661877		Twin Peaks: Fire Walk with Me
28	3

# 2.3.1 Query Validation API

In [27]:
query = {
   'query': {
        'multi_match': { 
            'query': usersSearch,  #User's query
            'fields': ['title^10', 'overview']
        }
    }
}
httpResp = requests.get('http://localhost:9200' + 
			    '/tmdb/movie/_validate/query?explain',
			     data=json.dumps(query), headers=headers)
print json.loads(httpResp.text)

{u'valid': True, u'explanations': [{u'index': u'tmdb', u'explanation': u'+((title:basketball title:with title:cartoon title:aliens)^10.0 | (overview:basketball overview:with overview:cartoon overview:aliens)) #*:*', u'valid': True}], u'_shards': {u'successful': 1, u'failed': 0, u'total': 1}}


# 2.3.3 Debugging Analysis

In [36]:
# Inner Layer of the Onion -- Why did the search engine consider these movies matches? Two sides to this
# (1) What tokens are placed in the search engine?
# (2) What did the search engine attempt to match exactly?

# Explain of what's happening when we construct these terms

#resp = requests.get(elasticSearchUrl + "/tmdb/_mapping/movie/field/title?format=yaml'
data={"text":"Fire with Fire", "field":"title"}
print json.dumps(data)

resp = requests.get('http://localhost:9200/tmdb/_analyze?format=yaml', 
                    data=json.dumps(data), headers=headers)
print resp.text

{"text": "Fire with Fire", "field": "title"}
---
tokens:
- token: "fire"
  start_offset: 0
  end_offset: 4
  type: "<ALPHANUM>"
  position: 0
- token: "with"
  start_offset: 5
  end_offset: 9
  type: "<ALPHANUM>"
  position: 1
- token: "fire"
  start_offset: 10
  end_offset: 14
  type: "<ALPHANUM>"
  position: 2



# 2.3.5 -- Solving The Matching Problem

In [38]:
mappingSettings = {
       'movie': {
            'properties': {
               'title': { #A
                   'type': 'string',
                   'analyzer': 'english'
               },
            'overview': {
                   'type': 'string',
                   'analyzer': 'english'
               }
            }
       }
}
reindex(mappingSettings=mappingSettings, movieDict=movieDict) 


building...
indexing...


In [41]:
resp = requests.get('http://localhost:9200/tmdb/_analyze?format=yaml', 
                    data=json.dumps(data), headers=headers)
print resp.text

---
tokens:
- token: "fire"
  start_offset: 0
  end_offset: 4
  type: "<ALPHANUM>"
  position: 0
- token: "with"
  start_offset: 5
  end_offset: 9
  type: "<ALPHANUM>"
  position: 1
- token: "fire"
  start_offset: 10
  end_offset: 14
  type: "<ALPHANUM>"
  position: 2



## Repeat the search

In [77]:
usersSearch = 'basketball with cartoon aliens'
query = {
    'query': {
        'multi_match': { 
            'query': usersSearch, #A
            'fields': ['title^10', 'overview'], #B
        },
    },
    'size': '100'
}
search(query)


Num	Relevance Score		Movie Title		Overview
1	1.0754712		Alien
2	1.0754712		Aliens
3	1.0754712		Alien³
4	1.032656		The Basketball Diaries
5	0.67216945		Cowboys & Aliens
6	0.67216945		Aliens in the Attic
7	0.67216945		Alien: Resurrection
8	0.5377356		AVP: Alien vs. Predator
9	0.5377356		Monsters vs Aliens
10	0.5377356		Aliens vs Predator: Requiem
11	0.08214001		Space Jam
12	0.024269354		Grown Ups
13	0.023364348		Speed Racer
14	0.023364348		The Flintstones
15	0.020802302		White Men Can't Jump
16	0.020802302		Coach Carter
17	0.018015321		Semi-Pro
18	0.016667001		The Thing
19	0.014142419		Meet Dave
20	0.0138682015		Teen Wolf
21	0.0138682015		High School Musical
22	0.0138682015		Bedazzled
23	0.013333602		Invasion of the Body Snatchers
24	0.013333602		Escape from Planet Earth
25	0.013333602		Slither
26	0.013333602		The Darkest Hour
27	0.01178535		District 9
28	0.011666901		Avatar
29	0.011666901		The Last Starfighter
30	0.010103833		The X Files
31	0.010000201		Scary Movie 3
32	0.010000201		The

# 2.4.1	Decomposing Relevance Score With Lucene’s Explain

In [81]:
query['explain'] = True
httpResp = requests.get('http://localhost:9200/tmdb/movie/_search', data=json.dumps(query))
jsonResp = json.loads(httpResp.text)
print json.dumps(jsonResp['hits']['hits'][0]['_explanation'], indent=True)
print "Explain for %s" % jsonResp['hits']['hits'][0]['_source']['title']
print simplerExplain(jsonResp['hits']['hits'][0]['_explanation'])
print "Explain for %s" % jsonResp['hits']['hits'][1]['_source']['title']
print simplerExplain(jsonResp['hits']['hits'][1]['_explanation'])
print "Explain for %s" % jsonResp['hits']['hits'][2]['_source']['title']
print simplerExplain(jsonResp['hits']['hits'][2]['_explanation'])
print "Explain for %s" % jsonResp['hits']['hits'][3]['_source']['title']
print simplerExplain(jsonResp['hits']['hits'][3]['_explanation'])
print "Explain for %s" % jsonResp['hits']['hits'][10]['_source']['title']
print simplerExplain(jsonResp['hits']['hits'][10]['_explanation'])


{
 "description": "sum of:", 
 "value": 1.0754712, 
 "details": [
  {
   "description": "max of:", 
   "value": 1.0754712, 
   "details": [
    {
     "description": "product of:", 
     "value": 1.0754712, 
     "details": [
      {
       "description": "sum of:", 
       "value": 3.2264135, 
       "details": [
        {
         "description": "weight(title:alien in 23) [PerFieldSimilarity], result of:", 
         "value": 3.2264135, 
         "details": [
          {
           "description": "score(doc=23,freq=1.0), product of:", 
           "value": 3.2264135, 
           "details": [
            {
             "description": "queryWeight, product of:", 
             "value": 0.48007536, 
             "details": [
              {
               "description": "idf(docFreq=9, maxDocs=3051)", 
               "value": 6.7206397, 
               "details": []
              }, 
              {
               "description": "queryNorm", 
               "value": 0.071432985, 
         

# 3.4.4	Fixing Space Jam vs Alien Ranking

In [82]:
query = {
    'query': {
        'multi_match': { 
            'query': usersSearch,  #User's query
            'fields': ['title^0.1', 'overview'],
        }
    },
    'explain': True
}
search(query)


Num	Relevance Score		Movie Title		Overview
1	1.0016364		Space Jam
2	0.29594672		Grown Ups
3	0.28491083		Speed Racer
4	0.28491083		The Flintstones
5	0.2536686		White Men Can't Jump
6	0.2536686		Coach Carter
7	0.21968345		Semi-Pro
8	0.20324169		The Thing
9	0.1724563		Meet Dave
10	0.16911241		Teen Wolf
