# Boilerplate Setup

In [22]:
import requests
import json
import logging
import numpy as np
from itertools import islice
from elasticsearch import Elasticsearch

# Optional, enable client-side caching for TMDB
# Requires: https://httpcache.readthedocs.org/en/latest/
#from httpcache import CachingHTTPAdapter
#tmdb_api.mount('https://', CachingHTTPAdapter())
#tmdb_api.mount('http://', CachingHTTPAdapter())

# Some utilities for flattening the explain into something a bit more
# readable. Pass Explain JSON, get something readable (ironically this is what Solr's default output is :-p)
def flatten(l):
    [item for sublist in l for item in sublist]

def simplerExplain(explainJson, depth=0):
    result = " " * (depth * 2) + "%s, %s\n" % (explainJson['value'], explainJson['description'])
    #print json.dumps(explainJson, indent=True)
    if 'details' in explainJson:
        for detail in explainJson['details']:
            result += simplerExplain(detail, depth=depth+1)
    return result

# 3.2.2 Indexing TMDB Movies

In [2]:
def extract():
    f = open('tmdb.json')
    if f:
         return json.loads(f.read());        
    return {}

In [20]:
def reindex(analysisSettings={}, mappingSettings={}, movieDict={}):
    settings = { #A
        "settings": {
            "number_of_shards": 1, #B
            "index": {
                "analysis" : analysisSettings, #C
            }}}

    if mappingSettings:
        settings['mappings'] = mappingSettings #C

    resp = requests.delete("https://localhost:9200/tmdb") #D
    resp = requests.put("https://localhost:9200/tmdb", 
                        data=json.dumps(settings))

    bulkMovies = ""
    logging.info("building...")
    for id, movie in movieDict.iteritems(): 
        addCmd = {"index": {"_index": "tmdb", #E
                            "_type": "movie",
                            "_id": movie["id"]}}
        bulkMovies += json.dumps(addCmd) + "\n" + json.dumps(movie) + "\n"

    logging.info("indexing...")
    resp = requests.post("https://localhost:9200/_bulk", data=bulkMovies)


In [15]:
movieDict = extract()

In [16]:
print(list(islice(movieDict.items(), 2)))

[('93837', {'poster_path': '/mfMndRWFbzXbTx0g3rHUXFAxyOh.jpg', 'production_countries': [{'iso_3166_1': 'US', 'name': 'United States of America'}], 'revenue': 0, 'overview': 'When the FBI hires her to go undercover at a college sorority, Molly Morris (Miley Cyrus) must transform herself from a tough, streetwise private investigator to a refined, sophisticated university girl to help protect the daughter of a one-time Mobster. With several suspects on her list, Molly unexpectedly discovers that not everyone is who they appear to be, including herself.', 'video': False, 'id': 93837, 'genres': [{'id': 28, 'name': 'Action'}, {'id': 35, 'name': 'Comedy'}], 'title': 'So Undercover', 'tagline': "Meet the FBI's new secret weapon", 'vote_count': 55, 'homepage': '', 'belongs_to_collection': None, 'original_language': 'en', 'status': 'Released', 'spoken_languages': [{'iso_639_1': 'en', 'name': 'English'}], 'imdb_id': 'tt1766094', 'adult': False, 'backdrop_path': '/o4Tt60z94Hbgk8adeZG9WE4S2im.jpg',

In [23]:
es = Elasticsearch("http://localhost:9200/")


# 3.2.3 Basic Searching

In [None]:
def search(query):
    url = 'http://localhost:9200/tmdb/movie/_search'
    httpResp = requests.get(url, data=json.dumps(query)) #A
    searchHits = json.loads(httpResp.text)['hits']
    print "Num\tRelevance Score\t\tMovie Title\t\tOverview" #B
    for idx, hit in enumerate(searchHits['hits']):
            print "%s\t%s\t\t%s" % (idx + 1, hit['_score'], hit['_source']['title'])


In [None]:
usersSearch = 'basketball with cartoon aliens'
query = {
    'query': {
        'multi_match': { 
            'query': usersSearch, #A
            'fields': ['title^10', 'overview'], #B
        },
    },
    'size': '100'
}
search(query)


# 2.3.1 Query Validation API

In [None]:
query = {
   'query': {
        'multi_match': { 
            'query': usersSearch,  #User's query
            'fields': ['title^10', 'overview']
        }
    }
}
httpResp = requests.get('http://localhost:9200' + 
			    '/tmdb/movie/_validate/query?explain',
			     data=json.dumps(query))
print json.loads(httpResp.text)

# 2.3.3 Debugging Analysis

In [None]:
# Inner Layer of the Onion -- Why did the search engine consider these movies matches? Two sides to this
# (1) What tokens are placed in the search engine?
# (2) What did the search engine attempt to match exactly?

# Explain of what's happening when we construct these terms

#resp = requests.get(elasticSearchUrl + "/tmdb/_mapping/movie/field/title?format=yaml'
resp = requests.get('http://localhost:9200/tmdb/_analyze?field=title&format=yaml', 
                    data="Fire with Fire")
print resp.text

# 2.3.5 -- Solving The Matching Problem

In [None]:
mappingSettings = {
       'movie': {
            'properties': {
               'title': { #A
                   'type': 'string',
                   'analyzer': 'english'
               },
            'overview': {
                   'type': 'string',
                   'analyzer': 'english'
               }
            }
       }
}
reindex(mappingSettings=mappingSettings, movieDict=movieDict) 


In [None]:
resp = requests.get('http://localhost:9200/tmdb/_analyze?field=title&format=yaml', 
                    data="Fire with Fire")
print resp.text

## Repeat the search

In [None]:
usersSearch = 'basketball with cartoon aliens'
query = {
    'query': {
        'multi_match': { 
            'query': usersSearch, #A
            'fields': ['title^10', 'overview'], #B
        },
    },
    'size': '100'
}
search(query)


# 2.4.1	Decomposing Relevance Score With Lucene’s Explain

In [None]:
query['explain'] = True
httpResp = requests.get('http://localhost:9200/tmdb/movie/_search', data=json.dumps(query))
jsonResp = json.loads(httpResp.text)
print json.dumps(jsonResp['hits']['hits'][0]['_explanation'], indent=True)
print "Explain for %s" % jsonResp['hits']['hits'][0]['_source']['title']
print simplerExplain(jsonResp['hits']['hits'][0]['_explanation'])
print "Explain for %s" % jsonResp['hits']['hits'][1]['_source']['title']
print simplerExplain(jsonResp['hits']['hits'][1]['_explanation'])
print "Explain for %s" % jsonResp['hits']['hits'][2]['_source']['title']
print simplerExplain(jsonResp['hits']['hits'][2]['_explanation'])
print "Explain for %s" % jsonResp['hits']['hits'][3]['_source']['title']
print simplerExplain(jsonResp['hits']['hits'][3]['_explanation'])
print "Explain for %s" % jsonResp['hits']['hits'][10]['_source']['title']
print simplerExplain(jsonResp['hits']['hits'][10]['_explanation'])


# 3.4.4	Fixing Space Jam vs Alien Ranking

In [None]:
query = {
    'query': {
        'multi_match': { 
            'query': usersSearch,  #User's query
            'fields': ['title^0.1', 'overview'],
        }
    },
    'explain': True
}
search(query)
