# Setup Code from Prev Chapters (run first)

In [21]:
import requests
import json
import os
from elasticsearch import (
    Elasticsearch,
    helpers
)
client = Elasticsearch()
Headers= {'Content-Type' : "application/json" }
# Some utilities for flattening the explain into something a bit more
# readable. Pass Explain JSON, get something readable (ironically this is what Solr's default output is :-p)
def flatten(l):
    [item for sublist in l for item in sublist]

def simplerExplain(explainJson, depth=0):
    result = " " * (depth * 2) + "%s, %s\n" % (explainJson['value'], explainJson['description'])
    #print json.dumps(explainJson, indent=True)
    if 'details' in explainJson:
        for detail in explainJson['details']:
            result += simplerExplain(detail, depth=depth+1)
    return result


# To speed up the pace of development, we really need to focus more heavily on the analysis and query
# settings of the search engine, rather than fidly bits of the http interface.
#
# To that end, we're going to collapse some of the code you were introduced to in chapter 3 into more general functions,
# so we can reuse them. Largely, this is the exact same code you saw in chapter 3 some more generality.

## Analyze
## The analyze function is a helper for accessing the _analyze endpoint like we did in chapter 3. Recall,
## given a field or analyzer, passing some text to _analyze will return the token stream that results from
## that analyzer. This token stream, if you recall, shows us exactly how the search engine translate text
## into individual tokens to be consumed by the underlying data structures. When we debug analysis, we see
## matches we need to expect.

def analyze(text, field=None, analyzer=None):
    whatToAnalyze = ''
    if field is not None:
        whatToAnalyze = "field=%s" % field
    elif analyzer is not None:
        whatToAnalyze = "analyzer=%s" % analyzer
    resp = requests.get("http://localhost:9200/tmdb/_analyze?%s&format=yaml" % whatToAnalyze, 
                        data=text,headers=Headers)
    print resp.text
    
## Search
## Next we need to wrap up our execution of query DSL queries. The function 'search' will execute the passed query DSL
## query and display the results. 
## If a scoring explain is associated with the results, then it also gets displayed,
## We'll also be sure to dump the query DSL
def search(query, verbose=False):
    url = 'http://localhost:9200/tmdb/_search'
    httpResp = requests.get(url, data=json.dumps(query),headers=Headers)
    if httpResp.status_code != 200:
        print "Search Failed <%s>" % httpResp.status_code
        print "%s" % httpResp.text
    searchHits = json.loads(httpResp.text)['hits']
    print "Num\tRelevance Score\t\tMovie Title"
    for idx, hit in enumerate(searchHits['hits']):
            castNames = []            
            castCharacters = []                        
            directorNames = []
            for cast in hit['_source']['cast']:
                castNames.append(cast['name'])
                castCharacters.append(cast['character'])
            for director in hit['_source']['directors']:
                directorNames.append(director['name'])
            print "%s\t%s\t\t%s" % (idx + 1, hit['_score'], hit['_source']['title'])
            if verbose:
                print "%s" % hit['_source']['title']
                print "%s" % hit['_source']['tagline']        
                print "%s" % hit['_source']['overview']        
                print "%s" % hit['_id']
                print "DIRS %s" % directorNames
                print "CAST %s" % castNames
                print "CHAR %s" % castCharacters
                if '_explanation' in hit:
                    print "%s" % simplerExplain(hit['_explanation'])
                    print "*************************************"
    
    if verbose:
        httpResp = requests.get('http://localhost:9200' + 
                    '/tmdb/_validate/query?explain',
                     data=json.dumps({'query': query['query']}),headers=Headers)
        print json.loads(httpResp.text)

## Reindex
## Reindex takes analyzer and field mappings, recreates the index, and then reindexes
## TMDB movies using the _bulk index API. There are other ways for modifying the configuration
## of the index besides dropping and restarting, however for convenience and because our data
## isn't truly that large, we'll just delete and start from scratch when we need to.
def reindex(analysisSettings, mappingSettings=None, movieDict={}):
    # Destroy any existing index (equiv to SQL "drop table")
    resp = requests.delete("http://localhost:9200/tmdb")
    print "Delete TMDB Index <%s>" % resp.status_code
    
    # Create the index with explicit settings
    # We need to explicitely set number of shards to 1 to eliminate the impact of 
    # distributed IDF on our small collection
    # See also "Relavance is Broken!"
    # http://www.elastic.co/guide/en/elasticsearch/guide/current/relevance-is-broken.html
    settings = {
        "settings": {
            "number_of_shards": 1,
            "index": {
                "analysis" : analysisSettings,
            }
        }
    }
    if mappingSettings:
        settings['mappings'] = mappingSettings
    resp = requests.put("http://localhost:9200/tmdb", data=json.dumps(settings),headers=Headers)
    print "Create TMDB Index <%s>" % resp.status_code
    if resp.status_code != 200:
        print resp.text
    
    # Bulk index title & overview to the movie endpoint
    print "Indexing %i movies" % len(movieDict.keys())
    actions = (format_doc(doc) for id,doc in movieDict.iteritems())
    results = [details for success,details in helpers.streaming_bulk(client, actions,chunk_size=5000) if not success]

    print "Bulk Index into TMDB Index <%s>" % results


## Extract
## major difference between our use of TMDB here and in chapter 3: pulling more data. Not only do we access the 
## movie endpoint, we also extract the credits -- pulling in the cast (actors and such) and extracting the director.
def extract(movieIds=[], numMovies=10000):
    if len(movieIds) == 0:
        try:
            f = open('tmdb.json')
            if f:
                return json.loads(f.read());
        except IOError:
            pass       
    return movieDict

def format_doc(doc):
    action = {
        "_index": "tmdb",
        "_type": "_doc",
        "_id": doc['id'],
        "_source": doc
        }
    return action

# Index to ES, Chapter 5 Settings

In [16]:
movieDict = extract([])

analysisSettings = {
   "analyzer" : {
      "default" : {
        "type" : "english"
      },
      "english_bigrams": {
          "type": "custom",
          "tokenizer": "standard",
          "filter": [
            "lowercase",
            "porter_stem",
            "bigram_filter"
          ]
      }
    },
  "filter": {
    "bigram_filter": {
        "type": "shingle",
        "max_shingle_size":2,
        "min_shingle_size":2,
        "output_unigrams":"false"
    }
  }
}

            
mappingSettings = {
        'properties': {
            "cast": {
               'properties': {
                  'name': {
                      'type': 'text',
                      'analyzer': 'english',
                      'fields': {
                         "bigramed": {
                            "type": "text",
                            "analyzer": "english_bigrams"
                        }     
                      }
                   }
                   
               }
            },
            "directors": {
               'properties': {
                  'name': {
                      'type': 'text',
                      'analyzer': 'english',
                      'fields': {
                         "bigramed": {
                            "type": "text",
                            "analyzer": "english_bigrams"
                        }     
                      }
                   }
                   
               }
            }            
        }
}

reindex(analysisSettings, mappingSettings, movieDict)

Delete TMDB Index <200>
Create TMDB Index <200>
Indexing 3051 movies
Bulk Index into TMDB Index <[]>


# 6.2.2, Listing 3 -- Most Fields undue promotion due to director AND cast member

In [19]:
usersSearch = 'star trek patrick stewart william shatner'
query = {
    'query': {
        'multi_match': { 
            'query': usersSearch,  #User's query
            'fields': ['title', 'overview', 'cast.name.bigramed', 'directors.name.bigramed'],      
            'type': 'most_fields'
         }
    },
    'size': 5,
    'explain': True
}
search(query)

Num	Relevance Score		Movie Title
1	23.29123		Star Trek: Generations
2	22.615887		Star Trek IV: The Voyage Home
3	20.892147		Star Trek V: The Final Frontier
4	18.406107		Star Trek: Nemesis
5	15.065897		Star Trek: Insurrection


# 6.3.2, Listing 7 Query Parser

In [23]:
usersSearch = 'star trek patrick stewart william shatner'
query = {
    'query': {
        'query_string': { 
            'query': usersSearch,  #User's query
            'fields': ['title', 'overview', 'cast.name.bigramed', 'directors.name.bigramed'],      
         }
    },
    'size': 5,
    'explain': True
}
search(query)


Num	Relevance Score		Movie Title
1	14.751948		Hannah Montana: The Movie
2	10.895414		Star Trek: Generations
3	10.760254		Star Trek
4	10.057888		Star Trek IV: The Voyage Home
5	9.007851		Star Trek: Nemesis


# Listing 8 -- Searching fields that work in sync

In [24]:
usersSearch = 'star trek patrick stewart william shatner'
query = {
    'query': {
        'query_string': { 
            'query': usersSearch,  #User's query
            'fields': ['title', 'overview',
  			     'cast.name', 'directors.name'],  #A    
         }
    },
    'size': 5,
    'explain': True
}
search(query)


Num	Relevance Score		Movie Title
1	14.751948		Hannah Montana: The Movie
2	13.600754		Star Trek V: The Final Frontier
3	13.512724		Star Trek: Generations
4	10.760254		Star Trek
5	10.057888		Star Trek IV: The Voyage Home


# 6.3.5, Listing 9 -- Tuning Term-Centric Search

In [29]:
usersSearch = 'star trek patrick stewart william shatner'
query = {
    'query': {
        'query_string': { 
            'query': usersSearch,  #User's query
            'fields': ['title', 'overview', 
   'cast.name^10', 'directors.name'], #A
           }
    },
    'size': 5,
    'explain': True
}
search(query) 

Num	Relevance Score		Movie Title
1	135.12723		Star Trek: Generations
2	97.44539		Showtime
3	94.6119		Osmosis Jones
4	92.86618		The Wild
5	86.62215		Miss Congeniality 2: Armed and Fabulous


# 6.4.1, Listings 10&11 Combining Fields into Custom All Fields

In [30]:
mappingSettings = {
        'properties': {
            # Because of a bug, you have to be very explicit
            # about analyzers for cross_field search
            "title": {
              'type': 'text',
              'analyzer': 'english',
            },
            "overview": {
              'type': 'text',
              'analyzer': 'english',
            },            
            "people": {
              'properties': {
                  'name': {
                      'type': 'text',
                      'analyzer': 'english',
                      'fields': {
                         "bigramed": {
                            "type": "text",
                            "analyzer": "english_bigrams",
                        }     
                      }
                   }
                   
               }                       
            },
            "cast": {
               'properties': {
                  'name': {
                      'type': 'text',
                      'analyzer': 'english',
                      'copy_to': 'people.name',
                      'fields': {
                         "bigramed": {
                            "type": "text",
                            "analyzer": "english_bigrams"
                        }     
                      }
                   }
                   
               }
            },
            "directors": {
               'properties': {
                  'name': {
                      'type': 'text',
                      'analyzer': 'english',
                      'copy_to': 'people.name',                      
                      'fields': {
                         "bigramed": {
                            "type": "text",
                            "analyzer": "english_bigrams"
                        }     
                      }
                   }
                   
               }
            }            
        }
}

reindex(analysisSettings, mappingSettings, movieDict)

Delete TMDB Index <200>
Create TMDB Index <200>
Indexing 3051 movies
Bulk Index into TMDB Index <[]>


# 6.4.1, Listing 12 -- Simple use of a custom all field

In [31]:
usersSearch = 'patrick stewart william shatner'
query = {
    'query': {
        'match': { 
            'people.name': usersSearch,  #User's query
         }
    },
    'size': 5,
    'explain': True
}
search(query)

Num	Relevance Score		Movie Title
1	13.423487		Star Trek: Generations
2	10.511824		Star Trek V: The Final Frontier
3	9.807777		The Wild
4	9.551281		Showtime
5	9.049201		Osmosis Jones


# Listing 13 -- Searching _all

In [33]:
usersSearch = 'star trek patrick stewart william shatner'
query = {
    'query': {
        'match': { 
            '_all': usersSearch,  #User's query
         }
    },
    'size': 5,
    'explain': True
}
search(query)


Num	Relevance Score		Movie Title


# 6.4.2, Listing 14 -- Cross Field Search over useful fields

In [35]:
usersSearch = 'star trek patrick stewart william shatner'
query = {
    'query': {
        'multi_match': { 
            'query': usersSearch,  #User's query
            'fields': ['title', 'overview', 'cast.name', 'directors.name'],
            'type': 'cross_fields',
         }
    },
    'size': 10,
    'explain': True
}
search(query)

Num	Relevance Score		Movie Title
1	21.658974		Star Trek: Generations
2	15.192904		Star Trek: Nemesis
3	14.535864		Star Trek: Insurrection
4	14.071022		Star Trek II: The Wrath of Khan
5	13.833348		Star Trek: The Motion Picture
6	13.661812		Star Trek V: The Final Frontier
7	13.471859		Star Trek IV: The Voyage Home
8	13.214497		Star Trek: First Contact
9	13.203346		Star Trek III: The Search for Spock
10	12.359765		Star Trek VI: The Undiscovered Country


# 6.5.1 -- Listing 15 -- Our Search combining term-centric all field (people.name) w/ other fields 

In [36]:
usersSearch = 'star trek patrick stewart william shatner'
query = {
    'query': {
        'multi_match': { 
            'query': usersSearch,  #User's query
            'fields': ['title', 'overview', 'people.name'],
            'type': 'most_fields',
         }
    },
    'size': 5,
    'explain': True
}
search(query)

Num	Relevance Score		Movie Title
1	25.819304		Star Trek: Generations
2	24.581379		Star Trek IV: The Voyage Home
3	19.045544		Star Trek: Nemesis
4	17.306519		Star Trek V: The Final Frontier
5	16.891613		Hannah Montana: The Movie


# 6.5.2, Listing 16 -- Searching two field groupings – people and text

In [37]:
usersSearch = 'star trek patrick stewart william shatner'
query = {
    'query': {
        'bool': {
            'should': [ #A
                {
                   'multi_match': { 
                      'query': usersSearch,  #User's query
                      'fields': ['directors.name.bigramed', #B 
                                 'cast.name.bigramed'],
                      'type': 'cross_fields'
                      }
                 },
                {
                   'multi_match': {
                     'query': usersSearch,  #User's query
                     'fields': ['overview', 'title'],
                     'type': 'cross_fields'                                
                   }
                },
            ]
        }
    },
    'size': 5,
    'explain': True
}
search(query) 

Num	Relevance Score		Movie Title
1	21.525768		Star Trek IV: The Voyage Home
2	19.041664		Star Trek: Generations
3	16.960918		Star Trek: Nemesis
4	14.751948		Hannah Montana: The Movie
5	13.620708		Star Trek: Insurrection


# 6.5.3 Listing 17 Greedy Term-Centric Paired With Highly Discriminating Like Fields

In [38]:
usersSearch = 'star trek patrick stewart william shatner'
query = {
    'query': {
        'bool': {
            'should': [ #A
                {
                   'multi_match': { 
                      'query': usersSearch,  #User's query
                      'fields': ['directors.name.bigramed', #B 
				      'cast.name.bigramed'],
                      'type': 'cross_fields'
                   }
                 },
                {
                   'multi_match': {
                     'query': usersSearch,  #User's query
                     'fields': ['overview', 'title', #C
                                 'directors.name', 'cast.name'],
                     'type': 'cross_fields'                                
                   }
                },
            ]
        }
    },
    'size': 5,
    'explain': True
}
search(query) 


Num	Relevance Score		Movie Title
1	32.55439		Star Trek: Generations
2	20.59391		Star Trek: Insurrection
3	20.580244		Star Trek: Nemesis
4	20.264833		Star Trek II: The Wrath of Khan
5	19.561811		Star Trek V: The Final Frontier
