# Setup Code (Listings 3 & 4) 5.2.1

In [22]:
import requests
import json
import os
from elasticsearch import Elasticsearch
from elasticsearch.helpers import streaming_bulk

client = Elasticsearch("http://localhost:9200")

# Some utilities for flattening the explain into something a bit more
# readable. Pass Explain JSON, get something readable (ironically this is what Solr's default output is :-p)
def flatten(l):
    [item for sublist in l for item in sublist]

def simplerExplain(explainJson, depth=0):
    result = " " * (depth * 2) + "%s, %s\n" % (explainJson['value'], explainJson['description'])
    #print json.dumps(explainJson, indent=True)
    if 'details' in explainJson:
        for detail in explainJson['details']:
            result += simplerExplain(detail, depth=depth+1)
    return result


# To speed up the pace of development, we really need to focus more heavily on the analysis and query
# settings of the search engine, rather than fidly bits of the http interface.
#
# To that end, we're going to collapse some of the code you were introduced to in chapter 3 into more general functions,
# so we can reuse them. Largely, this is the exact same code you saw in chapter 3 some more generality.

## Analyze
## The analyze function is a helper for accessing the _analyze endpoint like we did in chapter 3. Recall,
## given a field or analyzer, passing some text to _analyze will return the token stream that results from
## that analyzer. This token stream, if you recall, shows us exactly how the search engine translate text
## into individual tokens to be consumed by the underlying data structures. When we debug analysis, we see
## matches we need to expect.
def analyze(text, field=None, analyzer=None):
#     whatToAnalyze = ''
#     if field is not None:
#         whatToAnalyze = "field=%s" % field
#     elif analyzer is not None:
#         whatToAnalyze = "analyzer=%s" % analyzer
#     resp = requests.get("http://localhost:9200/tmdb/_analyze?%s&format=yaml" % whatToAnalyze, 
#                         data=text)
    f = {
      "field": field,
      "text": text,
        "analyzer":analyzer

    }
    resp = client.indices.analyze(index="tmdb",body=f)
    print json.dumps(resp, indent=2)
    
## Search
## Next we need to wrap up our execution of query DSL queries. The function 'search' will execute the passed query DSL
## query and display the results. 
## If a scoring explain is associated with the results, then it also gets displayed,
## We'll also be sure to dump the query DSL
def search(query, verbose=False):
    resp = client.search(index="tmdb", body=query)
    searchHits = resp['hits']
    print "Num\tRelevance Score\t\tMovie Title"
    for idx, hit in enumerate(searchHits['hits']):
            castNames = []            
            castCharacters = []                        
            directorNames = []
            for cast in hit['_source']['cast']:
                castNames.append(cast['name'])
                castCharacters.append(cast['character'])
            for director in hit['_source']['directors']:
                directorNames.append(director['name'])
            print "%s\t%s\t\t%s" % (idx + 1, hit['_score'], hit['_source']['title'])
            if verbose:
                mid = hit['_id']
                print "%s" % hit['_source']['title']
                print "%s" % hit['_source']['tagline']        
                print "%s" % hit['_source']['overview']        
                print "%s" % hit['_id']
                print "DIRS %s" % directorNames
                print "CAST %s" % castNames
                print "CHAR %s" % castCharacters
                if '_explanation' in hit:
                    print "%s" % simplerExplain(hit['_explanation'])
                    print "*************************************"
    
    if verbose:
        resp = client.indices.validate_query(index="tmdb",doc_type="_doc",body=query,rewrite=True)
        print "*************************************"
        print json.dumps(resp, indent=2)

## Reindex
## Reindex takes analyzer and field mappings, recreates the index, and then reindexes
## TMDB movies using the _bulk index API. There are other ways for modifying the configuration
## of the index besides dropping and restarting, however for convenience and because our data
## isn't truly that large, we'll just delete and start from scratch when we need to.
def reindex(analysisSettings, mappingSettings=None, movieDict={}):
    # Destroy any existing index (equiv to SQL "drop table")
    result = client.indices.delete(index='tmdb', ignore=[400, 404])
    print "Delete TMDB Index <%s>" % result
    
    # Create the index with explicit settings
    # We need to explicitely set number of shards to 1 to eliminate the impact of 
    # distributed IDF on our small collection
    # See also "Relavance is Broken!"
    # http://www.elastic.co/guide/en/elasticsearch/guide/current/relevance-is-broken.html
    settings = {
        "settings": {
            "number_of_shards": 1,
            "index": {
                "analysis" : analysisSettings,
            }
        }
    }
    if mappingSettings:
        settings['mappings'] = mappingSettings
    resp = client.indices.create(index="tmdb",body=settings)
    print "Create TMDB Index <%s>" % resp

    # Bulk index title & overview to the movie endpoint
    print "Indexing %i movies" % len(movieDict.keys())
    bulkMovies = ""
    print "creating..."
    for id, movie in movieDict.iteritems(): 
        result = client.create(index='tmdb', doc_type='_doc', id=movie["id"], body=json.dumps(movie))

    print "indexing..."
## Extract
## major difference between our use of TMDB here and in chapter 3: pulling more data. Not only do we access the 
## movie endpoint, we also extract the credits -- pulling in the cast (actors and such) and extracting the director.
def extract(movieIds=[], numMovies=10000):
    if len(movieIds) == 0:
        try:
            f = open('tmdb.json')
            if f:
                return json.loads(f.read());
        except IOError:
            pass       
    return movieDict

# 5.2.2 -- Listing 4, Index to ES, Search

In [23]:
movieDict = extract()

analysis = {
    "analyzer" : {
      "default" : {
        "type" : "english"
        }
      }
   }

# reindex(analysisSettings=analysis, movieDict=movieDict)

usersSearch = 'basketball with cartoon aliens'
query = {
    'query': {
        'multi_match': { 
            'query': usersSearch,  #User's query
            'fields': ['title^0.1', 'overview'],
         }
    },
    'size': 5,
    'explain': True
}
search(query)
query = {
    'query': {
        'multi_match': { 
            'query': usersSearch,  #User's query
            'fields': ['title^0.1', 'overview'],
         }
    }
}
print "==============="
search(query, verbose=True)

Num	Relevance Score		Movie Title
1	12.882349		Space Jam
2	7.5384703		Grown Ups
3	7.4996777		Speed Racer
4	7.244087		Semi-Pro
5	7.1626425		The Flintstones
Num	Relevance Score		Movie Title
1	12.882349		Space Jam
Space Jam
Get ready to jam.
Michael Jordan agrees to help the Looney Tunes play a basketball game against alien slavers to determine their freedom.
2300
DIRS [u'Joe Pytka']
CAST [u'Michael Jordan', u'Wayne Knight', u'Billy West', u'Dee Bradley Baker', u'Theresa Randle', u'Danny DeVito', u'Brandon Hammond', u'Larry Bird', u'Bill Murray', u'Charles Barkley', u'Patrick Ewing', u'Tyrone Bogues', u'Larry Johnson', u'Shawn Bradley', u'Ahmad Rashad', u'Del Harris', u'Vlade Divac', u'Cedric Ceballos', u'Jim Rome', u'Paul Westphal', u'Danny Ainge', u'Alonzo Mourning', u'A.C. Green', u'Charles Oakley', u'Derek Harper', u'Jeff Malone', u'Anthony Miller', u'Sharone Wright']
CHAR [u'Himself', u'Stan Podolak', u'Bugs Bunny/Elmer Fudd (voice)', u'Daffy Duck/Tazmanian Devil/Bull (voice)', u'Juan

# 5.2.4 -- Listing 5 Inspecting Nested Star Trek Docs

In [21]:
spaceJamId = 2300
httpResp = requests.get("http://localhost:9200/tmdb/_doc/%s" % spaceJamId)
spaceJamDoc = json.loads(httpResp.text)
print json.dumps(spaceJamDoc['_source'], indent=True)

{
 "poster_path": "/9T9ucCk6wO0crRBUIkBJMRAVcKp.jpg", 
 "production_countries": [
  {
   "iso_3166_1": "US", 
   "name": "United States of America"
  }
 ], 
 "revenue": 230000000, 
 "overview": "Michael Jordan agrees to help the Looney Tunes play a basketball game against alien slavers to determine their freedom.", 
 "video": false, 
 "id": 2300, 
 "genres": [
  {
   "id": 16, 
   "name": "Animation"
  }, 
  {
   "id": 35, 
   "name": "Comedy"
  }, 
  {
   "id": 18, 
   "name": "Drama"
  }, 
  {
   "id": 14, 
   "name": "Fantasy"
  }, 
  {
   "id": 10751, 
   "name": "Family"
  }
 ], 
 "title": "Space Jam", 
 "tagline": "Get ready to jam.", 
 "vote_count": 275, 
 "homepage": "", 
 "belongs_to_collection": null, 
 "original_language": "en", 
 "status": "Released", 
 "spoken_languages": [
  {
   "iso_639_1": "cs", 
   "name": "\u010cesk\u00fd"
  }, 
  {
   "iso_639_1": "fr", 
   "name": "Fran\u00e7ais"
  }, 
  {
   "iso_639_1": "pl", 
   "name": "Polski"
  }, 
  {
   "iso_639_1": "en", 


# 5.3.1, Listing 6 Star Trek Query Using Query from Ch 3

In [56]:
usersSearch = 'patrick stewart'
query = {
    'query': {
        'multi_match': { 
            'query': usersSearch,  #User's query
            'fields': ['title', 'overview', 'cast.name', 'directors.name'],      
            'type': 'best_fields'
         }
    },
    'size': 50,
    'explain': True
}
search(query)
print "==============="
search(query, verbose=True)

Num	Relevance Score		Movie Title
1	7.1845765		Hannah Montana: The Movie
2	7.1596975		Vertigo
3	6.9732013		Star Trek: Insurrection
4	6.9093037		One Flew Over the Cuckoo's Nest
5	6.906671		Legion
6	6.906671		Halo 4: Forward Unto Dawn
7	6.906671		Priest
8	6.906671		Dark Skies
9	6.71099		Star Trek: First Contact
10	6.5468693		Gnomeo & Juliet
11	6.5468693		Excalibur
12	6.465148		X-Men: Days of Future Past
13	6.3152065		Panic Room
14	6.2415876		Conspiracy Theory
15	6.228061		The Bounty Hunter
16	6.0993795		Star Trek: Nemesis
17	6.0993795		Star Trek: Generations
18	6.0993795		Robin Hood: Men in Tights
19	6.030678		The Wolverine
20	6.009052		Drive Angry
21	6.009052		Feast
22	6.009052		District 13: Ultimatum
23	6.009052		The Expendables 3
24	6.009052		Underworld: Rise of the Lycans
25	6.009052		My Bloody Valentine
26	5.833557		X-Men
27	5.7091494		Dune
28	5.6086707		Ted
29	5.589937		Save the Last Dance
30	5.589937		TMNT
31	5.475602		X2: X-Men United
32	5.475602		The Prince of Egypt
33	5.443809		

# 5.3.2 -- Listing 7 -- Reducing the Impact of directors.name

In [57]:
usersSearch = 'patrick stewart'
query = {
    'query': {
        'multi_match': { 
            'query': usersSearch,  #User's query
            'fields': ['title', 'overview',
                       'cast.name', 'directors.name^0.1'],  #A    
         }
    },
}
search(query)


Num	Relevance Score		Movie Title
1	7.1845765		Hannah Montana: The Movie
2	7.1596975		Vertigo
3	6.9732013		Star Trek: Insurrection
4	6.9093037		One Flew Over the Cuckoo's Nest
5	6.71099		Star Trek: First Contact
6	6.5468693		Gnomeo & Juliet
7	6.5468693		Excalibur
8	6.465148		X-Men: Days of Future Past
9	6.3152065		Panic Room
10	6.2415876		Conspiracy Theory


# 5.3.3 -- Listings 8&9  – Analysis Extracting English Bigrams

In [47]:
analysisSettings = {
   "analyzer" : {
      "default" : {
        "type" : "english"
      },
      "english_bigrams": {
          "type": "custom",
          "tokenizer": "standard",
          "filter": [
            "lowercase",
            "porter_stem",
            "bigram_filter"
          ]
      }
    },
  "filter": {
    "bigram_filter": {
        "type": "shingle",
        "max_shingle_size":2,
        "min_shingle_size":2,
        "output_unigrams":"false"
    }
  }
}


# From listing 9
mappingSettings = {
        'properties': {
             'title': { #A
                   'type': 'text',
                   'analyzer': 'english'
               },
            'overview': {
                   'type': 'text',
                   'analyzer': 'english'
               },
            "cast": {
                'properties': {
                  "name":{
                      'type': 'text',
                      'analyzer': 'english',
                      'fields': {
                         "bigramed": {
                            "type": "text",
                            "analyzer": "english_bigrams"
                        }     
                      }
                   
                  }
                }
            },
            "directors": {
                'properties': {
                "name":{
                      'type': 'text',
                      'analyzer': 'english',
                      'fields': {
                         "bigramed": {
                            "type": "text",
                            "analyzer": "english_bigrams"
                        }     
                      }
                }
                }
               }
                    
        }
}

reindex(analysisSettings, mappingSettings, movieDict)

Delete TMDB Index <{u'acknowledged': True}>
Create TMDB Index <{u'index': u'tmdb', u'acknowledged': True, u'shards_acknowledged': True}>
Indexing 3051 movies
creating...
indexing...


# 5.3.3 -- Listing 10 -- Searching *.bigramed fields, reindexing

In [58]:
usersSearch = 'patrick stewart'
query = {
    'query': {
        'multi_match': { 
            'query': usersSearch,  #User's query
            'fields': ['title', 'overview',
'cast.name.bigramed', 'directors.name.bigramed'],      
         }
    },
}
search(query)


Num	Relevance Score		Movie Title
1	7.1845765		Hannah Montana: The Movie
2	6.9093037		One Flew Over the Cuckoo's Nest
3	6.228061		The Bounty Hunter
4	6.058046		Star Trek: Insurrection
5	5.910869		Star Trek: First Contact
6	5.7706733		Gnomeo & Juliet
7	5.7706733		Excalibur
8	5.443809		The SpongeBob SquarePants Movie
9	5.443809		10 Things I Hate About You
10	5.3873396		Conspiracy Theory


# 5.3.4	Letting Losers Share The Glory (no listing number)

In [59]:
usersSearch = 'star trek patrick stewart'
query = {
    'query': {
        'multi_match': { 
            'query': usersSearch,  #User's query
            'fields': ['title', 'overview', 'cast.name.bigramed^5', 'directors.name.bigramed'],      
            'type': 'best_fields',
            'tie_breaker': 0.4
         }
    },
    'size': 5,
    'explain': True
}
search(query)
print "==============="
search(query, verbose=True)

Num	Relevance Score		Movie Title
1	33.893368		Star Trek: Insurrection
2	32.65286		Star Trek: First Contact
3	31.895023		Star Trek: Generations
4	30.539837		Star Trek: Nemesis
5	28.853365		Gnomeo & Juliet
Num	Relevance Score		Movie Title
1	33.893368		Star Trek: Insurrection
Star Trek: Insurrection
The battle for paradise has begun.
When an alien race and factions within Starfleet attempt to take over a planet that has "regenerative" properties, it falls upon Captain Picard and the crew of the Enterprise to defend the planet's people as well as the very ideals upon which the Federation itself was founded.
200
DIRS [u'Jonathan Frakes']
CAST [u'Patrick Stewart', u'Jonathan Frakes', u'Brent Spiner', u'LeVar Burton', u'Gates McFadden', u'Marina Sirtis', u'F. Murray Abraham', u'Anthony Zerbe', u'Donna Murphy', u'Gregg Henry', u'Michael Dorn']
CHAR [u'Captain Jean-Luc Picard', u'Commander William T. Riker', u'Lt. Commander Data', u'Lt. Commander Geordi La Forge', u'Doctor Beverly Crusher', u'C

# 5.3.5, Listing 11 Counting Multiple Signals using Most Fields 

In [60]:
usersSearch = 'star trek patrick stewart'
query = {
    'query': {
        'multi_match': { 
            'query': usersSearch,  #User's query
            'fields': ['title', 'overview',
'cast.name.bigramed', 'directors.name.bigramed'],      
            'type': 'most_fields'
         }
    }
}
search(query)


Num	Relevance Score		Movie Title
1	17.783155		Star Trek: Generations
2	15.065897		Star Trek: Insurrection
3	14.39519		Star Trek: Nemesis
4	13.657165		Star Trek: First Contact
5	10.878148		Hannah Montana: The Movie
6	10.760254		Star Trek
7	9.007851		Star Trek Into Darkness
8	8.387256		Maps to the Stars
9	8.1140175		Dinosaur
10	8.019917		The Beaver


# 5.3.6, Listing 12	Boosting in Most-Fields

In [61]:
usersSearch = 'star trek patrick stewart'
query = {
    'query': {
        'multi_match': { 
            'query': usersSearch,  #User's query
            'fields': ['title^0.2', 'overview',
 'cast.name.bigramed', 'directors.name.bigramed'],      
            'type': 'most_fields'
         }
    },
}
search(query)

Num	Relevance Score		Movie Title
1	10.878148		Hannah Montana: The Movie
2	10.576875		Star Trek: Generations
3	8.1140175		Dinosaur
4	8.019917		The Beaver
5	7.8596163		Star Trek: Insurrection
6	7.4601283		Star Trek: First Contact
7	7.1889095		Star Trek: Nemesis
8	6.94131		Ted
9	6.9093037		One Flew Over the Cuckoo's Nest
10	6.811893		The Lord of the Rings: The Two Towers


# 5.3.7	When Additional Matches Don’t Matter (no listing number)

In [62]:
usersSearch = 'star trek patrick stewart william shatner'
query = {
    'query': {
        'multi_match': { 
            'query': usersSearch,  #User's query
            'fields': ['title', 'overview', 'cast.name.bigramed', 'directors.name.bigramed'],      
            'type': 'most_fields'
         }
    },
    'size': 5,
    'explain': True
}
search(query)

Num	Relevance Score		Movie Title
1	23.29123		Star Trek: Generations
2	22.615887		Star Trek IV: The Voyage Home
3	20.892147		Star Trek V: The Final Frontier
4	18.406107		Star Trek: Nemesis
5	15.065897		Star Trek: Insurrection
