<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Defining-Helper-Functions" data-toc-modified-id="Defining-Helper-Functions-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Defining Helper Functions</a></span></li><li><span><a href="#Multi-Search" data-toc-modified-id="Multi-Search-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Multi Search</a></span></li><li><span><a href="#Bigram" data-toc-modified-id="Bigram-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Bigram</a></span></li></ul></div>

In [1]:
import json
import requests

In [2]:
def extract():
    with open('tmdb.json') as f:
        return json.loads(f.read())
    
    
movies = extract()

# we can check some sample movie id, to check a sense of what
# the data looks like
# movie_ids = ['93837', '8193', '8195', '5', '8202', '11']
movies['93837']

{'poster_path': '/mfMndRWFbzXbTx0g3rHUXFAxyOh.jpg',
 'production_countries': [{'iso_3166_1': 'US',
   'name': 'United States of America'}],
 'revenue': 0,
 'overview': 'When the FBI hires her to go undercover at a college sorority, Molly Morris (Miley Cyrus) must transform herself from a tough, streetwise private investigator to a refined, sophisticated university girl to help protect the daughter of a one-time Mobster. With several suspects on her list, Molly unexpectedly discovers that not everyone is who they appear to be, including herself.',
 'video': False,
 'id': 93837,
 'genres': [{'id': 28, 'name': 'Action'}, {'id': 35, 'name': 'Comedy'}],
 'title': 'So Undercover',
 'tagline': "Meet the FBI's new secret weapon",
 'vote_count': 55,
 'homepage': '',
 'belongs_to_collection': None,
 'original_language': 'en',
 'status': 'Released',
 'spoken_languages': [{'iso_639_1': 'en', 'name': 'English'}],
 'imdb_id': 'tt1766094',
 'adult': False,
 'backdrop_path': '/o4Tt60z94Hbgk8adeZG9WE4S

## Defining Helper Functions

In [3]:
class ElasticSearchUtils:

    def __init__(self, index_name='tmdb', base_url='http://localhost:9200'):
        self.base_url = base_url
        self.index_name = index_name
        self.index_url = self.base_url + '/' + self.index_name
        self.index_type_name = '_doc'
        self.index_type_url = self.index_url + '/' + self.index_type_name
        self.headers = {'Content-Type': 'application/json'}

    def reindex(self, movies, analysis_settings, mapping_settings=None):
        """
        Reindex takes analyzer and field mappings, recreates the index, and then reindexes
        TMDB movies using the _bulk index API. There are other ways for modifying the configuration
        of the index besides dropping and restarting, however for convenience and because our data
        isn't truly that large, we'll just delete and start from scratch when we need to.
        """
        response = requests.delete(self.index_url)
        print('deleted TMDB index: ', response.status_code)

        # create the index with explicit settings
        # We need to explicitly set number of shards to 1 to eliminate the impact of 
        # distributed IDF on our small collection
        # See also 'Relavance is Broken!'
        # http://www.elastic.co/guide/en/elasticsearch/guide/current/relevance-is-broken.html
        settings = {
            'settings': {
                'index': {
                    'number_of_replicas': 1,
                    'number_of_shards': 1
                },
                'analysis': analysis_settings
            }
        }
        if mapping_settings is not None:
            settings['mappings'] = mapping_settings

        response = requests.put(self.index_url, data=json.dumps(settings), headers=self.headers)
        print('Created TMDB index: ', response.status_code)

        self._bulk_index(movies)

    def _bulk_index(self, movies):
        bulk_index_cmd = ''
        for movie_id, movie in movies.items():
            index_cmd = {
                'index': {
                    '_index': self.index_name,
                    '_type': self.index_type_name,
                    '_id': movie_id
                }
            }
            bulk_index_cmd += (json.dumps(index_cmd) + '\n' + json.dumps(movie) + '\n')

        response = requests.post(self.base_url + '/_bulk',
                                 data=bulk_index_cmd,
                                 headers=self.headers)
 
        print('Bulk index into TMDB index:', response.status_code)

    def search(self, query, verbose=False):
        search_url = self.index_type_url + '/_search'
        response = requests.get(search_url, data=json.dumps(query), headers=self.headers)

        search_hits = json.loads(response.text)['hits']['hits']
        for idx, hit in enumerate(search_hits):
            source = hit['_source']
            print("%s\t%s\t%s" % (idx + 1, hit['_score'], source['title']))
            
            if verbose:
                cast_names = []
                cast_characters = []
                for cast in source['cast']:
                    cast_names.append(cast['name'])
                    cast_characters.append(cast['character'])

                director_names = [director['name'] for director in source['directors']]

                print('director: ', director_names)
                print('cast: ', cast_names)
                print('character: ', cast_characters)
                print('overview:', source['overview'])
                if '_explanation' in hit:
                    result = ElasticSearchUtils.flatten_explain(hit['_explanation'])
                    print(result)

                print('=============================================')
   
    @staticmethod          
    def flatten_explain(explain_json, depth=0):
        
        # getting rid of potential next line character to make things prettier
        description = explain_json['description'].replace('\n', '')
        result = ' ' * (depth * 2) + '%s, %s\n' % (explain_json['value'], description)
        if 'details' in explain_json:
            for detail in explain_json['details']:
                result += ElasticSearchUtils.flatten_explain(detail, depth=depth + 1)

        return result

In [4]:
analysis_settings = {
    'analyzer': {
        'default': {
            'type': 'english'
        }
    }
}

es_utils = ElasticSearchUtils()
es_utils.reindex(movies, analysis_settings, mapping_settings=None)

deleted TMDB index:  200
Created TMDB index:  200
Bulk index into TMDB index: 200


In [5]:
# doing a sample query
# this is essentially lucene's MultiFieldQueryParser
# https://www.elastic.co/guide/en/elasticsearch/guide/master/multi-field-search.html
user_search = 'basketball with cartoon aliens'
query = {
    'query': {
        'multi_match': {
            'query': user_search,
            'fields': ['title^0.1', 'overview']
        }
    },
    'size': 5
}
es_utils.search(query)

1	13.054171	Space Jam
2	7.1151	The Flintstones
3	6.8627505	White Men Can't Jump
4	5.907187	Meet Dave
5	5.859599	Aliens vs Predator: Requiem


## Multi Search

We will start with field-centric search, which runs the search query against each field, and combines the score into one final score after each field is searched in isolation.

For `multi_search`, there are two ways to combine the score from multiple fields:

- `best_fields`: Winners takes all search, this is suitable when there's is a single field in the document that's best match the search query. e.g. in the context of movie search, it's saying this is a title search, and title search only.
- `most_fields`: Every field gets a share. The list of fields we declared all count towards the final relevance score. This is saying the ideal search query should consist of the document's title, overview, cast member's name.

In [6]:
# how nested object are indexed in elasticsearch
# https://www.elastic.co/blog/managing-relations-inside-elasticsearch

# best_fields query
user_search = 'patrick stewart'
query = {
    'query': {
        'multi_match': {
            'type': 'best_fields',
            'query': user_search,
            'fields': ['title', 'overview', 'cast.name', 'directors.name']
        }
    },
    'size': 3,
    'explain': True
}
# in our query explanation, we can see that for the first document,
# the relevance score came from two occurrence of 'stewart' in the
# overview field,
# and the second query had 'James Stewart' and 'Lee Patrick' in the
# list of cast, causing the tokenized query patrick and stewart to
# match against it
es_utils.search(query, verbose=True)

1	7.5371456	One Flew Over the Cuckoo's Nest
director:  ['Miloš Forman']
cast:  ['Jack Nicholson', 'Louise Fletcher', 'Danny DeVito', 'William Redfield', 'Scatman Crothers', 'Brad Dourif', 'Christopher Lloyd', 'Will Sampson', 'Dean R. Brooks', 'Michael Berryman', 'Sydney Lassick', 'William Duell', 'Vincent Schiavelli']
character:  ['Randle Patrick McMurphy', 'Nurse Mildred Ratched', 'Martini', 'Harding', 'Orderly Turkle', 'Billy Bibbit', 'Taber', 'Chief Bromden', 'Dr. John Spivey', 'Ellis', 'Charlie Cheswick', 'Jim Sefelt', 'Fredrickson']
overview: While serving time for insanity at a state mental hospital, implacable rabble-rouser Randle Patrick McMurphy inspires his fellow patients to rebel against the authoritarian rule of head nurse Mildred Ratched.
7.537145, max of:
  7.537145, sum of:
    7.537145, weight(overview:patrick in 145) [PerFieldSimilarity], result of:
      7.537145, score(doc=145,freq=1.0 = termFreq=1.0), product of:
        6.569201, idf, computed as log(1 + (docCount

## Bigram

We can build a more precise signal by making sure the name goes together as a unit, i.e. instead of searching for cast.name:patrick cast.name:stewart, we search for cast.name:"patrick stewart".

In [7]:
analysis_settings = {
    'filter': {
        # shingle token creates combination of tokens as a single token
        # https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-shingle-tokenfilter.html
        'bigram_filter': {
            'type': 'shingle',
            'max_shingle_size': 2,
            'min_shingle_size': 2,
            'output_unigrams': False
        },
        'english_stemmer': {
            'type': 'stemmer',
            'name': 'english'
        }
    },
    'analyzer': {
        'english_bigram': {
            'type': 'custom',
            'tokenizer': 'standard',
            'filter': ['lowercase', 'english_stemmer', 'bigram_filter']
        }
    }
}

mapping_settings = {
    '_doc': {
        'properties': {
            'cast': {
                'properties': {
                    'name': {
                        'type': 'text',
                        'analyzer': 'english',
                        
                        # fields: allows indexing the same field, multiple ways for different purposes,
                        # so now this newly defined field is cast.name.bigrammed
                        # https://www.elastic.co/guide/en/elasticsearch/reference/current/multi-fields.html
                        'fields': {
                            'bigrammed': {
                                'type': 'text',
                                'analyzer': 'english_bigram'
                            }
                        }
                    }
                }
            },
            'directors': {
                'properties': {
                    'name': {
                        'type': 'text',
                        'analyzer': 'english',
                        'fields': {
                            'bigrammed': {
                                'type': 'text',
                                'analyzer': 'english_bigram'
                            }
                        }
                    }
                }
            }
        }
    }
}

es_utils = ElasticSearchUtils()
es_utils.reindex(movies, analysis_settings, mapping_settings)

deleted TMDB index:  200
Created TMDB index:  200
Bulk index into TMDB index: 200


In [8]:
# best fields: instead of combining the scores from each field, we use the score
# from the best matching field as the overall score for the query
# https://www.elastic.co/guide/en/elasticsearch/guide/master/_best_fields.html
user_search = 'patrick stewart'
query = {
    'query': {
        'multi_match': {
            'type': 'best_fields',
            'query': user_search,
            'fields': [
                'title',
                'overview^0.5',
                'cast.name.bigrammed',
                'directors.name.bigrammed'
            ]
        }
    },
    'size': 3,
    'explain': True
}

es_utils.search(query, verbose=True)

1	6.4320474	Star Trek: First Contact
director:  ['Jonathan Frakes']
cast:  ['James Cromwell', 'Alfre Woodard', 'Alice Krige', 'Neal McDonough', 'Patrick Stewart', 'Brent Spiner', 'LeVar Burton', 'Michael Dorn', 'Gates McFadden', 'Marina Sirtis', 'Jonathan Frakes', 'Dwight Schultz', 'Cameron Oppenheimer']
character:  ['Dr. Zefram Cochrane', 'Lily Sloane', 'Borg Queen', 'Lieutenant Hawk', 'Captain Jean-Luc Picard', 'Lt. Commander Data', 'Commander Geordi La Forge', 'Lieutenant Commander Worf', 'Commander Beverly Crusher', 'Ships Counselor Commander Deanna Troi', 'Commander William Riker', 'Lt. Barclay', 'Ensign Kellogg']
overview: The Borg, a relentless race of cyborgs, are on a direct course for Earth. Violating orders to stay away from the battle, Captain Picard and the crew of the newly-commissioned USS Enterprise E pursue the Borg back in time to prevent the invaders from changing Federation history and assimilating the galaxy.
6.432047, max of:
  6.432047, weight(cast.name.bigrammed

It is common for users to want to apply multiple criteria to their searches, expecting the search to account for all the information when searching. When searching using the `best_fields` approach, we can add the argument `tie_breaker`, which lets us add some scores from the matches that didn't have the highest score. When the `tie_breaker` starts to approach 1, then the formula will closely resemble the score when using `most_fields`.

In [17]:
user_search = 'star treck patrick stewart'
query = {
    'query': {
        'multi_match': {
            'type': 'best_fields',
            'tie_breaker': 0.4,
            'query': user_search,
            'fields': [
                'title',
                'overview',
                'cast.name.bigrammed^5',
                'directors.name.bigrammed'
            ]
        }
    },
    'size': 15,
    'explain': True
}

es_utils.search(query, verbose=False)

1	32.245945	Star Trek: Insurrection
2	31.265759	Star Trek: First Contact
3	30.545128	Star Trek: Generations
4	28.892412	Star Trek: Nemesis
5	28.853365	Excalibur
6	28.853365	Gnomeo & Juliet
7	26.936695	Conspiracy Theory
8	26.353165	Robin Hood: Men in Tights
9	25.794382	X-Men
10	25.794382	The Wolverine
11	25.258802	Dune
12	24.251707	X2: X-Men United
13	23.777687	TMNT
14	23.777687	The Prince of Egypt
15	22.883144	X-Men: The Last Stand


We can see changing the `multi_match` type to `most_fields` gives us similar results compared to using `best_fields` with `tie_breaker`. When we are not sure which of the several signals at hand ought to take precedence. `most_fields` is a good place to start.

In [16]:
# https://www.elastic.co/guide/en/elasticsearch/guide/master/most-fields.html

# example of how most_fields query translates into boolean query
# https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-multi-match-query.html#type-most-fields
user_search = 'star treck patrick stewart'
query = {
    'query': {
        'multi_match': {
            'type': 'most_fields',
            'query': user_search,
            # note that even with most_fields, we will still need to tweak
            # the boost for each field since the scores for each field aren't
            # really directly comparable, thus it can still give us lopsided results
            'fields': [
                'title',
                'overview',
                'cast.name.bigrammed^5',
                'directors.name.bigrammed'
            ]
        }
    },
    'size': 5,
    'explain': True
}

es_utils.search(query, verbose=False)

1	35.95778	Star Trek: Generations
2	35.17952	Star Trek: Insurrection
3	33.83288	Star Trek: First Contact
4	31.825989	Star Trek: Nemesis
5	28.853365	Excalibur


In [20]:
user_search = 'star treck patrick stewart william shatner'
query = {
    'query': {
        'multi_match': {
            'type': 'most_fields',
            'query': user_search,
            'fields': [
                'title',
                'overview',
                'cast.name.bigrammed^5',
                'directors.name.bigrammed'
            ]
        }
    },
    'size': 3,
    'explain': True
}

es_utils.search(query, verbose=True)

1	63.49815	Star Trek: Generations
director:  ['David Carson']
cast:  ['Patrick Stewart', 'Jonathan Frakes', 'Brent Spiner', 'LeVar Burton', 'Michael Dorn', 'Gates McFadden', 'Marina Sirtis', 'William Shatner', 'James Doohan', 'Walter Koenig', 'Malcolm McDowell', 'Alan Ruck', 'Whoopi Goldberg', 'Thomas Dekker', 'Cameron Oppenheimer', 'Jenette Goldstein', 'Tim Russ']
character:  ['Captain Jean-Luc Picard', 'Commander William T. Riker', 'Lt. Commander Data', 'Lt. Commander Geordi La Forge', 'Lt. Commander Worf', 'Dr. Beverly Crusher', 'Commander Deanna Troi', 'James T. Kirk', 'Montgomery Scott', 'Pavel Chekov', 'Dr. Tolian Soran', 'Capt. John Harriman', 'Guinan', "Picard's Kid", 'Ensign Kellogg', 'Science Officer', 'Lieutenant']
overview: Captain Jean-Luc Picard and the crew of the Enterprise-D find themselves at odds with the renegade scientist Soran who is destroying entire star systems. Only one man can help Picard stop Soran's scheme...and he's been dead for seventy-eight years.
63.49