<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Relevance-Score" data-toc-modified-id="Relevance-Score-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Relevance Score</a></span><ul class="toc-item"><li><span><a href="#Boolean-Query" data-toc-modified-id="Boolean-Query-1.1"><span class="toc-item-num">1.1&nbsp;&nbsp;</span>Boolean Query</a></span></li><li><span><a href="#Filtering" data-toc-modified-id="Filtering-1.2"><span class="toc-item-num">1.2&nbsp;&nbsp;</span>Filtering</a></span></li><li><span><a href="#Function-Query" data-toc-modified-id="Function-Query-1.3"><span class="toc-item-num">1.3&nbsp;&nbsp;</span>Function Query</a></span></li></ul></li></ul></div>

# Relevance Score

https://www.elastic.co/guide/en/elasticsearch/guide/master/controlling-relevance.html

In [1]:
import json
import requests

In [2]:
def extract():
    with open('tmdb.json') as f:
        return json.loads(f.read())
    
    
movies = extract()

# we can check some sample movie id, to check a sense of what
# the data looks like
# movie_ids = ['93837', '8193', '8195', '5', '8202', '11']
movies['93837']

{'poster_path': '/mfMndRWFbzXbTx0g3rHUXFAxyOh.jpg',
 'production_countries': [{'iso_3166_1': 'US',
   'name': 'United States of America'}],
 'revenue': 0,
 'overview': 'When the FBI hires her to go undercover at a college sorority, Molly Morris (Miley Cyrus) must transform herself from a tough, streetwise private investigator to a refined, sophisticated university girl to help protect the daughter of a one-time Mobster. With several suspects on her list, Molly unexpectedly discovers that not everyone is who they appear to be, including herself.',
 'video': False,
 'id': 93837,
 'genres': [{'id': 28, 'name': 'Action'}, {'id': 35, 'name': 'Comedy'}],
 'title': 'So Undercover',
 'tagline': "Meet the FBI's new secret weapon",
 'vote_count': 55,
 'homepage': '',
 'belongs_to_collection': None,
 'original_language': 'en',
 'status': 'Released',
 'spoken_languages': [{'iso_639_1': 'en', 'name': 'English'}],
 'imdb_id': 'tt1766094',
 'adult': False,
 'backdrop_path': '/o4Tt60z94Hbgk8adeZG9WE4S

In [3]:
class ElasticSearchUtils:

    def __init__(self, index_name='tmdb', base_url='http://localhost:9200'):
        self.base_url = base_url
        self.index_name = index_name
        self.index_url = self.base_url + '/' + self.index_name
        self.index_type_name = '_doc'
        self.index_type_url = self.index_url + '/' + self.index_type_name
        self.headers = {'Content-Type': 'application/json'}

    def reindex(self, movies, analysis_settings, mapping_settings=None):
        """
        Reindex takes analyzer and field mappings, recreates the index, and then reindexes
        TMDB movies using the _bulk index API. There are other ways for modifying the configuration
        of the index besides dropping and restarting, however for convenience and because our data
        isn't truly that large, we'll just delete and start from scratch when we need to.
        """
        response = requests.delete(self.index_url)
        print('deleted TMDB index: ', response.status_code)

        # create the index with explicit settings
        # We need to explicitly set number of shards to 1 to eliminate the impact of 
        # distributed IDF on our small collection
        # See also 'Relavance is Broken!'
        # http://www.elastic.co/guide/en/elasticsearch/guide/current/relevance-is-broken.html
        settings = {
            'settings': {
                'index': {
                    'number_of_replicas': 1,
                    'number_of_shards': 1
                },
                'analysis': analysis_settings
            }
        }
        if mapping_settings is not None:
            settings['mappings'] = mapping_settings

        response = requests.put(self.index_url, data=json.dumps(settings), headers=self.headers)
        print('Created TMDB index: ', response.status_code)

        self._bulk_index(movies)

    def _bulk_index(self, movies):
        bulk_index_cmd = ''
        for movie_id, movie in movies.items():
            index_cmd = {
                'index': {
                    '_index': self.index_name,
                    '_type': self.index_type_name,
                    '_id': movie_id
                }
            }
            bulk_index_cmd += (json.dumps(index_cmd) + '\n' + json.dumps(movie) + '\n')

        response = requests.post(self.base_url + '/_bulk',
                                 data=bulk_index_cmd,
                                 headers=self.headers)
 
        print('Bulk index into TMDB index:', response.status_code)

    def search(self, query, verbose=False):
        search_url = self.index_type_url + '/_search'
        response = requests.get(search_url, data=json.dumps(query), headers=self.headers)

        search_hits = json.loads(response.text)['hits']['hits']
        for idx, hit in enumerate(search_hits):
            source = hit['_source']
            print("%s\t%s\t%s" % (idx + 1, hit['_score'], source['title']))
            
            if verbose:
                cast_names = []
                cast_characters = []
                for cast in source['cast']:
                    cast_names.append(cast['name'])
                    cast_characters.append(cast['character'])

                director_names = [director['name'] for director in source['directors']]

                print('director: ', director_names)
                print('cast: ', cast_names)
                print('character: ', cast_characters)
                print('overview:', source['overview'])
                if '_explanation' in hit:
                    result = ElasticSearchUtils.flatten_explain(hit['_explanation'])
                    print(result)

                print('=============================================')
   
    @staticmethod          
    def flatten_explain(explain_json, depth=0):
        
        # getting rid of potential next line character to make things prettier
        description = explain_json['description'].replace('\n', '')
        result = ' ' * (depth * 2) + '%s, %s\n' % (explain_json['value'], description)
        if 'details' in explain_json:
            for detail in explain_json['details']:
                result += ElasticSearchUtils.flatten_explain(detail, depth=depth + 1)

        return result

    def validate(self, query):
        url = self.index_type_url + '/_validate/query?explain'
        response = requests.get(url, data=json.dumps(query), headers=self.headers)
        return json.loads(response.text)

In [4]:
# re-creating the index from chapter 5
analysis_settings = {
    'filter': {
        'bigram_filter': {
            'type': 'shingle',
            'max_shingle_size': 2,
            'min_shingle_size': 2,
            'output_unigrams': False
        },
        'english_stemmer': {
            'type': 'stemmer',
            'name': 'english'
        }
    },
    'analyzer': {
        'english_bigram': {
            'type': 'custom',
            'tokenizer': 'standard',
            'filter': ['lowercase', 'english_stemmer', 'bigram_filter']
        }
    }
}

mapping_settings = {
    '_doc': {
        'properties': {
            'cast': {
                'properties': {
                    'name': {
                        'type': 'text',
                        'analyzer': 'english',
                        'fields': {
                            'bigrammed': {
                                'type': 'text',
                                'analyzer': 'english_bigram'
                            }
                        }
                    }
                }
            },
            'directors': {
                'properties': {
                    'name': {
                        'type': 'text',
                        'analyzer': 'english',
                        'fields': {
                            'bigrammed': {
                                'type': 'text',
                                'analyzer': 'english_bigram'
                            }
                        }
                    }
                }
            }
        }
    }
}

es_utils = ElasticSearchUtils()
es_utils.reindex(movies, analysis_settings, mapping_settings)

deleted TMDB index:  200
Created TMDB index:  200
Bulk index into TMDB index: 200


## Boolean Query

In [5]:
user_search = 'william shatner patrick stewart'
query = {
    'query': {
        'bool': {
            'should': [
                {
                    'multi_match': {
                        'query': user_search,
                        'type': 'cross_fields',
                        'fields': [
                            'overview',
                            'title',
                            'directors.name',
                            'cast.name'
                        ]
                    }
                },
                {
                    # one question that we should be asking
                    # ourselves is whether we need the tf-idf relevance scheme
                    # for potential yes/no question, in the example here, we are
                    # doing a match phrase on the title field, do we need that or
                    # we can just perform a filtering
                    # https://www.elastic.co/guide/en/elasticsearch/guide/master/phrase-matching.html
                    'match_phrase': {
                        'title': 'star trek'
                    }
                }
            ]
        }
    }
}
es_utils.search(query)

1	21.624168	Star Trek: Generations
2	15.678394	Star Trek: Insurrection
3	14.812566	Star Trek: Nemesis
4	14.366369	Star Trek: First Contact
5	14.048485	Star Trek IV: The Voyage Home
6	13.350748	Star Trek: The Motion Picture
7	13.245766	Star Trek V: The Final Frontier
8	13.065597	Star Trek II: The Wrath of Khan
9	12.303821	Star Trek III: The Search for Spock
10	12.09503	Star Trek VI: The Undiscovered Country


In [6]:
user_search = 'william shatner patrick stewart'
query = {
    'query': {
        'bool': {
            'should': [
                {
                    'multi_match': {
                        'query': user_search,
                        'type': 'cross_fields',
                        'fields': [
                            'overview',
                            'title',
                            'directors.name',
                            'cast.name'
                        ]
                    }
                },
                {
                    # adjusting previous query for give less weight on title
                    'match_phrase': {
                        'title': {
                            'query': 'star trek',
                            'boost': 0.5
                        }
                    }
                }
            ]
        }
    }
}
es_utils.search(query)

1	17.380772	Star Trek: Generations
2	11.434998	Star Trek: Insurrection
3	11.070782	Star Trek IV: The Voyage Home
4	10.649588	Star Trek: First Contact
5	10.569169	Star Trek: Nemesis
6	10.357176	Star Trek II: The Wrath of Khan
7	10.268063	Star Trek V: The Final Frontier
8	10.044303	Star Trek: The Motion Picture
9	9.5954	Star Trek III: The Search for Spock
10	9.117327	Star Trek VI: The Undiscovered Country


## Filtering

One good recipe when shaping relevance is excluding search results. By excluding irrelevant results that the user doesn't wish to see, we avoid corner-cases in our relevance.

In [11]:
user_search = 'william shatner patrick stewart'
query = {
    "query": {
        "bool": {
            "should": [
                {
                    "multi_match": {
                        "query": user_search,
                        "type": "cross_fields",
                        "fields": [
                            'overview',
                            'title',
                            'directors.name',
                            'cast.name'
                        ]
                    }
                }
            ],
            # add the filter under the bool query
            # https://www.elastic.co/guide/en/elasticsearch/reference/6.4/query-dsl-filtered-query.html
            "filter": {
                "match_phrase": {
                    "title": "star trek"
                }
            }
        }
    }
}
es_utils.search(query)

1	13.512723	Star Trek: Generations
2	9.731844	Star Trek IV: The Voyage Home
3	8.366446	Star Trek II: The Wrath of Khan
4	7.9572344	Star Trek V: The Final Frontier
5	7.4987698	Star Trek III: The Search for Spock
6	7.3298416	Star Trek: The Motion Picture
7	6.9732013	Star Trek: Insurrection
8	6.7109895	Star Trek: First Contact
9	6.655188	Star Trek VI: The Undiscovered Country
10	6.0993795	Star Trek: Nemesis


## Function Query

With function queries, we can directly define our ranking function based on a combination of quantitative factors.

In [24]:
# boosting by popularity,
# e.g. the user's rating, the vote_average field is an example of a field
# that measures the content's value, thus we can leverage it using 'field_value_factor'
# https://www.elastic.co/guide/en/elasticsearch/guide/master/boosting-by-popularity.html
user_search = 'william shatner patrick stewart'
query = {
    'query': {
        'function_score': {
            # the main query is executed first
            'query': {
                'multi_match': {
                    'query': user_search,
                    'type': 'cross_fields',
                    'fields': [
                        'overview',
                        'title',
                        'directors.name',
                        'cast.name'
                    ]
                }
            },
            
            # then the score will be boosted by the field we specified here
            # note that it's important to specify the modifier as the influence
            # of these type of factors shouldn't be linear, i.e. a 10-star movie
            # probably isn't twice as important to the user compared to a 5-star movie
            'field_value_factor': {
                'field': 'vote_average',
                'modifier': 'sqrt',
                'missing': 3.5
            }
        }
    }
}
es_utils.search(query)

1	34.45082	Star Trek: Generations
2	25.190252	Star Trek IV: The Voyage Home
3	22.433615	Showtime
4	22.293085	Star Trek II: The Wrath of Khan
5	21.985819	Osmosis Jones
6	20.765507	The Wild
7	20.641468	Conspiracy Theory
8	20.208668	One Flew Over the Cuckoo's Nest
9	19.95512	Bill & Ted's Bogus Journey
10	19.941917	Miss Congeniality 2: Armed and Fabulous


In [25]:
# the closer, the better
# https://www.elastic.co/guide/en/elasticsearch/guide/master/decay-functions.html
user_search = 'william shatner patrick stewart'
query = {
    "query": {
        "function_score": {
            # the main query is executed first
            "query": {
                "multi_match": {
                    "query": user_search,
                    "type": "cross_fields",
                    "fields": [
                        "overview",
                        "title",
                        "directors.name",
                        "cast.name"
                    ]
                }
            },
            
            # the functions key holds an array of value to apply
            'functions': [
                {
                    'gauss': {
                        # showing films 900 days into the past (controlled by scale)
                        # are half as valueable (controlled by decay), and the anchoring
                        # point is controlled by origin
                        'release_date': {
                            'origin': 'now',
                            'scale': '900d',
                            'decay': 0.5
                        }
                    }
                }
            ]
        }
    }
}
es_utils.search(query)

1	0.9109527	The SpongeBob Movie: Sponge Out of Water
2	0.72538525	Interstellar
3	0.7072919	Night at the Museum: Secret of the Tomb
4	0.6877321	Still Alice
5	0.6397457	Kill the Messenger
6	0.63583016	X-Men: Days of Future Past
7	0.55854934	Frozen Fever
8	0.51337415	Gone Girl
9	0.4167139	Mommy
10	0.41290846	Inherent Vice
