how to setup elastic-search:
- docker run -d -p 9200:9200 -p 9300:9300 -e "discovery.type=single-node" docker.elastic.co/elasticsearch/elasticsearch:7.14.0

In [2]:
import json
import requests

In [3]:
def extract(json_path: str):
    with open(json_path, 'r') as f:
        return json.loads(f.read())

In [4]:
def reindex(analysisSettings={}, mappingSettings={}, movieDict={}):
    settings = {
        "settings" : {
            "index" : {
                "number_of_shards" : 1,
                "number_of_replicas" : 0
            }
        }
    }

    if analysisSettings:
        settings['analysis'] = analysisSettings

    if mappingSettings:
        settings['mappings'] = mappingSettings

    resp = requests.delete("http://localhost:9200/tmdb")

    put_headers = {'Content-Type': 'application/json'}
    resp = requests.put("http://localhost:9200/tmdb",
                        data=json.dumps(settings), headers=put_headers)

    bulkMovies = ""

    for _, movie in movieDict.items():
        addCmd = {"index": {"_index": "tmdb",
                            "_id": movie["id"]}}
        bulkMovies += json.dumps(addCmd) + "\n" + json.dumps(movie) + "\n"


    headers = {'Content-Type': 'application/x-ndjson'}
    resp = requests.post("http://localhost:9200/_bulk", headers=headers, data=bulkMovies)
    return resp


In [5]:
movieDict = extract("./tmdb.json")

In [6]:
#
# read a movie from the movie dictionary
#
some_key = list(movieDict.keys())[0]
print(movieDict[some_key])

{'poster_path': '/mfMndRWFbzXbTx0g3rHUXFAxyOh.jpg', 'production_countries': [{'iso_3166_1': 'US', 'name': 'United States of America'}], 'revenue': 0, 'overview': 'When the FBI hires her to go undercover at a college sorority, Molly Morris (Miley Cyrus) must transform herself from a tough, streetwise private investigator to a refined, sophisticated university girl to help protect the daughter of a one-time Mobster. With several suspects on her list, Molly unexpectedly discovers that not everyone is who they appear to be, including herself.', 'video': False, 'id': 93837, 'genres': [{'id': 28, 'name': 'Action'}, {'id': 35, 'name': 'Comedy'}], 'title': 'So Undercover', 'tagline': "Meet the FBI's new secret weapon", 'vote_count': 55, 'homepage': '', 'belongs_to_collection': None, 'original_language': 'en', 'status': 'Released', 'spoken_languages': [{'iso_639_1': 'en', 'name': 'English'}], 'imdb_id': 'tt1766094', 'adult': False, 'backdrop_path': '/o4Tt60z94Hbgk8adeZG9WE4S2im.jpg', 'productio

In [7]:
#
# index the entire movie dictionary
#
reindex(movieDict=movieDict)

<Response [200]>

In [8]:
#
# function to query a movie, and return the top 10 query matches
#
def search(query):
    url = "http://localhost:9200/tmdb/_search"
    headers = {'Content-Type': 'application/json'}
    httpResp = requests.get(url, data=json.dumps(query), headers=headers)
    searchHits = json.loads(httpResp.text)['hits']
    print("Relevance Score")
    for idx, hit in enumerate(searchHits['hits']):
        relev_score = hit['_score']
        movie_title = hit['_source']['title']
        print(f"Num:{idx+1}\tRelevance Score:{relev_score}\tMovie Title:{movie_title}")

In [9]:
#
# let's try to search for a movie "basketball with cartoon aliens"
# 
usersSearch = 'basketball with cartoon aliens'
query = {
    "query": {
        "multi_match": {
            "query": usersSearch,
            "fields": ["title^10", "overview"]
        }
    }
}
search(query)

Relevance Score
Num:1	Relevance Score:82.75214	Movie Title:Aliens
Num:2	Relevance Score:69.15401	Movie Title:Cowboys & Aliens
Num:3	Relevance Score:67.66199	Movie Title:The Basketball Diaries
Num:4	Relevance Score:52.04843	Movie Title:Aliens vs Predator: Requiem
Num:5	Relevance Score:45.524403	Movie Title:Dances with Wolves
Num:6	Relevance Score:45.524403	Movie Title:Friends with Benefits
Num:7	Relevance Score:45.524403	Movie Title:Fire with Fire
Num:8	Relevance Score:39.894062	Movie Title:Interview with the Vampire
Num:9	Relevance Score:39.894062	Movie Title:From Russia With Love
Num:10	Relevance Score:39.894062	Movie Title:Gone with the Wind


In [10]:
#
# looking at the query results, you notice that some of these movies don't make sense being in the top 10 for that query
# 

In [11]:
#
# let's then debug the query, and understand the behavior of elastic when receiving the query
#
usersSearch = 'basketball with cartoon aliens'
query = {
    "query": {
        "multi_match": {
            "query": usersSearch,
            "fields": ["title^10", "overview"]
        }
    }
}
headers = {'Content-Type': 'application/json'}
httpResp = requests.get("http://localhost:9200/tmdb/_validate/query?explain", data=json.dumps(query), headers=headers)

print(json.loads(httpResp.text))

{'_shards': {'total': 1, 'successful': 1, 'failed': 0}, 'valid': True, 'explanations': [{'index': 'tmdb', 'valid': True, 'explanation': '((overview:basketball overview:with overview:cartoon overview:aliens) | (title:basketball title:with title:cartoon title:aliens)^10.0)'}]}


In [12]:
# Here the returned explanation field (in bold) lists what you’re interested in.
# ((overview:basketball overview:with overview:cartoon overview:aliens) | (title:basketball title:with title:cartoon title:aliens)^10.0)

In [13]:
#
# let's continue debugging, and see how my query is tokenized
#
headers = {'Content-Type': 'application/json'}
query = {"text": "Fire with Fire"}
httpResp = requests.get("http://localhost:9200/tmdb/_analyze?format=yaml", data=json.dumps(query), headers=headers)
print(httpResp.text)

---
tokens:
- token: "fire"
  start_offset: 0
  end_offset: 4
  type: "<ALPHANUM>"
  position: 0
- token: "with"
  start_offset: 5
  end_offset: 9
  type: "<ALPHANUM>"
  position: 1
- token: "fire"
  start_offset: 10
  end_offset: 14
  type: "<ALPHANUM>"
  position: 2



the results is three entries in the token stream, showing the extracted properties of a token
start/end offsets indicate where the token exists in the source text
position indicates term ordering, distance, and adjency

In this token stream, you extract three tokens: fire, with, and fire. Notice how the text has been tokenized by whitespace and lowercased?

In [14]:
#
# let's continue debugging, and see how my query is tokenized
#
headers = {'Content-Type': 'application/json'}
query = {"text": "Fire with Fire", "analyzer": "english"}
httpResp = requests.get("http://localhost:9200/tmdb/_analyze?format=yaml", data=json.dumps(query), headers=headers)
print(httpResp.text)

---
tokens:
- token: "fire"
  start_offset: 0
  end_offset: 4
  type: "<ALPHANUM>"
  position: 0
- token: "fire"
  start_offset: 10
  end_offset: 14
  type: "<ALPHANUM>"
  position: 2



In [15]:
mappingSettings = {
    'properties': {
        'title': { #A
            'type': 'text',
            'analyzer': 'english'
        },
        'overview': {
                'type': 'text',
                'analyzer': 'english'
            }
    }
}
reindex(mappingSettings=mappingSettings, movieDict=movieDict)

<Response [200]>

Notice the removal of "with" in this token stream

In [16]:
usersSearch = 'basketball with cartoon aliens'
query = {
    "query": {
        "multi_match": {
            "query": usersSearch,
            "fields": ["title^10", "overview"]
        }
    }
}
search(query)

Relevance Score
Num:1	Relevance Score:72.222855	Movie Title:Alien
Num:2	Relevance Score:72.222855	Movie Title:Aliens
Num:3	Relevance Score:58.34751	Movie Title:Cowboys & Aliens
Num:4	Relevance Score:42.15139	Movie Title:Aliens vs Predator: Requiem
Num:5	Relevance Score:6.7331104	Movie Title:The Flintstones
Num:6	Relevance Score:6.713963	Movie Title:White Men Can't Jump
Num:7	Relevance Score:5.738433	Movie Title:The Thing
Num:8	Relevance Score:5.394642	Movie Title:Bedazzled
Num:9	Relevance Score:5.285317	Movie Title:High School Musical
Num:10	Relevance Score:5.2432656	Movie Title:Independence Day


In [17]:
#
# Debugging ranking
#
usersSearch = 'basketball with cartoon aliens'
query = {
    "explain": True,
    "query": {
        "multi_match": {
            "query": usersSearch,
            "fields": ["title^10", "overview"]
        }
    }
}
url = "http://localhost:9200/tmdb/_search"
headers = {'Content-Type': 'application/json'}
httpResp = requests.get(url, data=json.dumps(query), headers=headers)
print
searchHits = json.loads(httpResp.text)['hits']['hits'][0]
print(f"Explain for {str(searchHits['_source']['title'])}")
print(f"Explain for {json.dumps(searchHits['_explanation'], indent=4, sort_keys=True)}")

Explain for Alien
Explain for {
    "description": "max of:",
    "details": [
        {
            "description": "sum of:",
            "details": [
                {
                    "description": "weight(overview:alien in 229) [PerFieldSimilarity], result of:",
                    "details": [
                        {
                            "description": "score(freq=1.0), computed as boost * idf * tf from:",
                            "details": [
                                {
                                    "description": "boost",
                                    "details": [],
                                    "value": 2.2
                                },
                                {
                                    "description": "idf, computed as log(1 + (N - n + 0.5) / (n + 0.5)) from:",
                                    "details": [
                                        {
                                            "description": "n, nu

In [18]:
usersSearch = 'basketball with cartoon aliens'
query = {
    "query": {
        "multi_match": {
            "query": usersSearch,
            "fields": ["title^0.1", "overview"]
        }
    }
}
search(query)

Relevance Score
Num:1	Relevance Score:6.7331104	Movie Title:The Flintstones
Num:2	Relevance Score:6.713963	Movie Title:White Men Can't Jump
Num:3	Relevance Score:6.24872	Movie Title:Aliens vs Predator: Requiem
Num:4	Relevance Score:5.738433	Movie Title:The Thing
Num:5	Relevance Score:5.394642	Movie Title:Bedazzled
Num:6	Relevance Score:5.285317	Movie Title:High School Musical
Num:7	Relevance Score:5.2432656	Movie Title:Independence Day
Num:8	Relevance Score:5.145266	Movie Title:The X Files
Num:9	Relevance Score:4.8164744	Movie Title:The Day the Earth Stood Still
Num:10	Relevance Score:4.695741	Movie Title:Star Trek IV: The Voyage Home
