# Boilerplate Setup

In [1]:
import json
import os
from pprint import pprint
import requests
import time

ELASTICSEARCH_URL = os.environ.get('ELASTICSEARCH_URL', 'http://localhost:9200')

# 4.2.2 Analysis for precision or recall

In [2]:
def extract():
    with open('tmdb.json') as f:
         return json.loads(f.read())

In [3]:
def reindex(settings):
    response = requests.delete(ELASTICSEARCH_URL + '/my_library')
    response = requests.put(ELASTICSEARCH_URL + '/my_library', json=settings)
    return response

In [4]:
def tokenize(text, analyzer):
    response = requests.get(
        ELASTICSEARCH_URL + '/my_library/_analyze', 
        json={
            'text': text,
            'analyzer': analyzer
        }
    ).json()

    print(''.join(['[{}]'.format(token_term['token']) for token_term in response['tokens']]))

## Standard analyzer

In [5]:
reindex({
  "settings": {
    "analysis": {
      "analyzer": {
        "standard_clone": {
          "tokenizer": "standard",
          "filter": [ "standard", "lowercase", "stop" ]
}}}}})

<Response [200]>

In [6]:
tokenize("Dr. Strangelove: Or How I Learned to Stop Worrying and Love the Bomb", 'standard_clone')

[dr][strangelove][how][i][learned][stop][worrying][love][bomb]


## English analyzer

In [7]:
reindex({
  "settings": {
    "analysis": {
      "filter": {
        "english_stop": {
          "type":       "stop",
          "stopwords":  "_english_"},
        "english_keywords": {
          "type":       "keyword_marker",
          "keywords":   []},
        "english_stemmer": {
          "type":       "stemmer",
          "language":   "english"},
        "english_possessive_stemmer": {
          "type":       "stemmer",
          "language":   "possessive_english"}},
      "analyzer": {
        "english_clone": {
          "tokenizer":  "standard",
          "filter": [
            "english_possessive_stemmer",
            "lowercase",
            "english_stop",
            "english_keywords",
            "english_stemmer"]
}}}}})

<Response [200]>

In [8]:
tokenize("Dr. Strangelove: Or How I Learned to Stop Worrying and Love the Bomb", 'english_clone')

[dr][strangelov][how][i][learn][stop][worri][love][bomb]


# 4.2.3 Taking recall to extremes

## Phonetic analyzer

In [9]:
%%bash
docker exec elasticsearch bash -c "bin/elasticsearch-plugin install analysis-phonetic > /dev/null"
docker restart elasticsearch
sleep 60s

elasticsearch


In [10]:
reindex({
  "settings": {
    "analysis": {
      "analyzer": {
        "phonetic": {
          "tokenizer": "standard",
          "filter": [
            "standard",
            "lowercase",
            "my_doublemetaphone"]}},
      "filter": {
        "my_doublemetaphone": {
          "type": "phonetic",
          "encoder": "doublemetaphone",
           "replace": True
}}}}})

<Response [200]>

In [11]:
tokenize("message from Dalai Lama", 'phonetic')
tokenize("message from tall llama", 'phonetic')

[MSJ][MSK][FRM][TL][LM]
[MSJ][MSK][FRM][TL][LM]


# 4.3.1 Scoring strength of a feature in a single field

In [12]:
print(requests.put(ELASTICSEARCH_URL + '/my_library/example/1', json={ "title":"apple apple apple apple apple" }))
print(requests.put(ELASTICSEARCH_URL + '/my_library/example/2', json={ "title":"apple apple apple banana banana" }))
print(requests.put(ELASTICSEARCH_URL + '/my_library/example/3', json={ "title":"apple banana blueberry coconut" }))

<Response [201]>
<Response [201]>
<Response [201]>


In [13]:
time.sleep(5)

In [14]:
def simpler_explain(explain_json, depth=0):
    result = " " * (depth * 2) + "%s, %s\n" % (explain_json['value'], explain_json['description'].replace('\n', ''))
    if 'details' in explain_json:
        for detail in explain_json['details']:
            result += simpler_explain(detail, depth=depth+1)
    return result

query = {
  "explain": "true",
  "query": {
    "match": {
      "title": "apple"
}}}
response = requests.get(ELASTICSEARCH_URL + '/my_library/example/_search', json=query)
data = response.json()
for i in range(len(data['hits']['hits'])):
    print("Explain for %s" % data['hits']['hits'][i]['_source']['title'])
    print(simpler_explain(data['hits']['hits'][i]['_explanation']))

Explain for apple apple apple apple apple
0.51040375, weight(title:apple in 0) [PerFieldSimilarity], result of:
  0.51040375, score(doc=0,freq=5.0 = termFreq=5.0), product of:
    0.2876821, idf, computed as log(1 + (docCount - docFreq + 0.5) / (docFreq + 0.5)) from:
      1.0, docFreq
      1.0, docCount
    1.7741936, tfNorm, computed as (freq * (k1 + 1)) / (freq + k1 * (1 - b + b * fieldLength / avgFieldLength)) from:
      5.0, termFreq=5.0
      1.2, parameter k1
      0.75, parameter b
      5.0, avgFieldLength
      5.0, fieldLength

Explain for apple apple apple banana banana
0.4520719, weight(title:apple in 0) [PerFieldSimilarity], result of:
  0.4520719, score(doc=0,freq=3.0 = termFreq=3.0), product of:
    0.2876821, idf, computed as log(1 + (docCount - docFreq + 0.5) / (docFreq + 0.5)) from:
      1.0, docFreq
      1.0, docCount
    1.5714288, tfNorm, computed as (freq * (k1 + 1)) / (freq + k1 * (1 - b + b * fieldLength / avgFieldLength)) from:
      3.0, termFreq=3.0
    

In [15]:
print(requests.put(ELASTICSEARCH_URL + '/my_library/example/4', json={ "title":"apples apple" }))

<Response [201]>


In [16]:
time.sleep(5)

In [17]:
query = {
  "explain": "true",
  "query": {
    "match": {
      "title": "apple"
}}}
response = requests.get(ELASTICSEARCH_URL + '/my_library/example/_search', json=query)
data = response.json()
for i in range(len(data['hits']['hits'])):
    print("Explain for %s" % data['hits']['hits'][i]['_source']['title'])
    print(simpler_explain(data['hits']['hits'][i]['_explanation']))

Explain for apple apple apple apple apple
0.51040375, weight(title:apple in 0) [PerFieldSimilarity], result of:
  0.51040375, score(doc=0,freq=5.0 = termFreq=5.0), product of:
    0.2876821, idf, computed as log(1 + (docCount - docFreq + 0.5) / (docFreq + 0.5)) from:
      1.0, docFreq
      1.0, docCount
    1.7741936, tfNorm, computed as (freq * (k1 + 1)) / (freq + k1 * (1 - b + b * fieldLength / avgFieldLength)) from:
      5.0, termFreq=5.0
      1.2, parameter k1
      0.75, parameter b
      5.0, avgFieldLength
      5.0, fieldLength

Explain for apple banana blueberry coconut
0.2876821, weight(title:apple in 0) [PerFieldSimilarity], result of:
  0.2876821, score(doc=0,freq=1.0 = termFreq=1.0), product of:
    0.2876821, idf, computed as log(1 + (docCount - docFreq + 0.5) / (docFreq + 0.5)) from:
      1.0, docFreq
      1.0, docCount
    1.0, tfNorm, computed as (freq * (k1 + 1)) / (freq + k1 * (1 - b + b * fieldLength / avgFieldLength)) from:
      1.0, termFreq=1.0
      1.2, 

In [18]:
reindex({
    'mappings': {
       'example': {
            'properties': {
               'title': {
                   'type': 'text',
                   'analyzer': 'english'},
            'overview': {
                   'type': 'text',
                   'analyzer': 'english'
}}}}})

<Response [200]>

In [19]:
print(requests.put(ELASTICSEARCH_URL + '/my_library/example/1', json={ "title":"apple apple apple apple apple" }))
print(requests.put(ELASTICSEARCH_URL + '/my_library/example/2', json={ "title":"apple apple apple banana banana" }))
print(requests.put(ELASTICSEARCH_URL + '/my_library/example/3', json={ "title":"apple banana blueberry coconut" }))
print(requests.put(ELASTICSEARCH_URL + '/my_library/example/4', json={ "title":"apples apple" }))

<Response [201]>
<Response [201]>
<Response [201]>
<Response [201]>


In [20]:
time.sleep(5)

In [21]:
query = {
  "explain": "true",
  "query": {
    "match": {
      "title": "apple"
}}}
response = requests.get(ELASTICSEARCH_URL + '/my_library/example/_search', json=query)
data = response.json()
for i in range(len(data['hits']['hits'])):
    print("Explain for %s" % data['hits']['hits'][i]['_source']['title'])
    print(simpler_explain(data['hits']['hits'][i]['_explanation']))

Explain for apple apple apple apple apple
0.51040375, weight(title:appl in 0) [PerFieldSimilarity], result of:
  0.51040375, score(doc=0,freq=5.0 = termFreq=5.0), product of:
    0.2876821, idf, computed as log(1 + (docCount - docFreq + 0.5) / (docFreq + 0.5)) from:
      1.0, docFreq
      1.0, docCount
    1.7741936, tfNorm, computed as (freq * (k1 + 1)) / (freq + k1 * (1 - b + b * fieldLength / avgFieldLength)) from:
      5.0, termFreq=5.0
      1.2, parameter k1
      0.75, parameter b
      5.0, avgFieldLength
      5.0, fieldLength

Explain for apple banana blueberry coconut
0.2876821, weight(title:appl in 0) [PerFieldSimilarity], result of:
  0.2876821, score(doc=0,freq=1.0 = termFreq=1.0), product of:
    0.2876821, idf, computed as log(1 + (docCount - docFreq + 0.5) / (docFreq + 0.5)) from:
      1.0, docFreq
      1.0, docCount
    1.0, tfNorm, computed as (freq * (k1 + 1)) / (freq + k1 * (1 - b + b * fieldLength / avgFieldLength)) from:
      1.0, termFreq=1.0
      1.2, pa

# 4.4.1 Dealing with delimiters

### Acronyms

In [22]:
reindex({
  "settings": {
    "analysis": {
      "filter": {
        "acronyms": {
          "type": "word_delimiter",
          "catenate_all": True,
          "generate_word_parts": False,
          "generate_number_parts": False}},
      "analyzer": {
        "standard_with_acronyms": {
          "tokenizer": "standard",
          "filter": ["standard","lowercase","acronyms"]
}}}}})
tokenize("I.B.M. versus IBM versus ibm", 'standard_with_acronyms')

<Response [200]>

### Phone numbers

In [27]:
reindex({
  "settings": {
    "analysis": {
      "filter": {
        "phone_num_filter": {
          "type": "word_delimiter",
          "catenate_all": True,
          "generate_number_parts": False},
        "phone_num_parts": {
          "type": "pattern_capture",
          "patterns":["(\\d{7}$)","(\\d{10}$)"],
          "preserve_original": True}},
      "analyzer": {
        "phone_num": {
          "tokenizer": "keyword",
         "filter": ["phone_num_filter","phone_num_parts"]
}}}}})
tokenize("1(800)867-5309", 'phone_num')

[18008675309][8008675309][8675309]


# 4.4.2. Capturing meaning with synonyms

In [36]:
reindex({
  "settings": {
    "analysis": {
      "filter": {
        "english_stop": {
          "type":       "stop",
          "stopwords":  "_english_"},
        "english_keywords": {
          "type":       "keyword_marker",
          "keywords":   []},
        "english_stemmer": {
          "type":       "stemmer",
          "language":   "english"},
        "english_possessive_stemmer": {
          "type":       "stemmer",
          "language":   "possessive_english"},
        "retail_syn_filter": {
          "type": "synonym",
          "synonyms": [
            "dress shoe, dress shoes => dress_shoe, shoe"
          ]}},
      "analyzer": {
        "retail_analyzer": {
          "tokenizer":  "standard",
          "filter": [
            "english_possessive_stemmer",
            "lowercase",
            "retail_syn_filter",
            "english_keywords",
            "english_stemmer"]}}}},
  "mappings": {
    "items": {
      "properties": {
        "desc": {
          "type": "text",
          "analyzer": "retail_analyzer",
}}}}})
print(requests.put(ELASTICSEARCH_URL + '/my_library/items/1', json={ "desc": "bob's brand dress shoes are the bomb diggity" }))
print(requests.put(ELASTICSEARCH_URL + '/my_library/items/2', json={ "desc": "this little black dress is sure to impress" }))
print(requests.put(ELASTICSEARCH_URL + '/my_library/items/3', json={ "desc": "tennis shoes... you know, for tennis" }))

<Response [201]>
<Response [201]>
<Response [201]>


In [38]:
query = {
  "explain": "true",
  "query": {
    "match": {
      "desc": "dress"
}}}
response = requests.get(ELASTICSEARCH_URL + '/my_library/items/_search', json=query)
data = response.json()
for i in range(len(data['hits']['hits'])):
    print("Explain for %s" % data['hits']['hits'][i]['_source']['desc'])
    print(simpler_explain(data['hits']['hits'][i]['_explanation']))

Explain for this little black dress is sure to impress
0.2876821, weight(desc:dress in 0) [PerFieldSimilarity], result of:
  0.2876821, score(doc=0,freq=1.0 = termFreq=1.0), product of:
    0.2876821, idf, computed as log(1 + (docCount - docFreq + 0.5) / (docFreq + 0.5)) from:
      1.0, docFreq
      1.0, docCount
    1.0, tfNorm, computed as (freq * (k1 + 1)) / (freq + k1 * (1 - b + b * fieldLength / avgFieldLength)) from:
      1.0, termFreq=1.0
      1.2, parameter k1
      0.75, parameter b
      8.0, avgFieldLength
      8.0, fieldLength



In [39]:
query = {
  "explain": "true",
  "query": {
    "match": {
      "desc": "shoes"
}}}
response = requests.get(ELASTICSEARCH_URL + '/my_library/items/_search', json=query)
data = response.json()
for i in range(len(data['hits']['hits'])):
    print("Explain for %s" % data['hits']['hits'][i]['_source']['desc'])
    print(simpler_explain(data['hits']['hits'][i]['_explanation']))

Explain for bob's brand dress shoes are the bomb diggity
0.30318588, weight(desc:shoe in 0) [PerFieldSimilarity], result of:
  0.30318588, score(doc=0,freq=1.0 = termFreq=1.0), product of:
    0.2876821, idf, computed as log(1 + (docCount - docFreq + 0.5) / (docFreq + 0.5)) from:
      1.0, docFreq
      1.0, docCount
    1.0538921, tfNorm, computed as (freq * (k1 + 1)) / (freq + k1 * (1 - b + b * fieldLength / avgFieldLength)) from:
      1.0, termFreq=1.0
      1.2, parameter k1
      0.75, parameter b
      8.0, avgFieldLength
      7.0, fieldLength

Explain for tennis shoes... you know, for tennis
0.2876821, weight(desc:shoe in 0) [PerFieldSimilarity], result of:
  0.2876821, score(doc=0,freq=1.0 = termFreq=1.0), product of:
    0.2876821, idf, computed as log(1 + (docCount - docFreq + 0.5) / (docFreq + 0.5)) from:
      1.0, docFreq
      1.0, docCount
    1.0, tfNorm, computed as (freq * (k1 + 1)) / (freq + k1 * (1 - b + b * fieldLength / avgFieldLength)) from:
      1.0, termFre

In [40]:
query = {
  "explain": "true",
  "query": {
    "match": {
      "desc": "dress shoes"
}}}
response = requests.get(ELASTICSEARCH_URL + '/my_library/items/_search', json=query)
data = response.json()
for i in range(len(data['hits']['hits'])):
    print("Explain for %s" % data['hits']['hits'][i]['_source']['desc'])
    print(simpler_explain(data['hits']['hits'][i]['_explanation']))

Explain for bob's brand dress shoes are the bomb diggity
0.40997607, weight(Synonym(desc:dress_sho desc:shoe) in 0) [PerFieldSimilarity], result of:
  0.40997607, score(doc=0,freq=2.0 = termFreq=2.0), product of:
    0.2876821, idf, computed as log(1 + (docCount - docFreq + 0.5) / (docFreq + 0.5)) from:
      1.0, docFreq
      1.0, docCount
    1.4251012, tfNorm, computed as (freq * (k1 + 1)) / (freq + k1 * (1 - b + b * fieldLength / avgFieldLength)) from:
      2.0, termFreq=2.0
      1.2, parameter k1
      0.75, parameter b
      8.0, avgFieldLength
      7.0, fieldLength

Explain for tennis shoes... you know, for tennis
0.2876821, weight(Synonym(desc:dress_sho desc:shoe) in 0) [PerFieldSimilarity], result of:
  0.2876821, score(doc=0,freq=1.0 = termFreq=1.0), product of:
    0.2876821, idf, computed as log(1 + (docCount - docFreq + 0.5) / (docFreq + 0.5)) from:
      1.0, docFreq
      1.0, docCount
    1.0, tfNorm, computed as (freq * (k1 + 1)) / (freq + k1 * (1 - b + b * fieldLe

In [49]:
a = reindex({
  "settings": {
    "analysis": {
      "filter": {
        "english_stop": {
          "type":       "stop",
          "stopwords":  "_english_"},
        "english_keywords": {
          "type":       "keyword_marker",
          "keywords":   []},
        "english_stemmer": {
          "type":       "stemmer",
          "language":   "english"},
        "english_possessive_stemmer": {
          "type":       "stemmer",
          "language":   "possessive_english"},
        "retail_syn_filter_index": {
          "type": "synonym",
          "synonyms": [
            "dress shoe, dress shoes => dress_shoe, shoe"
          ]},
        "retail_syn_filter_search": {
          "type": "synonym",
          "synonyms": [
            "dress shoe, dress shoes => dress_shoe"
          ]}},
      "analyzer": {
        "retail_analyzer_index": {
          "tokenizer":  "standard",
          "filter": [
            "english_possessive_stemmer",
            "lowercase",
            "retail_syn_filter_index",
            "english_stop",
            "english_keywords",
            "english_stemmer"]},
        "retail_analyzer_search": {
          "tokenizer":  "standard",
          "filter": [
            "english_possessive_stemmer",
            "lowercase",
            "retail_syn_filter_search",
            "english_stop",
            "english_keywords",
            "english_stemmer"]}}}},
  "mappings": {
    "items": {
      "properties": {
        "desc": {
          "type": "text",
          "analyzer": "retail_analyzer_index",
          "search_analyzer": "retail_analyzer_search",
}}}}})
print(requests.put(ELASTICSEARCH_URL + '/my_library/items/1', json={ "desc": "bob's brand dress shoes are the bomb diggity" }))
print(requests.put(ELASTICSEARCH_URL + '/my_library/items/2', json={ "desc": "this little black dress is sure to impress" }))
print(requests.put(ELASTICSEARCH_URL + '/my_library/items/3', json={ "desc": "tennis shoes... you know, for tennis" }))

<Response [201]>
<Response [201]>
<Response [201]>


In [50]:
query = {
  "explain": "true",
  "query": {
    "match": {
      "desc": "dress"
}}}
response = requests.get(ELASTICSEARCH_URL + '/my_library/items/_search', json=query)
data = response.json()
for i in range(len(data['hits']['hits'])):
    print("Explain for %s" % data['hits']['hits'][i]['_source']['desc'])
    print(simpler_explain(data['hits']['hits'][i]['_explanation']))

Explain for this little black dress is sure to impress
0.2876821, weight(desc:dress in 0) [PerFieldSimilarity], result of:
  0.2876821, score(doc=0,freq=1.0 = termFreq=1.0), product of:
    0.2876821, idf, computed as log(1 + (docCount - docFreq + 0.5) / (docFreq + 0.5)) from:
      1.0, docFreq
      1.0, docCount
    1.0, tfNorm, computed as (freq * (k1 + 1)) / (freq + k1 * (1 - b + b * fieldLength / avgFieldLength)) from:
      1.0, termFreq=1.0
      1.2, parameter k1
      0.75, parameter b
      5.0, avgFieldLength
      5.0, fieldLength



In [51]:
query = {
  "explain": "true",
  "query": {
    "match": {
      "desc": "shoes"
}}}
response = requests.get(ELASTICSEARCH_URL + '/my_library/items/_search', json=query)
data = response.json()
for i in range(len(data['hits']['hits'])):
    print("Explain for %s" % data['hits']['hits'][i]['_source']['desc'])
    print(simpler_explain(data['hits']['hits'][i]['_explanation']))

Explain for bob's brand dress shoes are the bomb diggity
0.30873197, weight(desc:shoe in 0) [PerFieldSimilarity], result of:
  0.30873197, score(doc=0,freq=1.0 = termFreq=1.0), product of:
    0.2876821, idf, computed as log(1 + (docCount - docFreq + 0.5) / (docFreq + 0.5)) from:
      1.0, docFreq
      1.0, docCount
    1.0731707, tfNorm, computed as (freq * (k1 + 1)) / (freq + k1 * (1 - b + b * fieldLength / avgFieldLength)) from:
      1.0, termFreq=1.0
      1.2, parameter k1
      0.75, parameter b
      6.0, avgFieldLength
      5.0, fieldLength

Explain for tennis shoes... you know, for tennis
0.2876821, weight(desc:shoe in 0) [PerFieldSimilarity], result of:
  0.2876821, score(doc=0,freq=1.0 = termFreq=1.0), product of:
    0.2876821, idf, computed as log(1 + (docCount - docFreq + 0.5) / (docFreq + 0.5)) from:
      1.0, docFreq
      1.0, docCount
    1.0, tfNorm, computed as (freq * (k1 + 1)) / (freq + k1 * (1 - b + b * fieldLength / avgFieldLength)) from:
      1.0, termFre

In [52]:
query = {
  "explain": "true",
  "query": {
    "match": {
      "desc": "dress shoes"
}}}
response = requests.get(ELASTICSEARCH_URL + '/my_library/items/_search', json=query)
data = response.json()
for i in range(len(data['hits']['hits'])):
    print("Explain for %s" % data['hits']['hits'][i]['_source']['desc'])
    print(simpler_explain(data['hits']['hits'][i]['_explanation']))

Explain for bob's brand dress shoes are the bomb diggity
0.30873197, weight(desc:dress_sho in 0) [PerFieldSimilarity], result of:
  0.30873197, score(doc=0,freq=1.0 = termFreq=1.0), product of:
    0.2876821, idf, computed as log(1 + (docCount - docFreq + 0.5) / (docFreq + 0.5)) from:
      1.0, docFreq
      1.0, docCount
    1.0731707, tfNorm, computed as (freq * (k1 + 1)) / (freq + k1 * (1 - b + b * fieldLength / avgFieldLength)) from:
      1.0, termFreq=1.0
      1.2, parameter k1
      0.75, parameter b
      6.0, avgFieldLength
      5.0, fieldLength



# Notes

Takeaways:

1. ?

Experiments:

- **[1]**: 