# Boilerplate Setup

In [1]:
import json
import os
from pprint import pprint
import requests
import time

ELASTICSEARCH_URL = os.environ.get('ELASTICSEARCH_URL', 'http://localhost:9200')

# 4.2.2 Analysis for precision or recall

In [2]:
def reindex(settings):
    response = requests.delete(ELASTICSEARCH_URL + '/my_library')
    response = requests.put(ELASTICSEARCH_URL + '/my_library', json=settings)
    return response

In [3]:
def tokenize(text, analyzer):
    response = requests.get(
        ELASTICSEARCH_URL + '/my_library/_analyze', 
        json={'text': text,
              'analyzer': analyzer}
    ).json()

    print(''.join(['[{}]'.format(token_term['token']) for token_term in response['tokens']]))

## Standard analyzer

In [4]:
reindex({'settings': {'analysis': {'analyzer': {'standard_clone': {'tokenizer': 'standard',
                                                                   'filter': ['standard',
                                                                              'lowercase',
                                                                              'stop']}}}}})

tokenize("Dr. Strangelove: Or How I Learned to Stop Worrying and Love the Bomb", 'standard_clone')

[dr][strangelove][how][i][learned][stop][worrying][love][bomb]


## English analyzer

In [5]:
reindex({'settings': {'analysis': {'filter': {'english_stop': {'type': 'stop',
                                                               'stopwords':  '_english_'},
                                              'english_keywords': {'type': 'keyword_marker',
                                                                   'keywords':   []},
                                              'english_stemmer': {'type': 'stemmer',
                                                                  'language': 'english'},
                                              'english_possessive_stemmer': {'type': 'stemmer',
                                                                             'language': 'possessive_english'}},
                                   'analyzer': {'english_clone': {'tokenizer': 'standard',
                                                                  'filter': ['english_possessive_stemmer',
                                                                             'lowercase',
                                                                             'english_stop',
                                                                             'english_keywords',
                                                                             'english_stemmer']}}}}})

tokenize("Dr. Strangelove: Or How I Learned to Stop Worrying and Love the Bomb", 'english_clone')

[dr][strangelov][how][i][learn][stop][worri][love][bomb]


# 4.2.3 Taking recall to extremes

## Phonetic analyzer

In [6]:
%%bash -s "$ELASTICSEARCH_URL"
docker exec elasticsearch bash -c "bin/elasticsearch-plugin install analysis-phonetic >/dev/null"
docker restart elasticsearch >/dev/null
wget $1 --retry-connrefused --tries=10 -q --wait=10 --spider

In [7]:
reindex({'settings': {'analysis': {'analyzer': {'phonetic': {'tokenizer': 'standard',
                                                             'filter': ['standard',
                                                                        'lowercase',
                                                                        'my_doublemetaphone']}},
                                   'filter': {'my_doublemetaphone': {'type': 'phonetic',
                                                                     'encoder': 'doublemetaphone',
                                                                     'replace': True}}}}})

<Response [200]>

In [8]:
tokenize("message from Dalai Lama", 'phonetic')
tokenize("message from tall llama", 'phonetic')

[MSJ][MSK][FRM][TL][LM]
[MSJ][MSK][FRM][TL][LM]


# 4.3.1 Scoring strength of a feature in a single field

In [9]:
def put(id, item):
    requests.put(ELASTICSEARCH_URL + '/my_library/example/%d?refresh=wait_for' % id, json=item)

In [10]:
put(1, {'title': "apple apple apple apple apple"})
put(2, {'title': "apple apple apple banana banana"})
put(3, {'title': "apple banana blueberry coconut"})

In [11]:
def simpler_explain(explain_json, depth=0):
    result = " " * (depth * 2) + "%s, %s\n" % (explain_json['value'], explain_json['description'].replace('\n', ''))
    if 'details' in explain_json:
        for detail in explain_json['details']:
            result += simpler_explain(detail, depth=depth+1)
    return result

def search_explain(query):
    query['explain'] = True
    response = requests.get(ELASTICSEARCH_URL + '/my_library/example/_search', json=query)
    data = response.json()
    for i in range(len(data['hits']['hits'])):
        print("Explain for `%s`" % data['hits']['hits'][i]['_source']['title'])
        print(simpler_explain(data['hits']['hits'][i]['_explanation']))

In [12]:
query = {'explain': 'true',
         'query': {'match': {'title': "apple"}}}

search_explain(query)

Explain for `apple apple apple apple apple`
0.51040375, weight(title:apple in 0) [PerFieldSimilarity], result of:
  0.51040375, score(doc=0,freq=5.0 = termFreq=5.0), product of:
    0.2876821, idf, computed as log(1 + (docCount - docFreq + 0.5) / (docFreq + 0.5)) from:
      1.0, docFreq
      1.0, docCount
    1.7741936, tfNorm, computed as (freq * (k1 + 1)) / (freq + k1 * (1 - b + b * fieldLength / avgFieldLength)) from:
      5.0, termFreq=5.0
      1.2, parameter k1
      0.75, parameter b
      5.0, avgFieldLength
      5.0, fieldLength

Explain for `apple apple apple banana banana`
0.4520719, weight(title:apple in 0) [PerFieldSimilarity], result of:
  0.4520719, score(doc=0,freq=3.0 = termFreq=3.0), product of:
    0.2876821, idf, computed as log(1 + (docCount - docFreq + 0.5) / (docFreq + 0.5)) from:
      1.0, docFreq
      1.0, docCount
    1.5714288, tfNorm, computed as (freq * (k1 + 1)) / (freq + k1 * (1 - b + b * fieldLength / avgFieldLength)) from:
      3.0, termFreq=3.0


In [13]:
put(4, {'title': "apples apple"})

In [14]:
query = {'explain': 'true',
         'query': {'match': {'title': "apple"}}}

search_explain(query)

Explain for `apple apple apple apple apple`
0.51040375, weight(title:apple in 0) [PerFieldSimilarity], result of:
  0.51040375, score(doc=0,freq=5.0 = termFreq=5.0), product of:
    0.2876821, idf, computed as log(1 + (docCount - docFreq + 0.5) / (docFreq + 0.5)) from:
      1.0, docFreq
      1.0, docCount
    1.7741936, tfNorm, computed as (freq * (k1 + 1)) / (freq + k1 * (1 - b + b * fieldLength / avgFieldLength)) from:
      5.0, termFreq=5.0
      1.2, parameter k1
      0.75, parameter b
      5.0, avgFieldLength
      5.0, fieldLength

Explain for `apple banana blueberry coconut`
0.2876821, weight(title:apple in 0) [PerFieldSimilarity], result of:
  0.2876821, score(doc=0,freq=1.0 = termFreq=1.0), product of:
    0.2876821, idf, computed as log(1 + (docCount - docFreq + 0.5) / (docFreq + 0.5)) from:
      1.0, docFreq
      1.0, docCount
    1.0, tfNorm, computed as (freq * (k1 + 1)) / (freq + k1 * (1 - b + b * fieldLength / avgFieldLength)) from:
      1.0, termFreq=1.0
      1

In [15]:
reindex({'mappings': {'example': {'properties': {'title': {'type': 'text',
                                                           'analyzer': 'english'},
                                                 'overview': {'type': 'text',
                                                              'analyzer': 'english'}}}}})

<Response [200]>

In [16]:
put(1, {'title': "apple apple apple apple apple"})
put(2, {'title': "apple apple apple banana banana"})
put(3, {'title': "apple banana blueberry coconut"})
put(4, {'title': "apples apple"})

In [17]:
query = {'explain': 'true',
         'query': {'match': {'title': "apple"}}}

search_explain(query)

Explain for `apple apple apple apple apple`
0.51040375, weight(title:appl in 0) [PerFieldSimilarity], result of:
  0.51040375, score(doc=0,freq=5.0 = termFreq=5.0), product of:
    0.2876821, idf, computed as log(1 + (docCount - docFreq + 0.5) / (docFreq + 0.5)) from:
      1.0, docFreq
      1.0, docCount
    1.7741936, tfNorm, computed as (freq * (k1 + 1)) / (freq + k1 * (1 - b + b * fieldLength / avgFieldLength)) from:
      5.0, termFreq=5.0
      1.2, parameter k1
      0.75, parameter b
      5.0, avgFieldLength
      5.0, fieldLength

Explain for `apple banana blueberry coconut`
0.2876821, weight(title:appl in 0) [PerFieldSimilarity], result of:
  0.2876821, score(doc=0,freq=1.0 = termFreq=1.0), product of:
    0.2876821, idf, computed as log(1 + (docCount - docFreq + 0.5) / (docFreq + 0.5)) from:
      1.0, docFreq
      1.0, docCount
    1.0, tfNorm, computed as (freq * (k1 + 1)) / (freq + k1 * (1 - b + b * fieldLength / avgFieldLength)) from:
      1.0, termFreq=1.0
      1.2

# 4.4.1 Dealing with delimiters

### Acronyms

In [18]:
reindex({'settings': {'analysis': {'filter': {'acronyms': {'type': 'word_delimiter',
                                                           'catenate_all': True,
                                                           'generate_word_parts': False,
                                                           'generate_number_parts': False}},
                                   'analyzer': {'standard_with_acronyms': {'tokenizer': 'standard',
                                                                           'filter': ['standard',
                                                                                      'lowercase',
                                                                                      'acronyms']}}}}})

tokenize("I.B.M. versus IBM versus ibm", 'standard_with_acronyms')

[ibm][versus][ibm][versus][ibm]


### Phone numbers

In [19]:
reindex({'settings': {'analysis': {'filter': {'phone_num_filter': {'type': 'word_delimiter',
                                                                   'catenate_all': True,
                                                                   'generate_number_parts': False},
                                              'phone_num_parts': {'type': 'pattern_capture',
                                                                  'patterns':["(\\d{7}$)","(\\d{10}$)"],
                                                                  'preserve_original': True}},
                                   'analyzer': {'phone_num': {'tokenizer': 'keyword',
                                                              'filter': ['phone_num_filter',
                                                                         'phone_num_parts']}}}}})

tokenize("1(800)867-5309", 'phone_num')

[18008675309][8008675309][8675309]


# 4.4.2. Capturing meaning with synonyms

In [20]:
reindex({'settings': {'analysis': {'filter': {'english_stop': {'type': 'stop',
                                                               'stopwords':  '_english_'},
                                              'english_keywords': {'type': 'keyword_marker',
                                                                   'keywords':   []},
                                              'english_stemmer': {'type': 'stemmer',
                                                                  'language': 'english'},
                                              'english_possessive_stemmer': {'type': 'stemmer',
                                                                             'language': 'possessive_english'},
                                              'retail_syn_filter': {'type': 'synonym',
                                                                    'synonyms': ['dress shoe, dress shoes => dress_shoe, shoe']}},
                                   'analyzer': {'retail_analyzer': {'tokenizer':  'standard',
                                                                    'filter': ['english_possessive_stemmer',
                                                                               'lowercase',
                                                                               'retail_syn_filter',
                                                                               'english_keywords',
                                                                               'english_stemmer']}}}},
         'mappings': {'example': {'properties': {'title': {'type': 'text',
                                                        'analyzer': 'retail_analyzer'}}}}})

<Response [200]>

In [21]:
put(1, {'title': "bob's brand dress shoes are the bomb diggity"})
put(2, {'title': "this little black dress is sure to impress"})
put(3, {'title': "tennis shoes... you know, for tennis"})

In [22]:
query = {'explain': 'true',
         'query': {'match': {'title': "dress"}}}

search_explain(query)

Explain for `this little black dress is sure to impress`
0.2876821, weight(title:dress in 0) [PerFieldSimilarity], result of:
  0.2876821, score(doc=0,freq=1.0 = termFreq=1.0), product of:
    0.2876821, idf, computed as log(1 + (docCount - docFreq + 0.5) / (docFreq + 0.5)) from:
      1.0, docFreq
      1.0, docCount
    1.0, tfNorm, computed as (freq * (k1 + 1)) / (freq + k1 * (1 - b + b * fieldLength / avgFieldLength)) from:
      1.0, termFreq=1.0
      1.2, parameter k1
      0.75, parameter b
      8.0, avgFieldLength
      8.0, fieldLength



In [23]:
query = {'explain': 'true',
         'query': {'match': {'title': "shoes"}}}

search_explain(query)

Explain for `bob's brand dress shoes are the bomb diggity`
0.30318588, weight(title:shoe in 0) [PerFieldSimilarity], result of:
  0.30318588, score(doc=0,freq=1.0 = termFreq=1.0), product of:
    0.2876821, idf, computed as log(1 + (docCount - docFreq + 0.5) / (docFreq + 0.5)) from:
      1.0, docFreq
      1.0, docCount
    1.0538921, tfNorm, computed as (freq * (k1 + 1)) / (freq + k1 * (1 - b + b * fieldLength / avgFieldLength)) from:
      1.0, termFreq=1.0
      1.2, parameter k1
      0.75, parameter b
      8.0, avgFieldLength
      7.0, fieldLength

Explain for `tennis shoes... you know, for tennis`
0.2876821, weight(title:shoe in 0) [PerFieldSimilarity], result of:
  0.2876821, score(doc=0,freq=1.0 = termFreq=1.0), product of:
    0.2876821, idf, computed as log(1 + (docCount - docFreq + 0.5) / (docFreq + 0.5)) from:
      1.0, docFreq
      1.0, docCount
    1.0, tfNorm, computed as (freq * (k1 + 1)) / (freq + k1 * (1 - b + b * fieldLength / avgFieldLength)) from:
      1.0, t

In [24]:
query = {'explain': 'true',
         'query': {'match': {'title': "dress shoes"}}}

search_explain(query)

Explain for `bob's brand dress shoes are the bomb diggity`
0.40997607, weight(Synonym(title:dress_sho title:shoe) in 0) [PerFieldSimilarity], result of:
  0.40997607, score(doc=0,freq=2.0 = termFreq=2.0), product of:
    0.2876821, idf, computed as log(1 + (docCount - docFreq + 0.5) / (docFreq + 0.5)) from:
      1.0, docFreq
      1.0, docCount
    1.4251012, tfNorm, computed as (freq * (k1 + 1)) / (freq + k1 * (1 - b + b * fieldLength / avgFieldLength)) from:
      2.0, termFreq=2.0
      1.2, parameter k1
      0.75, parameter b
      8.0, avgFieldLength
      7.0, fieldLength

Explain for `tennis shoes... you know, for tennis`
0.2876821, weight(Synonym(title:dress_sho title:shoe) in 0) [PerFieldSimilarity], result of:
  0.2876821, score(doc=0,freq=1.0 = termFreq=1.0), product of:
    0.2876821, idf, computed as log(1 + (docCount - docFreq + 0.5) / (docFreq + 0.5)) from:
      1.0, docFreq
      1.0, docCount
    1.0, tfNorm, computed as (freq * (k1 + 1)) / (freq + k1 * (1 - b + b *

In [25]:
a = reindex({'settings': {'analysis': {'filter': {'english_stop': {'type': 'stop',
                                                                   'stopwords':  '_english_'},
                                                  'english_keywords': {'type': 'keyword_marker',
                                                                       'keywords':   []},
                                                  'english_stemmer': {'type': 'stemmer',
                                                                      'language':   'english'},
                                                  'english_possessive_stemmer': {'type': 'stemmer',
                                                                                 'language': 'possessive_english'},
                                                  'retail_syn_filter_index': {'type': 'synonym',
                                                                              'synonyms': ['dress shoe, dress shoes => dress_shoe, shoe']},
                                                  'retail_syn_filter_search': {'type': 'synonym',
                                                                               'synonyms': ['dress shoe, dress shoes => dress_shoe']}},
                                       'analyzer': {'retail_analyzer_index': {'tokenizer':  'standard',
                                                                              'filter': ['english_possessive_stemmer',
                                                                                         'lowercase',
                                                                                         'retail_syn_filter_index',
                                                                                         'english_stop',
                                                                                         'english_keywords',
                                                                                         'english_stemmer']},
                                                    'retail_analyzer_search': {'tokenizer':  'standard',
                                                                               'filter': ['english_possessive_stemmer',
                                                                                          'lowercase',
                                                                                          'retail_syn_filter_search',
                                                                                          'english_stop',
                                                                                          'english_keywords',
                                                                                          'english_stemmer']}}}},
             'mappings': {'example': {'properties': {'title': {'type': 'text',
                                                            'analyzer': 'retail_analyzer_index',
                                                            'search_analyzer': 'retail_analyzer_search'}}}}})

In [26]:
put(1, {'title': "bob's brand dress shoes are the bomb diggity"})
put(2, {'title': "this little black dress is sure to impress"})
put(3, {'title': "tennis shoes... you know, for tennis"})

In [27]:
query = {'explain': 'true',
         'query': {'match': {'title': "dress"}}}

search_explain(query)

Explain for `this little black dress is sure to impress`
0.2876821, weight(title:dress in 0) [PerFieldSimilarity], result of:
  0.2876821, score(doc=0,freq=1.0 = termFreq=1.0), product of:
    0.2876821, idf, computed as log(1 + (docCount - docFreq + 0.5) / (docFreq + 0.5)) from:
      1.0, docFreq
      1.0, docCount
    1.0, tfNorm, computed as (freq * (k1 + 1)) / (freq + k1 * (1 - b + b * fieldLength / avgFieldLength)) from:
      1.0, termFreq=1.0
      1.2, parameter k1
      0.75, parameter b
      5.0, avgFieldLength
      5.0, fieldLength



In [28]:
query = {'explain': 'true',
         'query': {'match': {'title': "shoes"}}}

search_explain(query)

Explain for `bob's brand dress shoes are the bomb diggity`
0.30873197, weight(title:shoe in 0) [PerFieldSimilarity], result of:
  0.30873197, score(doc=0,freq=1.0 = termFreq=1.0), product of:
    0.2876821, idf, computed as log(1 + (docCount - docFreq + 0.5) / (docFreq + 0.5)) from:
      1.0, docFreq
      1.0, docCount
    1.0731707, tfNorm, computed as (freq * (k1 + 1)) / (freq + k1 * (1 - b + b * fieldLength / avgFieldLength)) from:
      1.0, termFreq=1.0
      1.2, parameter k1
      0.75, parameter b
      6.0, avgFieldLength
      5.0, fieldLength

Explain for `tennis shoes... you know, for tennis`
0.2876821, weight(title:shoe in 0) [PerFieldSimilarity], result of:
  0.2876821, score(doc=0,freq=1.0 = termFreq=1.0), product of:
    0.2876821, idf, computed as log(1 + (docCount - docFreq + 0.5) / (docFreq + 0.5)) from:
      1.0, docFreq
      1.0, docCount
    1.0, tfNorm, computed as (freq * (k1 + 1)) / (freq + k1 * (1 - b + b * fieldLength / avgFieldLength)) from:
      1.0, t

In [29]:
query = {'explain': 'true',
         'query': {'match': {'title': "dress shoes"}}}

search_explain(query)

Explain for `bob's brand dress shoes are the bomb diggity`
0.30873197, weight(title:dress_sho in 0) [PerFieldSimilarity], result of:
  0.30873197, score(doc=0,freq=1.0 = termFreq=1.0), product of:
    0.2876821, idf, computed as log(1 + (docCount - docFreq + 0.5) / (docFreq + 0.5)) from:
      1.0, docFreq
      1.0, docCount
    1.0731707, tfNorm, computed as (freq * (k1 + 1)) / (freq + k1 * (1 - b + b * fieldLength / avgFieldLength)) from:
      1.0, termFreq=1.0
      1.2, parameter k1
      0.75, parameter b
      6.0, avgFieldLength
      5.0, fieldLength



# Notes

Takeaways:

1. ?

Experiments:

- **[1]**: 