# Boilerplate Setup

In [1]:
import json
import os
from pprint import pprint
import requests
import time

ELASTICSEARCH_URL = os.environ.get('ELASTICSEARCH_URL', 'http://localhost:9292')

# 4.2.2 Analysis for precision or recall

In [2]:
def reindex(settings):
    response = requests.delete(ELASTICSEARCH_URL + '/my_library')
    response = requests.put(ELASTICSEARCH_URL + '/my_library', json=settings)
    return response

In [3]:
def tokenize(text, analyzer):
    response = requests.get(
        ELASTICSEARCH_URL + '/my_library/_analyze', 
        json={'text': text, 'analyzer': analyzer},
    ).json()
    
    print(''.join(['[{}]'.format(token_term['token']) for token_term in response['tokens']]))

## Standard analyzer

In [4]:
reindex({'settings': {'analysis': {'analyzer': {'standard_clone': {'tokenizer': 'standard',
                                                                   'filter': ['lowercase',
                                                                              'stop']}}}}})

tokenize("Dr. Strangelove: Or How I Learned to Stop Worrying and Love the Bomb", 'standard_clone')

[dr][strangelove][how][i][learned][stop][worrying][love][bomb]


## English analyzer

In [5]:
reindex({'settings': {'analysis': {'filter': {'english_stop': {'type': 'stop',
                                                               'stopwords':  '_english_'},
                                              'english_keywords': {'type': 'keyword_marker',
                                                                   'keywords':   []},
                                              'english_stemmer': {'type': 'stemmer',
                                                                  'language': 'english'},
                                              'english_possessive_stemmer': {'type': 'stemmer',
                                                                             'language': 'possessive_english'}},
                                   'analyzer': {'english_clone': {'tokenizer': 'standard',
                                                                  'filter': ['english_possessive_stemmer',
                                                                             'lowercase',
                                                                             'english_stop',
                                                                             'english_keywords',
                                                                             'english_stemmer']}}}}})

tokenize("Dr. Strangelove: Or How I Learned to Stop Worrying and Love the Bomb", 'english_clone')

[dr][strangelov][how][i][learn][stop][worri][love][bomb]


# 4.2.3 Taking recall to extremes

## Phonetic analyzer

In [6]:
%%bash -s "$ELASTICSEARCH_URL"
docker exec relsearch bash -c "bin/elasticsearch-plugin install analysis-phonetic >/dev/null"
docker restart relsearch >/dev/null
wget $1 --retry-connrefused --tries=10 -q --wait=10 --spider



In [7]:
reindex({'settings': {'analysis': {'analyzer': {'phonetic': {'tokenizer': 'standard',
                                                             'filter': ['lowercase',
                                                                        'my_doublemetaphone']}},
                                   'filter': {'my_doublemetaphone': {'type': 'phonetic',
                                                                     'encoder': 'doublemetaphone',
                                                                     'replace': True}}}}})

<Response [200]>

In [8]:
tokenize("message from Dalai Lama", 'phonetic')
tokenize("message from tall llama", 'phonetic')

[MSJ][MSK][FRM][TL][LM]
[MSJ][MSK][FRM][TL][LM]


# 4.3.1 Scoring strength of a feature in a single field

In [9]:
def put(id, item):
    requests.put(ELASTICSEARCH_URL + '/my_library/_doc/%d?refresh=wait_for' % id, json=item)

In [10]:
put(1, {'title': "apple apple apple apple apple"})
put(2, {'title': "apple apple apple banana banana"})
put(3, {'title': "apple banana blueberry coconut"})

In [11]:
def simpler_explain(explain_json, depth=0):
    result = " " * (depth * 2) + "%s, %s\n" % (explain_json['value'], explain_json['description'].replace('\n', ''))
    if 'details' in explain_json:
        for detail in explain_json['details']:
            result += simpler_explain(detail, depth=depth+1)
    return result

def search_explain(query):
    query['explain'] = True
    response = requests.get(ELASTICSEARCH_URL + '/my_library/_search', json=query)
    data = response.json()
    for i in range(len(data['hits']['hits'])):
        print("Explain for `%s`" % data['hits']['hits'][i]['_source']['title'])
        print(simpler_explain(data['hits']['hits'][i]['_explanation']))

In [12]:
query = {'explain': 'true',
         'query': {'match': {'title': "apple"}}}

search_explain(query)

Explain for `apple apple apple apple apple`
0.2344793, weight(title:apple in 0) [PerFieldSimilarity], result of:
  0.2344793, score(freq=5.0), product of:
    2.2, boost
    0.13353139, idf, computed as log(1 + (N - n + 0.5) / (n + 0.5)) from:
      3, n, number of documents containing term
      3, N, total number of documents with field
    0.7981756, tf, computed as freq / (freq + k1 * (1 - b + b * dl / avgdl)) from:
      5.0, freq, occurrences of term within document
      1.2, k1, term saturation parameter
      0.75, b, length normalization parameter
      5.0, dl, length of field
      4.6666665, avgdl, average length of field

Explain for `apple apple apple banana banana`
0.20667168, weight(title:apple in 0) [PerFieldSimilarity], result of:
  0.20667168, score(freq=3.0), product of:
    2.2, boost
    0.13353139, idf, computed as log(1 + (N - n + 0.5) / (n + 0.5)) from:
      3, n, number of documents containing term
      3, N, total number of documents with field
    0.70351

In [13]:
put(4, {'title': "apples apple"})

In [14]:
query = {'explain': 'true',
         'query': {'match': {'title': "apple"}}}

search_explain(query)

Explain for `apple apple apple apple apple`
0.18038376, weight(title:apple in 0) [PerFieldSimilarity], result of:
  0.18038376, score(freq=5.0), product of:
    2.2, boost
    0.105360515, idf, computed as log(1 + (N - n + 0.5) / (n + 0.5)) from:
      4, n, number of documents containing term
      4, N, total number of documents with field
    0.7782101, tf, computed as freq / (freq + k1 * (1 - b + b * dl / avgdl)) from:
      5.0, freq, occurrences of term within document
      1.2, k1, term saturation parameter
      0.75, b, length normalization parameter
      5.0, dl, length of field
      4.0, avgdl, average length of field

Explain for `apple apple apple banana banana`
0.1571479, weight(title:apple in 0) [PerFieldSimilarity], result of:
  0.1571479, score(freq=3.0), product of:
    2.2, boost
    0.105360515, idf, computed as log(1 + (N - n + 0.5) / (n + 0.5)) from:
      4, n, number of documents containing term
      4, N, total number of documents with field
    0.6779661, 

In [15]:
reindex({'mappings': {'properties': {'title': {'type': 'text', 'analyzer': 'english'},
                                     'overview': {'type': 'text', 'analyzer': 'english'}}}})

<Response [200]>

In [16]:
put(1, {'title': "apple apple apple apple apple"})
put(2, {'title': "apple apple apple banana banana"})
put(3, {'title': "apple banana blueberry coconut"})
put(4, {'title': "apples apple"})

In [17]:
query = {'explain': 'true',
         'query': {'match': {'title': "apple"}}}

search_explain(query)

Explain for `apple apple apple apple apple`
0.18038376, weight(title:appl in 0) [PerFieldSimilarity], result of:
  0.18038376, score(freq=5.0), product of:
    2.2, boost
    0.105360515, idf, computed as log(1 + (N - n + 0.5) / (n + 0.5)) from:
      4, n, number of documents containing term
      4, N, total number of documents with field
    0.7782101, tf, computed as freq / (freq + k1 * (1 - b + b * dl / avgdl)) from:
      5.0, freq, occurrences of term within document
      1.2, k1, term saturation parameter
      0.75, b, length normalization parameter
      5.0, dl, length of field
      4.0, avgdl, average length of field

Explain for `apples apple`
0.16857684, weight(title:appl in 0) [PerFieldSimilarity], result of:
  0.16857684, score(freq=2.0), product of:
    2.2, boost
    0.105360515, idf, computed as log(1 + (N - n + 0.5) / (n + 0.5)) from:
      4, n, number of documents containing term
      4, N, total number of documents with field
    0.72727275, tf, computed as fr

# 4.4.1 Dealing with delimiters

### Acronyms

In [18]:
reindex({'settings': {'analysis': {'filter': {'acronyms': {'type': 'word_delimiter',
                                                           'catenate_all': True,
                                                           'generate_word_parts': False,
                                                           'generate_number_parts': False}},
                                   'analyzer': {'standard_with_acronyms': {'tokenizer': 'standard',
                                                                           'filter': ['lowercase',
                                                                                      'acronyms']}}}}})

tokenize("I.B.M. versus IBM versus ibm", 'standard')
tokenize("I.B.M. versus IBM versus ibm", 'standard_with_acronyms')

[i.b.m][versus][ibm][versus][ibm]
[ibm][versus][ibm][versus][ibm]


### Phone numbers

In [19]:
reindex({'settings': {'analysis': {'filter': {'phone_num_filter': {'type': 'word_delimiter',
                                                                   'catenate_all': True,
                                                                   'generate_number_parts': False},
                                              'phone_num_parts': {'type': 'pattern_capture',
                                                                  'patterns':["(\\d{7}$)","(\\d{10}$)"],
                                                                  'preserve_original': True}},
                                   'analyzer': {'phone_num': {'tokenizer': 'keyword',
                                                              'filter': ['phone_num_filter',
                                                                         'phone_num_parts']}}}}})

tokenize("1(800)867-5309", 'standard')
tokenize("1(800)867-5309", 'phone_num')

[1][800][867][5309]
[18008675309][8008675309][8675309]


# 4.4.2. Capturing meaning with synonyms

In [20]:
reindex({'settings': {'analysis': {'filter': {'english_stop': {'type': 'stop',
                                                               'stopwords':  '_english_'},
                                              'english_keywords': {'type': 'keyword_marker',
                                                                   'keywords':   []},
                                              'english_stemmer': {'type': 'stemmer',
                                                                  'language': 'english'},
                                              'english_possessive_stemmer': {'type': 'stemmer',
                                                                             'language': 'possessive_english'},
                                              'retail_syn_filter': {'type': 'synonym',
                                                                    'synonyms': ['dress shoe, dress shoes => dress_shoe, shoe']}},
                                   'analyzer': {'retail_analyzer': {'tokenizer':  'standard',
                                                                    'filter': ['english_possessive_stemmer',
                                                                               'lowercase',
                                                                               'retail_syn_filter',
                                                                               'english_keywords',
                                                                               'english_stemmer']}}}},
         'mappings': {'properties': {'title': {'type': 'text',
                                               'analyzer': 'retail_analyzer'}}}})

<Response [200]>

In [21]:
put(1, {'title': "bob's brand dress shoes are the bomb diggity"})
put(2, {'title': "this little black dress is sure to impress"})
put(3, {'title': "tennis shoes... you know, for tennis"})

In [22]:
query = {'explain': 'true',
         'query': {'match': {'title': "dress"}}}

search_explain(query)

Explain for `this little black dress is sure to impress`
0.94566005, weight(title:dress in 0) [PerFieldSimilarity], result of:
  0.94566005, score(freq=1.0), product of:
    2.2, boost
    0.98082924, idf, computed as log(1 + (N - n + 0.5) / (n + 0.5)) from:
      1, n, number of documents containing term
      3, N, total number of documents with field
    0.43824703, tf, computed as freq / (freq + k1 * (1 - b + b * dl / avgdl)) from:
      1.0, freq, occurrences of term within document
      1.2, k1, term saturation parameter
      0.75, b, length normalization parameter
      8.0, dl, length of field
      7.3333335, avgdl, average length of field



In [23]:
query = {'explain': 'true',
         'query': {'match': {'title': "shoes"}}}

search_explain(query)

Explain for `tennis shoes... you know, for tennis`
0.5077718, weight(title:shoe in 0) [PerFieldSimilarity], result of:
  0.5077718, score(freq=1.0), product of:
    2.2, boost
    0.47000363, idf, computed as log(1 + (N - n + 0.5) / (n + 0.5)) from:
      2, n, number of documents containing term
      3, N, total number of documents with field
    0.4910714, tf, computed as freq / (freq + k1 * (1 - b + b * dl / avgdl)) from:
      1.0, freq, occurrences of term within document
      1.2, k1, term saturation parameter
      0.75, b, length normalization parameter
      6.0, dl, length of field
      7.3333335, avgdl, average length of field

Explain for `bob's brand dress shoes are the bomb diggity`
0.478909, weight(title:shoe in 0) [PerFieldSimilarity], result of:
  0.478909, score(freq=1.0), product of:
    2.2, boost
    0.47000363, idf, computed as log(1 + (N - n + 0.5) / (n + 0.5)) from:
      2, n, number of documents containing term
      3, N, total number of documents with fie

In [24]:
query = {'explain': 'true',
         'query': {'match': {'title': "dress shoes"}}}

search_explain(query)

Explain for `bob's brand dress shoes are the bomb diggity`
0.6546238, weight(Synonym(title:dress_sho title:shoe) in 0) [PerFieldSimilarity], result of:
  0.6546238, score(freq=2.0), product of:
    2.2, boost
    0.47000363, idf, computed as log(1 + (N - n + 0.5) / (n + 0.5)) from:
      2, n, number of documents containing term
      3, N, total number of documents with field
    0.63309354, tf, computed as freq / (freq + k1 * (1 - b + b * dl / avgdl)) from:
      2.0, termFreq=2.0
      1.2, k1, term saturation parameter
      0.75, b, length normalization parameter
      7.0, dl, length of field
      7.3333335, avgdl, average length of field

Explain for `tennis shoes... you know, for tennis`
0.5077718, weight(Synonym(title:dress_sho title:shoe) in 0) [PerFieldSimilarity], result of:
  0.5077718, score(freq=1.0), product of:
    2.2, boost
    0.47000363, idf, computed as log(1 + (N - n + 0.5) / (n + 0.5)) from:
      2, n, number of documents containing term
      3, N, total numb

In [25]:
reindex({'settings': {'analysis': {'filter': {'english_stop': {'type': 'stop',
                                                                   'stopwords':  '_english_'},
                                                  'english_keywords': {'type': 'keyword_marker',
                                                                       'keywords':   []},
                                                  'english_stemmer': {'type': 'stemmer',
                                                                      'language':   'english'},
                                                  'english_possessive_stemmer': {'type': 'stemmer',
                                                                                 'language': 'possessive_english'},
                                                  'retail_syn_filter_index': {'type': 'synonym',
                                                                              'synonyms': ['dress shoe, dress shoes => dress_shoe, shoe']},
                                                  'retail_syn_filter_search': {'type': 'synonym',
                                                                               'synonyms': ['dress shoe, dress shoes => dress_shoe']}},
                                       'analyzer': {'retail_analyzer_index': {'tokenizer':  'standard',
                                                                              'filter': ['english_possessive_stemmer',
                                                                                         'lowercase',
                                                                                         'retail_syn_filter_index',
                                                                                         'english_stop',
                                                                                         'english_keywords',
                                                                                         'english_stemmer']},
                                                    'retail_analyzer_search': {'tokenizer':  'standard',
                                                                               'filter': ['english_possessive_stemmer',
                                                                                          'lowercase',
                                                                                          'retail_syn_filter_search',
                                                                                          'english_stop',
                                                                                          'english_keywords',
                                                                                          'english_stemmer']}}}},
             'mappings': {'properties': {'title': {'type': 'text',
                                                   'analyzer': 'retail_analyzer_index',
                                                   'search_analyzer': 'retail_analyzer_search'}}}})

<Response [200]>

In [26]:
put(1, {'title': "bob's brand dress shoes are the bomb diggity"})
put(2, {'title': "this little black dress is sure to impress"})
put(3, {'title': "tennis shoes... you know, for tennis"})

In [27]:
query = {'explain': 'true',
         'query': {'match': {'title': "dress"}}}

search_explain(query)

Explain for `this little black dress is sure to impress`
1.0065652, weight(title:dress in 0) [PerFieldSimilarity], result of:
  1.0065652, score(freq=1.0), product of:
    2.2, boost
    0.98082924, idf, computed as log(1 + (N - n + 0.5) / (n + 0.5)) from:
      1, n, number of documents containing term
      3, N, total number of documents with field
    0.4664723, tf, computed as freq / (freq + k1 * (1 - b + b * dl / avgdl)) from:
      1.0, freq, occurrences of term within document
      1.2, k1, term saturation parameter
      0.75, b, length normalization parameter
      5.0, dl, length of field
      5.3333335, avgdl, average length of field



In [28]:
query = {'explain': 'true',
         'query': {'match': {'title': "shoes"}}}

search_explain(query)

Explain for `bob's brand dress shoes are the bomb diggity`
0.4823361, weight(title:shoe in 0) [PerFieldSimilarity], result of:
  0.4823361, score(freq=1.0), product of:
    2.2, boost
    0.47000363, idf, computed as log(1 + (N - n + 0.5) / (n + 0.5)) from:
      2, n, number of documents containing term
      3, N, total number of documents with field
    0.4664723, tf, computed as freq / (freq + k1 * (1 - b + b * dl / avgdl)) from:
      1.0, freq, occurrences of term within document
      1.2, k1, term saturation parameter
      0.75, b, length normalization parameter
      5.0, dl, length of field
      5.3333335, avgdl, average length of field

Explain for `tennis shoes... you know, for tennis`
0.4823361, weight(title:shoe in 0) [PerFieldSimilarity], result of:
  0.4823361, score(freq=1.0), product of:
    2.2, boost
    0.47000363, idf, computed as log(1 + (N - n + 0.5) / (n + 0.5)) from:
      2, n, number of documents containing term
      3, N, total number of documents with f

In [29]:
query = {'explain': 'true',
         'query': {'match': {'title': "dress shoes"}}}

search_explain(query)

Explain for `bob's brand dress shoes are the bomb diggity`
1.0065652, weight(title:dress_sho in 0) [PerFieldSimilarity], result of:
  1.0065652, score(freq=1.0), product of:
    2.2, boost
    0.98082924, idf, computed as log(1 + (N - n + 0.5) / (n + 0.5)) from:
      1, n, number of documents containing term
      3, N, total number of documents with field
    0.4664723, tf, computed as freq / (freq + k1 * (1 - b + b * dl / avgdl)) from:
      1.0, freq, occurrences of term within document
      1.2, k1, term saturation parameter
      0.75, b, length normalization parameter
      5.0, dl, length of field
      5.3333335, avgdl, average length of field



# 4.4.5. Modeling specificity with paths 

In [30]:
reindex({'settings': {'analysis': {'analyzer': {'path_hierarchy': {'tokenizer': 'path_hierarchy'}}}},
         'mappings': {'properties': {'inventory_dir': {'type': 'text',
                                                       'analyzer': 'path_hierarchy'}}}})

<Response [200]>

In [31]:
put(1, {'inventory_dir': '/fruit/apples/fuji', 'title': "crisp, sweet-flavored, long shelf-life"})
put(2, {'inventory_dir': '/fruit/apples/gala', 'title ': "sweet, pleasant apple"})
put(3, {'inventory_dir': '/fruit', 'title ': "edible, seed-bearing portion of plants"})

In [32]:
query = {'query': {'bool': {'should': [{'term': {'inventory_dir': '/fruit/apples/fuji'}}]}}}

search_explain(query)

Explain for `crisp, sweet-flavored, long shelf-life`
1.2800652, weight(inventory_dir:/fruit/apples/fuji in 0) [PerFieldSimilarity], result of:
  1.2800652, score(freq=1.0), product of:
    2.2, boost
    0.98082924, idf, computed as log(1 + (N - n + 0.5) / (n + 0.5)) from:
      1, n, number of documents containing term
      3, N, total number of documents with field
    0.5932203, tf, computed as freq / (freq + k1 * (1 - b + b * dl / avgdl)) from:
      1.0, freq, occurrences of term within document
      1.2, k1, term saturation parameter
      0.75, b, length normalization parameter
      1.0, dl, length of field
      2.3333333, avgdl, average length of field



There is a specialized Path Hierarchy Tokenizer that can be used to control the specificity of the query or index.

# 4.4.9. Tokenizing melodies 

In [33]:
reindex({'settings': {'analysis': {'filter': {'parsons-ngram': {'type': 'nGram',
                                                                'min_gram': 5,
                                                                'max_gram': 5}},
                                   'analyzer': {'parsons': {'tokenizer': 'keyword',
                                                            'filter': ['parsons-ngram']}}}}})

<Response [200]>

In [34]:
tokenize("*RRDURDURDRD", 'parsons')

[*RRDU][RRDUR][RDURD][DURDU][URDUR][RDURD][DURDR][URDRD]


In [35]:
reindex({'settings': {'analysis': {'filter': {'parsons-ngram': {'type': 'nGram',
                                                                'min_gram': 4,
                                                                'max_gram': 5}},
                                   'analyzer': {'parsons': {'tokenizer': 'keyword',
                                                            'filter': ['parsons-ngram']}}}}})

<Response [200]>

In [36]:
tokenize("*RRDURDURDRD", 'parsons')

[*RRD][*RRDU][RRDU][RRDUR][RDUR][RDURD][DURD][DURDU][URDU][URDUR][RDUR][RDURD][DURD][DURDR][URDR][URDRD][RDRD]


# Notes

Takeaways:

1. Generalizing a concept either by synonym lists or tokenization techniques is called *semantic expansion* and is the basis of assymetric analysis.
2. Asymmetric analysis allows general concepts to be extracted at index or query time. This can be used to make answer queries with generalized or specialized concepts.
3. This is the default behavior on several built-in filters and analyzers, like integers or geopoints. It also allows efficient querying of ranges and different levels of precision.

Experiments:

- **[1,2]**: Mapping expressions to a concise term that represents the concept like `dress shoe -> dress_shoe, shoe` will guarantee us the ability to treat "dress shoes" as shoe, as a specific type of shoe and not as dress.