# Boilerplate Setup

In [1]:
import json
import os
from pprint import pprint
import requests
import time

ELASTICSEARCH_URL = os.environ.get('ELASTICSEARCH_URL', 'http://localhost:9200')

# 4.2.2 Analysis for precision or recall

In [2]:
def extract():
    with open('tmdb.json') as f:
         return json.loads(f.read())

In [23]:
def reindex(settings):
    response = requests.delete(ELASTICSEARCH_URL + '/my_library')
    response = requests.put(ELASTICSEARCH_URL + '/my_library', json=settings)
    return response

In [43]:
def tokenize(text, analyzer):
    response = requests.get(
        ELASTICSEARCH_URL + '/my_library/_analyze', 
        json={
            'text': text,
            'analyzer': analyzer
        }
    ).json()

    print(''.join(['[{}]'.format(token_term['token']) for token_term in response['tokens']]))

## Standard analyzer

In [44]:
reindex({
  "settings": {
    "analysis": {
      "analyzer": {
        "standard_clone": {
          "tokenizer": "standard",
          "filter": [ "standard", "lowercase", "stop" ]
}}}}})

<Response [200]>

In [46]:
tokenize("Dr. Strangelove: Or How I Learned to Stop Worrying and Love the Bomb", 'standard_clone')

[dr][strangelove][how][i][learned][stop][worrying][love][bomb]


## English analyzer

In [47]:
reindex({
  "settings": {
    "analysis": {
      "filter": {
        "english_stop": {
          "type":       "stop",
          "stopwords":  "_english_"},
        "english_keywords": {
          "type":       "keyword_marker",
          "keywords":   []},
        "english_stemmer": {
          "type":       "stemmer",
          "language":   "english"},
        "english_possessive_stemmer": {
          "type":       "stemmer",
          "language":   "possessive_english"}},
      "analyzer": {
        "english_clone": {
          "tokenizer":  "standard",
          "filter": [
            "english_possessive_stemmer",
            "lowercase",
            "english_stop",
            "english_keywords",
            "english_stemmer"]
}}}}})

<Response [200]>

In [48]:
tokenize("Dr. Strangelove: Or How I Learned to Stop Worrying and Love the Bomb", 'english_clone')

[dr][strangelov][how][i][learn][stop][worri][love][bomb]


# 4.2.3 Taking recall to extremes

## Phonetic analyzer

In [37]:
%%bash
docker exec elasticsearch bash -c "bin/elasticsearch-plugin install analysis-phonetic > /dev/null"
docker restart elasticsearch
sleep 30s

elasticsearch


In [49]:
reindex({
  "settings": {
    "analysis": {
      "analyzer": {
        "phonetic": {
          "tokenizer": "standard",
          "filter": [
            "standard",
            "lowercase",
            "my_doublemetaphone"]}},
      "filter": {
        "my_doublemetaphone": {
          "type": "phonetic",
          "encoder": "doublemetaphone",
           "replace": True
}}}}})

<Response [200]>

In [51]:
tokenize("message from Dalai Lama", 'phonetic')
tokenize("message from tall llama", 'phonetic')

[MSJ][MSK][FRM][TL][LM]
[MSJ][MSK][FRM][TL][LM]


# 4.3.1 Scoring strength of a feature in a single field

In [56]:
print(requests.put(ELASTICSEARCH_URL + '/my_library/example/1', json={ "title":"apple apple apple apple apple" }))
print(requests.put(ELASTICSEARCH_URL + '/my_library/example/2', json={ "title":"apple apple apple banana banana" }))
print(requests.put(ELASTICSEARCH_URL + '/my_library/example/3', json={ "title":"apple banana blueberry coconut" }))

<Response [200]>
<Response [200]>
<Response [200]>


In [78]:
def simpler_explain(explain_json, depth=0):
    result = " " * (depth * 2) + "%s, %s\n" % (explain_json['value'], explain_json['description'].replace('\n', ''))
    if 'details' in explain_json:
        for detail in explain_json['details']:
            result += simpler_explain(detail, depth=depth+1)
    return result

query = {
  "explain": "true",
  "query": {
    "match": {
      "title": "apple"
}}}
response = requests.get(ELASTICSEARCH_URL + '/my_library/example/_search', json=query)
data = response.json()
for i in range(len(data['hits']['hits'])):
    print("Explain for %s" % data['hits']['hits'][i]['_source']['title'])
    print(simpler_explain(data['hits']['hits'][i]['_explanation']))

Explain for apple apple apple apple apple
0.51040375, weight(title:apple in 0) [PerFieldSimilarity], result of:
  0.51040375, score(doc=0,freq=5.0 = termFreq=5.0), product of:
    0.2876821, idf, computed as log(1 + (docCount - docFreq + 0.5) / (docFreq + 0.5)) from:
      1.0, docFreq
      1.0, docCount
    1.7741936, tfNorm, computed as (freq * (k1 + 1)) / (freq + k1 * (1 - b + b * fieldLength / avgFieldLength)) from:
      5.0, termFreq=5.0
      1.2, parameter k1
      0.75, parameter b
      5.0, avgFieldLength
      5.0, fieldLength

Explain for apple banana blueberry coconut
0.2876821, weight(title:apple in 0) [PerFieldSimilarity], result of:
  0.2876821, score(doc=0,freq=1.0 = termFreq=1.0), product of:
    0.2876821, idf, computed as log(1 + (docCount - docFreq + 0.5) / (docFreq + 0.5)) from:
      1.0, docFreq
      1.0, docCount
    1.0, tfNorm, computed as (freq * (k1 + 1)) / (freq + k1 * (1 - b + b * fieldLength / avgFieldLength)) from:
      1.0, termFreq=1.0
      1.2, 

In [70]:
print(requests.put(ELASTICSEARCH_URL + '/my_library/example/4', json={ "title":"apples apple" }))

<Response [200]>


In [80]:
query = {
  "explain": "true",
  "query": {
    "match": {
      "title": "apple"
}}}
response = requests.get(ELASTICSEARCH_URL + '/my_library/example/_search', json=query)
data = response.json()
for i in range(len(data['hits']['hits'])):
    print("Explain for %s" % data['hits']['hits'][i]['_source']['title'])
    print(simpler_explain(data['hits']['hits'][i]['_explanation']))

Explain for apple apple apple apple apple
0.51040375, weight(title:apple in 0) [PerFieldSimilarity], result of:
  0.51040375, score(doc=0,freq=5.0 = termFreq=5.0), product of:
    0.2876821, idf, computed as log(1 + (docCount - docFreq + 0.5) / (docFreq + 0.5)) from:
      1.0, docFreq
      1.0, docCount
    1.7741936, tfNorm, computed as (freq * (k1 + 1)) / (freq + k1 * (1 - b + b * fieldLength / avgFieldLength)) from:
      5.0, termFreq=5.0
      1.2, parameter k1
      0.75, parameter b
      5.0, avgFieldLength
      5.0, fieldLength

Explain for apple banana blueberry coconut
0.2876821, weight(title:apple in 0) [PerFieldSimilarity], result of:
  0.2876821, score(doc=0,freq=1.0 = termFreq=1.0), product of:
    0.2876821, idf, computed as log(1 + (docCount - docFreq + 0.5) / (docFreq + 0.5)) from:
      1.0, docFreq
      1.0, docCount
    1.0, tfNorm, computed as (freq * (k1 + 1)) / (freq + k1 * (1 - b + b * fieldLength / avgFieldLength)) from:
      1.0, termFreq=1.0
      1.2, 

In [94]:
reindex({
    'mappings': {
       'example': {
            'properties': {
               'title': {
                   'type': 'text',
                   'analyzer': 'english'},
            'overview': {
                   'type': 'text',
                   'analyzer': 'english'
}}}}})

<Response [200]>

In [95]:
print(requests.put(ELASTICSEARCH_URL + '/my_library/example/1', json={ "title":"apple apple apple apple apple" }))
print(requests.put(ELASTICSEARCH_URL + '/my_library/example/2', json={ "title":"apple apple apple banana banana" }))
print(requests.put(ELASTICSEARCH_URL + '/my_library/example/3', json={ "title":"apple banana blueberry coconut" }))
print(requests.put(ELASTICSEARCH_URL + '/my_library/example/4', json={ "title":"apples apple" }))

<Response [201]>
<Response [201]>
<Response [201]>
<Response [201]>


In [96]:
time.sleep(5)

In [98]:
query = {
  "explain": "true",
  "query": {
    "match": {
      "title": "apple"
}}}
response = requests.get(ELASTICSEARCH_URL + '/my_library/example/_search', json=query)
data = response.json()
for i in range(len(data['hits']['hits'])):
    print("Explain for %s" % data['hits']['hits'][i]['_source']['title'])
    print(simpler_explain(data['hits']['hits'][i]['_explanation']))

Explain for apple apple apple apple apple
0.51040375, weight(title:appl in 0) [PerFieldSimilarity], result of:
  0.51040375, score(doc=0,freq=5.0 = termFreq=5.0), product of:
    0.2876821, idf, computed as log(1 + (docCount - docFreq + 0.5) / (docFreq + 0.5)) from:
      1.0, docFreq
      1.0, docCount
    1.7741936, tfNorm, computed as (freq * (k1 + 1)) / (freq + k1 * (1 - b + b * fieldLength / avgFieldLength)) from:
      5.0, termFreq=5.0
      1.2, parameter k1
      0.75, parameter b
      5.0, avgFieldLength
      5.0, fieldLength

Explain for apple banana blueberry coconut
0.2876821, weight(title:appl in 0) [PerFieldSimilarity], result of:
  0.2876821, score(doc=0,freq=1.0 = termFreq=1.0), product of:
    0.2876821, idf, computed as log(1 + (docCount - docFreq + 0.5) / (docFreq + 0.5)) from:
      1.0, docFreq
      1.0, docCount
    1.0, tfNorm, computed as (freq * (k1 + 1)) / (freq + k1 * (1 - b + b * fieldLength / avgFieldLength)) from:
      1.0, termFreq=1.0
      1.2, pa

# Notes

Takeaways:

1. ?

Experiments:

- **[1]**: 