# Index Receipts


In [1]:
from ir import IndexDataframe, search_loop, make_dictionary, make_raw_to_web, \
SimpleQueryMaker, WildQueryMaker, MashedWildQueryMaker, FuzzyMashedWildQueryMaker, \
NgramWildQueryMaker, XorNgramWildQueryMaker, BoostedNgramWildQueryMaker, FuzzyNgramWildQueryMaker, \
Config, SimpleSearcher, evaluate
import pandas as pd
from datetime import datetime
from org.apache.lucene.analysis.standard import StandardAnalyzer

In [2]:
INDEX_DIR = "indexes/receipts1"

In [3]:
path = '../data/raw_web_joined/703_00198_2020-03-20_3_1391204_joined.json'
df = pd.read_json(path)

In [4]:
def index_test(quiet=False):
    start = datetime.now()
    try:
        IndexDataframe(df, INDEX_DIR, StandardAnalyzer(), quiet)
        end = datetime.now()
        print('Elapsed: %s' % (end - start))
    except Exception as e:
        print("Failed: %s" % e)
        raise e    

In [5]:
index_test(True)

.done
Elapsed: 0:00:00.215915


In [6]:
#search_loop(INDEX_DIR, 'web', explain=True)

## Equivalency Evaluation
```
build a table of raw -> [web]
where [web] is all the possible values of matches for raw
example: 'FFST CAT FOOD' -> ['Fancy Feast Flaked Fish Cat Food', 'Purina Fancy Feast Chicken Cat Food']

search on query (e.g. FFST CAT FOOD) if top result is any of the ones associated with query, it counts as a hit
```

## Wildcard Technique
```
when building a query evaluate against dictionary of seen web terms
unseen tokens get wildcard treatment
wildcard treatment means insert * between each letter
e.g., FFST CAT FOOD -> F*F*S*T CAT FOOD
since cat and food exist in dictionary
```

In [7]:
WORDS = make_dictionary(df)

In [8]:
RAW_TO_WEB = make_raw_to_web(df)

## Scoring
### Interface
```
def is_hit(raw, config): -> bool

config:
  Index
  Searcher
  QueryMaker
```

Algorithm:
We lookup the webs for the webs for the raw to generate the hit candidates
we process raw into a query using QueryMaker
We run the query using Searcher
Index may not be necessary
If the top result is in webs, we return true, otherwise false

In [9]:
qm = SimpleQueryMaker()

In [10]:
ss = SimpleSearcher(INDEX_DIR)

In [11]:
simple_config = Config(qm, ss)

In [12]:
queries = ['AVOCADO', 'FFST CAT FOOD']

In [13]:
evaluate(queries, simple_config, RAW_TO_WEB)

(0.5, ['AVOCADO'])

In [14]:
queries = df.raw.unique()

In [15]:
simple_score, simple_misses = evaluate(queries, simple_config, RAW_TO_WEB)

In [16]:
len(queries)

69

In [17]:
wqm = WildQueryMaker(WORDS)

In [18]:
wild_config = Config(wqm, ss)

In [19]:
wild_score, wild_misses = evaluate(queries, wild_config, RAW_TO_WEB)

In [20]:
simple_score

0.6231884057971014

In [21]:
wild_score

0.8405797101449275

In [22]:
simple_misses

['KRO WATER',
 'CA REDEM VAL',
 'ARTICHOKES',
 'BYND SSG HT ITLN',
 'BRHD CHEESE',
 'CUCUMBERS',
 'FRGO STR CHS',
 'GLBNI STR CHS',
 'KRO SOAP',
 'KRO CCNT MK',
 'MSHRM BYBL WHL',
 'LES PET CHEESE BAR',
 'BROWN ONIONS',
 'ASP ORG',
 'PPRS BL GRN ORGN',
 'RADISH ORG',
 'SQSH YLW ORG',
 'TOMATO ORGNC',
 'PRSL MPL TKY GNG',
 'PRSL MUENSTR',
 'STO CRT BABY ORGNC',
 'STO CCNT MILK',
 'STO BROTH',
 'STO CARROTS ORGNC',
 'STN CHCK RST',
 'SFTSOAP KTCHN FRSH']

### wild_misses
`wild_misses` is the set of queries that wasn't matched by using wild queries on terms.  
Examples
* `CA REDEM VAL` - difficult to match since little lexical overlap
* `ARTICHOKES`, `CUCMBER` - plural mismatmatch
* `BRHD *` `PRSL *`, `STO *`   - Wildcard won't match terms because BRHD, PRSL, STO spans multiple terms.  However, a wildcard match against the entire web string might work ok
* `KRO SOAP` - Matched `Kroger® Pear & Coconut Hand Soap` which had associated raw field of `KRO PEAR COCONUT`.  The actual hit was at rank 2.  Measuring precision@2 would have caught it.
* `LES PET CHEESE BAR` - Matched `Les Petites Havarati Cheese Wedge`.  The second hit was correct, with web value of `Les Petites Kosher Colby Jack Cheese`.  Note that neither web had disambiguating `BAR` term present.
* `BROWN ONIONS` - Matched `Onions - Green`.  Correct hit was ranked second: `Onions - Yellow`.  Note that there was no Yellow Onions in the web texts.

In [23]:
wild_misses

['CA REDEM VAL',
 'ARTICHOKES',
 'BRHD CHEESE',
 'CUCUMBERS',
 'KRO SOAP',
 'LES PET CHEESE BAR',
 'BROWN ONIONS',
 'PRSL MUENSTR',
 'STO CCNT MILK',
 'STO BROTH',
 'STO CARROTS ORGNC']

### wild_eliminees
`wild_eliminees` is the set of simple_misses that were eliminated by using wildcard queries on terms
Example: `ASP ORG` is eliminated by wildcard queries, probably due to match of `Asparagus` and `Organic`

In [24]:
wild_eliminees = set(simple_misses) - set(wild_misses)
wild_eliminees

{'ASP ORG',
 'BYND SSG HT ITLN',
 'FRGO STR CHS',
 'GLBNI STR CHS',
 'KRO CCNT MK',
 'KRO WATER',
 'MSHRM BYBL WHL',
 'PPRS BL GRN ORGN',
 'PRSL MPL TKY GNG',
 'RADISH ORG',
 'SFTSOAP KTCHN FRSH',
 'SQSH YLW ORG',
 'STN CHCK RST',
 'STO CRT BABY ORGNC',
 'TOMATO ORGNC'}

In [25]:
"""
Note that there are no `wild_misses` that were not in `simple_misses`.
This means that using wildcards doesn't hurt performance
"""
set(wild_misses) - set(simple_misses)

set()

## TODO
* Try not not analyzing the entire web string and doing wildcard matches against only the unanalyzed string (don't do wildcard matches against terms
* Try wildcard matching against terms and entire unanalyzed web string

### Fuzzy match
Notice that wildcard can over match, resulting in false positives
For example `m*ue*en*s*t*r` matches `MoisturePartSkimOriginalMozzarellaStringCheese`
See the next example
Combining the wildcard match with a fuzzy match makes Muenster surface to the top (see following examples)

In [26]:
ss.search('mashed_web:m*u*e*n*s*t*r*')[0]

<Document: Document<stored,indexed,tokenized,indexOptions=DOCS_AND_FREQS<raw:FRGO STR CHS> stored,indexed,tokenized<web:Frigo Cheese Heads Low Moisture Part Skim Original Mozzarella String Cheese> stored,indexed,tokenized<mashed_web:frigocheeseheadslowmoisturepartskimoriginalmozzarellastringcheese cheeseheadslowmoisturepartskimoriginalmozzarellastringcheese headslowmoisturepartskimoriginalmozzarellastringcheese lowmoisturepartskimoriginalmozzarellastringcheese moisturepartskimoriginalmozzarellastringcheese partskimoriginalmozzarellastringcheese skimoriginalmozzarellastringcheese originalmozzarellastringcheese mozzarellastringcheese stringcheese cheese> stored,indexed,tokenized<unigrams:frigo cheese heads low moisture part skim original mozzarella string cheese> stored,indexed,tokenized<bigrams:mozzarella_string string_cheese> stored,indexed,tokenized<trigrams:mozzarella_string_cheese> stored,indexed,tokenized<ngrams:frigo cheese heads low moisture part skim original mozzarella string c

In [27]:
ss.explain('mashed_web:m*u*e*n*s*t*r*')

mashed_web:m*u*e*n*s*t*r*
Frigo Cheese Heads Low Moisture Part Skim Original Mozzarella String Cheese | FRGO STR CHS | 1.0
1.0 = mashed_web:m*u*e*n*s*t*r*

------------
Private Selection™ Grab & Go Muenster Cheese | PRSL MUENSTR | 1.0
1.0 = mashed_web:m*u*e*n*s*t*r*

------------


In [28]:
ss.search('mashed_web:m*u*e*n*s*t*r*  meunstr~')[0]

<Document: Document<stored,indexed,tokenized,indexOptions=DOCS_AND_FREQS<raw:PRSL MUENSTR> stored,indexed,tokenized<web:Private Selection™ Grab & Go Muenster Cheese> stored,indexed,tokenized<mashed_web:privateselectiongrabgomuenstercheese selectiongrabgomuenstercheese grabgomuenstercheese gomuenstercheese gomuenstercheese muenstercheese cheese> stored,indexed,tokenized<unigrams:private selection grab  go muenster cheese> stored,indexed,tokenized<bigrams:private_selection selection_grab go_muenster muenster_cheese> stored,indexed,tokenized<trigrams:private_selection_grab go_muenster_cheese> stored,indexed,tokenized<ngrams:private selection grab  go muenster cheese private_selection selection_grab go_muenster muenster_cheese private_selection_grab go_muenster_cheese> stored<id:20615390000>>>

In [29]:
ss.explain('mashed_web:m*u*e*n*s*t*r*  meunstr~')

mashed_web:m*u*e*n*s*t*r*  meunstr~
Private Selection™ Grab & Go Muenster Cheese | PRSL MUENSTR | 2.2037241458892822
2.2037241 = sum of:
  1.0 = mashed_web:m*u*e*n*s*t*r*
  1.2037241 = weight(web:muenster in 59) [BM25Similarity], result of:
    1.2037241 = score(freq=1.0), product of:
      0.71428573 = boost
      4.037186 = idf, computed as log(1 + (N - n + 0.5) / (n + 0.5)) from:
        1 = n, number of documents containing term
        84 = N, total number of documents with field
      0.41742286 = tf, computed as freq / (freq + k1 * (1 - b + b * dl / avgdl)) from:
        1.0 = freq, occurrences of term within document
        1.2 = k1, term saturation parameter
        0.75 = b, length normalization parameter
        7.0 = dl, length of field
        5.75 = avgdl, average length of field

------------
Frigo Cheese Heads Low Moisture Part Skim Original Mozzarella String Cheese | FRGO STR CHS | 1.0
1.0 = sum of:
  1.0 = mashed_web:m*u*e*n*s*t*r*

------------


In [30]:
ss.search('mashed_web:p*r*s*l* prsl~ mashed_web:m*u*e*n*s*t*r*  meunstr~')[0]

<Document: Document<stored,indexed,tokenized,indexOptions=DOCS_AND_FREQS<raw:PRSL MUENSTR> stored,indexed,tokenized<web:Private Selection™ Grab & Go Muenster Cheese> stored,indexed,tokenized<mashed_web:privateselectiongrabgomuenstercheese selectiongrabgomuenstercheese grabgomuenstercheese gomuenstercheese gomuenstercheese muenstercheese cheese> stored,indexed,tokenized<unigrams:private selection grab  go muenster cheese> stored,indexed,tokenized<bigrams:private_selection selection_grab go_muenster muenster_cheese> stored,indexed,tokenized<trigrams:private_selection_grab go_muenster_cheese> stored,indexed,tokenized<ngrams:private selection grab  go muenster cheese private_selection selection_grab go_muenster muenster_cheese private_selection_grab go_muenster_cheese> stored<id:20615390000>>>

In [31]:
ss.explain('mashed_web:p*r*s*l* prsl~ mashed_web:m*u*e*n*s*t*r*  meunstr~')

mashed_web:p*r*s*l* prsl~ mashed_web:m*u*e*n*s*t*r*  meunstr~
Private Selection™ Grab & Go Muenster Cheese | PRSL MUENSTR | 3.2037241458892822
3.2037241 = sum of:
  1.0 = mashed_web:p*r*s*l*
  1.0 = mashed_web:m*u*e*n*s*t*r*
  1.2037241 = weight(web:muenster in 59) [BM25Similarity], result of:
    1.2037241 = score(freq=1.0), product of:
      0.71428573 = boost
      4.037186 = idf, computed as log(1 + (N - n + 0.5) / (n + 0.5)) from:
        1 = n, number of documents containing term
        84 = N, total number of documents with field
      0.41742286 = tf, computed as freq / (freq + k1 * (1 - b + b * dl / avgdl)) from:
        1.0 = freq, occurrences of term within document
        1.2 = k1, term saturation parameter
        0.75 = b, length normalization parameter
        7.0 = dl, length of field
        5.75 = avgdl, average length of field

------------
Frigo Cheese Heads Low Moisture Part Skim Original Mozzarella String Cheese | FRGO STR CHS | 2.0
2.0 = sum of:
  1.0 = mashed_

## Mashed and Fuzzy
* mashed wildcard (matching mashed_web field with wildcard query)
* fuzzy mashed wildcard (mix in fuzzy terms to mashed wildcard)

In [32]:
mwqm = MashedWildQueryMaker(WORDS)

In [33]:
mashed_wild_config = Config(mwqm, ss)

In [34]:
mashed_wild_score, mashed_wild_misses = evaluate(queries, mashed_wild_config, RAW_TO_WEB)

In [35]:
mashed_wild_score

0.8840579710144928

In [36]:
mashed_wild_misses

['CA REDEM VAL',
 'ARTICHOKES',
 'CUCUMBERS',
 'KRO SOAP',
 'LES PET CHEESE BAR',
 'BROWN ONIONS',
 'PRSL MUENSTR',
 'STO CARROTS ORGNC']

### mashed_wild_eliminees
`mashed_wild_eliminees` is the set of `wild_misses` that were eliminated by using mashed wildcard queries on terms
Example: `BRHD` is eliminated by wildcard queries, probably due to match of `BRHD` and `Boar's Head`

In [37]:
mashed_wild_eliminees = set(wild_misses) - set(mashed_wild_misses)
mashed_wild_eliminees

{'BRHD CHEESE', 'STO BROTH', 'STO CCNT MILK'}

In [38]:
regressions = set(mashed_wild_misses) - set(wild_misses)
regressions

set()

In [39]:
fmwqm = FuzzyMashedWildQueryMaker(WORDS)

In [40]:
fuzzy_mashed_wild_config = Config(fmwqm, ss)

In [41]:
fuzzy_mashed_wild_score, fuzzy_mashed_wild_misses = evaluate(queries, fuzzy_mashed_wild_config, RAW_TO_WEB)

In [42]:
fuzzy_mashed_wild_score

0.8985507246376812

In [43]:
fuzzy_mashed_wild_misses

['CA REDEM VAL',
 'FRGO STR CHS',
 'KRO SOAP',
 'LES PET CHEESE BAR',
 'OCEANS HALO BROTH',
 'BROWN ONIONS',
 'STO CARROTS ORGNC']

### fuzzy_mashed_wild_eliminees
By introducing fuzzy matching, we get plurals (`ARTICHOKES`, `CUCUMBERS`) in addition to Mild misspellings (`MUENSTR`).

In [44]:
fuzzy_mashed_wild_eliminees = set(mashed_wild_misses) - set(fuzzy_mashed_wild_misses)
fuzzy_mashed_wild_eliminees

{'ARTICHOKES', 'CUCUMBERS', 'PRSL MUENSTR'}

In [45]:
fuzzy_mashed_regressions = set(fuzzy_mashed_wild_misses) - set(mashed_wild_misses)
fuzzy_mashed_regressions

{'FRGO STR CHS', 'OCEANS HALO BROTH'}

In [46]:
ss.search(fmwqm.make_query('FRGO STR CHS'))[0]

<Document: Document<stored,indexed,tokenized,indexOptions=DOCS_AND_FREQS<raw:BEEF STR FRY> stored,indexed,tokenized<web:Beef Choice For Stir Fry> stored,indexed,tokenized<mashed_web:beefchoiceforstirfry choiceforstirfry forstirfry stirfry fry> stored,indexed,tokenized<unigrams:beef choice for stir fry> stored,indexed,tokenized<bigrams:> stored,indexed,tokenized<trigrams:> stored,indexed,tokenized<ngrams:beef choice for stir fry  > stored<id:20254600000>>>

FRGO STR CHS matches Beef Choice For Stir Fry
* frgo~ matches fry  i think???
* frgo~ matches for i think???
* str~ matches stir
* c*h*s* matches choice for stir.  This would be eliminated if we only consider using bigram/trigram terms (choice for stir hopefully isn't a trigram)

This indicates that we should eliminate web words that are shorter than raw words- both for and fry are shorter than frgo~

The False Negative is the second hit
* Frigo Cheese Heads Low Moisture Part Skim Original Mozzarella String Cheese
* frgo~ matches Frigo
* chs~ does not match Cheese... why?
* str~ does not match String... why?

I think maybe bonus points should be awarded for prefix matches
Examples:
KRO => Kroger
STR => String

This could be accomplished by adding prefix queries 
e.g., in addition to S*T*R*, have STR*


In [47]:
ss.explain(fmwqm.make_query('FRGO STR CHS'))

mashed_web:f*r*g*o* frgo~ mashed_web:s*t*r* str~ mashed_web:c*h*s* chs~
Beef Choice For Stir Fry | BEEF STR FRY | 3.921565532684326
3.9215655 = sum of:
  0.5105596 = weight(web:fry in 10) [BM25Similarity], result of:
    0.5105596 = score(freq=1.0), product of:
      0.3333333 = boost
      3.1898882 = idf, computed as log(1 + (N - n + 0.5) / (n + 0.5)) from:
        3 = n, number of documents containing term
        84 = N, total number of documents with field
      0.48016697 = tf, computed as freq / (freq + k1 * (1 - b + b * dl / avgdl)) from:
        1.0 = freq, occurrences of term within document
        1.2 = k1, term saturation parameter
        0.75 = b, length normalization parameter
        5.0 = dl, length of field
        5.75 = avgdl, average length of field
  1.0 = mashed_web:s*t*r*
  0.47033533 = weight(web:for in 10) [BM25Similarity], result of:
    0.47033533 = score(freq=1.0), product of:
      0.3333333 = boost
      2.9385738 = idf, computed as log(1 + (N - n + 0.5)

Note above matches a stop word `(web:for)`
Maybe eliminating stop words could help

In [48]:
ss.search(mwqm.make_query('FRGO STR CHS'))[0]

<Document: Document<stored,indexed,tokenized,indexOptions=DOCS_AND_FREQS<raw:FRGO STR CHS> stored,indexed,tokenized<web:Frigo Cheese Heads Low Moisture Part Skim Original Mozzarella String Cheese> stored,indexed,tokenized<mashed_web:frigocheeseheadslowmoisturepartskimoriginalmozzarellastringcheese cheeseheadslowmoisturepartskimoriginalmozzarellastringcheese headslowmoisturepartskimoriginalmozzarellastringcheese lowmoisturepartskimoriginalmozzarellastringcheese moisturepartskimoriginalmozzarellastringcheese partskimoriginalmozzarellastringcheese skimoriginalmozzarellastringcheese originalmozzarellastringcheese mozzarellastringcheese stringcheese cheese> stored,indexed,tokenized<unigrams:frigo cheese heads low moisture part skim original mozzarella string cheese> stored,indexed,tokenized<bigrams:mozzarella_string string_cheese> stored,indexed,tokenized<trigrams:mozzarella_string_cheese> stored,indexed,tokenized<ngrams:frigo cheese heads low moisture part skim original mozzarella string c

In [49]:
fmwqm.make_query('FRGO STR CHS')

'mashed_web:f*r*g*o* frgo~ mashed_web:s*t*r* str~ mashed_web:c*h*s* chs~'

OCEANS HALO BROTH matches Pero Organic Green Beans
* o*c*e*a*n*s* matches OrganiC green bEANS - this would have been eliminated using ngrams

In [50]:
ss.explain(fmwqm.make_query('OCEANS HALO BROTH'))

mashed_web:o*c*e*a*n*s* oceans~ mashed_web:h*a*l*o* halo~ broth
Pero Organic Green Beans | GREEN BEANS ORGNC | 2.257633686065674
2.2576337 = sum of:
  1.0 = mashed_web:o*c*e*a*n*s*
  1.2576336 = weight(web:beans in 56) [BM25Similarity], result of:
    1.2576336 = score(freq=1.0), product of:
      0.6 = boost
      4.037186 = idf, computed as log(1 + (N - n + 0.5) / (n + 0.5)) from:
        1 = n, number of documents containing term
        84 = N, total number of documents with field
      0.51918733 = tf, computed as freq / (freq + k1 * (1 - b + b * dl / avgdl)) from:
        1.0 = freq, occurrences of term within document
        1.2 = k1, term saturation parameter
        0.75 = b, length normalization parameter
        4.0 = dl, length of field
        5.75 = avgdl, average length of field

------------
NO CHICKEN BROTH TETRA | OCEANS HALO BROTH | 1.8308416604995728
1.8308417 = sum of:
  1.8308417 = weight(web:broth in 40) [BM25Similarity], result of:
    1.8308417 = score(freq=1.

## ngrams
Instead of matching wildcards on mashed_web field, match wildcards on web, bigrams, and trigrams

In [51]:
nwqm = NgramWildQueryMaker(WORDS)

In [52]:
nwqm.make_query('FRGO STR CHS')

'web:f*r*g*o* bigrams:f*r*g*o* trigrams:f*r*g*o* web:s*t*r* bigrams:s*t*r* trigrams:s*t*r* web:c*h*s* bigrams:c*h*s* trigrams:c*h*s*'

In [53]:
ss.search(nwqm.make_query('FRGO STR CHS'))[0]

<Document: Document<stored,indexed,tokenized,indexOptions=DOCS_AND_FREQS<raw:FRGO STR CHS> stored,indexed,tokenized<web:Frigo Cheese Heads Low Moisture Part Skim Original Mozzarella String Cheese> stored,indexed,tokenized<mashed_web:frigocheeseheadslowmoisturepartskimoriginalmozzarellastringcheese cheeseheadslowmoisturepartskimoriginalmozzarellastringcheese headslowmoisturepartskimoriginalmozzarellastringcheese lowmoisturepartskimoriginalmozzarellastringcheese moisturepartskimoriginalmozzarellastringcheese partskimoriginalmozzarellastringcheese skimoriginalmozzarellastringcheese originalmozzarellastringcheese mozzarellastringcheese stringcheese cheese> stored,indexed,tokenized<unigrams:frigo cheese heads low moisture part skim original mozzarella string cheese> stored,indexed,tokenized<bigrams:mozzarella_string string_cheese> stored,indexed,tokenized<trigrams:mozzarella_string_cheese> stored,indexed,tokenized<ngrams:frigo cheese heads low moisture part skim original mozzarella string c

In [54]:
ss.explain(mwqm.make_query('FRGO STR CHS'))

mashed_web:f*r*g*o* mashed_web:s*t*r* mashed_web:c*h*s*
Frigo Cheese Heads Low Moisture Part Skim Original Mozzarella String Cheese | FRGO STR CHS | 3.0
3.0 = sum of:
  1.0 = mashed_web:f*r*g*o*
  1.0 = mashed_web:s*t*r*
  1.0 = mashed_web:c*h*s*

------------
Purina Fancy Feast Sliced Chicken Hearts & Liver Feast in Gravy Wet Cat Food Can | FFST CAT FOOD | 3.0
3.0 = sum of:
  1.0 = mashed_web:f*r*g*o*
  1.0 = mashed_web:s*t*r*
  1.0 = mashed_web:c*h*s*

------------
Beef Choice For Stir Fry | BEEF STR FRY | 2.0
2.0 = sum of:
  1.0 = mashed_web:s*t*r*
  1.0 = mashed_web:c*h*s*

------------
Galbani String Cheese | GLBNI STR CHS | 2.0
2.0 = sum of:
  1.0 = mashed_web:s*t*r*
  1.0 = mashed_web:c*h*s*

------------
Kroger® Honey Citrus & Shea Butter Hand Soap Bottle | KRO SOAP | 2.0
2.0 = sum of:
  1.0 = mashed_web:s*t*r*
  1.0 = mashed_web:c*h*s*

------------
Private Selection™ Grab & Go Muenster Cheese | PRSL MUENSTR | 2.0
2.0 = sum of:
  1.0 = mashed_web:s*t*r*
  1.0 = mashed_web:c*h*

In [55]:
ss.explain(nwqm.make_query('FRGO STR CHS'))

web:f*r*g*o* bigrams:f*r*g*o* trigrams:f*r*g*o* web:s*t*r* bigrams:s*t*r* trigrams:s*t*r* web:c*h*s* bigrams:c*h*s* trigrams:c*h*s*
Frigo Cheese Heads Low Moisture Part Skim Original Mozzarella String Cheese | FRGO STR CHS | 4.0
4.0 = sum of:
  1.0 = web:f*r*g*o*
  1.0 = web:s*t*r*
  1.0 = bigrams:s*t*r*
  1.0 = web:c*h*s*

------------
Galbani String Cheese | GLBNI STR CHS | 3.0
3.0 = sum of:
  1.0 = web:s*t*r*
  1.0 = bigrams:s*t*r*
  1.0 = web:c*h*s*

------------
Fancy Feast Classic Pate Tender Liver & Chicken Feast Wet Cat Food | FFST CAT FOOD | 2.0
2.0 = sum of:
  1.0 = bigrams:c*h*s*
  1.0 = trigrams:c*h*s*

------------
Private Selection™ Grab & Go Muenster Cheese | PRSL MUENSTR | 2.0
2.0 = sum of:
  1.0 = bigrams:s*t*r*
  1.0 = web:c*h*s*

------------
Purina Fancy Feast Classic Pate Chicken Feast Wet Cat Food Can | FFST CAT FOOD | 2.0
2.0 = sum of:
  1.0 = bigrams:c*h*s*
  1.0 = trigrams:c*h*s*

------------
Simple Truth Organic™ Baby Carrots | STO CRT BABY ORGNC | 2.0
2.0 = 

In [56]:
ngram_wild_config = Config(nwqm, ss)

In [57]:
ngram_wild_score, ngram_wild_misses = evaluate(queries, ngram_wild_config, RAW_TO_WEB)

In [58]:
ngram_wild_score

0.782608695652174

In [59]:
ngram_wild_misses

['CA REDEM VAL',
 'ARTICHOKES',
 'ATKINS BARS',
 'CUCUMBERS',
 'KRO SOAP',
 'KRO CCNT MK',
 'LES PET CHEESE BAR',
 'BROWN ONIONS',
 'ASP ORG',
 'CABBAGE ORG',
 'RADISH ORG',
 'TOMATO ORGNC',
 'STO CARROTS ORGNC',
 'STN CHCK RST',
 'MBA CHK GROUND']

In [60]:
set(ngram_wild_misses) - set(mashed_wild_misses)

{'ASP ORG',
 'ATKINS BARS',
 'CABBAGE ORG',
 'KRO CCNT MK',
 'MBA CHK GROUND',
 'RADISH ORG',
 'STN CHCK RST',
 'TOMATO ORGNC'}

k*r*o* matches bigrams and trigrams for Kroger Spring Water getting a score of 3
k*r*o* only matches unigrams for Kroger Premium Coconut Milk even though the score is still 3
This indicates that unigrams should be boosted over bigrams, bigrams boosted over trigrams
Alternative? More complex boolean query, like web:k*r*o* or (bigram:k*r*o* and not web:k*r*o*)

In [61]:
ss.explain(nwqm.make_query('KRO CCNT MK'))

web:k*r*o* bigrams:k*r*o* trigrams:k*r*o* web:c*c*n*t* bigrams:c*c*n*t* trigrams:c*c*n*t* web:m*k* bigrams:m*k* trigrams:m*k*
(LMTD QTY) Kroger® Spring Water | KRO WATER | 3.0
3.0 = sum of:
  1.0 = web:k*r*o*
  1.0 = bigrams:k*r*o*
  1.0 = trigrams:k*r*o*

------------
Kroger® Premium Coconut Milk | KRO CCNT MK | 3.0
3.0 = sum of:
  1.0 = web:k*r*o*
  1.0 = web:c*c*n*t*
  1.0 = web:m*k*

------------
Kroger® Romaine Leaf Single Cut Leaf | KRO ROMAINE | 3.0
3.0 = sum of:
  1.0 = web:k*r*o*
  1.0 = bigrams:k*r*o*
  1.0 = trigrams:k*r*o*

------------
Fancy Feast Classic Pate Tender Liver & Chicken Feast Wet Cat Food | FFST CAT FOOD | 2.0
2.0 = sum of:
  1.0 = bigrams:c*c*n*t*
  1.0 = trigrams:c*c*n*t*

------------
Kroger® Pear & Coconut Hand Soap | KRO PEAR COCONUT | 2.0
2.0 = sum of:
  1.0 = web:k*r*o*
  1.0 = web:c*c*n*t*

------------
Kroger® Romaine Lettuce Hearts | KRO ROMAINE | 2.0
2.0 = sum of:
  1.0 = web:k*r*o*
  1.0 = bigrams:k*r*o*

------------
Purina Fancy Feast Classic Pat

In [62]:
ss.explain(nwqm.make_query('ASP ORG'))

web:a*s*p* bigrams:a*s*p* trigrams:a*s*p* web:o*r*g* bigrams:o*r*g* trigrams:o*r*g*
Organic Italian Parsley | PARSLEY ORG | 3.0
3.0 = sum of:
  1.0 = web:o*r*g*
  1.0 = bigrams:o*r*g*
  1.0 = trigrams:o*r*g*

------------
Simple Truth Organic™ Fat Free Beef Stock | STO STOCK | 3.0
3.0 = sum of:
  1.0 = web:o*r*g*
  1.0 = bigrams:o*r*g*
  1.0 = trigrams:o*r*g*

------------
Simple Truth Organic™ Fat Free Free Range Chicken Broth | STO BROTH | 3.0
3.0 = sum of:
  1.0 = web:o*r*g*
  1.0 = bigrams:o*r*g*
  1.0 = trigrams:o*r*g*

------------
Organic - Asparagus | ASP ORG | 2.0
2.0 = sum of:
  1.0 = web:a*s*p*
  1.0 = web:o*r*g*

------------
Simple Truth Organic™ Baby Carrots | STO CRT BABY ORGNC | 2.0
2.0 = sum of:
  1.0 = web:o*r*g*
  1.0 = bigrams:o*r*g*

------------
Simple Truth Organic™ Baby Spring Mix | STO BABY SPRING MX | 2.0
2.0 = sum of:
  1.0 = web:o*r*g*
  1.0 = bigrams:o*r*g*

------------
Smart Chicken Organic Ground Chicken 95% Lean | MBA CHK GROUND | 2.0
2.0 = sum of:
  1.

## XOR Ngram Wild Query Maker

In [63]:
xnwqm = XorNgramWildQueryMaker(WORDS)

In [64]:
xnwqm.make_query('FRGO STR CHS')

'(web:f*r*g*o* bigrams:f*r*g*o* trigrams:f*r*g*o*) OR (web:s*t*r* bigrams:s*t*r* trigrams:s*t*r*) OR (web:c*h*s* bigrams:c*h*s* trigrams:c*h*s*)'

In [65]:
ss.search(nwqm.make_query('FRGO STR CHS'))[0]

<Document: Document<stored,indexed,tokenized,indexOptions=DOCS_AND_FREQS<raw:FRGO STR CHS> stored,indexed,tokenized<web:Frigo Cheese Heads Low Moisture Part Skim Original Mozzarella String Cheese> stored,indexed,tokenized<mashed_web:frigocheeseheadslowmoisturepartskimoriginalmozzarellastringcheese cheeseheadslowmoisturepartskimoriginalmozzarellastringcheese headslowmoisturepartskimoriginalmozzarellastringcheese lowmoisturepartskimoriginalmozzarellastringcheese moisturepartskimoriginalmozzarellastringcheese partskimoriginalmozzarellastringcheese skimoriginalmozzarellastringcheese originalmozzarellastringcheese mozzarellastringcheese stringcheese cheese> stored,indexed,tokenized<unigrams:frigo cheese heads low moisture part skim original mozzarella string cheese> stored,indexed,tokenized<bigrams:mozzarella_string string_cheese> stored,indexed,tokenized<trigrams:mozzarella_string_cheese> stored,indexed,tokenized<ngrams:frigo cheese heads low moisture part skim original mozzarella string c

In [66]:
ss.explain(xnwqm.make_query('FRGO STR CHS'))

(web:f*r*g*o* bigrams:f*r*g*o* trigrams:f*r*g*o*) OR (web:s*t*r* bigrams:s*t*r* trigrams:s*t*r*) OR (web:c*h*s* bigrams:c*h*s* trigrams:c*h*s*)
Frigo Cheese Heads Low Moisture Part Skim Original Mozzarella String Cheese | FRGO STR CHS | 4.0
4.0 = sum of:
  1.0 = web:f*r*g*o*
  1.0 = web:s*t*r*
  1.0 = bigrams:s*t*r*
  1.0 = web:c*h*s*

------------
Galbani String Cheese | GLBNI STR CHS | 3.0
3.0 = sum of:
  1.0 = web:s*t*r*
  1.0 = bigrams:s*t*r*
  1.0 = web:c*h*s*

------------
Fancy Feast Classic Pate Tender Liver & Chicken Feast Wet Cat Food | FFST CAT FOOD | 2.0
2.0 = sum of:
  1.0 = bigrams:c*h*s*
  1.0 = trigrams:c*h*s*

------------
Private Selection™ Grab & Go Muenster Cheese | PRSL MUENSTR | 2.0
2.0 = sum of:
  1.0 = bigrams:s*t*r*
  1.0 = web:c*h*s*

------------
Purina Fancy Feast Classic Pate Chicken Feast Wet Cat Food Can | FFST CAT FOOD | 2.0
2.0 = sum of:
  1.0 = bigrams:c*h*s*
  1.0 = trigrams:c*h*s*

------------
Simple Truth Organic™ Baby Carrots | STO CRT BABY ORGNC 

## Boosted ngrams

In [67]:
bnwqm = BoostedNgramWildQueryMaker(WORDS)

In [68]:
bnwqm.make_query('FRGO STR CHS')

'ngrams:f*r*g*o* OR ngrams:s*t*r* OR ngrams:c*h*s*'

In [69]:
ss.search(bnwqm.make_query('FRGO STR CHS'))[0]

<Document: Document<stored,indexed,tokenized,indexOptions=DOCS_AND_FREQS<raw:FRGO STR CHS> stored,indexed,tokenized<web:Frigo Cheese Heads Low Moisture Part Skim Original Mozzarella String Cheese> stored,indexed,tokenized<mashed_web:frigocheeseheadslowmoisturepartskimoriginalmozzarellastringcheese cheeseheadslowmoisturepartskimoriginalmozzarellastringcheese headslowmoisturepartskimoriginalmozzarellastringcheese lowmoisturepartskimoriginalmozzarellastringcheese moisturepartskimoriginalmozzarellastringcheese partskimoriginalmozzarellastringcheese skimoriginalmozzarellastringcheese originalmozzarellastringcheese mozzarellastringcheese stringcheese cheese> stored,indexed,tokenized<unigrams:frigo cheese heads low moisture part skim original mozzarella string cheese> stored,indexed,tokenized<bigrams:mozzarella_string string_cheese> stored,indexed,tokenized<trigrams:mozzarella_string_cheese> stored,indexed,tokenized<ngrams:frigo cheese heads low moisture part skim original mozzarella string c

In [70]:
ss.search(bnwqm.make_query('GLBNI STR CHS'))[0]

<Document: Document<stored,indexed,tokenized,indexOptions=DOCS_AND_FREQS<raw:GLBNI STR CHS> stored,indexed,tokenized<web:Galbani String Cheese> stored,indexed,tokenized<mashed_web:galbanistringcheese stringcheese cheese> stored,indexed,tokenized<unigrams:galbani string cheese> stored,indexed,tokenized<bigrams:galbani_string string_cheese> stored,indexed,tokenized<trigrams:galbani_string_cheese> stored,indexed,tokenized<ngrams:galbani string cheese galbani_string string_cheese galbani_string_cheese> stored<id:7403006610>>>

In [71]:
ss.explain(bnwqm.make_query('FRGO STR CHS'))

ngrams:f*r*g*o* OR ngrams:s*t*r* OR ngrams:c*h*s*
Frigo Cheese Heads Low Moisture Part Skim Original Mozzarella String Cheese | FRGO STR CHS | 3.0
3.0 = sum of:
  1.0 = ngrams:f*r*g*o*
  1.0 = ngrams:s*t*r*
  1.0 = ngrams:c*h*s*

------------
Galbani String Cheese | GLBNI STR CHS | 2.0
2.0 = sum of:
  1.0 = ngrams:s*t*r*
  1.0 = ngrams:c*h*s*

------------
Private Selection™ Grab & Go Muenster Cheese | PRSL MUENSTR | 2.0
2.0 = sum of:
  1.0 = ngrams:s*t*r*
  1.0 = ngrams:c*h*s*

------------
Simple Truth™ Sea Salt Roasted Cashews | ST CASHEWS RS | 2.0
2.0 = sum of:
  1.0 = ngrams:s*t*r*
  1.0 = ngrams:c*h*s*

------------
(LMTD QTY) Kroger® Spring Water | KRO WATER | 1.0
1.0 = sum of:
  1.0 = ngrams:s*t*r*

------------
(LMTD QTY) Robitussin Max Strength Blue Raspberry Nighttime Cough DM Liquid | ROBITUSSIN COUGH | 1.0
1.0 = sum of:
  1.0 = ngrams:s*t*r*

------------
Beef Choice For Stir Fry | BEEF STR FRY | 1.0
1.0 = sum of:
  1.0 = ngrams:s*t*r*

------------
Boar's Head Monterey Ja

In [72]:
ss.explain(bnwqm.make_query('GLBNI STR CHS'))

ngrams:g*l*b*n*i* OR ngrams:s*t*r* OR ngrams:c*h*s*
Galbani String Cheese | GLBNI STR CHS | 3.0
3.0 = sum of:
  1.0 = ngrams:g*l*b*n*i*
  1.0 = ngrams:s*t*r*
  1.0 = ngrams:c*h*s*

------------
Frigo Cheese Heads Low Moisture Part Skim Original Mozzarella String Cheese | FRGO STR CHS | 2.0
2.0 = sum of:
  1.0 = ngrams:s*t*r*
  1.0 = ngrams:c*h*s*

------------
Private Selection™ Grab & Go Muenster Cheese | PRSL MUENSTR | 2.0
2.0 = sum of:
  1.0 = ngrams:s*t*r*
  1.0 = ngrams:c*h*s*

------------
Simple Truth™ Sea Salt Roasted Cashews | ST CASHEWS RS | 2.0
2.0 = sum of:
  1.0 = ngrams:s*t*r*
  1.0 = ngrams:c*h*s*

------------
(LMTD QTY) Kroger® Spring Water | KRO WATER | 1.0
1.0 = sum of:
  1.0 = ngrams:s*t*r*

------------
(LMTD QTY) Robitussin Max Strength Blue Raspberry Nighttime Cough DM Liquid | ROBITUSSIN COUGH | 1.0
1.0 = sum of:
  1.0 = ngrams:s*t*r*

------------
Beef Choice For Stir Fry | BEEF STR FRY | 1.0
1.0 = sum of:
  1.0 = ngrams:s*t*r*

------------
Boar's Head Montere

In [73]:
ss.search(bnwqm.make_query('FRGO STR CHS'))[0]

<Document: Document<stored,indexed,tokenized,indexOptions=DOCS_AND_FREQS<raw:FRGO STR CHS> stored,indexed,tokenized<web:Frigo Cheese Heads Low Moisture Part Skim Original Mozzarella String Cheese> stored,indexed,tokenized<mashed_web:frigocheeseheadslowmoisturepartskimoriginalmozzarellastringcheese cheeseheadslowmoisturepartskimoriginalmozzarellastringcheese headslowmoisturepartskimoriginalmozzarellastringcheese lowmoisturepartskimoriginalmozzarellastringcheese moisturepartskimoriginalmozzarellastringcheese partskimoriginalmozzarellastringcheese skimoriginalmozzarellastringcheese originalmozzarellastringcheese mozzarellastringcheese stringcheese cheese> stored,indexed,tokenized<unigrams:frigo cheese heads low moisture part skim original mozzarella string cheese> stored,indexed,tokenized<bigrams:mozzarella_string string_cheese> stored,indexed,tokenized<trigrams:mozzarella_string_cheese> stored,indexed,tokenized<ngrams:frigo cheese heads low moisture part skim original mozzarella string c

In [74]:
bngram_wild_config = Config(bnwqm, ss)

In [75]:
bngram_wild_score, bngram_wild_misses = evaluate(queries, bngram_wild_config, RAW_TO_WEB)

In [76]:
bngram_wild_score

0.8985507246376812

In [77]:
bngram_wild_misses

['CA REDEM VAL',
 'ARTICHOKES',
 'CUCUMBERS',
 'KRO SOAP',
 'LES PET CHEESE BAR',
 'BROWN ONIONS',
 'STO CARROTS ORGNC']

In [78]:
mashed_wild_misses

['CA REDEM VAL',
 'ARTICHOKES',
 'CUCUMBERS',
 'KRO SOAP',
 'LES PET CHEESE BAR',
 'BROWN ONIONS',
 'PRSL MUENSTR',
 'STO CARROTS ORGNC']

In [79]:
ss.explain(mwqm.make_query('PRSL MUENSTR'))

mashed_web:p*r*s*l* mashed_web:m*u*e*n*s*t*r*
Frigo Cheese Heads Low Moisture Part Skim Original Mozzarella String Cheese | FRGO STR CHS | 2.0
2.0 = sum of:
  1.0 = mashed_web:p*r*s*l*
  1.0 = mashed_web:m*u*e*n*s*t*r*

------------
Private Selection™ Grab & Go Muenster Cheese | PRSL MUENSTR | 2.0
2.0 = sum of:
  1.0 = mashed_web:p*r*s*l*
  1.0 = mashed_web:m*u*e*n*s*t*r*

------------
Earthwise Palm Trees Reusable Shopping Bag | PALM SHOPPING BAGS | 1.0
1.0 = sum of:
  1.0 = mashed_web:p*r*s*l*

------------
Organic - Parsley - Curly | PARSLEY ORG | 1.0
1.0 = sum of:
  1.0 = mashed_web:p*r*s*l*

------------
Organic Italian Parsley | PARSLEY ORG | 1.0
1.0 = sum of:
  1.0 = mashed_web:p*r*s*l*

------------
Private Selection™ Campari Tomatoes | TOMATOES CAMPARI | 1.0
1.0 = sum of:
  1.0 = mashed_web:p*r*s*l*

------------
Private Selection™ Grab & Go Maple Turkey Breast | PRSL MPL TKY GNG | 1.0
1.0 = sum of:
  1.0 = mashed_web:p*r*s*l*

------------
Purina Fancy Feast Classic Pate Chic

In [80]:
ss.explain(bnwqm.make_query('PRSL MUENSTR'))

ngrams:p*r*s*l* OR ngrams:m*u*e*n*s*t*r*
Private Selection™ Grab & Go Muenster Cheese | PRSL MUENSTR | 2.0
2.0 = sum of:
  1.0 = ngrams:p*r*s*l*
  1.0 = ngrams:m*u*e*n*s*t*r*

------------
Organic - Parsley - Curly | PARSLEY ORG | 1.0
1.0 = sum of:
  1.0 = ngrams:p*r*s*l*

------------
Organic Italian Parsley | PARSLEY ORG | 1.0
1.0 = sum of:
  1.0 = ngrams:p*r*s*l*

------------
Private Selection™ Campari Tomatoes | TOMATOES CAMPARI | 1.0
1.0 = sum of:
  1.0 = ngrams:p*r*s*l*

------------
Private Selection™ Grab & Go Maple Turkey Breast | PRSL MPL TKY GNG | 1.0
1.0 = sum of:
  1.0 = ngrams:p*r*s*l*

------------
Purina Fancy Feast Classic Pate Savory Salmon Feast Wet Cat Food Can | FFST CAT FOOD | 1.0
1.0 = sum of:
  1.0 = ngrams:p*r*s*l*

------------


In [81]:
wild_misses

['CA REDEM VAL',
 'ARTICHOKES',
 'BRHD CHEESE',
 'CUCUMBERS',
 'KRO SOAP',
 'LES PET CHEESE BAR',
 'BROWN ONIONS',
 'PRSL MUENSTR',
 'STO CCNT MILK',
 'STO BROTH',
 'STO CARROTS ORGNC']

In [82]:
ss.explain(bnwqm.make_query('KRO SOAP'))

ngrams:k*r*o* OR soap^1.0
Kroger® Pear & Coconut Hand Soap | KRO PEAR COCONUT | 2.3123726844787598
2.3123727 = sum of:
  1.0 = ngrams:k*r*o*
  1.3123727 = weight(web:soap in 29) [BM25Similarity], result of:
    1.3123727 = score(freq=1.0), product of:
      2.9385738 = idf, computed as log(1 + (N - n + 0.5) / (n + 0.5)) from:
        4 = n, number of documents containing term
        84 = N, total number of documents with field
      0.44660193 = tf, computed as freq / (freq + k1 * (1 - b + b * dl / avgdl)) from:
        1.0 = freq, occurrences of term within document
        1.2 = k1, term saturation parameter
        0.75 = b, length normalization parameter
        6.0 = dl, length of field
        5.75 = avgdl, average length of field

------------
Kroger® Honey Citrus & Shea Butter Hand Soap Bottle | KRO SOAP | 2.084866762161255
2.0848668 = sum of:
  1.0 = ngrams:k*r*o*
  1.0848668 = weight(web:soap in 27) [BM25Similarity], result of:
    1.0848668 = score(freq=1.0), product of:
  

In [83]:
ss.explain(bnwqm.make_query('LES PET CHEESE BAR'))

les^1.0 OR ngrams:p*e*t* OR cheese^1.0 OR bar^1.0
Les Petites Havarati Cheese Wedge | LES PET CHEESE | 3.9276785850524902
3.9276786 = sum of:
  1.6932418 = weight(web:les in 34) [BM25Similarity], result of:
    1.6932418 = score(freq=1.0), product of:
      3.5263605 = idf, computed as log(1 + (N - n + 0.5) / (n + 0.5)) from:
        2 = n, number of documents containing term
        84 = N, total number of documents with field
      0.48016697 = tf, computed as freq / (freq + k1 * (1 - b + b * dl / avgdl)) from:
        1.0 = freq, occurrences of term within document
        1.2 = k1, term saturation parameter
        0.75 = b, length normalization parameter
        5.0 = dl, length of field
        5.75 = avgdl, average length of field
  1.0 = ngrams:p*e*t*
  1.2344369 = weight(web:cheese in 34) [BM25Similarity], result of:
    1.2344369 = score(freq=1.0), product of:
      2.5708492 = idf, computed as log(1 + (N - n + 0.5) / (n + 0.5)) from:
        6 = n, number of documents contai

In [84]:
wild_score

0.8405797101449275

In [85]:
mashed_wild_score

0.8840579710144928

In [86]:
set(bngram_wild_misses) - set(mashed_wild_misses)

set()

In [87]:
set(mashed_wild_misses) - set(bngram_wild_misses)

{'PRSL MUENSTR'}

In [88]:
ss.explain(bnwqm.make_query('GLBNI STR CHS'))

ngrams:g*l*b*n*i* OR ngrams:s*t*r* OR ngrams:c*h*s*
Galbani String Cheese | GLBNI STR CHS | 3.0
3.0 = sum of:
  1.0 = ngrams:g*l*b*n*i*
  1.0 = ngrams:s*t*r*
  1.0 = ngrams:c*h*s*

------------
Frigo Cheese Heads Low Moisture Part Skim Original Mozzarella String Cheese | FRGO STR CHS | 2.0
2.0 = sum of:
  1.0 = ngrams:s*t*r*
  1.0 = ngrams:c*h*s*

------------
Private Selection™ Grab & Go Muenster Cheese | PRSL MUENSTR | 2.0
2.0 = sum of:
  1.0 = ngrams:s*t*r*
  1.0 = ngrams:c*h*s*

------------
Simple Truth™ Sea Salt Roasted Cashews | ST CASHEWS RS | 2.0
2.0 = sum of:
  1.0 = ngrams:s*t*r*
  1.0 = ngrams:c*h*s*

------------
(LMTD QTY) Kroger® Spring Water | KRO WATER | 1.0
1.0 = sum of:
  1.0 = ngrams:s*t*r*

------------
(LMTD QTY) Robitussin Max Strength Blue Raspberry Nighttime Cough DM Liquid | ROBITUSSIN COUGH | 1.0
1.0 = sum of:
  1.0 = ngrams:s*t*r*

------------
Beef Choice For Stir Fry | BEEF STR FRY | 1.0
1.0 = sum of:
  1.0 = ngrams:s*t*r*

------------
Boar's Head Montere

## Fuzzy ngrams

In [89]:
fnwqm = FuzzyNgramWildQueryMaker(WORDS)

In [90]:
fnwqm.make_query('ARTICHOKES')

'ngrams:a*r*t*i*c*h*o*k*e*s* OR artichokes~'

In [91]:
fngram_wild_config = Config(fnwqm, ss)

In [92]:
fngram_wild_score, fngram_wild_misses = evaluate(queries, fngram_wild_config, RAW_TO_WEB)

In [93]:
fngram_wild_score

0.927536231884058

In [94]:
fngram_wild_misses

['CA REDEM VAL',
 'KRO SOAP',
 'LES PET CHEESE BAR',
 'BROWN ONIONS',
 'STO CARROTS ORGNC']

In [95]:
bngram_wild_misses

['CA REDEM VAL',
 'ARTICHOKES',
 'CUCUMBERS',
 'KRO SOAP',
 'LES PET CHEESE BAR',
 'BROWN ONIONS',
 'STO CARROTS ORGNC']

In [96]:
eliminees = set(bngram_wild_misses) - set(fngram_wild_misses)
eliminees

{'ARTICHOKES', 'CUCUMBERS'}

In [97]:
regressions = set(fngram_wild_misses) - set(bngram_wild_misses)
regressions

set()

In [98]:
ss.explain(fnwqm.make_query('STO CARROTS ORGNC'))

ngrams:s*t*o* OR sto~ OR carrots^1.0 OR ngrams:o*r*g*n*c* OR orgnc~
Simple Truth Organic™ Baby Carrots | STO CRT BABY ORGNC | 3.983488082885742
3.983488 = sum of:
  1.0 = ngrams:s*t*o*
  1.5748794 = weight(web:carrots in 65) [BM25Similarity], result of:
    1.5748794 = score(freq=1.0), product of:
      3.5263605 = idf, computed as log(1 + (N - n + 0.5) / (n + 0.5)) from:
        2 = n, number of documents containing term
        84 = N, total number of documents with field
      0.44660193 = tf, computed as freq / (freq + k1 * (1 - b + b * dl / avgdl)) from:
        1.0 = freq, occurrences of term within document
        1.2 = k1, term saturation parameter
        0.75 = b, length normalization parameter
        6.0 = dl, length of field
        5.75 = avgdl, average length of field
  1.0 = ngrams:o*r*g*n*c*
  0.40860876 = weight(web:organic in 65) [BM25Similarity], result of:
    0.40860876 = score(freq=1.0), product of:
      0.6 = boost
      1.5248805 = idf, computed as log(1 + (N