# Index Receipts


In [1]:
from ir import IndexDataframe, search_loop, make_dictionary, make_raw_to_web, \
SimpleQueryMaker, WildQueryMaker, Config, SimpleSearcher, evaluate
import pandas as pd
from datetime import datetime
from org.apache.lucene.analysis.standard import StandardAnalyzer

In [2]:
INDEX_DIR = "indexes/receipts1"

In [3]:
path = '../data/raw_web_joined/703_00198_2020-03-20_3_1391204_joined.json'
df = pd.read_json(path)

In [4]:
def index_test(quiet=False):
    start = datetime.now()
    try:
        IndexDataframe(df, INDEX_DIR, StandardAnalyzer(), quiet)
        end = datetime.now()
        print('Elapsed: %s' % (end - start))
    except Exception as e:
        print("Failed: %s" % e)
        raise e    

In [5]:
index_test(True)

.done
Elapsed: 0:00:00.146639


In [6]:
search_loop(INDEX_DIR, 'web')

Hit enter with no input to quit.
Query:


## Equivalency Evaluation
```
build a table of raw -> [web]
where [web] is all the possible values of matches for raw
example: 'FFST CAT FOOD' -> ['Fancy Feast Flaked Fish Cat Food', 'Purina Fancy Feast Chicken Cat Food']

search on query (e.g. FFST CAT FOOD) if top result is any of the ones associated with query, it counts as a hit
```

## Wildcard Technique
```
when building a query evaluate against dictionary of seen web terms
unseen tokens get wildcard treatment
wildcard treatment means insert * between each letter
e.g., FFST CAT FOOD -> F*F*S*T CAT FOOD
since cat and food exist in dictionary
```

In [7]:
WORDS = make_dictionary(df)

In [8]:
RAW_TO_WEB = make_raw_to_web(df)

## Scoring
### Interface
```
def is_hit(raw, config): -> bool

config:
  Index
  Searcher
  QueryMaker
```

Algorithm:
We lookup the webs for the webs for the raw to generate the hit candidates
we process raw into a query using QueryMaker
We run the query using Searcher
Index may not be necessary
If the top result is in webs, we return true, otherwise false

In [9]:
qm = SimpleQueryMaker()

In [10]:
ss = SimpleSearcher(INDEX_DIR)

In [11]:
simple_config = Config(qm, ss)

In [12]:
queries = ['AVOCADO', 'FFST CAT FOOD']

In [13]:
evaluate(queries, simple_config, RAW_TO_WEB)

(0.5, ['AVOCADO'])

In [14]:
queries = df.raw.unique()

In [15]:
simple_score, simple_misses = evaluate(queries, simple_config, RAW_TO_WEB)

In [16]:
len(queries)

69

In [17]:
wqm = WildQueryMaker(WORDS)

In [18]:
wild_config = Config(wqm, ss)

In [19]:
wild_score, wild_misses = evaluate(queries, wild_config, RAW_TO_WEB)

In [20]:
simple_score

0.6231884057971014

In [21]:
wild_score

0.8405797101449275

In [22]:
simple_misses

['KRO WATER',
 'CA REDEM VAL',
 'ARTICHOKES',
 'BYND SSG HT ITLN',
 'BRHD CHEESE',
 'CUCUMBERS',
 'FRGO STR CHS',
 'GLBNI STR CHS',
 'KRO SOAP',
 'KRO CCNT MK',
 'MSHRM BYBL WHL',
 'LES PET CHEESE BAR',
 'BROWN ONIONS',
 'ASP ORG',
 'PPRS BL GRN ORGN',
 'RADISH ORG',
 'SQSH YLW ORG',
 'TOMATO ORGNC',
 'PRSL MPL TKY GNG',
 'PRSL MUENSTR',
 'STO CRT BABY ORGNC',
 'STO CCNT MILK',
 'STO BROTH',
 'STO CARROTS ORGNC',
 'STN CHCK RST',
 'SFTSOAP KTCHN FRSH']

In [23]:
wild_misses

['CA REDEM VAL',
 'ARTICHOKES',
 'BRHD CHEESE',
 'CUCUMBERS',
 'KRO SOAP',
 'LES PET CHEESE BAR',
 'BROWN ONIONS',
 'PRSL MUENSTR',
 'STO CCNT MILK',
 'STO BROTH',
 'STO CARROTS ORGNC']

In [24]:
set(simple_misses) - set(wild_misses)

{'ASP ORG',
 'BYND SSG HT ITLN',
 'FRGO STR CHS',
 'GLBNI STR CHS',
 'KRO CCNT MK',
 'KRO WATER',
 'MSHRM BYBL WHL',
 'PPRS BL GRN ORGN',
 'PRSL MPL TKY GNG',
 'RADISH ORG',
 'SFTSOAP KTCHN FRSH',
 'SQSH YLW ORG',
 'STN CHCK RST',
 'STO CRT BABY ORGNC',
 'TOMATO ORGNC'}

In [25]:
set(wild_misses) - set(simple_misses)

set()