# Index Receipts


In [1]:
import pandas as pd
import sys, os, lucene, threading, time
from datetime import datetime

from java.nio.file import Paths
from org.apache.lucene.analysis.miscellaneous import LimitTokenCountAnalyzer
from org.apache.lucene.analysis.standard import StandardAnalyzer
from org.apache.lucene.document import Document, Field, FieldType
from org.apache.lucene.index import FieldInfo, IndexWriter, IndexWriterConfig, IndexOptions, DirectoryReader
from org.apache.lucene.store import SimpleFSDirectory
from org.apache.lucene.search import IndexSearcher
from org.apache.lucene.queryparser.classic import QueryParser

In [2]:
INDEX_DIR = "indexes/receipts1"

In [3]:
path = '../data/raw_web_joined/703_00198_2020-03-20_3_1391204_joined.json'
df = pd.read_json(path)

In [4]:
lucene.initVM(vmargs=['-Djava.awt.headless=true'])

<jcc.JCCEnv at 0x7f6f31898530>

In [5]:
class Ticker(object):

    def __init__(self):
        self.tick = True

    def run(self):
        while self.tick:
            sys.stdout.write('.')
            sys.stdout.flush()
            time.sleep(1.0)

In [22]:
"""                                                                                                                           
This class is loosely based on the Lucene (java implementation) demo class                                                    
org.apache.lucene.demo.IndexFiles.  It will take a directory as an argument                                                   
and will index all of the files in that directory and downward recursively.                                                   
It will index on the file path, the file name and the file contents.  The                                                     
resulting Lucene index will be placed in the current directory and called                                                     
'index'.                                                                                                                      
"""
class IndexFiles(object):
    """Usage: python IndexFiles <doc_directory>"""

    def __init__(self, df, storeDir, analyzer):
        if not os.path.exists(storeDir):
            os.mkdir(storeDir)

        store = SimpleFSDirectory(Paths.get(storeDir))
#         analyzer = LimitTokenCountAnalyzer(analyzer, 1048576)
        config = IndexWriterConfig(analyzer)
        config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
        writer = IndexWriter(store, config)

        self.indexDocs(df, writer)
        ticker = Ticker()
        print('commit index',)
        threading.Thread(target=ticker.run).start()
        writer.commit()
        writer.close()
        ticker.tick = False
        print('done')
        
    def indexDocs(self, df, writer):
        t1 = FieldType()
        t1.setStored(True)
        t1.setTokenized(False)
        t1.setIndexOptions(IndexOptions.DOCS_AND_FREQS)

        t2 = FieldType()
#         t2.setStored(False)
        t2.setStored(True)
        t2.setTokenized(True)
        t2.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)
        
        for row in df.iterrows():
            raw, web, id = (row[1]['raw'], row[1]['web'], row[1]['id'])
            print("adding %s" % web)
            try:
                doc = Document()
                doc.add(Field("raw", raw, t1))
                doc.add(Field("web", web, t2))
                doc.add(Field("id", id, t1))
                writer.addDocument(doc)
            except Exception as e:
                print("Failed in indexDocs: %s" % e)


In [23]:
def index_test1():
    print('lucene version %s' % lucene.VERSION)
    start = datetime.now()
    try:
        IndexFiles(df, INDEX_DIR, StandardAnalyzer())
        end = datetime.now()
        print('Elapsed: %s' % (end - start))
    except Exception as e:
        print("Failed: %s" % e)
        raise e    

In [24]:
index_test1()

lucene version 8.1.1
adding (LMTD QTY) Essentia Ionized Alkaline Water
adding (LMTD QTY) FIJI Natural Artesian Water
adding (LMTD QTY) Kroger® Spring Water
adding (LMTD QTY) Robitussin Max Strength Blue Raspberry Nighttime Cough DM Liquid
adding $000.10 CRV DEPOSIT
adding $000.10 CRV DEPOSIT
adding ABOUND™ Natural Clumping Cat Litter
adding Artichoke
adding Atkins Endulge Chocolate Coconut Treat Bar
adding Avocado - Small
adding Beef Choice For Stir Fry
adding Beyond Meat Hot Italian Plant-Based Sausage
adding Beyond Meat The Beyond Burger Plant-Based Burger Patties
adding Boar's Head Monterey Jack with Jalapeno Pre-Sliced Cheese
adding Broccoli
adding Cauliflower
adding Celery - Small
adding Cholula Original Hot Sauce
adding Cucumber - English
adding Del Cabo Cucumber Og 16 Oz
adding Earthwise Palm Trees Reusable Shopping Bag
adding Fancy Feast Classic Pate Cod Sole & Shrimp Feast Wet Cat Food
adding Fancy Feast Classic Pate Tender Liver & Chicken Feast Wet Cat Food
adding Fancy Feast

In [25]:
def search_loop():
    searcher = IndexSearcher(DirectoryReader.open(SimpleFSDirectory(Paths.get(INDEX_DIR))))
    analyzer = StandardAnalyzer()
    print("Hit enter with no input to quit.")
    while True:
        command = input("Query:")
        if command == '':
            return
        print("Searching for: %s" % command)
        query = QueryParser("web", analyzer).parse(command)
        scoreDocs = searcher.search(query, 50).scoreDocs
        print("%s total matching documents." % len(scoreDocs))

        for scoreDoc in scoreDocs:
            doc = searcher.doc(scoreDoc.doc)
            print(doc.get("id"), ':', doc.get("web"), "|", doc.get("raw"))


In [33]:
search_loop()

Hit enter with no input to quit.
Query:ABOUND CAT LITTER
Searching for: ABOUND CAT LITTER
9 total matching documents.
1111080010 : ABOUND™ Natural Clumping Cat Litter | ABOUND CAT LITTER
5000042874 : Fancy Feast Flaked Fish & Shrimp Feast Wet Cat Food | FFST CAT FOOD
5000042894 : Fancy Feast Classic Pate Cod Sole & Shrimp Feast Wet Cat Food | FFST CAT FOOD
5000042904 : Fancy Feast Classic Pate Tender Liver & Chicken Feast Wet Cat Food | FFST CAT FOOD
5000042994 : Purina Fancy Feast Classic Pate Chicken Feast Wet Cat Food Can | FFST CAT FOOD
5000042944 : Purina Fancy Feast Classic Pate Savory Salmon Feast Wet Cat Food Can | FFST CAT FOOD
5000042954 : Purina Fancy Feast Classic Pate Tender Beef Feast Wet Cat Food Can | FFST CAT FOOD
5000043494 : Purina Fancy Feast Minced Turkey Feast in Sauce Wet Cat Food Can | FFST CAT FOOD
5000043464 : Purina Fancy Feast Sliced Chicken Hearts & Liver Feast in Gravy Wet Cat Food Can | FFST CAT FOOD
Query:ARTICHOKES
Searching for: ARTICHOKES
0 total matc

## TODO EVALUATE
```
build a table of raw -> [web]
where [web] is all the possible values of matches for raw
example: 'FFST CAT FOOD' -> ['Fancy Feast Flaked Fish Cat Food', 'Purina Fancy Feast Chicken Cat Food']

search on query (e.g. FFST CAT FOOD) if top result is any of the ones associated with query, it counts as a hit
```

## Idea
```
when building a query evaluate against dictionary of seen web terms
unseen tokens get wildcard treatment
wildcard treatment means insert * between each letter
e.g., FFST CAT FOOD -> F*F*S*T CAT FOOD
since cat and food exist in dictionary
```

In [32]:
df['raw'].head(10)

0           ESNT WATER
1           FIJI WATER
2            KRO WATER
3     ROBITUSSIN COUGH
4         CA REDEM VAL
5         CA REDEM VAL
6    ABOUND CAT LITTER
7           ARTICHOKES
8          ATKINS BARS
9         AVOCADO HASS
Name: raw, dtype: object

In [88]:
def normalize_word(word):
    import re
    word = word.lower()
    p = re.compile('[^a-z-]')
    word = p.sub('', word)
    return word.strip()

In [95]:
def make_dictionary(df):
    """
    Extract all tokens from 'web' column of 'df'.
    Return set of tokens
    """
    result = set()
    for sent in df.web.unique():
        for word in sent.split():
            nword = normalize_word(word)
            if nword:
                result.add(nword)
    return result


In [195]:
WORDS = make_dictionary(df)

In [98]:
def make_raw_to_web(df):
    from collections import defaultdict
    raw_to_web = defaultdict(list)
    for row in df.iterrows():
        raw, web = (row[1]['raw'], row[1]['web'])
        raw_to_web[raw].append(web)
    return raw_to_web

In [153]:
RAW_TO_WEB = make_raw_to_web(df)

## Scoring
### Interface
```
def is_hit(raw, config): -> bool

config:
  Index
  Searcher
  QueryMaker
```

Algorithm:
We lookup the webs for the webs for the raw to generate the hit candidates
we process raw into a query using QueryMaker
We run the query using Searcher
Index may not be necessary
If the top result is in webs, we return true, otherwise false

In [105]:
class QueryMaker:
    def make_query(raw):
        pass

In [110]:
class SimpleQueryMaker(QueryMaker):
    def make_query(self, raw):
        return raw

In [111]:
qm = SimpleQueryMaker()

In [112]:
qm.make_query('FFST CAT')

'FFST CAT'

In [113]:
class Searcher:
    def search(self, query):
        pass


In [138]:
class SimpleSearcher(Searcher):
    def __init__(self):
        self.searcher = IndexSearcher(DirectoryReader.open(SimpleFSDirectory(Paths.get(INDEX_DIR))))
        self.analyzer = StandardAnalyzer()

    def search(self, qstring):
        query = QueryParser("web", self.analyzer).parse(qstring)
        scoreDocs = self.searcher.search(query, 50).scoreDocs
        return [self.searcher.doc(score_doc.doc) for score_doc in scoreDocs]

In [139]:
ss = SimpleSearcher()

In [140]:
docs = ss.search('FFST CAT')

In [141]:
d0=docs[0]

In [143]:
d0

<Document: Document<stored,indexed,tokenized,indexOptions=DOCS_AND_FREQS<raw:ABOUND CAT LITTER> stored,indexed,tokenized<web:ABOUND™ Natural Clumping Cat Litter> stored<id:1111080010>>>

In [144]:
d0.get('id')

'1111080010'

In [145]:
d0.get('web')

'ABOUND™ Natural Clumping Cat Litter'

In [148]:
class Config:
    def __init__(self, query_maker, searcher):
        self.query_maker = query_maker
        self.searcher = searcher


In [149]:
simple_config = Config(qm, ss)

In [152]:
simple_config.searcher

<__main__.SimpleSearcher at 0x7f6f0c465b10>

In [180]:
def is_hit(raw, config):
    query = config.query_maker.make_query(raw)
    docs = config.searcher.search(query)
    if not docs:
        return False
    top_doc = docs[0]
    top_web = top_doc.get('web')
    if raw not in RAW_TO_WEB:
        return False
    web_candidates = RAW_TO_WEB[raw]
    if top_web not in web_candidates:
        return False
    return True

In [181]:
is_hit('FFST CAT FOOD', simple_config)

True

In [182]:
is_hit('AVOCADO', simple_config)

False

In [183]:
def evaluate(queries, config):
    num_queries = len(queries)
    total_hits = 0
    missed_queries = []
    for query in queries:
        if is_hit(query, config):
            total_hits += 1
        else:
            missed_queries.append(query)
    precision = total_hits/num_queries
    return precision, missed_queries

In [184]:
queries = ['AVOCADO', 'FFST CAT FOOD']

In [185]:
evaluate(queries, simple_config)

(0.5, ['AVOCADO'])

In [186]:
queries = df.raw.unique()

In [208]:
simple_score, simple_misses = evaluate(queries, simple_config)

In [189]:
len(queries)

69

In [191]:
len(misses)

26

In [192]:
misses

['KRO WATER',
 'CA REDEM VAL',
 'ARTICHOKES',
 'BYND SSG HT ITLN',
 'BRHD CHEESE',
 'CUCUMBERS',
 'FRGO STR CHS',
 'GLBNI STR CHS',
 'KRO SOAP',
 'KRO CCNT MK',
 'MSHRM BYBL WHL',
 'LES PET CHEESE BAR',
 'BROWN ONIONS',
 'ASP ORG',
 'PPRS BL GRN ORGN',
 'RADISH ORG',
 'SQSH YLW ORG',
 'TOMATO ORGNC',
 'PRSL MPL TKY GNG',
 'PRSL MUENSTR',
 'STO CRT BABY ORGNC',
 'STO CCNT MILK',
 'STO BROTH',
 'STO CARROTS ORGNC',
 'STN CHCK RST',
 'SFTSOAP KTCHN FRSH']

In [193]:
s = 'abc'
s2 = ''
for c in s:
    s2 += c + '*'


In [194]:
s2

'a*b*c*'

In [196]:
def make_wildcard_query(q):
    words = q.split()
    nwords = [normalize_word(word) for word in words]
    tokens = []
    for nword in nwords:
        if nword not in WORDS:
            wild_word = ''
            for c in nword:
                wild_word += c + '*'
            tokens.append(wild_word)
        else:
            tokens.append(nword)
    return ' '.join(tokens)                        

In [199]:
make_wildcard_query('FFST CAT FOOD')

'f*f*s*t* cat food'

In [200]:
class WildQueryMaker(QueryMaker):
    def make_query(self, raw):
        return make_wildcard_query(raw)

In [201]:
wqm = WildQueryMaker()

In [204]:
wild_config = Config(wqm, ss)

In [209]:
wild_score, wild_misses = evaluate(queries, wild_config)

In [210]:
wild_score

0.8405797101449275

In [211]:
simple_misses

['KRO WATER',
 'CA REDEM VAL',
 'ARTICHOKES',
 'BYND SSG HT ITLN',
 'BRHD CHEESE',
 'CUCUMBERS',
 'FRGO STR CHS',
 'GLBNI STR CHS',
 'KRO SOAP',
 'KRO CCNT MK',
 'MSHRM BYBL WHL',
 'LES PET CHEESE BAR',
 'BROWN ONIONS',
 'ASP ORG',
 'PPRS BL GRN ORGN',
 'RADISH ORG',
 'SQSH YLW ORG',
 'TOMATO ORGNC',
 'PRSL MPL TKY GNG',
 'PRSL MUENSTR',
 'STO CRT BABY ORGNC',
 'STO CCNT MILK',
 'STO BROTH',
 'STO CARROTS ORGNC',
 'STN CHCK RST',
 'SFTSOAP KTCHN FRSH']

In [212]:
wild_misses

['CA REDEM VAL',
 'ARTICHOKES',
 'BRHD CHEESE',
 'CUCUMBERS',
 'KRO SOAP',
 'LES PET CHEESE BAR',
 'BROWN ONIONS',
 'PRSL MUENSTR',
 'STO CCNT MILK',
 'STO BROTH',
 'STO CARROTS ORGNC']

In [213]:
set(simple_misses) - set(wild_misses)

{'ASP ORG',
 'BYND SSG HT ITLN',
 'FRGO STR CHS',
 'GLBNI STR CHS',
 'KRO CCNT MK',
 'KRO WATER',
 'MSHRM BYBL WHL',
 'PPRS BL GRN ORGN',
 'PRSL MPL TKY GNG',
 'RADISH ORG',
 'SFTSOAP KTCHN FRSH',
 'SQSH YLW ORG',
 'STN CHCK RST',
 'STO CRT BABY ORGNC',
 'TOMATO ORGNC'}