# Index Receipts


In [1]:
import pandas as pd
import sys, os, lucene, threading, time
from datetime import datetime

from java.nio.file import Paths
from org.apache.lucene.analysis.miscellaneous import LimitTokenCountAnalyzer
from org.apache.lucene.analysis.standard import StandardAnalyzer
from org.apache.lucene.document import Document, Field, FieldType
from org.apache.lucene.index import FieldInfo, IndexWriter, IndexWriterConfig, IndexOptions, DirectoryReader
from org.apache.lucene.store import SimpleFSDirectory
from org.apache.lucene.search import IndexSearcher
from org.apache.lucene.queryparser.classic import QueryParser

In [2]:
INDEX_DIR = "indexes/receipts1"

In [3]:
path = '../data/raw_web_joined/703_00198_2020-03-20_3_1391204_joined.json'
df = pd.read_json(path)

In [4]:
lucene.initVM(vmargs=['-Djava.awt.headless=true'])

<jcc.JCCEnv at 0x7f6f31898530>

In [5]:
class Ticker(object):

    def __init__(self):
        self.tick = True

    def run(self):
        while self.tick:
            sys.stdout.write('.')
            sys.stdout.flush()
            time.sleep(1.0)

In [22]:
"""                                                                                                                           
This class is loosely based on the Lucene (java implementation) demo class                                                    
org.apache.lucene.demo.IndexFiles.  It will take a directory as an argument                                                   
and will index all of the files in that directory and downward recursively.                                                   
It will index on the file path, the file name and the file contents.  The                                                     
resulting Lucene index will be placed in the current directory and called                                                     
'index'.                                                                                                                      
"""
class IndexFiles(object):
    """Usage: python IndexFiles <doc_directory>"""

    def __init__(self, df, storeDir, analyzer):
        if not os.path.exists(storeDir):
            os.mkdir(storeDir)

        store = SimpleFSDirectory(Paths.get(storeDir))
#         analyzer = LimitTokenCountAnalyzer(analyzer, 1048576)
        config = IndexWriterConfig(analyzer)
        config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
        writer = IndexWriter(store, config)

        self.indexDocs(df, writer)
        ticker = Ticker()
        print('commit index',)
        threading.Thread(target=ticker.run).start()
        writer.commit()
        writer.close()
        ticker.tick = False
        print('done')
        
    def indexDocs(self, df, writer):
        t1 = FieldType()
        t1.setStored(True)
        t1.setTokenized(False)
        t1.setIndexOptions(IndexOptions.DOCS_AND_FREQS)

        t2 = FieldType()
#         t2.setStored(False)
        t2.setStored(True)
        t2.setTokenized(True)
        t2.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)
        
        for row in df.iterrows():
            raw, web, id = (row[1]['raw'], row[1]['web'], row[1]['id'])
            print("adding %s" % web)
            try:
                doc = Document()
                doc.add(Field("raw", raw, t1))
                doc.add(Field("web", web, t2))
                doc.add(Field("id", id, t1))
                writer.addDocument(doc)
            except Exception as e:
                print("Failed in indexDocs: %s" % e)


In [23]:
def index_test1():
    print('lucene version %s' % lucene.VERSION)
    start = datetime.now()
    try:
        IndexFiles(df, INDEX_DIR, StandardAnalyzer())
        end = datetime.now()
        print('Elapsed: %s' % (end - start))
    except Exception as e:
        print("Failed: %s" % e)
        raise e    

In [24]:
index_test1()

lucene version 8.1.1
adding (LMTD QTY) Essentia Ionized Alkaline Water
adding (LMTD QTY) FIJI Natural Artesian Water
adding (LMTD QTY) Kroger® Spring Water
adding (LMTD QTY) Robitussin Max Strength Blue Raspberry Nighttime Cough DM Liquid
adding $000.10 CRV DEPOSIT
adding $000.10 CRV DEPOSIT
adding ABOUND™ Natural Clumping Cat Litter
adding Artichoke
adding Atkins Endulge Chocolate Coconut Treat Bar
adding Avocado - Small
adding Beef Choice For Stir Fry
adding Beyond Meat Hot Italian Plant-Based Sausage
adding Beyond Meat The Beyond Burger Plant-Based Burger Patties
adding Boar's Head Monterey Jack with Jalapeno Pre-Sliced Cheese
adding Broccoli
adding Cauliflower
adding Celery - Small
adding Cholula Original Hot Sauce
adding Cucumber - English
adding Del Cabo Cucumber Og 16 Oz
adding Earthwise Palm Trees Reusable Shopping Bag
adding Fancy Feast Classic Pate Cod Sole & Shrimp Feast Wet Cat Food
adding Fancy Feast Classic Pate Tender Liver & Chicken Feast Wet Cat Food
adding Fancy Feast

In [25]:
def search_loop():
    searcher = IndexSearcher(DirectoryReader.open(SimpleFSDirectory(Paths.get(INDEX_DIR))))
    analyzer = StandardAnalyzer()
    print("Hit enter with no input to quit.")
    while True:
        command = input("Query:")
        if command == '':
            return
        print("Searching for: %s" % command)
        query = QueryParser("web", analyzer).parse(command)
        scoreDocs = searcher.search(query, 50).scoreDocs
        print("%s total matching documents." % len(scoreDocs))

        for scoreDoc in scoreDocs:
            doc = searcher.doc(scoreDoc.doc)
            print(doc.get("id"), ':', doc.get("web"), "|", doc.get("raw"))


In [33]:
search_loop()

Hit enter with no input to quit.
Query:ABOUND CAT LITTER
Searching for: ABOUND CAT LITTER
9 total matching documents.
1111080010 : ABOUND™ Natural Clumping Cat Litter | ABOUND CAT LITTER
5000042874 : Fancy Feast Flaked Fish & Shrimp Feast Wet Cat Food | FFST CAT FOOD
5000042894 : Fancy Feast Classic Pate Cod Sole & Shrimp Feast Wet Cat Food | FFST CAT FOOD
5000042904 : Fancy Feast Classic Pate Tender Liver & Chicken Feast Wet Cat Food | FFST CAT FOOD
5000042994 : Purina Fancy Feast Classic Pate Chicken Feast Wet Cat Food Can | FFST CAT FOOD
5000042944 : Purina Fancy Feast Classic Pate Savory Salmon Feast Wet Cat Food Can | FFST CAT FOOD
5000042954 : Purina Fancy Feast Classic Pate Tender Beef Feast Wet Cat Food Can | FFST CAT FOOD
5000043494 : Purina Fancy Feast Minced Turkey Feast in Sauce Wet Cat Food Can | FFST CAT FOOD
5000043464 : Purina Fancy Feast Sliced Chicken Hearts & Liver Feast in Gravy Wet Cat Food Can | FFST CAT FOOD
Query:ARTICHOKES
Searching for: ARTICHOKES
0 total matc

## TODO EVALUATE
```
build a table of raw -> [web]
where [web] is all the possible values of matches for raw
example: 'FFST CAT FOOD' -> ['Fancy Feast Flaked Fish Cat Food', 'Purina Fancy Feast Chicken Cat Food']

search on query (e.g. FFST CAT FOOD) if top result is any of the ones associated with query, it counts as a hit
```

## Idea
```
when building a query evaluate against dictionary of seen web terms
unseen tokens get wildcard treatment
wildcard treatment means insert * between each letter
e.g., FFST CAT FOOD -> F*F*S*T CAT FOOD
since cat and food exist in dictionary
```

In [32]:
df['raw'].head(10)

0           ESNT WATER
1           FIJI WATER
2            KRO WATER
3     ROBITUSSIN COUGH
4         CA REDEM VAL
5         CA REDEM VAL
6    ABOUND CAT LITTER
7           ARTICHOKES
8          ATKINS BARS
9         AVOCADO HASS
Name: raw, dtype: object