In [1]:
import pandas as pd
from os import listdir, getcwd
from os.path import join
from pathlib import Path
from datetime import datetime
from ir import IndexDataframe, make_dictionary, make_raw_to_web, evaluate, search_loop,\
Config, SimpleQueryMaker, SimpleSearcher, \
WildQueryMaker, MashedWildQueryMaker, BoostedNgramWildQueryMaker, FuzzyNgramWildQueryMaker
from org.apache.lucene.analysis.standard import StandardAnalyzer

## Setup

In [2]:
INDEX_DIR = "indexes/receipts2"

home = getcwd()
home_path = Path(home)
data_path = join(home_path.parent, 'data/raw_web_joined')
files = [join(data_path, file) for file in listdir(data_path)]

dfs = [pd.read_json(file) for file in files]
df = pd.concat(dfs)

def index_test(quiet=False):
    start = datetime.now()
    try:
        IndexDataframe(df, INDEX_DIR, StandardAnalyzer(), quiet)
        end = datetime.now()
        print('Elapsed: %s' % (end - start))
    except Exception as e:
        print("Failed: %s" % e)
        raise e    

index_test(True)

WORDS = make_dictionary(df)
RAW_TO_WEB = make_raw_to_web(df)
queries = df.raw.unique()

.done
Elapsed: 0:00:00.443070


## Baseline

In [3]:
qm = SimpleQueryMaker()
ss = SimpleSearcher(INDEX_DIR)
simple_config = Config(qm, ss)
simple_score, simple_misses = evaluate(queries, simple_config, RAW_TO_WEB)

simple_score

0.46959459459459457

## Basic Wildcard

In [4]:
wqm = WildQueryMaker(WORDS)
wild_config = Config(wqm, ss)
wild_score, wild_misses = evaluate(queries, wild_config, RAW_TO_WEB)

wild_score

0.7162162162162162

## Mashed Wildcard

In [5]:
mwqm = MashedWildQueryMaker(WORDS)
mashed_wild_config = Config(mwqm, ss)
mashed_wild_score, mashed_wild_misses = evaluate(queries, mashed_wild_config, RAW_TO_WEB)

mashed_wild_score

0.7601351351351351

## Ngrams

In [6]:
bnwqm = BoostedNgramWildQueryMaker(WORDS)
bngram_wild_config = Config(bnwqm, ss)
bngram_wild_score, bngram_wild_misses = evaluate(queries, bngram_wild_config, RAW_TO_WEB)

bngram_wild_score

0.7668918918918919

## Fuzzy Ngrams

In [7]:
fnwqm = FuzzyNgramWildQueryMaker(WORDS)
fngram_wild_config = Config(fnwqm, ss)
fngram_wild_score, fngram_wild_misses = evaluate(queries, fngram_wild_config, RAW_TO_WEB)

fngram_wild_score

0.7871621621621622

## Summary

In [8]:
df2 = pd.DataFrame([{'label': 'Baseline', 'score':simple_score}, 
                    {'label': 'Basic Wildcard', 'score':wild_score},
                    {'label': 'Mashed Wildcard', 'score':mashed_wild_score},
                    {'label': 'Ngrams', 'score': bngram_wild_score},
                    {'label': 'Fuzzy Ngrams', 'score': fngram_wild_score}])
df2.set_index(keys='label', drop=True, inplace=True)
pd.set_option('precision', 2)
df2

Unnamed: 0_level_0,score
label,Unnamed: 1_level_1
Baseline,0.47
Basic Wildcard,0.72
Mashed Wildcard,0.76
Ngrams,0.77
Fuzzy Ngrams,0.79
