In [None]:
from strsimpy import (
    Levenshtein,
    NormalizedLevenshtein,
    WeightedLevenshtein,
    Damerau,
    OptimalStringAlignment,
    LongestCommonSubsequence,
    MetricLCS,
    NGram,
    QGram,
    Cosine,
    Jaccard,
    SorensenDice
)

In [None]:
dists = {
    'lev': Levenshtein().distance,
    'nlev': NormalizedLevenshtein().distance,
    'wlev_ins': WeightedLevenshtein(
        insertion_cost_fn=lambda char: 2
    ).distance,
    'wlev_del': WeightedLevenshtein(
        deletion_cost_fn=lambda char: 2
    ).distance,
    'wlev_sub': WeightedLevenshtein(
        substitution_cost_fn=lambda char_a, char_b: 2
    ).distance,
    'dam': Damerau().distance,
    'osa': OptimalStringAlignment().distance,
    'lcs': LongestCommonSubsequence().distance,
    'mlcs': MetricLCS().distance,
    'ng1': NGram(1).distance,
    'ng2': NGram(2).distance,
    'ng3': NGram(3).distance,
    'ng4': NGram(4).distance,
    'qg1': QGram(1).distance,
    'qg2': QGram(2).distance,
    'qg3': QGram(3).distance,
    'qg4': QGram(4).distance,
    'cos1': Cosine(1).distance,
    'cos2': Cosine(2).distance,
    'cos3': Cosine(3).distance,
    'cos4': Cosine(4).distance,
    'jac1': Jaccard(1).distance,
    'jac2': Jaccard(2).distance,
    'jac3': Jaccard(3).distance,
    'jac4': Jaccard(4).distance,
    'dice1': SorensenDice(1).distance,
    'dice2': SorensenDice(2).distance,
    'dice3': SorensenDice(3).distance,
    'dice4': SorensenDice(4).distance
}

In [None]:
def fast_int_round(name, fn):
    if (name in {'lev', 'dam', 'osa', 'lcs'}) or ('wlev' in name) or ('qg' in name):
        return lambda a, b: round(fn(a, b))
    else:
        return lambda a, b: round(100 * fn(a, b))

In [None]:
rounded_dists = {
    name: fast_int_round(name, dist_fn) for name, dist_fn in dists.items()
}

In [None]:
for name in dists:
    print(name, rounded_dists[name]('hellos', 'hallos'), rounded_dists[name]('hellos', 'cobras'))

In [1]:
from search import search, docs
import pandas as pd

In [4]:
queries = [
    'hello', 'hey', 'good', 'God'
]

queries_df = pd.DataFrame({
    'query_id': [str(i) for i in range(len(queries))],
    'query': queries
})

tree, run = search(
    'lev',
    queries_df,
    min_R = 1,
    max_R = 100,
    max_K = 1000,
    tree = tree
)
run

Processed query 4/4. ETR: 0:00:00.589342.

Unnamed: 0,query_id,doc_id,score
0,0,223218,1.000000
1,0,60151,1.000000
2,0,203349,1.000000
3,0,39383,1.000000
4,0,96652,1.000000
...,...,...,...
3995,3,139553,0.333333
3996,3,177107,0.333333
3997,3,286451,0.333333
3998,3,110997,0.333333


In [5]:
run.merge(queries_df, how='left', on='query_id').merge(docs, how='left', on='doc_id')

Unnamed: 0,query_id,doc_id,score,query,doc,doc_lang
0,0,223218,1.000000,hello,heslo,slk
1,0,60151,1.000000,hello,heilo,nob
2,0,203349,1.000000,hello,ello,spa
3,0,39383,1.000000,hello,helpo,epo
4,0,96652,1.000000,hello,sello,fao
...,...,...,...,...,...,...
3995,3,139553,0.333333,God,bont,nld
3996,3,177107,0.333333,God,hnoj,bel
3997,3,286451,0.333333,God,cnoc,gle
3998,3,110997,0.333333,God,kote,kan
