# LSH Cascade Search - 基本検証

このノートブックでは、LSH (Locality Sensitive Hashing) を用いた3段階フィルタリング検索の基本動作を検証します。

## 目次
1. データの読み込みと確認
2. SimHashの動作確認
3. HNSW検索 vs LSH Cascade検索
4. パラメータ比較（LSH-4, LSH-8, LSH-16）

In [None]:
from pathlib import Path
import sys
sys.path.insert(0, '..')

import numpy as np
import pandas as pd

from src.lsh import SimHashGenerator, chunk_hash, hamming_distance
from src.db import VectorDatabase
from src.pipeline import LSHCascadeSearcher, HNSWSearcher

## 1. データの読み込みと確認

In [23]:
# DuckDBからデータを読み込み
db_path = Path('../data/sample_vectors.duckdb')  # 手元のWikipedaデータ（１万件）
db = VectorDatabase(db_path=db_path)
db.initialize()

print(f'Total documents: {db.count():,}')

# カラム情報を表示
print('\n=== テーブルスキーマ ===')
schema = db.conn.execute('DESCRIBE documents').fetchdf()
print(schema.to_string(index=False))

Total documents: 10,000

=== テーブルスキーマ ===
column_name column_type null key default extra
         id     INTEGER   NO PRI    None  None
       text     VARCHAR  YES NaN    None  None
     vector FLOAT[1024]  YES NaN    None  None
    simhash     VARCHAR  YES NaN    None  None
 lsh_chunks   VARCHAR[]  YES NaN    None  None


In [21]:
# サンプルデータを確認
df_sample = db.get_by_ids([0, 1, 2])
df_sample

Unnamed: 0,id,text,vector,simhash,lsh_chunks
0,0,2012 009 conclusion GOD'S WAR,"[0.013836248, -0.023572471, -0.043704864, -0.0...",BBABFCA5DB96BAF883038FB9CAD65CDB,"[c0_BB, c1_AB, c2_FC, c3_A5, c4_DB, c5_96, c6_..."
1,1,安部雄一,"[0.027291711, 0.001755841, 0.020223374, -0.045...",BA958C25B31EAEF8C903879DC87ECCD6,"[c0_BA, c1_95, c2_8C, c3_25, c4_B3, c5_1E, c6_..."
2,2,アシャンティの伝統的建築物群,"[0.041348234, -0.004756351, -0.01896887, -0.02...",AB990C2E9B5EECF8C1028D9DCA6E8C9A,"[c0_AB, c1_99, c2_0C, c3_2E, c4_9B, c5_5E, c6_..."


In [22]:
# # サンプルデータの確認（全カラム表示）
# df_sample = db.get_by_ids([0, 1, 2])
# print(df_sample)
# print("=====")

# print('=== サンプルデータ (3行) ===')
# for i, row in df_sample.iterrows():
#     print(f'\n--- Row {row["id"]} ---')
#     print(f'id: {row["id"]}')
#     print(f'text: {row["text"]}')
#     vec = row["vector"]
#     print(f'vector: [{vec[0]:.6f}, {vec[1]:.6f}, ...] (len={len(vec)})')
#     print(f'simhash: {row["simhash"]}')
#     chunks = row["lsh_chunks"]
#     print(f'lsh_chunks: {chunks[:4]}... (len={len(chunks)})')

## 2. SimHashの動作確認

In [5]:
# SimHashGeneratorの初期化
simhash_gen = SimHashGenerator(dim=1024, hash_bits=128, seed=42)

# サンプルベクトルでSimHashを生成
sample_vec = np.array(df_sample.iloc[0]['vector'], dtype=np.float32)
sample_hash = simhash_gen.hash(sample_vec)

print(f'Vector shape: {sample_vec.shape}')
print(f'SimHash (int): {sample_hash}')
print(f'SimHash (hex): {sample_hash:032X}')

Vector shape: (1024,)
SimHash (int): 249458642282415097800268828519741938907
SimHash (hex): BBABFCA5DB96BAF883038FB9CAD65CDB


In [6]:
# チャンク分割の確認
for num_chunks in [4, 8, 16]:
    chunks = chunk_hash(sample_hash, num_chunks)
    print(f'LSH-{num_chunks}: {chunks[:4]}... (total {len(chunks)} chunks)')

LSH-4: ['c0_BBABFCA5', 'c1_DB96BAF8', 'c2_83038FB9', 'c3_CAD65CDB']... (total 4 chunks)
LSH-8: ['c0_BBAB', 'c1_FCA5', 'c2_DB96', 'c3_BAF8']... (total 8 chunks)
LSH-16: ['c0_BB', 'c1_AB', 'c2_FC', 'c3_A5']... (total 16 chunks)


In [24]:
# ハミング距離の確認
vec1 = np.array(df_sample.iloc[0]['vector'], dtype=np.float32)
vec2 = np.array(df_sample.iloc[1]['vector'], dtype=np.float32)

hash1 = simhash_gen.hash(vec1)
hash2 = simhash_gen.hash(vec2)

dist = hamming_distance(hash1, hash2)
cosine_sim = np.dot(vec1, vec2)

print(f'Hamming distance: {dist} / 128 bits')
print(f'Cosine similarity: {cosine_sim:.4f}')

Hamming distance: 32 / 128 bits
Cosine similarity: 0.7265


## 3. HNSW検索 vs LSH Cascade検索

In [51]:
# 検索器の初期化
hnsw_searcher = HNSWSearcher(db)
lsh_searcher = LSHCascadeSearcher(
    db=db,
    simhash_generator=simhash_gen,
    num_chunks=16,
    # step2_top_n=100,
    step2_top_n=500,
)

In [52]:
# クエリベクトル（最初のドキュメントを使用）
query_vec = np.array(df_sample.iloc[0]['vector'], dtype=np.float32)

# HNSW検索
hnsw_results, hnsw_time = hnsw_searcher.search(query_vec, top_k=10)
print('=== HNSW Results ===')
print(f'Time: {hnsw_time:.2f} ms')
for r in hnsw_results[:5]:
    print(f'  id={r.id}, score={r.score:.4f}, text={r.text[:30]}...')

=== HNSW Results ===
Time: 18.71 ms
  id=0, score=1.0000, text=2012 009 conclusion GOD'S WAR...
  id=6138, score=0.8416, text=Wikipedia:Files for deletion/2...
  id=8425, score=0.8334, text=Category:913 endings...
  id=8546, score=0.8263, text=Category:1212 disestablishment...
  id=8229, score=0.8261, text=Wikipedia:WikiProject Video ga...


In [53]:
# LSH Cascade検索
lsh_results, lsh_metrics = lsh_searcher.search(query_vec, top_k=10)
print('=== LSH Cascade Results ===')
print(f'Total time: {lsh_metrics.total_time_ms:.2f} ms')
print(f'Step1 candidates: {lsh_metrics.step1_candidates}')
print(f'Step2 candidates: {lsh_metrics.step2_candidates}')
print()
for r in lsh_results[:5]:
    print(f'  id={r.id}, score={r.score:.4f}, text={r.text[:30]}...')

=== LSH Cascade Results ===
Total time: 252.51 ms
Step1 candidates: 8709
Step2 candidates: 500

  id=0, score=1.0000, text=2012 009 conclusion GOD'S WAR...
  id=6138, score=0.8416, text=Wikipedia:Files for deletion/2...
  id=8425, score=0.8334, text=Category:913 endings...
  id=5551, score=0.8187, text=Coup d'état of December Twelft...
  id=7494, score=0.8166, text=Wikipedia:Categories for discu...


In [54]:
# Recall計算
hnsw_ids = set(r.id for r in hnsw_results)
lsh_ids = set(r.id for r in lsh_results)
recall = len(hnsw_ids & lsh_ids) / len(hnsw_ids)

print(f'Recall@10: {recall:.2f}')

Recall@10: 0.40


In [55]:
print(f'Total docs: {lsh_metrics.total_docs}')
print(f'Step1 candidates: {lsh_metrics.step1_candidates}')
print(f'Step1 reduction: {lsh_metrics.step1_candidates / lsh_metrics.total_docs * 100:.1f}%')


Total docs: 10000
Step1 candidates: 8709
Step1 reduction: 87.1%


In [58]:
# Step1の候補に対して分析
query_hash = simhash_gen.hash(query_vec)
query_chunks = chunk_hash(query_hash, 16)
candidates = db.search_lsh_chunks(query_chunks)

results = []
for _, row in candidates.iterrows():
    doc_hash = int(row['simhash'], 16)
    doc_vec = np.array(row['vector'], dtype=np.float32)
    
    ham_dist = hamming_distance(query_hash, doc_hash)
    cos_sim = np.dot(query_vec, doc_vec)
    
    results.append({
        'id': row['id'],
        'hamming_dist': ham_dist,
        'cosine_sim': cos_sim,
    })

df_analysis = pd.DataFrame(results)

# 相関係数
corr = df_analysis['hamming_dist'].corr(df_analysis['cosine_sim'])
print(f'ハミング距離とコサイン類似度の相関係数: {corr:.3f}')

# HNSW正解のハミング距離順位を確認
df_sorted_ham = df_analysis.sort_values('hamming_dist')
df_sorted_ham['ham_rank'] = range(1, len(df_sorted_ham) + 1)

print('\n=== HNSW正解10件のハミング距離順位 ===')
hnsw_ids = {0, 8546, 1858, 8229, 5256, 8425, 9195, 5551, 1872, 6138}
for doc_id in hnsw_ids:
    row = df_sorted_ham[df_sorted_ham['id'] == doc_id].iloc[0]
    print(f'id={doc_id}: ハミング距離={row["hamming_dist"]}, 順位={row["ham_rank"]}/{len(df_sorted_ham)}, コサイン類似度={row["cosine_sim"]:.4f}')


ハミング距離とコサイン類似度の相関係数: -0.315

=== HNSW正解10件のハミング距離順位 ===
id=0: ハミング距離=0.0, 順位=1.0/8709, コサイン類似度=1.0000
id=8546: ハミング距離=29.0, 順位=2481.0/8709, コサイン類似度=0.8263
id=1858: ハミング距離=30.0, 順位=3030.0/8709, コサイン類似度=0.8217
id=8229: ハミング距離=30.0, 順位=2885.0/8709, コサイン類似度=0.8261
id=5256: ハミング距離=28.0, 順位=1503.0/8709, コサイン類似度=0.8199
id=8425: ハミング距離=20.0, 順位=8.0/8709, コサイン類似度=0.8334
id=9195: ハミング距離=30.0, 順位=2774.0/8709, コサイン類似度=0.8244
id=5551: ハミング距離=24.0, 順位=206.0/8709, コサイン類似度=0.8187
id=1872: ハミング距離=32.0, 順位=4972.0/8709, コサイン類似度=0.8199
id=6138: ハミング距離=23.0, 順位=95.0/8709, コサイン類似度=0.8416


In [57]:
# HNSWの正解ID
hnsw_ids = set(r.id for r in hnsw_results)
print(f'HNSW結果のID: {hnsw_ids}')

# Step1の候補ID
query_chunks = chunk_hash(simhash_gen.hash(query_vec), 16)
candidates = db.search_lsh_chunks(query_chunks)
candidate_ids = set(candidates['id'])

# 重複確認
overlap = hnsw_ids & candidate_ids
print(f'Step1候補に含まれるHNSW正解: {len(overlap)}/10')
print(f'含まれているID: {overlap}')
print(f'含まれていないID: {hnsw_ids - candidate_ids}')

HNSW結果のID: {0, 8546, 1858, 8229, 5256, 8425, 9195, 5551, 1872, 6138}
Step1候補に含まれるHNSW正解: 10/10
含まれているID: {0, 8546, 1858, 8229, 5256, 8425, 9195, 5551, 1872, 6138}
含まれていないID: set()


In [59]:
import random

# ランダムに5件のクエリを選択
random.seed(123)
all_docs = db.get_all()
query_indices = random.sample(range(len(all_docs)), 5)

for idx in query_indices:
    query_row = all_docs.iloc[idx]
    query_vec = np.array(query_row['vector'], dtype=np.float32)
    query_id = query_row['id']
    
    print(f'\n{"="*50}')
    print(f'Query ID: {query_id}, Text: {query_row["text"][:30]}...')
    print(f'{"="*50}')
    
    # HNSW検索
    hnsw_results, _ = hnsw_searcher.search(query_vec, top_k=10)
    hnsw_ids = set(r.id for r in hnsw_results)
    
    # Step1候補取得
    query_hash = simhash_gen.hash(query_vec)
    query_chunks = chunk_hash(query_hash, 16)
    candidates = db.search_lsh_chunks(query_chunks)
    
    # 相関分析
    results = []
    for _, row in candidates.iterrows():
        doc_hash = int(row['simhash'], 16)
        doc_vec = np.array(row['vector'], dtype=np.float32)
        ham_dist = hamming_distance(query_hash, doc_hash)
        cos_sim = np.dot(query_vec, doc_vec)
        results.append({'id': row['id'], 'hamming_dist': ham_dist, 'cosine_sim': cos_sim})
    
    df_analysis = pd.DataFrame(results)
    corr = df_analysis['hamming_dist'].corr(df_analysis['cosine_sim'])
    
    # HNSW正解の順位確認
    df_sorted = df_analysis.sort_values('hamming_dist')
    df_sorted['ham_rank'] = range(1, len(df_sorted) + 1)
    
    ranks_in_100 = 0
    ranks_in_500 = 0
    for doc_id in hnsw_ids:
        if doc_id in df_sorted['id'].values:
            rank = df_sorted[df_sorted['id'] == doc_id]['ham_rank'].values[0]
            if rank <= 100:
                ranks_in_100 += 1
            if rank <= 500:
                ranks_in_500 += 1
    
    print(f'Step1候補数: {len(candidates)}')
    print(f'相関係数: {corr:.3f}')
    print(f'HNSW正解がStep1に含まれる: {len(hnsw_ids & set(candidates["id"]))}/10')
    print(f'HNSW正解がTop100に含まれる: {ranks_in_100}/10')
    print(f'HNSW正解がTop500に含まれる: {ranks_in_500}/10')



Query ID: 857, Text: 安俊洙...
Step1候補数: 9258
相関係数: -0.427
HNSW正解がStep1に含まれる: 9/10
HNSW正解がTop100に含まれる: 3/10
HNSW正解がTop500に含まれる: 4/10

Query ID: 4385, Text: ウィタリアヌス (ローマ教皇)...
Step1候補数: 6397
相関係数: -0.231
HNSW正解がStep1に含まれる: 10/10
HNSW正解がTop100に含まれる: 6/10
HNSW正解がTop500に含まれる: 6/10

Query ID: 1428, Text: ミシケの戦い...
Step1候補数: 8585
相関係数: -0.482
HNSW正解がStep1に含まれる: 10/10
HNSW正解がTop100に含まれる: 4/10
HNSW正解がTop500に含まれる: 8/10

Query ID: 6672, Text: Wikipedia:Articles for deletio...
Step1候補数: 8093
相関係数: -0.581
HNSW正解がStep1に含まれる: 9/10
HNSW正解がTop100に含まれる: 4/10
HNSW正解がTop500に含まれる: 4/10

Query ID: 4367, Text: おぼん・こぼん...
Step1候補数: 8178
相関係数: -0.489
HNSW正解がStep1に含まれる: 10/10
HNSW正解がTop100に含まれる: 3/10
HNSW正解がTop500に含まれる: 7/10


In [60]:
# 複数のシード値で比較
seeds = [42, 123, 456, 789, 1000]

# 固定のクエリを使用（最初のドキュメント）
query_row = all_docs.iloc[0]
query_vec = np.array(query_row['vector'], dtype=np.float32)

print(f'Query ID: {query_row["id"]}, Text: {query_row["text"][:30]}...')
print()

for seed in seeds:
    # 新しいシードでSimHashGeneratorを作成
    test_gen = SimHashGenerator(dim=1024, hash_bits=128, seed=seed)
    
    # クエリのハッシュとチャンク
    query_hash = test_gen.hash(query_vec)
    
    # 全ドキュメントのハッシュを再計算して分析
    results = []
    for _, row in all_docs.iterrows():
        doc_vec = np.array(row['vector'], dtype=np.float32)
        doc_hash = test_gen.hash(doc_vec)
        
        ham_dist = hamming_distance(query_hash, doc_hash)
        cos_sim = np.dot(query_vec, doc_vec)
        
        results.append({
            'id': row['id'],
            'hamming_dist': ham_dist,
            'cosine_sim': cos_sim,
        })
    
    df_analysis = pd.DataFrame(results)
    corr = df_analysis['hamming_dist'].corr(df_analysis['cosine_sim'])
    
    # HNSW正解の順位確認
    hnsw_results, _ = hnsw_searcher.search(query_vec, top_k=10)
    hnsw_ids = set(r.id for r in hnsw_results)
    
    df_sorted = df_analysis.sort_values('hamming_dist')
    df_sorted['ham_rank'] = range(1, len(df_sorted) + 1)
    
    ranks_in_100 = sum(1 for doc_id in hnsw_ids 
                       if df_sorted[df_sorted['id'] == doc_id]['ham_rank'].values[0] <= 100)
    ranks_in_500 = sum(1 for doc_id in hnsw_ids 
                       if df_sorted[df_sorted['id'] == doc_id]['ham_rank'].values[0] <= 500)
    
    print(f'Seed={seed}: 相関係数={corr:.3f}, Top100={ranks_in_100}/10, Top500={ranks_in_500}/10')


Query ID: 0, Text: 2012 009 conclusion GOD'S WAR...

Seed=42: 相関係数=-0.319, Top100=2/10, Top500=4/10
Seed=123: 相関係数=-0.457, Top100=3/10, Top500=4/10
Seed=456: 相関係数=-0.440, Top100=2/10, Top500=5/10
Seed=789: 相関係数=-0.345, Top100=2/10, Top500=3/10
Seed=1000: 相関係数=-0.393, Top100=3/10, Top500=6/10


## 4. パラメータ比較（LSH-4, LSH-8, LSH-16）

In [62]:
# 複数クエリでの比較
import random

# ランダムに10件のクエリを選択
random.seed(42)
all_docs = db.get_all()
query_indices = random.sample(range(len(all_docs)), 10)

results_summary = []

for num_chunks in [4, 8, 16]:
    searcher = LSHCascadeSearcher(
        db=db,
        simhash_generator=simhash_gen,
        num_chunks=num_chunks,
        step2_top_n=100,
    )
    
    recalls = []
    latencies = []
    candidates = []
    
    for idx in query_indices:
        query_vec = np.array(all_docs.iloc[idx]['vector'], dtype=np.float32)
        
        # HNSW baseline
        hnsw_results, _ = hnsw_searcher.search(query_vec, top_k=10)
        hnsw_ids = set(r.id for r in hnsw_results)
        
        # LSH search
        lsh_results, metrics = searcher.search(query_vec, top_k=10)
        lsh_ids = set(r.id for r in lsh_results)
        
        recall = len(hnsw_ids & lsh_ids) / len(hnsw_ids) if hnsw_ids else 0
        recalls.append(recall)
        latencies.append(metrics.total_time_ms)
        candidates.append(metrics.step1_candidates)
    
    results_summary.append({
        'chunks': num_chunks,
        'bits_per_chunk': 128 // num_chunks,
        'avg_recall': np.mean(recalls),
        'avg_latency_ms': np.mean(latencies),
        'avg_candidates': np.mean(candidates),
        'reduction_rate': 1 - np.mean(candidates) / len(all_docs),
    })

df_results = pd.DataFrame(results_summary)
df_results

Unnamed: 0,chunks,bits_per_chunk,avg_recall,avg_latency_ms,avg_candidates,reduction_rate
0,4,32,0.0,9.341149,0.0,1.0
1,8,16,0.0,11.175816,0.0,1.0
2,16,8,0.45,208.753611,8026.3,0.19737


In [63]:
# デバッグ: 単一クエリで4チャンクのマッチングを確認
query_idx = 0
query_vec = np.array(all_docs.iloc[query_idx]['vector'], dtype=np.float32)
query_hash = simhash_gen.hash(query_vec)

print(f'Query ID: {all_docs.iloc[query_idx]["id"]}')
print(f'Query SimHash: {query_hash:032X}')

for num_chunks in [4, 8, 16]:
    query_chunks = set(chunk_hash(query_hash, num_chunks))
    print(f'\n=== {num_chunks}チャンク ===')
    print(f'Query chunks: {query_chunks}')
    
    # 自分自身とマッチするか確認
    doc_hash = int(all_docs.iloc[query_idx]['simhash'], 16)
    doc_chunks = set(chunk_hash(doc_hash, num_chunks))
    print(f'Doc chunks: {doc_chunks}')
    print(f'一致: {query_chunks & doc_chunks}')
    
    # 候補数をカウント
    match_count = 0
    for _, row in all_docs.head(100).iterrows():
        doc_hash = int(row['simhash'], 16)
        doc_chunks = set(chunk_hash(doc_hash, num_chunks))
        if query_chunks & doc_chunks:
            match_count += 1
    print(f'最初の100件中のマッチ数: {match_count}')


Query ID: 0
Query SimHash: BBABFCA5DB96BAF883038FB9CAD65CDB

=== 4チャンク ===
Query chunks: {'c1_DB96BAF8', 'c2_83038FB9', 'c3_CAD65CDB', 'c0_BBABFCA5'}
Doc chunks: {'c1_DB96BAF8', 'c2_83038FB9', 'c3_CAD65CDB', 'c0_BBABFCA5'}
一致: {'c1_DB96BAF8', 'c2_83038FB9', 'c3_CAD65CDB', 'c0_BBABFCA5'}
最初の100件中のマッチ数: 1

=== 8チャンク ===
Query chunks: {'c4_8303', 'c5_8FB9', 'c6_CAD6', 'c0_BBAB', 'c1_FCA5', 'c2_DB96', 'c3_BAF8', 'c7_5CDB'}
Doc chunks: {'c4_8303', 'c5_8FB9', 'c6_CAD6', 'c0_BBAB', 'c1_FCA5', 'c2_DB96', 'c3_BAF8', 'c7_5CDB'}
一致: {'c4_8303', 'c5_8FB9', 'c6_CAD6', 'c0_BBAB', 'c1_FCA5', 'c2_DB96', 'c3_BAF8', 'c7_5CDB'}
最初の100件中のマッチ数: 6

=== 16チャンク ===
Query chunks: {'c6_BA', 'c8_83', 'c3_A5', 'c12_CA', 'c2_FC', 'c13_D6', 'c15_DB', 'c7_F8', 'c9_03', 'c0_BB', 'c4_DB', 'c14_5C', 'c10_8F', 'c1_AB', 'c5_96', 'c11_B9'}
Doc chunks: {'c6_BA', 'c8_83', 'c3_A5', 'c12_CA', 'c2_FC', 'c13_D6', 'c15_DB', 'c7_F8', 'c9_03', 'c0_BB', 'c4_DB', 'c14_5C', 'c10_8F', 'c1_AB', 'c5_96', 'c11_B9'}
一致: {'c12_CA', 'c8_83'

In [65]:
# データベース接続を閉じる
db.close()
print('Done!')

Done!
