# 80. DF-LSH実験シリーズ データ準備

## 目的
- 英語・日本語Wikipedia 各10,000件のE5-base埋め込みを準備
- 既存ITQモデル（400K日本語で学習済み）を再利用してハッシュ・ピボット生成
- 実験81-84で共通利用するデータセット

## データ構成
| データセット | モデル | 次元数 | 件数 |
|-------------|--------|--------|------|
| 英語Wikipedia | multilingual-e5-base | 768 | 10,000 |
| 日本語Wikipedia | multilingual-e5-base | 768 | 10,000 |
| 英語Wikipedia | all-MiniLM-L6-v2 | 384 | 10,000（既存再利用） |

## 0. セットアップ

In [1]:
import sys
import numpy as np
import time
from pathlib import Path
from scipy.stats import spearmanr
from sklearn.metrics.pairwise import cosine_similarity
import warnings
warnings.filterwarnings('ignore')

sys.path.insert(0, '../src')
from itq_lsh import ITQLSH, hamming_distance, hamming_distance_batch

DATA_DIR = Path('../data')
np.random.seed(42)

N_DOCUMENTS = 10000
N_BITS = 128
N_PIVOTS = 8

print(f'Configuration:')
print(f'  Documents: {N_DOCUMENTS}')
print(f'  ITQ bits: {N_BITS}')
print(f'  Pivots: {N_PIVOTS}')

Configuration:
  Documents: 10000
  ITQ bits: 128
  Pivots: 8


## 1. 英語Wikipedia 10K + E5-base埋め込み

In [2]:
import pandas as pd

# 英語Wikipediaテキストをロード
print('Loading English Wikipedia from parquet...')
df_en = pd.read_parquet(DATA_DIR / 'wikipedia_en_body_100k.parquet')
print(f'  Total rows: {len(df_en)}')
print(f'  Columns: {list(df_en.columns)}')
print(f'  Sample: {df_en.iloc[0]["text"][:100]}...')

Loading English Wikipedia from parquet...


  Total rows: 100000
  Columns: ['id', 'title', 'text', 'lang', 'source']
  Sample: {{short description|American college basketball season}}
{{Use mdy dates|date=August 2023}}
{{refimp...


In [3]:
# 先頭10K件を選択（テキスト長50文字以上）
en_documents = []
for _, row in df_en.iterrows():
    text = str(row['text'])[:500].strip()
    if len(text) >= 50:
        en_documents.append(text)
    if len(en_documents) >= N_DOCUMENTS:
        break

print(f'English documents: {len(en_documents)}')
print(f'Sample: {en_documents[0][:100]}...')

English documents: 10000
Sample: {{short description|American college basketball season}}
{{Use mdy dates|date=August 2023}}
{{refimp...


In [4]:
# E5-base で英語テキストの埋め込み生成
from sentence_transformers import SentenceTransformer

print('Loading multilingual-e5-base...')
e5_model = SentenceTransformer('intfloat/multilingual-e5-base', device='cpu')

# passage: プレフィックス
en_texts_prefixed = [f'passage: {t}' for t in en_documents]

print(f'Generating embeddings for {len(en_texts_prefixed)} English documents...')
start = time.time()
e5_en_embeddings = e5_model.encode(
    en_texts_prefixed, show_progress_bar=True,
    convert_to_numpy=True
).astype(np.float32)
en_embed_time = time.time() - start

print(f'  Shape: {e5_en_embeddings.shape}')
print(f'  Time: {en_embed_time:.1f}s ({en_embed_time/len(en_documents)*1000:.1f} ms/doc)')

# 保存
np.save(DATA_DIR / '10k_e5_base_en_embeddings.npy', e5_en_embeddings)
print(f'  Saved: {DATA_DIR / "10k_e5_base_en_embeddings.npy"}')

Loading multilingual-e5-base...


Generating embeddings for 10000 English documents...


Batches:   0%|          | 0/313 [00:00<?, ?it/s]

  Shape: (10000, 768)
  Time: 794.6s (79.5 ms/doc)
  Saved: ../data/10k_e5_base_en_embeddings.npy


## 2. 日本語Wikipedia 10K（400Kからサブサンプル）

In [5]:
# 400K埋め込みから10K件をサブサンプル
print('Loading Japanese Wikipedia 400K embeddings...')
e5_ja_400k = np.load(DATA_DIR / 'wikipedia_400k_e5_base_embeddings.npy')
print(f'  400K shape: {e5_ja_400k.shape}')

# seed=42で10K件をランダムサンプル
rng = np.random.default_rng(42)
ja_indices = rng.choice(len(e5_ja_400k), N_DOCUMENTS, replace=False)
ja_indices.sort()

e5_ja_embeddings = e5_ja_400k[ja_indices].astype(np.float32)
print(f'  Subsampled shape: {e5_ja_embeddings.shape}')

# 保存
np.save(DATA_DIR / '10k_e5_base_ja_embeddings.npy', e5_ja_embeddings)
print(f'  Saved: {DATA_DIR / "10k_e5_base_ja_embeddings.npy"}')

# メモリ解放
del e5_ja_400k

Loading Japanese Wikipedia 400K embeddings...


  400K shape: (399029, 768)
  Subsampled shape: (10000, 768)
  Saved: ../data/10k_e5_base_ja_embeddings.npy


## 3. ITQハッシュ生成（既存モデル再利用）

In [6]:
# 既存のITQモデルをロード（400K日本語で学習済み）
print('Loading pre-trained ITQ model...')
itq = ITQLSH.load(str(DATA_DIR / 'itq_e5_base_128bits.pkl'))
print(f'  n_bits: {itq.n_bits}')
print(f'  mean_vector shape: {itq.mean_vector.shape}')
print(f'  pca_matrix shape: {itq.pca_matrix.shape}')
print(f'  rotation_matrix shape: {itq.rotation_matrix.shape}')

Loading pre-trained ITQ model...
  n_bits: 128
  mean_vector shape: (768,)
  pca_matrix shape: (768, 128)
  rotation_matrix shape: (128, 128)


In [7]:
# 英語のハッシュ生成
print('Generating ITQ hashes for English 10K...')
e5_en_hashes = itq.transform(e5_en_embeddings)
print(f'  Shape: {e5_en_hashes.shape}')
print(f'  Bit balance: {e5_en_hashes.mean():.3f} (0.5が理想)')

np.save(DATA_DIR / '10k_e5_base_en_hashes_128bits.npy', e5_en_hashes)
print(f'  Saved.')

# 日本語のハッシュ生成
print('\nGenerating ITQ hashes for Japanese 10K...')
e5_ja_hashes = itq.transform(e5_ja_embeddings)
print(f'  Shape: {e5_ja_hashes.shape}')
print(f'  Bit balance: {e5_ja_hashes.mean():.3f} (0.5が理想)')

np.save(DATA_DIR / '10k_e5_base_ja_hashes_128bits.npy', e5_ja_hashes)
print(f'  Saved.')

Generating ITQ hashes for English 10K...
  Shape: (10000, 128)
  Bit balance: 0.495 (0.5が理想)
  Saved.

Generating ITQ hashes for Japanese 10K...
  Shape: (10000, 128)
  Bit balance: 0.500 (0.5が理想)
  Saved.


## 4. Pivot選択とピボット距離計算

In [8]:
def select_pivots_furthest_first(hashes, n_pivots, seed=42):
    """Furthest First法でピボットを選択"""
    rng = np.random.default_rng(seed)
    n_samples = len(hashes)
    
    # 最初のピボットをランダムに選択
    pivot_indices = [rng.integers(n_samples)]
    
    for _ in range(n_pivots - 1):
        # 全ドキュメントの既存ピボットへの最小距離を計算
        min_dists = np.full(n_samples, np.inf)
        for pi in pivot_indices:
            dists = hamming_distance_batch(hashes[pi], hashes).astype(float)
            min_dists = np.minimum(min_dists, dists)
        
        # 既存ピボットを除外
        for pi in pivot_indices:
            min_dists[pi] = -1
        
        # 最大距離のドキュメントを新ピボットに
        new_pivot = np.argmax(min_dists)
        pivot_indices.append(new_pivot)
    
    return pivot_indices

In [9]:
# 英語のピボット選択
print('Selecting pivots for English 10K...')
en_pivot_indices = select_pivots_furthest_first(e5_en_hashes, N_PIVOTS)
en_pivots = e5_en_hashes[en_pivot_indices]
print(f'  Pivot indices: {en_pivot_indices}')
print(f'  Pivots shape: {en_pivots.shape}')

# ピボット間距離
print('  Pivot inter-distances:')
for i in range(N_PIVOTS):
    for j in range(i+1, N_PIVOTS):
        d = hamming_distance(en_pivots[i], en_pivots[j])
        if i < 3 and j < 5:
            print(f'    p{i}-p{j}: {d}')

np.save(DATA_DIR / 'pivots_8_e5_base_en.npy', en_pivots)

# ピボット距離計算
print('\nComputing pivot distances for English...')
en_pivot_distances = np.zeros((len(e5_en_hashes), N_PIVOTS), dtype=np.uint8)
for i in range(N_PIVOTS):
    en_pivot_distances[:, i] = hamming_distance_batch(en_pivots[i], e5_en_hashes)
print(f'  Shape: {en_pivot_distances.shape}')

np.save(DATA_DIR / '10k_e5_base_en_pivot_distances.npy', en_pivot_distances)
print(f'  Saved.')

Selecting pivots for English 10K...
  Pivot indices: [np.int64(892), np.int64(1162), np.int64(7228), np.int64(550), np.int64(1279), np.int64(7631), np.int64(3242), np.int64(1196)]
  Pivots shape: (8, 128)
  Pivot inter-distances:
    p0-p1: 71
    p0-p2: 64
    p0-p3: 63
    p0-p4: 63
    p1-p2: 67
    p1-p3: 62
    p1-p4: 60
    p2-p3: 63
    p2-p4: 61

Computing pivot distances for English...
  Shape: (10000, 8)
  Saved.


In [10]:
# 日本語のピボット選択
print('Selecting pivots for Japanese 10K...')
ja_pivot_indices = select_pivots_furthest_first(e5_ja_hashes, N_PIVOTS)
ja_pivots = e5_ja_hashes[ja_pivot_indices]
print(f'  Pivot indices: {ja_pivot_indices}')
print(f'  Pivots shape: {ja_pivots.shape}')

np.save(DATA_DIR / 'pivots_8_e5_base_ja.npy', ja_pivots)

# ピボット距離計算
print('\nComputing pivot distances for Japanese...')
ja_pivot_distances = np.zeros((len(e5_ja_hashes), N_PIVOTS), dtype=np.uint8)
for i in range(N_PIVOTS):
    ja_pivot_distances[:, i] = hamming_distance_batch(ja_pivots[i], e5_ja_hashes)
print(f'  Shape: {ja_pivot_distances.shape}')

np.save(DATA_DIR / '10k_e5_base_ja_pivot_distances.npy', ja_pivot_distances)
print(f'  Saved.')

Selecting pivots for Japanese 10K...
  Pivot indices: [np.int64(892), np.int64(5578), np.int64(2208), np.int64(9775), np.int64(6268), np.int64(8130), np.int64(2972), np.int64(5272)]
  Pivots shape: (8, 128)

Computing pivot distances for Japanese...


  Shape: (10000, 8)
  Saved.


## 5. データ品質検証

In [11]:
def evaluate_hash_quality(embeddings, hashes, label, n_queries=100):
    """ハッシュ品質の評価（Hamming-Cosine相関）"""
    rng = np.random.default_rng(42)
    query_indices = rng.choice(len(embeddings), n_queries, replace=False)
    
    all_hamming = []
    all_cosine = []
    
    for qi in query_indices:
        h_dists = hamming_distance_batch(hashes[qi], hashes).astype(float)
        c_sims = cosine_similarity(embeddings[qi:qi+1], embeddings)[0]
        
        # 自分自身を除外
        mask = np.ones(len(embeddings), dtype=bool)
        mask[qi] = False
        all_hamming.extend(h_dists[mask])
        all_cosine.extend(c_sims[mask])
    
    correlation, pvalue = spearmanr(all_hamming, all_cosine)
    print(f'{label}:')
    print(f'  Hamming-Cosine Spearman: {correlation:.4f} (p={pvalue:.2e})')
    print(f'  Hamming mean: {np.mean(all_hamming):.1f}, std: {np.std(all_hamming):.1f}')
    print(f'  Cosine mean: {np.mean(all_cosine):.4f}, std: {np.std(all_cosine):.4f}')
    return correlation

In [12]:
print('='*60)
print('Hash Quality Evaluation')
print('='*60)

en_corr = evaluate_hash_quality(e5_en_embeddings, e5_en_hashes, 'E5-base English 10K')
print()
ja_corr = evaluate_hash_quality(e5_ja_embeddings, e5_ja_hashes, 'E5-base Japanese 10K')

# MiniLMの確認
print()
minilm_embeddings = np.load(DATA_DIR / '10k_minilm_embeddings.npy')
minilm_hashes = np.load(DATA_DIR / '10k_minilm_hashes_128bits.npy')
minilm_corr = evaluate_hash_quality(minilm_embeddings, minilm_hashes, 'MiniLM English 10K')

Hash Quality Evaluation


E5-base English 10K:
  Hamming-Cosine Spearman: -0.4717 (p=0.00e+00)
  Hamming mean: 52.4, std: 6.4
  Cosine mean: 0.7393, std: 0.0365



E5-base Japanese 10K:
  Hamming-Cosine Spearman: -0.5283 (p=0.00e+00)
  Hamming mean: 64.0, std: 7.5
  Cosine mean: 0.7556, std: 0.0305



MiniLM English 10K:
  Hamming-Cosine Spearman: -0.6897 (p=0.00e+00)
  Hamming mean: 64.0, std: 6.4
  Cosine mean: 0.0214, std: 0.0892


In [13]:
# ベースラインRecall@10の確認
def evaluate_baseline_recall(embeddings, hashes, label, n_queries=100, top_k=10, candidate_limits=[100, 500, 1000]):
    """ITQ Hamming距離ベースのRecall@10"""
    rng = np.random.default_rng(42)
    query_indices = rng.choice(len(embeddings), n_queries, replace=False)
    
    results = {limit: [] for limit in candidate_limits}
    
    for qi in query_indices:
        # Ground truth: コサイン類似度Top-K
        cos_sims = cosine_similarity(embeddings[qi:qi+1], embeddings)[0]
        cos_sims[qi] = -1
        gt = set(np.argsort(cos_sims)[-top_k:])
        
        # Hamming距離でソート
        h_dists = hamming_distance_batch(hashes[qi], hashes).astype(float)
        h_dists[qi] = np.inf
        sorted_indices = np.argsort(h_dists)
        
        for limit in candidate_limits:
            candidates = set(sorted_indices[:limit])
            recall = len(gt & candidates) / top_k
            results[limit].append(recall)
    
    print(f'\n{label} - Baseline Recall@{top_k}:')
    for limit in candidate_limits:
        mean_recall = np.mean(results[limit])
        print(f'  candidates={limit}: {mean_recall*100:.1f}%')
    
    return results

In [14]:
print('='*60)
print('Baseline Recall@10 Evaluation')
print('='*60)

en_recall = evaluate_baseline_recall(e5_en_embeddings, e5_en_hashes, 'E5-base English 10K')
ja_recall = evaluate_baseline_recall(e5_ja_embeddings, e5_ja_hashes, 'E5-base Japanese 10K')
minilm_recall = evaluate_baseline_recall(minilm_embeddings, minilm_hashes, 'MiniLM English 10K')

Baseline Recall@10 Evaluation



E5-base English 10K - Baseline Recall@10:
  candidates=100: 61.5%
  candidates=500: 84.0%
  candidates=1000: 91.6%



E5-base Japanese 10K - Baseline Recall@10:
  candidates=100: 83.1%
  candidates=500: 98.1%
  candidates=1000: 99.0%



MiniLM English 10K - Baseline Recall@10:
  candidates=100: 88.3%
  candidates=500: 98.2%
  candidates=1000: 99.3%


## 6. サマリー

In [15]:
print('='*70)
print('Data Preparation Summary')
print('='*70)

print('\n【Datasets】')
print(f'{"Dataset":<25} {"Model":<25} {"Dim":>5} {"Docs":>6}')
print('-'*65)
print(f'{"English Wikipedia":<25} {"multilingual-e5-base":<25} {768:>5} {N_DOCUMENTS:>6}')
print(f'{"Japanese Wikipedia":<25} {"multilingual-e5-base":<25} {768:>5} {N_DOCUMENTS:>6}')
print(f'{"English Wikipedia":<25} {"all-MiniLM-L6-v2":<25} {384:>5} {N_DOCUMENTS:>6}')

print('\n【Hash Quality (Hamming-Cosine Spearman)】')
print(f'  E5-base English:  {en_corr:.4f}')
print(f'  E5-base Japanese: {ja_corr:.4f}')
print(f'  MiniLM English:   {minilm_corr:.4f}')

print('\n【Output Files】')
files = [
    '10k_e5_base_en_embeddings.npy',
    '10k_e5_base_en_hashes_128bits.npy',
    '10k_e5_base_en_pivot_distances.npy',
    'pivots_8_e5_base_en.npy',
    '10k_e5_base_ja_embeddings.npy',
    '10k_e5_base_ja_hashes_128bits.npy',
    '10k_e5_base_ja_pivot_distances.npy',
    'pivots_8_e5_base_ja.npy',
]
for f in files:
    fpath = DATA_DIR / f
    if fpath.exists():
        size_mb = fpath.stat().st_size / 1024 / 1024
        print(f'  {f:<45} {size_mb:.1f} MB')
    else:
        print(f'  {f:<45} NOT FOUND')

Data Preparation Summary

【Datasets】
Dataset                   Model                       Dim   Docs
-----------------------------------------------------------------
English Wikipedia         multilingual-e5-base        768  10000
Japanese Wikipedia        multilingual-e5-base        768  10000
English Wikipedia         all-MiniLM-L6-v2            384  10000

【Hash Quality (Hamming-Cosine Spearman)】
  E5-base English:  -0.4717
  E5-base Japanese: -0.5283
  MiniLM English:   -0.6897

【Output Files】
  10k_e5_base_en_embeddings.npy                 29.3 MB
  10k_e5_base_en_hashes_128bits.npy             1.2 MB
  10k_e5_base_en_pivot_distances.npy            0.1 MB
  pivots_8_e5_base_en.npy                       0.0 MB
  10k_e5_base_ja_embeddings.npy                 29.3 MB
  10k_e5_base_ja_hashes_128bits.npy             1.2 MB
  10k_e5_base_ja_pivot_distances.npy            0.1 MB
  pivots_8_e5_base_ja.npy                       0.0 MB
