In [1]:
import re
from pathlib import Path

import numpy as np
from scipy.stats import spearmanr
from sympy.combinatorics import Permutation

from santa.utils import get_token2id, tokens2order, load_file

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
target_id = 5
dir = Path("./output/")
files = sorted(dir.glob(f"id{target_id}_0*.txt"))
# dir = Path("./output/id5-bk")
# files += sorted(dir.glob(f"id{target_id}_0*.txt"))
len(files)

224

In [9]:
# パラメータ設定
index        = 0
score_th     = None
filter_token = None
corr_th      = None
# ベースとする解の評価
best_text, best_score = load_file(files[index])
best_tokens = best_text.split()
token2id = get_token2id(best_text)
id2token = {v: k for k, v in token2id.items()}
best_order = tokens2order(best_tokens, token2id)
# 他の解との類似度を計算
texts, scores, transpositions = [], [], []
for i, filename in enumerate(files):
    text, score = load_file(filename)
    if text in texts:
        continue
    if score_th is not None and score > score_th:
        continue
    texts.append(text)
    scores.append(score)
    tokens = text.split()
    if filter_token is not None and filter_token == tokens[0]:
        continue
    order = tokens2order(tokens, token2id)
    corr, _ = spearmanr(order, best_order)
    if corr_th is not None and corr_th < corr:
        continue
    p = Permutation(order)
    transpositions.append(p.transpositions())
    num_swap = len(transpositions[-1])
    num_match = np.sum(np.array(tokens) == np.array(best_tokens))
    print(f"[id {i:>03}] init token={tokens[0]:<8}, n_match={num_match:3d}, corr={corr:.2f}, n_swaps={num_swap:>2}, score={score:.5f}, diff={score - best_score:.5f}")

[id 000] init token=of      , n_match=100, corr=1.00, n_swaps= 0, score=28.90722, diff=0.00000
[id 003] init token=of      , n_match= 75, corr=1.00, n_swaps=24, score=28.95948, diff=0.05226
[id 005] init token=of      , n_match= 39, corr=0.77, n_swaps=56, score=28.97113, diff=0.06391
[id 006] init token=of      , n_match= 39, corr=0.77, n_swaps=57, score=28.97472, diff=0.06750
[id 007] init token=of      , n_match= 98, corr=1.00, n_swaps= 1, score=28.99655, diff=0.08933
[id 008] init token=of      , n_match= 75, corr=1.00, n_swaps=24, score=28.99806, diff=0.09084
[id 010] init token=of      , n_match= 73, corr=1.00, n_swaps=25, score=29.01615, diff=0.10893
[id 013] init token=of      , n_match= 39, corr=0.78, n_swaps=56, score=29.03403, diff=0.12681
[id 014] init token=of      , n_match= 39, corr=0.78, n_swaps=57, score=29.03520, diff=0.12798
[id 015] init token=of      , n_match= 39, corr=0.77, n_swaps=57, score=29.04886, diff=0.14164
[id 016] init token=of      , n_match= 45, corr=0.

In [10]:
for text, score in zip(texts, scores, strict=True):
    tokens = text.split()
    chars = [token[0] for token in tokens]
    for char in chars[60:70]:
        if char == "w":
            print(" ".join(chars[20:]), score)
            break

b d e f g g h h j l n p p r s s s t u v w w y c c c c c e f f m n o p r s s s s s w p a a b b b c c c c c d d d e f g g g g h h j j k m m m n n n o p p w w w w 28.90722
b d e f g g h h j l n p p r s s s t u v w w y c c c c c c e f f m n o p r s s s s s w p a a b b b c c c c d d d e f g g g g h h j j k m m m n n n o p p w w w w 28.95948
b d e f g g h h j l n p p r s s s t u w w b c c c c d d d e f g g h j j k m m m n n n v w w w y a a b b c c c c c c e f f g g h m n o o p p p r s s s s s w w p 28.97113
b d e f g g h h j l n p p r s s s t u w w b c c c c d d d e f g g h j j k m m m n n n v w w w y a a b b c c c c c c e f f g g h m n o o p p p r s s s s s w w p 28.97472
b d e f g g h h j l n p p r s s s t u v w w y c c c c c e f f m n o p r s s s s s w p a a b b b c c c c c d d d e f g g g g h h j j k m m m n n n o p p w w w w 28.99655
b d e f g g h h j l n p p r s s s t u v w w y c c c c c c e f f m n o p r s s s s s w p a a b b b c c c c d d d e f g g g g h h j j k m m m n n n o p p w w