In [1]:
from pathlib import Path
import json
import pandas as pd
from sklearn.model_selection import train_test_split, KFold, cross_val_score
import re
import numpy as np

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import f1_score, make_scorer

In [2]:
REPO_URL = "https://github.com/eugeniavd/magic_tagger.git"  # <-- EDIT if needed
!git clone {REPO_URL}


fatal: destination path 'magic_tagger' already exists and is not an empty directory.


In [3]:
PROJECT_ROOT = Path("/content/magic_tagger")

csv_path = PROJECT_ROOT / "data" / "processed" / "classify_data_normalized.csv"

# --- load ---
df = pd.read_csv(csv_path, encoding="utf-8")
print("Loaded:", csv_path)
print("Shape:", df.shape)
display(df.head(5))

Loaded: /content/magic_tagger/data/processed/classify_data_normalized.csv
Shape: (50, 14)


Unnamed: 0,tale_id,rights_status,content_description,set,sampling_version,type_count,collection,volume_no,source_ref,atu_labels_json,txt_path,text_raw,summary_norm,text_norm
0,era_vene_1_503_1,open,[Царевна-лягушка].,core,v1_20251230,3,"ERA, Vene",1,"ERA, Vene 1, 503/4 (1)","[""402""]",/Users/eugenia/Desktop/thesis/magic_tagger/dat...,Тили были царь с царицей у не\nбыло три сына. ...,царевна-лягушка.,тили были царь с царицей у не было три сына. ц...
1,era_vene_1_515_1,open,"[По пьяни мужик спорит, что сможет принести но...",coverage,v1_20251230,1,"ERA, Vene",1,"ERA, Vene 1, 515/6 (1)","[""410""]",/Users/eugenia/Desktop/thesis/magic_tagger/dat...,"Раз пяное, ребятище» подился.\nчто можит в 12 ...","по пьяни мужик спорит, что сможет принести ноч...","раз пяное, ребятище» подился. что можит в 12 ч..."
2,era_vene_12_105_22,open,Снегурочка.,core,v1_20251230,3,"ERA, Vene",12,"ERA, Vene 12, 105 (22)","[""703*""]",/Users/eugenia/Desktop/thesis/magic_tagger/dat...,Сделали дети со снегу куклу.\nВ одного старина...,снегурочка.,сделали дети со снегу куклу. в одного старина ...
3,era_vene_12_137_98,open,Иван-дурак.,core,v1_20251230,4,"ERA, Vene",12,"ERA, Vene 12, 137/41 (98)","[""530""]",/Users/eugenia/Desktop/thesis/magic_tagger/dat...,"Кил-был стажк. В яво бло\nтра сегна. Миша, Гри...",иван-дурак.,"кил-был стажк. в яво бло тра сегна. миша, гриш..."
4,era_vene_12_189_1,open,Два брата.,core,v1_20251230,2,"ERA, Vene",12,"ERA, Vene 12, 189/94 (1)","[""735A""]",/Users/eugenia/Desktop/thesis/magic_tagger/dat...,Жили – брели два брата.\nи посла смерти отца о...,два брата.,жили — брели два брата. и посла смерти отца об...


In [5]:
col = "atu_labels_json"

def parse_labels(x):
    if pd.isna(x):
        return []
    s = str(x).strip()
    if not s:
        return []
    try:
        v = json.loads(s)
        if isinstance(v, list):
            return [str(t).strip() for t in v if str(t).strip()]

        return [str(v).strip()]
    except Exception:

        return [t.strip() for t in s.split(",") if t.strip()]

df["labels"] = df[col].apply(parse_labels)

unique_labels = sorted({lab for labs in df["labels"] for lab in labs})
print("Unique labels:", len(unique_labels))
print("Example:", unique_labels[:20])


Unique labels: 37
Example: ['1000', '1060', '1168', '1174', '300', '300A', '301', '302C*', '302С*', '307', '313', '325', '327A', '331', '402', '410', '425C', '470', '480A', '480D*']


In [6]:
label_counts = pd.Series([lab for labs in df["labels"] for lab in labs]).value_counts()
display(label_counts)

Unnamed: 0,count
707,6
480D*,5
530,3
703*,3
402,3
552,3
480A,3
650A,3
307,3
301,2


In [7]:
train_df, test_df = train_test_split(
    df,
    test_size=0.2,
    random_state=42,
    shuffle=True
)

print("Train:", train_df.shape, "| Test:", test_df.shape)
print("Unique labels in train:", len(set(sum(train_df["labels"].tolist(), []))))
print("Unique labels in test:", len(set(sum(test_df["labels"].tolist(), []))))

Train: (40, 15) | Test: (10, 15)
Unique labels in train: 34
Unique labels in test: 11


In [8]:
out_dir = Path("data/processed/splits")
out_dir.mkdir(parents=True, exist_ok=True)

train_df.to_csv(out_dir / "train.csv", index=False, encoding="utf-8")
test_df.to_csv(out_dir / "test.csv", index=False, encoding="utf-8")
print("Saved splits to:", out_dir)

Saved splits to: data/processed/splits


In [9]:
train_df.head(5)

Unnamed: 0,tale_id,rights_status,content_description,set,sampling_version,type_count,collection,volume_no,source_ref,atu_labels_json,txt_path,text_raw,summary_norm,text_norm,labels
12,era_vene_13_137_16,open,"[У попа и попадьи было много земли, не могут н...",core,v1_20251230,3,"ERA, Vene",13,"ERA, Vene 13, 137/50 (16)","[""650A"", ""1000"", ""1060""]",/Users/eugenia/Desktop/thesis/magic_tagger/dat...,16. Жин ной с попадьей. Брыло в\nн много земли...,"у попа и попадьи было много земли, не могут на...","16. жин ной с попадьей. брыло в н много земли,...","[650A, 1000, 1060]"
4,era_vene_12_189_1,open,Два брата.,core,v1_20251230,2,"ERA, Vene",12,"ERA, Vene 12, 189/94 (1)","[""735A""]",/Users/eugenia/Desktop/thesis/magic_tagger/dat...,Жили – брели два брата.\nи посла смерти отца о...,два брата.,жили — брели два брата. и посла смерти отца об...,[735A]
37,rkm_vene_1_82_47,open,[Царевич Иван не может найти себе невесту. Три...,core,v1_20251230,6,"RKM, Vene",1,"RKM, Vene 1, 82/103 (47)","[""707""]",/Users/eugenia/Desktop/thesis/magic_tagger/dat...,"в этаким, этаким царстве,\nв этаким, этаким го...",царевич иван не может найти себе невесту. три ...,"в этаким, этаким царстве, в этаким, этаким гос...",[707]
8,era_vene_12_592_4,open,Снегурочка.,core,v1_20251230,3,"ERA, Vene",12,"ERA, Vene 12, 592/4 (4)","[""703*""]",/Users/eugenia/Desktop/thesis/magic_tagger/dat...,Жия музик да баба стара\n\n\nлюди были. Не бре...,снегурочка.,жия музик да баба стара люди были. не брело в ...,[703*]
3,era_vene_12_137_98,open,Иван-дурак.,core,v1_20251230,4,"ERA, Vene",12,"ERA, Vene 12, 137/41 (98)","[""530""]",/Users/eugenia/Desktop/thesis/magic_tagger/dat...,"Кил-был стажк. В яво бло\nтра сегна. Миша, Гри...",иван-дурак.,"кил-был стажк. в яво бло тра сегна. миша, гриш...",[530]


## Feature construction

In [10]:
# --- noise features (per-document) ---
RE_CYR = re.compile(r"[\u0400-\u04FF]")
RE_LAT = re.compile(r"[A-Za-z]")
RE_DIG = re.compile(r"\d")
RE_PUNCT = re.compile(r"[^\w\s]", flags=re.UNICODE)
RE_RARE = re.compile(r"[^\u0400-\u04FFA-Za-z0-9\s\.\,\!\?\:\;\-\—\(\)\"\'«»…]", flags=re.UNICODE)

def tokenize_simple(s: str):
    return [t for t in re.split(r"\s+", s.strip()) if t]

def compute_noise_features(text: str) -> dict:
    text = "" if pd.isna(text) else str(text)
    n = len(text)
    if n == 0:
        return dict(
            cyr_ratio=0.0, lat_ratio=0.0, digit_ratio=0.0, punct_ratio=0.0, rare_ratio=0.0,
            avg_token_len=0.0, onechar_token_ratio=0.0, n_tokens=0, n_chars=0
        )

    cyr = len(RE_CYR.findall(text))
    lat = len(RE_LAT.findall(text))
    dig = len(RE_DIG.findall(text))
    punct = len(RE_PUNCT.findall(text))
    rare = len(RE_RARE.findall(text))

    toks = tokenize_simple(text)
    tok_lens = [len(t) for t in toks] if toks else []
    avg_tok_len = float(np.mean(tok_lens)) if tok_lens else 0.0
    onechar_ratio = float(np.mean([1 if len(t) == 1 else 0 for t in toks])) if toks else 0.0

    return dict(
        cyr_ratio=cyr / n,
        lat_ratio=lat / n,
        digit_ratio=dig / n,
        punct_ratio=punct / n,
        rare_ratio=rare / n,
        avg_token_len=avg_tok_len,
        onechar_token_ratio=onechar_ratio,
        n_tokens=len(toks),
        n_chars=n
    )

NUM_SRC_COL = "text_norm"  # считаем метрики по основному входу модели (рекомендуется)

def add_noise_cols(df_in: pd.DataFrame, src_col: str) -> pd.DataFrame:
    df_out = df_in.copy()
    feats = df_out[src_col].apply(compute_noise_features).apply(pd.Series)
    feats = feats.add_prefix("noise__")
    return pd.concat([df_out.reset_index(drop=True), feats.reset_index(drop=True)], axis=1)

train_df = add_noise_cols(train_df, NUM_SRC_COL)
test_df = add_noise_cols(test_df, NUM_SRC_COL)

noise_cols = [c for c in train_df.columns if c.startswith("noise__")]
print("Noise feature cols:", noise_cols)

Noise feature cols: ['noise__cyr_ratio', 'noise__lat_ratio', 'noise__digit_ratio', 'noise__punct_ratio', 'noise__rare_ratio', 'noise__avg_token_len', 'noise__onechar_token_ratio', 'noise__n_tokens', 'noise__n_chars']


In [11]:
train_df.info()
display(train_df.head(5))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40 entries, 0 to 39
Data columns (total 24 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   tale_id                     40 non-null     object 
 1   rights_status               40 non-null     object 
 2   content_description         40 non-null     object 
 3   set                         40 non-null     object 
 4   sampling_version            40 non-null     object 
 5   type_count                  40 non-null     int64  
 6   collection                  40 non-null     object 
 7   volume_no                   40 non-null     int64  
 8   source_ref                  40 non-null     object 
 9   atu_labels_json             40 non-null     object 
 10  txt_path                    36 non-null     object 
 11  text_raw                    40 non-null     object 
 12  summary_norm                40 non-null     object 
 13  text_norm                   40 non-nu

Unnamed: 0,tale_id,rights_status,content_description,set,sampling_version,type_count,collection,volume_no,source_ref,atu_labels_json,...,labels,noise__cyr_ratio,noise__lat_ratio,noise__digit_ratio,noise__punct_ratio,noise__rare_ratio,noise__avg_token_len,noise__onechar_token_ratio,noise__n_tokens,noise__n_chars
0,era_vene_13_137_16,open,"[У попа и попадьи было много земли, не могут н...",core,v1_20251230,3,"ERA, Vene",13,"ERA, Vene 13, 137/50 (16)","[""650A"", ""1000"", ""1060""]",...,"[650A, 1000, 1060]",0.779062,0.001114,0.000835,0.042601,0.001949,4.665615,0.12224,1268.0,7183.0
1,era_vene_12_189_1,open,Два брата.,core,v1_20251230,2,"ERA, Vene",12,"ERA, Vene 12, 189/94 (1)","[""735A""]",...,[735A],0.770933,0.0,0.003037,0.037744,0.000868,4.301149,0.167816,435.0,2305.0
2,rkm_vene_1_82_47,open,[Царевич Иван не может найти себе невесту. Три...,core,v1_20251230,6,"RKM, Vene",1,"RKM, Vene 1, 82/103 (47)","[""707""]",...,[707],0.780635,0.0,0.000577,0.04623,0.001401,4.79284,0.149403,2095.0,12135.0
3,era_vene_12_592_4,open,Снегурочка.,core,v1_20251230,3,"ERA, Vene",12,"ERA, Vene 12, 592/4 (4)","[""703*""]",...,[703*],0.784195,0.0,0.0,0.033435,0.001013,4.458564,0.154696,181.0,987.0
4,era_vene_12_137_98,open,Иван-дурак.,core,v1_20251230,4,"ERA, Vene",12,"ERA, Vene 12, 137/41 (98)","[""530""]",...,[530],0.780444,0.0,0.002222,0.035111,0.000889,4.476886,0.138686,411.0,2250.0


In [12]:
import json
import ast
import numpy as np
import pandas as pd

def ensure_list(x):
    """labels должны быть list[str]. Если вдруг строка — распарсим."""
    if isinstance(x, list):
        return [str(t).strip() for t in x if str(t).strip()]
    if pd.isna(x):
        return []
    s = str(x).strip()
    if not s:
        return []
    # пробуем json
    try:
        v = json.loads(s)
        if isinstance(v, list):
            return [str(t).strip() for t in v if str(t).strip()]
        return [str(v).strip()]
    except Exception:
        pass
    # пробуем python literal
    try:
        v = ast.literal_eval(s)
        if isinstance(v, list):
            return [str(t).strip() for t in v if str(t).strip()]
    except Exception:
        pass
    # fallback: csv-like
    return [t.strip() for t in s.split(",") if t.strip()]

def prepare_audit_df(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()

    # строки
    for col in ["tale_id", "rights_status", "content_description", "set", "sampling_version",
                "collection", "source_ref", "txt_path", "text_raw", "summary_norm", "text_norm", "atu_labels_json"]:
        if col in df.columns:
            df[col] = df[col].fillna("").astype(str)

    # числа
    for col in ["type_count", "volume_no"]:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors="coerce").fillna(0).astype(int)

    # labels
    if "labels" in df.columns:
        df["labels"] = df["labels"].apply(ensure_list)
    elif "atu_labels_json" in df.columns:
        df["labels"] = df["atu_labels_json"].apply(ensure_list)
    else:
        raise ValueError("No labels column found (labels / atu_labels_json).")

    # sanity
    df["text_norm"] = df.get("text_norm", "").fillna("").astype(str)
    df["summary_norm"] = df.get("summary_norm", "").fillna("").astype(str)

    # удалим явные пустые строки по id
    df = df[df["tale_id"].str.strip().ne("")].copy()

    return df

audit_cols = [
    "tale_id",
    "rights_status", "collection", "volume_no", "source_ref",
    "content_description", "set", "sampling_version", "type_count",
    "atu_labels_json", "labels",
    "txt_path", "text_raw",
    "summary_norm", "text_norm",
] + [c for c in train_df.columns if c.startswith("noise__")]

df_train_audit = prepare_audit_df(train_df)

# оставим только те колонки, которые реально есть
audit_cols_existing = [c for c in audit_cols if c in df_train_audit.columns]
df_train_audit = df_train_audit[audit_cols_existing].copy()

print("Audit DF:", df_train_audit.shape)
display(df_train_audit.head(3))


Audit DF: (40, 24)


Unnamed: 0,tale_id,rights_status,collection,volume_no,source_ref,content_description,set,sampling_version,type_count,atu_labels_json,...,text_norm,noise__cyr_ratio,noise__lat_ratio,noise__digit_ratio,noise__punct_ratio,noise__rare_ratio,noise__avg_token_len,noise__onechar_token_ratio,noise__n_tokens,noise__n_chars
0,era_vene_13_137_16,open,"ERA, Vene",13,"ERA, Vene 13, 137/50 (16)","[У попа и попадьи было много земли, не могут н...",core,v1_20251230,3,"[""650A"", ""1000"", ""1060""]",...,"16. жин ной с попадьей. брыло в н много земли,...",0.779062,0.001114,0.000835,0.042601,0.001949,4.665615,0.12224,1268.0,7183.0
1,era_vene_12_189_1,open,"ERA, Vene",12,"ERA, Vene 12, 189/94 (1)",Два брата.,core,v1_20251230,2,"[""735A""]",...,жили — брели два брата. и посла смерти отца об...,0.770933,0.0,0.003037,0.037744,0.000868,4.301149,0.167816,435.0,2305.0
2,rkm_vene_1_82_47,open,"RKM, Vene",1,"RKM, Vene 1, 82/103 (47)",[Царевич Иван не может найти себе невесту. Три...,core,v1_20251230,6,"[""707""]",...,"в этаким, этаким царстве, в этаким, этаким гос...",0.780635,0.0,0.000577,0.04623,0.001401,4.79284,0.149403,2095.0,12135.0


In [13]:
USE_NOISE_IN_MODEL = False

noise_cols = [c for c in df_train_audit.columns if c.startswith("noise__")]

model_cols = ["tale_id", "text_norm", "summary_norm", "labels"]
if USE_NOISE_IN_MODEL:
    model_cols += noise_cols

df_train_model = df_train_audit[model_cols].copy()

# фильтры качества для обучения
df_train_model = df_train_model[
    df_train_model["text_norm"].str.strip().ne("") &
    df_train_model["labels"].map(len).gt(0)
].copy()

# на всякий случай пустые summary допустимы
df_train_model["summary_norm"] = df_train_model["summary_norm"].fillna("").astype(str)

print("Model DF:", df_train_model.shape)
print("Docs:", df_train_model["tale_id"].nunique())
print("Avg labels/doc:", df_train_model["labels"].map(len).mean())
display(df_train_model.head(3))


Model DF: (40, 4)
Docs: 40
Avg labels/doc: 1.325


Unnamed: 0,tale_id,text_norm,summary_norm,labels
0,era_vene_13_137_16,"16. жин ной с попадьей. брыло в н много земли,...","у попа и попадьи было много земли, не могут на...","[650A, 1000, 1060]"
1,era_vene_12_189_1,жили — брели два брата. и посла смерти отца об...,два брата.,[735A]
2,rkm_vene_1_82_47,"в этаким, этаким царстве, в этаким, этаким гос...",царевич иван не может найти себе невесту. три ...,[707]


## baseline model

In [42]:
import numpy as np
from collections import Counter
from sklearn.model_selection import KFold

def topk_parents_by_freq(labels_parent_lists, k=3):
    """
    labels_parent_lists: list[list[str]] - parent labels for each doc in TRAIN
    Returns: list[str] top-k parent labels by document-level frequency.
    """
    cnt = Counter()
    for labs in labels_parent_lists:
        # считаем по документам, чтобы multi-label не "раздувал" слишком сильно
        for lab in set(labs):
            cnt[lab] += 1
    return [lab for lab, _ in cnt.most_common(k)]

def parent_hit_at_k_constant_preds(y_true_parent_lists, pred_parents_topk):
    """
    y_true_parent_lists: list[list[str]]
    pred_parents_topk: list[str] length k
    """
    pred_set = set(pred_parents_topk)
    hits = [(1 if (pred_set & set(true_labs)) else 0) for true_labs in y_true_parent_lists]
    return float(np.mean(hits))

def cv_frequency_baseline_parent_hit_at_k(df, label_col="labels_parent", k=3, n_splits=3, random_state=42):
    cv = KFold(n_splits=n_splits, shuffle=True, random_state=random_state)
    df = df.reset_index(drop=True)

    scores = []
    for fold, (tr, va) in enumerate(cv.split(df), start=1):
        y_tr = df.iloc[tr][label_col].tolist()
        y_va = df.iloc[va][label_col].tolist()

        topk = topk_parents_by_freq(y_tr, k=k)
        score = parent_hit_at_k_constant_preds(y_va, topk)

        scores.append(score)
        print(f"Fold {fold} Freq-baseline Top{k}={topk}  Parent-Hit@{k}: {score:.3f}")

    scores = np.array(scores)
    print(f"CV Freq-baseline Parent-Hit@{k}: mean={scores.mean():.3f} std={scores.std():.3f} scores={scores}")
    return scores

# ====== Использование ======
# Предполагается, что у вас есть train_df/test_df и в них есть колонка labels_parent (list[str])

# 1) CV baseline на train_df
cv_base = cv_frequency_baseline_parent_hit_at_k(
    train_df, label_col="labels_parent", k=3, n_splits=3, random_state=42
)

# 2) Baseline на hold-out test: top-3 по всему train, оценка на test
top3_train = topk_parents_by_freq(train_df["labels_parent"].tolist(), k=3)
test_base = parent_hit_at_k_constant_preds(test_df["labels_parent"].tolist(), top3_train)

print("Train top-3 parents by freq:", top3_train)
print(f"Test Freq-baseline Parent-Hit@3: {test_base:.3f}")


KeyError: 'labels_parent'

In [14]:
X_train = train_df
X_test = test_df

mlb = MultiLabelBinarizer()
y_train = mlb.fit_transform(train_df["labels"])
y_test = mlb.transform(test_df["labels"])

# TF-IDF: ограничиваем словарь, чтобы CV не тормозил
text_char = TfidfVectorizer(
    analyzer="char",
    ngram_range=(3, 5),
    min_df=2,
    max_features=50000,     # можно 20000–80000
    sublinear_tf=True
)

summary_word = TfidfVectorizer(
    analyzer="word",
    ngram_range=(1, 2),
    min_df=1,
    max_features=20000,
    sublinear_tf=True
)

preprocess = ColumnTransformer(
    transformers=[
        ("char_tfidf", text_char, "text_norm"),
        ("sum_tfidf", summary_word, "summary_norm"),
        ("noise", StandardScaler(with_mean=False), noise_cols),  # Pipeline не нужен
    ],
    remainder="drop",
    sparse_threshold=0.3
)

clf = OneVsRestClassifier(
    LogisticRegression(
        max_iter=2000,
        solver="liblinear"   # быстрее/стабильнее на малых данных
    ),
    n_jobs=-1               # параллелим по классам
)

model = Pipeline([
    ("features", preprocess),
    ("clf", clf),
])

model



In [15]:
cv = KFold(n_splits=5, shuffle=True, random_state=42)

def f1_macro_multilabel(y_true, y_pred):
    return f1_score(y_true, y_pred, average="macro", zero_division=0)

scorer = make_scorer(f1_macro_multilabel)

scores = cross_val_score(model, X_train, y_train, cv=cv, scoring=scorer)
print("CV F1-macro:", scores, "mean=", scores.mean(), "std=", scores.std())

CV F1-macro: [0.02941176 0.         0.         0.         0.        ] mean= 0.0058823529411764705 std= 0.011764705882352941


In [16]:
model.fit(X_train, y_train)
pred = model.predict(X_train)
print("Avg positives per sample:", pred.sum(axis=1).mean())
print("Total predicted positives:", pred.sum())


Avg positives per sample: 0.325
Total predicted positives: 13


In [17]:
import numpy as np

model.fit(X_train, y_train)

# scores: (n_samples, n_classes)
# Для LogisticRegression в OVR predict_proba должен быть доступен
proba = model.predict_proba(X_train)

topk = 3
topk_idx = np.argsort(-proba, axis=1)[:, :topk]

# истинные классы в индексы
true_idx = [set(np.where(row == 1)[0]) for row in y_train]

hits = []
for i in range(len(true_idx)):
    hits.append(1 if any(j in true_idx[i] for j in topk_idx[i]) else 0)

print(f"Train Hit@{topk}: {np.mean(hits):.3f}")


Train Hit@3: 1.000


In [18]:
import numpy as np

def hit_at_k_from_proba(y_true_bin, proba, k=3):
    """
    y_true_bin: np.array shape (n_samples, n_classes), 0/1
    proba: np.array shape (n_samples, n_classes), float
    """
    topk_idx = np.argsort(-proba, axis=1)[:, :k]
    true_idx = [set(np.where(row == 1)[0]) for row in y_true_bin]

    hits = []
    for i in range(len(true_idx)):
        hits.append(1 if any(j in true_idx[i] for j in topk_idx[i]) else 0)
    return float(np.mean(hits))


In [19]:
from sklearn.model_selection import KFold
from sklearn.base import clone

def cv_hit_at_k(model, X, y, k=3, n_splits=3, random_state=42):
    cv = KFold(n_splits=n_splits, shuffle=True, random_state=random_state)
    scores = []

    X = X.reset_index(drop=True)  # важно: корректная индексация
    for fold, (tr_idx, va_idx) in enumerate(cv.split(X), start=1):
        m = clone(model)
        X_tr, X_va = X.iloc[tr_idx], X.iloc[va_idx]
        y_tr, y_va = y[tr_idx], y[va_idx]

        m.fit(X_tr, y_tr)
        proba = m.predict_proba(X_va)   # ранжируем по вероятностям
        score = hit_at_k_from_proba(y_va, proba, k=k)
        scores.append(score)
        print(f"Fold {fold} Hit@{k}: {score:.3f}")

    scores = np.array(scores)
    print(f"CV Hit@{k}: mean={scores.mean():.3f} std={scores.std():.3f} scores={scores}")
    return scores

# запуск
cv_scores = cv_hit_at_k(model, X_train, y_train, k=3, n_splits=3, random_state=42)


Fold 1 Hit@3: 0.429
Fold 2 Hit@3: 0.154
Fold 3 Hit@3: 0.154
CV Hit@3: mean=0.245 std=0.130 scores=[0.42857143 0.15384615 0.15384615]


In [20]:
# обучаем на всем train
model.fit(X_train, y_train)

# оцениваем на test
proba_test = model.predict_proba(X_test)
test_hit3 = hit_at_k_from_proba(y_test, proba_test, k=3)
print(f"Test Hit@3: {test_hit3:.3f}")


Test Hit@3: 0.400


метод 2

In [21]:
import re

RE_ATU_PARENT = re.compile(r"(\d{1,4})")  # берем ведущие цифры

def atu_parent(label: str) -> str:
    """
    Примеры:
      "530" -> "530"
      "ATU_530A" -> "530"
      "530A" -> "530"
      "ATU 327A" -> "327"
    """
    if label is None:
        return ""
    s = str(label)
    m = RE_ATU_PARENT.search(s)
    return m.group(1) if m else s.strip()


In [22]:
import pandas as pd

# train_df/test_df уже есть, и в них есть колонка labels = список строк (оригинальные ATU)
# пример: ["ATU_327A", "530"]

def to_parent_set(labels):
    return sorted({atu_parent(x) for x in labels if str(x).strip()})

train_df = train_df.copy()
test_df = test_df.copy()

train_df["labels_parent"] = train_df["labels"].apply(to_parent_set)
test_df["labels_parent"] = test_df["labels"].apply(to_parent_set)


In [23]:
import numpy as np
from collections import Counter

def stratified_multilabel_split_by_parent(
    df: pd.DataFrame,
    label_col: str = "labels_parent",
    test_size: float = 0.2,
    random_state: int = 42
):
    rng = np.random.RandomState(random_state)
    df = df.reset_index(drop=True).copy()

    n = len(df)
    n_test = max(1, int(round(n * test_size)))

    # counts of parent labels across documents
    all_counts = Counter(lab for labs in df[label_col] for lab in labs)

    # singleton labels cannot be placed in test (otherwise they vanish from train)
    # rule: we only move a doc to test if for all its labels count_remaining[label] >= 2
    remaining = Counter(all_counts)

    test_idx = []
    covered_test = set()

    candidates = list(range(n))
    rng.shuffle(candidates)

    def is_safe(i):
        labs = df.at[i, label_col]
        return all(remaining[lab] >= 2 for lab in labs)

    def gain(i):
        labs = set(df.at[i, label_col])
        return len(labs - covered_test)

    # Greedy selection: maximize new label coverage, keep train coverage safe
    while len(test_idx) < n_test:
        safe = [i for i in candidates if i not in test_idx and is_safe(i)]
        if not safe:
            break

        # pick the safe doc with best gain; tie-breaker random
        gains = np.array([gain(i) for i in safe])
        best_gain = gains.max()
        best = [safe[j] for j in np.where(gains == best_gain)[0]]
        chosen = rng.choice(best)

        test_idx.append(chosen)
        for lab in df.at[chosen, label_col]:
            remaining[lab] -= 1
            covered_test.add(lab)

    # If we did not reach desired test size, fill with any remaining safe docs randomly
    if len(test_idx) < n_test:
        safe_rest = [i for i in range(n) if i not in test_idx and is_safe(i)]
        rng.shuffle(safe_rest)
        need = n_test - len(test_idx)
        test_idx.extend(safe_rest[:need])

    test_idx = sorted(set(test_idx))
    train_idx = [i for i in range(n) if i not in test_idx]

    train_df = df.iloc[train_idx].reset_index(drop=True)
    test_df = df.iloc[test_idx].reset_index(drop=True)

    return train_df, test_df, all_counts

# Применение
df_split_src = df.copy()  # ваш исходный df с колонками text_norm/summary_norm/labels и т.д.
df_split_src["labels_parent"] = df_split_src["labels"].apply(to_parent_set)

train_df, test_df, parent_counts = stratified_multilabel_split_by_parent(
    df_split_src,
    label_col="labels_parent",
    test_size=0.2,
    random_state=42
)

print("Train:", train_df.shape, "Test:", test_df.shape)

# Диагностика покрытия
train_parents = set(lab for labs in train_df["labels_parent"] for lab in labs)
test_parents  = set(lab for labs in test_df["labels_parent"] for lab in labs)

print("Unique parent labels total:", len(set(parent_counts)))
print("Unique parent labels in train:", len(train_parents))
print("Unique parent labels in test:", len(test_parents))
print("Parents only in train (singletons etc.):", len(train_parents - test_parents))
print("Parents only in test (should be 0 ideally):", len(test_parents - train_parents))


Train: (40, 16) Test: (10, 16)
Unique parent labels total: 32
Unique parent labels in train: 32
Unique parent labels in test: 11
Parents only in train (singletons etc.): 21
Parents only in test (should be 0 ideally): 0


In [24]:
train_df = add_noise_cols(train_df, src_col="text_norm")
test_df  = add_noise_cols(test_df,  src_col="text_norm")

# 2) Обновляем список колонок (важно!)
noise_cols = [c for c in train_df.columns if c.startswith("noise__")]
print("noise cols:", noise_cols)


noise cols: ['noise__cyr_ratio', 'noise__lat_ratio', 'noise__digit_ratio', 'noise__punct_ratio', 'noise__rare_ratio', 'noise__avg_token_len', 'noise__onechar_token_ratio', 'noise__n_tokens', 'noise__n_chars']


In [25]:
from sklearn.preprocessing import StandardScaler

preprocess = ColumnTransformer(
    transformers=[
        ("char_tfidf", text_char, "text_norm"),
        ("sum_tfidf", summary_word, "summary_norm"),
        ("noise", StandardScaler(with_mean=False), noise_cols),
    ],
    remainder="drop"
)

model = Pipeline([("features", preprocess), ("clf", clf)])


In [26]:
import numpy as np

def hit_at_k_parent_any(y_true_labels_parent, proba, classes, k=3):
    """
    y_true_labels_parent: list[list[str]] - истинные parent-метки на документ
    proba: np.ndarray shape (n_samples, n_classes)
    classes: array-like, mlb.classes_ (оригинальные классы)
    """
    classes_parent = np.array([atu_parent(c) for c in classes])

    topk_idx = np.argsort(-proba, axis=1)[:, :k]
    hits = []

    for i, true_parents in enumerate(y_true_labels_parent):
        true_set = set(true_parents)
        pred_parent_set = set(classes_parent[topk_idx[i]])
        hits.append(1 if (true_set & pred_parent_set) else 0)

    return float(np.mean(hits))


In [27]:
from sklearn.model_selection import KFold
from sklearn.base import clone

def cv_hit_at_k_parent(model, X, y_labels_parent, mlb, k=3, n_splits=3, random_state=42):
    cv = KFold(n_splits=n_splits, shuffle=True, random_state=random_state)
    X = X.reset_index(drop=True)

    scores = []
    for fold, (tr, va) in enumerate(cv.split(X), start=1):
        m = clone(model)
        m.fit(X.iloc[tr], mlb.transform(X.iloc[tr]["labels"]))  # если labels лежит в X
        proba = m.predict_proba(X.iloc[va])

        score = hit_at_k_parent_any(
            y_true_labels_parent=[y_labels_parent[i] for i in va],
            proba=proba,
            classes=mlb.classes_,
            k=k
        )
        scores.append(score)
        print(f"Fold {fold} Parent-Hit@{k}: {score:.3f}")

    scores = np.array(scores)
    print(f"CV Parent-Hit@{k}: mean={scores.mean():.3f} std={scores.std():.3f} scores={scores}")
    return scores

# 1) готовим X/y
X_train = train_df.copy()
X_test  = test_df.copy()

In [28]:


# y для обучения модели — по исходным классам (как раньше)
from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer()
y_train = mlb.fit_transform(train_df["labels"])
y_test = mlb.transform(test_df["labels"])

# 2) CV по родительской метрике (на train)
cv_scores = cv_hit_at_k_parent(
    model=model,
    X=X_train,
    y_labels_parent=X_train["labels_parent"].tolist(),
    mlb=mlb,
    k=3,
    n_splits=3,
    random_state=42
)

# 3) Fit на train и оценка на test по родительской метрике
model.fit(X_train, y_train)
proba_test = model.predict_proba(X_test)

test_parent_hit3 = hit_at_k_parent_any(
    y_true_labels_parent=X_test["labels_parent"].tolist(),
    proba=proba_test,
    classes=mlb.classes_,
    k=3
)
print(f"Test Parent-Hit@3: {test_parent_hit3:.3f}")


Fold 1 Parent-Hit@3: 0.357
Fold 2 Parent-Hit@3: 0.308
Fold 3 Parent-Hit@3: 0.077
CV Parent-Hit@3: mean=0.247 std=0.122 scores=[0.35714286 0.30769231 0.07692308]
Test Parent-Hit@3: 0.400


In [29]:
import numpy as np
from sklearn.model_selection import KFold
from sklearn.base import clone
from collections import Counter

def topk_parents_by_freq(labels_parent_lists, k=3):
    cnt = Counter()
    for labs in labels_parent_lists:
        for lab in set(labs):
            cnt[lab] += 1
    return [lab for lab, _ in cnt.most_common(k)]

def hits_constant_topk(y_true_parent_lists, pred_topk):
    pred_set = set(pred_topk)
    return np.array([1 if (pred_set & set(true_labs)) else 0 for true_labs in y_true_parent_lists], dtype=int)

def hits_model_topk_parent(proba, classes, y_true_parent_lists, k=3):
    classes_parent = np.array([atu_parent(c) for c in classes])
    topk_idx = np.argsort(-proba, axis=1)[:, :k]
    hits = []
    for i, true_labs in enumerate(y_true_parent_lists):
        pred_set = set(classes_parent[topk_idx[i]])
        hits.append(1 if (pred_set & set(true_labs)) else 0)
    return np.array(hits, dtype=int)

def cv_compare_model_vs_freq_baseline(model, df, mlb, k=3, n_splits=3, random_state=42):
    cv = KFold(n_splits=n_splits, shuffle=True, random_state=random_state)
    df = df.reset_index(drop=True)

    all_model_hits = []
    all_base_hits = []

    for fold, (tr, va) in enumerate(cv.split(df), start=1):
        df_tr = df.iloc[tr].reset_index(drop=True)
        df_va = df.iloc[va].reset_index(drop=True)

        # baseline top-k from TRAIN fold
        base_topk = topk_parents_by_freq(df_tr["labels_parent"].tolist(), k=k)
        base_hits = hits_constant_topk(df_va["labels_parent"].tolist(), base_topk)

        # model
        m = clone(model)
        y_tr = mlb.transform(df_tr["labels"])
        m.fit(df_tr, y_tr)
        proba = m.predict_proba(df_va)
        model_hits = hits_model_topk_parent(proba, mlb.classes_, df_va["labels_parent"].tolist(), k=k)

        all_model_hits.append(model_hits)
        all_base_hits.append(base_hits)

        print(f"Fold {fold}: model Hit@{k}={model_hits.mean():.3f} | baseline Hit@{k}={base_hits.mean():.3f} | top{k}={base_topk}")

    all_model_hits = np.concatenate(all_model_hits)
    all_base_hits = np.concatenate(all_base_hits)

    win = np.sum((all_model_hits == 1) & (all_base_hits == 0))
    lose = np.sum((all_model_hits == 0) & (all_base_hits == 1))
    tie = np.sum(all_model_hits == all_base_hits)

    print("\nPaired comparison over all CV validation examples:")
    print("Model wins (model=1, base=0):", win)
    print("Model loses (model=0, base=1):", lose)
    print("Ties:", tie)
    print("Mean model hit:", all_model_hits.mean(), "Mean baseline hit:", all_base_hits.mean())

# запуск (train_df должен содержать labels и labels_parent)
cv_compare_model_vs_freq_baseline(model, train_df, mlb, k=3, n_splits=3, random_state=42)


Fold 1: model Hit@3=0.357 | baseline Hit@3=0.500 | top3=['707', '480', '402']
Fold 2: model Hit@3=0.308 | baseline Hit@3=0.231 | top3=['480', '707', '552']
Fold 3: model Hit@3=0.077 | baseline Hit@3=0.077 | top3=['480', '402', '703']

Paired comparison over all CV validation examples:
Model wins (model=1, base=0): 1
Model loses (model=0, base=1): 2
Ties: 37
Mean model hit: 0.25 Mean baseline hit: 0.275


In [31]:
import pandas as pd

df_model = df_train_model.copy()

# 1) гарантируем строки
df_model["text_norm"] = df_model["text_norm"].fillna("").astype(str)
df_model["summary_norm"] = df_model["summary_norm"].fillna("").astype(str)

# 2) fallback: если summary пустой, берём первые N символов текста
N_FALLBACK = 800
df_model["summary_fallback"] = df_model["summary_norm"].where(
    df_model["summary_norm"].str.strip().ne(""),
    df_model["text_norm"].str.slice(0, N_FALLBACK)
)

# проверка
display(df_model[["tale_id", "summary_norm", "summary_fallback"]].head(5))


Unnamed: 0,tale_id,summary_norm,summary_fallback
0,era_vene_13_137_16,"у попа и попадьи было много земли, не могут на...","у попа и попадьи было много земли, не могут на..."
1,era_vene_12_189_1,два брата.,два брата.
2,rkm_vene_1_82_47,царевич иван не может найти себе невесту. три ...,царевич иван не может найти себе невесту. три ...
3,era_vene_12_592_4,снегурочка.,снегурочка.
4,era_vene_12_137_98,иван-дурак.,иван-дурак.


In [32]:

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression

text_char = TfidfVectorizer(
    analyzer="char",
    ngram_range=(3, 5),
    min_df=1,             # для малого корпуса часто лучше 1
    max_features=50000,   # ограничиваем размерность
    sublinear_tf=True
)

summary_word = TfidfVectorizer(
    analyzer="word",
    ngram_range=(1, 2),
    min_df=1,
    max_features=20000,
    sublinear_tf=True
)

preprocess = ColumnTransformer(
    transformers=[
        ("char_tfidf", text_char, "text_norm"),
        ("sum_tfidf", summary_word, "summary_fallback"),
    ],
    remainder="drop"
)

clf = OneVsRestClassifier(
    LogisticRegression(max_iter=2000, solver="liblinear"),
    n_jobs=-1
)

model = Pipeline([
    ("features", preprocess),
    ("clf", clf),
])

model


In [33]:
import numpy as np
from sklearn.preprocessing import MultiLabelBinarizer

# y
mlb = MultiLabelBinarizer()
y = mlb.fit_transform(df_model["labels"])

print("Docs:", len(df_model))
print("Unique labels:", len(mlb.classes_))

# parent labels for evaluation
import re
RE_ATU_PARENT = re.compile(r"(\d{1,4})")

def atu_parent(label: str) -> str:
    if label is None:
        return ""
    s = str(label)
    m = RE_ATU_PARENT.search(s)
    return m.group(1) if m else s.strip()

def to_parent_set(labels):
    return sorted({atu_parent(x) for x in labels if str(x).strip()})

df_model["labels_parent"] = df_model["labels"].apply(to_parent_set)


Docs: 40
Unique labels: 34


In [35]:
from collections import Counter

def stratified_multilabel_split_by_parent(
    df,
    label_col="labels_parent",
    test_size=0.2,
    random_state=42
):
    rng = np.random.RandomState(random_state)
    df = df.reset_index(drop=True).copy()

    n = len(df)
    n_test = max(1, int(round(n * test_size)))

    all_counts = Counter(lab for labs in df[label_col] for lab in labs)
    remaining = Counter(all_counts)

    test_idx = []
    covered_test = set()

    candidates = list(range(n))
    rng.shuffle(candidates)

    def is_safe(i):
        labs = df.at[i, label_col]
        return all(remaining[lab] >= 2 for lab in labs)

    def gain(i):
        labs = set(df.at[i, label_col])
        return len(labs - covered_test)

    while len(test_idx) < n_test:
        safe = [i for i in candidates if i not in test_idx and is_safe(i)]
        if not safe:
            break
        gains = np.array([gain(i) for i in safe])
        best_gain = gains.max()
        best = [safe[j] for j in np.where(gains == best_gain)[0]]
        chosen = rng.choice(best)

        test_idx.append(chosen)
        for lab in df.at[chosen, label_col]:
            remaining[lab] -= 1
            covered_test.add(lab)

    if len(test_idx) < n_test:
        safe_rest = [i for i in range(n) if i not in test_idx and is_safe(i)]
        rng.shuffle(safe_rest)
        need = n_test - len(test_idx)
        test_idx.extend(safe_rest[:need])

    test_idx = sorted(set(test_idx))
    train_idx = [i for i in range(n) if i not in test_idx]

    return df.iloc[train_idx].reset_index(drop=True), df.iloc[test_idx].reset_index(drop=True)

train_df, test_df = stratified_multilabel_split_by_parent(df_model, test_size=0.2, random_state=42)
print("Train:", train_df.shape, "| Test:", test_df.shape)



Train: (32, 6) | Test: (8, 6)


In [36]:
X_train = train_df
X_test = test_df

y_train = mlb.transform(train_df["labels"])
y_test = mlb.transform(test_df["labels"])

print("Unique labels in train:", len({lab for labs in train_df["labels"] for lab in labs}))
print("Unique labels in test:", len({lab for labs in test_df["labels"] for lab in labs}))


Unique labels in train: 34
Unique labels in test: 9


In [37]:
import numpy as np
from sklearn.model_selection import KFold
from sklearn.base import clone

def hit_at_k_parent_any(y_true_labels_parent, proba, classes, k=3):
    classes_parent = np.array([atu_parent(c) for c in classes])
    topk_idx = np.argsort(-proba, axis=1)[:, :k]
    hits = []
    for i, true_parents in enumerate(y_true_labels_parent):
        true_set = set(true_parents)
        pred_set = set(classes_parent[topk_idx[i]])
        hits.append(1 if (true_set & pred_set) else 0)
    return float(np.mean(hits))

def cv_parent_hit_at_k(model, X, y_bin, y_parent_lists, mlb, k=3, n_splits=3, random_state=42):
    cv = KFold(n_splits=n_splits, shuffle=True, random_state=random_state)
    X = X.reset_index(drop=True)
    scores = []
    for fold, (tr, va) in enumerate(cv.split(X), start=1):
        m = clone(model)
        m.fit(X.iloc[tr], y_bin[tr])
        proba = m.predict_proba(X.iloc[va])
        score = hit_at_k_parent_any(
            y_true_labels_parent=[y_parent_lists[i] for i in va],
            proba=proba,
            classes=mlb.classes_,
            k=k
        )
        scores.append(score)
        print(f"Fold {fold} Parent-Hit@{k}: {score:.3f}")
    scores = np.array(scores)
    print(f"CV Parent-Hit@{k}: mean={scores.mean():.3f} std={scores.std():.3f} scores={scores}")
    return scores

# CV на train
cv_scores = cv_parent_hit_at_k(
    model=model,
    X=X_train,
    y_bin=y_train,
    y_parent_lists=X_train["labels_parent"].tolist(),
    mlb=mlb,
    k=3,
    n_splits=3,
    random_state=42
)


Fold 1 Parent-Hit@3: 0.182
Fold 2 Parent-Hit@3: 0.273
Fold 3 Parent-Hit@3: 0.200
CV Parent-Hit@3: mean=0.218 std=0.039 scores=[0.18181818 0.27272727 0.2       ]


In [38]:
from collections import Counter

def topk_parents_by_freq(labels_parent_lists, k=3):
    cnt = Counter()
    for labs in labels_parent_lists:
        for lab in set(labs):
            cnt[lab] += 1
    return [lab for lab, _ in cnt.most_common(k)]

def parent_hit_at_k_constant_preds(y_true_parent_lists, pred_parents_topk):
    pred_set = set(pred_parents_topk)
    hits = [(1 if (pred_set & set(true_labs)) else 0) for true_labs in y_true_parent_lists]
    return float(np.mean(hits))

top3_train = topk_parents_by_freq(X_train["labels_parent"].tolist(), k=3)
cv_base = parent_hit_at_k_constant_preds(X_train["labels_parent"].tolist(), top3_train)

print("Train top-3 parents by freq:", top3_train)
print(f"Train Freq-baseline Parent-Hit@3 (on train, naive): {cv_base:.3f}")

# baseline на test
test_base = parent_hit_at_k_constant_preds(X_test["labels_parent"].tolist(), top3_train)
print(f"Test Freq-baseline Parent-Hit@3: {test_base:.3f}")


Train top-3 parents by freq: ['480', '707', '703']
Train Freq-baseline Parent-Hit@3 (on train, naive): 0.312
Test Freq-baseline Parent-Hit@3: 0.375


In [39]:
# Fit
model.fit(X_train, y_train)

# Test Parent-Hit@3
proba_test = model.predict_proba(X_test)
test_parent_hit3 = hit_at_k_parent_any(
    y_true_labels_parent=X_test["labels_parent"].tolist(),
    proba=proba_test,
    classes=mlb.classes_,
    k=3
)
print(f"Test Parent-Hit@3: {test_parent_hit3:.3f}")


Test Parent-Hit@3: 0.500


In [41]:
seeds = [41,42,43,44,45,46,47,48,49,50]
results = []

for rs in seeds:
    train_df, test_df = stratified_multilabel_split_by_parent(df_model, test_size=0.2, random_state=rs)
    X_train, X_test = train_df, test_df
    y_train = mlb.transform(train_df["labels"])
    y_test = mlb.transform(test_df["labels"])

    model.fit(X_train, y_train)
    proba_test = model.predict_proba(X_test)

    hit3 = hit_at_k_parent_any(
        y_true_labels_parent=X_test["labels_parent"].tolist(),
        proba=proba_test,
        classes=mlb.classes_,
        k=3
    )
    results.append(hit3)

results = np.array(results)
print("Repeated hold-out Test Parent-Hit@3 across seeds:")
print("mean=", results.mean(), "std=", results.std(), "min=", results.min(), "max=", results.max())
print("scores=", results)


Repeated hold-out Test Parent-Hit@3 across seeds:
mean= 0.4625 std= 0.08003905296791061 min= 0.375 max= 0.625
scores= [0.375 0.5   0.625 0.375 0.5   0.5   0.375 0.375 0.5   0.5  ]


In [42]:
import numpy as np
import pandas as pd

scores = np.array([0.375, 0.5, 0.625, 0.375, 0.5, 0.5, 0.375, 0.375, 0.5, 0.5])
seeds  = [41,42,43,44,45,46,47,48,49,50]  # если это ваши seeds

df_eval = pd.DataFrame({"seed": seeds, "test_parent_hit3": scores})
display(df_eval)

print("Repeated hold-out Test Parent-Hit@3 across seeds:")
print("mean=", df_eval["test_parent_hit3"].mean())
print("std=", df_eval["test_parent_hit3"].std(ddof=0))
print("min=", df_eval["test_parent_hit3"].min())
print("max=", df_eval["test_parent_hit3"].max())


Unnamed: 0,seed,test_parent_hit3
0,41,0.375
1,42,0.5
2,43,0.625
3,44,0.375
4,45,0.5
5,46,0.5
6,47,0.375
7,48,0.375
8,49,0.5
9,50,0.5


Repeated hold-out Test Parent-Hit@3 across seeds:
mean= 0.4625
std= 0.08003905296791061
min= 0.375
max= 0.625


In [43]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression

def build_model(
    use_word_summary=True,
    use_word_text=False,
    char_analyzer="char",        # попробуйте "char_wb"
    char_ngram=(3,5),
    word_ngram=(1,2),
    char_max_features=50000,
    word_max_features=20000,
):
    transformers = []

    # char TF-IDF по тексту
    transformers.append((
        "char_tfidf",
        TfidfVectorizer(
            analyzer=char_analyzer,
            ngram_range=char_ngram,
            min_df=1,
            max_features=char_max_features,
            sublinear_tf=True,
            lowercase=False,  # у вас уже text_norm lowercase
        ),
        "text_norm"
    ))

    # word TF-IDF по summary_fallback
    if use_word_summary:
        transformers.append((
            "sum_word",
            TfidfVectorizer(
                analyzer="word",
                ngram_range=word_ngram,
                min_df=1,
                max_features=word_max_features,
                sublinear_tf=True,
                lowercase=False,
            ),
            "summary_fallback"
        ))

    # word TF-IDF по text_norm (дополнительно)
    if use_word_text:
        transformers.append((
            "text_word",
            TfidfVectorizer(
                analyzer="word",
                ngram_range=word_ngram,
                min_df=1,
                max_features=word_max_features,
                sublinear_tf=True,
                lowercase=False,
            ),
            "text_norm"
        ))

    preprocess = ColumnTransformer(transformers=transformers, remainder="drop")

    clf = OneVsRestClassifier(
        LogisticRegression(max_iter=2000, solver="liblinear"),
        n_jobs=-1
    )

    return Pipeline([("features", preprocess), ("clf", clf)])


In [44]:
import numpy as np

def repeated_holdout_parent_hit3(df_model, mlb, seeds, model_builder, test_size=0.2):
    scores = []
    for rs in seeds:
        tr, te = stratified_multilabel_split_by_parent(df_model, test_size=test_size, random_state=rs)

        X_train, X_test = tr, te
        y_train = mlb.transform(tr["labels"])

        model = model_builder()
        model.fit(X_train, y_train)

        proba = model.predict_proba(X_test)
        score = hit_at_k_parent_any(
            y_true_labels_parent=X_test["labels_parent"].tolist(),
            proba=proba,
            classes=mlb.classes_,
            k=3
        )
        scores.append(score)

    scores = np.array(scores, dtype=float)
    return scores

seeds = [41,42,43,44,45,46,47,48,49,50]

experiments = {
    # Ваша текущая логика: char(text) + word(summary_fallback)
    "A_char+word_summary": lambda: build_model(use_word_summary=True, use_word_text=False, char_analyzer="char"),

    # Добавляем word(text) тоже: char(text) + word(summary) + word(text)
    "B_char+word_summary+word_text": lambda: build_model(use_word_summary=True, use_word_text=True, char_analyzer="char"),

    # Пробуем char_wb (часто лучше на шумных текстах): char_wb(text) + word(summary)
    "C_charWB+word_summary": lambda: build_model(use_word_summary=True, use_word_text=False, char_analyzer="char_wb"),

    # Самый “богатый”: char_wb(text) + word(summary) + word(text)
    "D_charWB+word_summary+word_text": lambda: build_model(use_word_summary=True, use_word_text=True, char_analyzer="char_wb"),
}

results = {}
for name, builder in experiments.items():
    scores = repeated_holdout_parent_hit3(df_model, mlb, seeds, builder, test_size=0.2)
    results[name] = scores
    print(f"{name}: mean={scores.mean():.4f} std={scores.std():.4f} min={scores.min():.3f} max={scores.max():.3f} scores={scores}")

# Сводная таблица
import pandas as pd
summary = pd.DataFrame({
    "model": list(results.keys()),
    "mean": [results[k].mean() for k in results],
    "std":  [results[k].std() for k in results],
    "min":  [results[k].min() for k in results],
    "max":  [results[k].max() for k in results],
}).sort_values("mean", ascending=False)

display(summary)


A_char+word_summary: mean=0.4625 std=0.0800 min=0.375 max=0.625 scores=[0.375 0.5   0.625 0.375 0.5   0.5   0.375 0.375 0.5   0.5  ]
B_char+word_summary+word_text: mean=0.5125 std=0.1305 min=0.375 max=0.750 scores=[0.375 0.625 0.75  0.375 0.625 0.625 0.375 0.375 0.5   0.5  ]
C_charWB+word_summary: mean=0.4750 std=0.0935 min=0.375 max=0.625 scores=[0.375 0.625 0.625 0.375 0.5   0.5   0.375 0.375 0.5   0.5  ]
D_charWB+word_summary+word_text: mean=0.5375 std=0.1256 min=0.375 max=0.750 scores=[0.375 0.625 0.75  0.375 0.625 0.625 0.375 0.625 0.5   0.5  ]


Unnamed: 0,model,mean,std,min,max
3,D_charWB+word_summary+word_text,0.5375,0.125623,0.375,0.75
1,B_char+word_summary+word_text,0.5125,0.130504,0.375,0.75
2,C_charWB+word_summary,0.475,0.093541,0.375,0.625
0,A_char+word_summary,0.4625,0.080039,0.375,0.625



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.




Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.




Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.



In [54]:
df_cal = df_cal.copy()
df_cal["score"] = score_calibrator.predict_proba(df_cal[["p1","p2","delta"]])[:, 1]

rows = []
for thr in np.linspace(0.1, 0.9, 17):
    sel = df_cal["score"] >= thr
    coverage = float(sel.mean())
    precision = float(df_cal.loc[sel, "y_correct"].mean()) if sel.any() else np.nan
    rows.append({"thr": thr, "coverage": coverage, "precision": precision})

display(pd.DataFrame(rows))
print(df_cal["score"].describe(percentiles=[.1,.25,.5,.75,.9,.95]))


Unnamed: 0,thr,coverage,precision
0,0.1,0.8125,0.192308
1,0.15,0.46875,0.333333
2,0.2,0.15625,0.6
3,0.25,0.09375,1.0
4,0.3,0.09375,1.0
5,0.35,0.09375,1.0
6,0.4,0.09375,1.0
7,0.45,0.09375,1.0
8,0.5,0.09375,1.0
9,0.55,0.0625,1.0


count    32.000000
mean      0.188577
std       0.177597
min       0.052527
10%       0.081770
25%       0.108084
50%       0.140444
75%       0.189182
90%       0.213476
95%       0.561204
max       0.924723
Name: score, dtype: float64


In [55]:
import numpy as np
import pandas as pd
from collections import Counter
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.base import clone
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV

def build_score_calibration_dataset_hit3(df_train, mlb, ranker_model, k=3, n_splits=5, random_state=42):
    cv = KFold(n_splits=n_splits, shuffle=True, random_state=random_state)
    df_train = df_train.reset_index(drop=True).copy()

    rows = []
    for tr, va in cv.split(df_train):
        m = clone(ranker_model)
        y_tr = mlb.transform(df_train.iloc[tr]["labels"])
        m.fit(df_train.iloc[tr], y_tr)

        proba = m.predict_proba(df_train.iloc[va])
        classes_parent = np.array([atu_parent(c) for c in mlb.classes_])

        topk_idx = np.argsort(-proba, axis=1)[:, :k]

        for i, row_idx in enumerate(va):
            gold_parents = set(df_train.at[row_idx, "labels_parent"])

            p_sorted = np.sort(proba[i])[::-1]
            p1 = float(p_sorted[0])
            p2 = float(p_sorted[1]) if len(p_sorted) > 1 else 0.0
            delta = p1 - p2

            pred_parent_set = set(classes_parent[topk_idx[i]])
            y_hit3 = 1 if (gold_parents & pred_parent_set) else 0

            rows.append({"p1": p1, "p2": p2, "delta": delta, "y_correct": int(y_hit3)})

    return pd.DataFrame(rows)

# 1) строим мета-датасет для калибратора (OOF по ранжирующей модели)
df_cal = build_score_calibration_dataset_hit3(train_df, mlb, model, k=3, n_splits=5, random_state=42)

X_cal = df_cal[["p1", "p2", "delta"]]
y_cal = df_cal["y_correct"].values

print("Meta-dataset:", df_cal.shape)
print("Pos rate (Hit@3):", y_cal.mean())
print("Class counts:", Counter(y_cal))


Meta-dataset: (32, 4)
Pos rate (Hit@3): 0.21875
Class counts: Counter({np.int64(0): 25, np.int64(1): 7})


In [56]:
# 2) калибратор: Stratified CV (важно при дисбалансе)
min_class = min(Counter(y_cal).values())
n_splits_cal = min(3, min_class)  # чтобы не упасть, если позитивов мало
if n_splits_cal < 2:
    n_splits_cal = 2  # крайний случай

cv_cal = StratifiedKFold(n_splits=n_splits_cal, shuffle=True, random_state=42)

score_calibrator = CalibratedClassifierCV(
    estimator=LinearSVC(class_weight="balanced"),
    method="sigmoid",
    cv=cv_cal
)

score_calibrator.fit(X_cal, y_cal)
print("Calibrator fitted with n_splits =", n_splits_cal)

Calibrator fitted with n_splits = 3


In [61]:
from sklearn.metrics import brier_score_loss, log_loss

def expected_calibration_error(y_true, y_prob, n_bins=5):
    y_true = np.asarray(y_true)
    y_prob = np.asarray(y_prob)

    bins = np.linspace(0.0, 1.0, n_bins + 1)
    bin_ids = np.digitize(y_prob, bins) - 1
    bin_ids = np.clip(bin_ids, 0, n_bins - 1)

    ece = 0.0
    for b in range(n_bins):
        mask = bin_ids == b
        if not np.any(mask):
            continue
        acc = y_true[mask].mean()
        conf = y_prob[mask].mean()
        ece += (mask.mean()) * abs(acc - conf)
    return float(ece)

def reliability_table(y_true, y_prob, n_bins=10):
    bins = np.linspace(0.0, 1.0, n_bins + 1)
    df = pd.DataFrame({"y": y_true, "p": y_prob})
    df["bin"] = pd.cut(df["p"], bins=bins, include_lowest=True)

    tab = df.groupby("bin").agg(
        n=("y", "size"),
        mean_score=("p", "mean"),
        emp_rate=("y", "mean"),
    ).reset_index()
    return tab

def plot_reliability(tab, out_path=None):
    tab_nonempty = tab[tab["n"] > 0].copy()

    plt.figure(figsize=(6, 6))
    plt.plot([0, 1], [0, 1])  # perfect calibration line
    plt.scatter(tab_nonempty["mean_score"], tab_nonempty["emp_rate"])
    plt.xlabel("Mean predicted SCORE")
    plt.title("Reliability diagram (SCORE calibration)")
    plt.grid(True)

    if out_path:
        plt.savefig(out_path, dpi=200, bbox_inches="tight")
    plt.show()

# ---- CV evaluation of the calibrator itself ----
cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

briers, lls, eces = [], [], []
all_probs = np.zeros(len(y_cal), dtype=float)

for tr, va in cv.split(X_cal, y_cal):
    cal = CalibratedClassifierCV(
        estimator=LinearSVC(class_weight="balanced"),
        method="sigmoid",
        cv=3
    )
    cal.fit(X_cal.iloc[tr], y_cal[tr])

    p = cal.predict_proba(X_cal.iloc[va])[:, 1]
    all_probs[va] = p

    briers.append(brier_score_loss(y_cal[va], p))
    lls.append(log_loss(y_cal[va], p, labels=[0,1]))
    eces.append(expected_calibration_error(y_cal[va], p, n_bins=10))

print("Calibrator CV Brier: mean=", np.mean(briers), "std=", np.std(briers))
print("Calibrator CV LogLoss: mean=", np.mean(lls), "std=", np.std(lls))

Calibrator CV Brier: mean= 0.12822853149839838 std= 0.03473842395810324
Calibrator CV LogLoss: mean= 0.41763050510001837 std= 0.0917967508528818


In [62]:
p0 = float(y_cal.mean())
probs_base = np.full_like(y_cal, fill_value=p0, dtype=float)

from sklearn.metrics import brier_score_loss, log_loss
print("Baseline (constant) Brier:", brier_score_loss(y_cal, probs_base))
print("Baseline (constant) LogLoss:", log_loss(y_cal, probs_base, labels=[0,1]))


Baseline (constant) Brier: 0.1708984375
Baseline (constant) LogLoss: 0.525321319515595
