In [45]:
from pathlib import Path
import json
import pandas as pd
from sklearn.model_selection import train_test_split, KFold, cross_val_score
import re
import numpy as np
from collections import Counter

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, MultiLabelBinarizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import f1_score, make_scorer

import joblib
from typing import Tuple, Optional
from datetime import datetime, timezone
import subprocess


In [4]:
REPO_URL = "https://github.com/eugeniavd/magic_tagger.git"  # <-- EDIT if needed
!git clone {REPO_URL}


fatal: destination path 'magic_tagger' already exists and is not an empty directory.


In [5]:
PROJECT_ROOT = Path("/Users/eugenia/Desktop/thesis/magic_tagger")

csv_path = PROJECT_ROOT / "data" / "processed" / "classify_data_normalized.csv"

# --- load ---
df = pd.read_csv(csv_path, encoding="utf-8")
print("Loaded:", csv_path)
print("Shape:", df.shape)
display(df.head(5))

Loaded: /Users/eugenia/Desktop/thesis/magic_tagger/data/processed/classify_data_normalized.csv
Shape: (50, 14)


Unnamed: 0,tale_id,rights_status,content_description,set,sampling_version,type_count,collection,volume_no,source_ref,atu_labels_json,txt_path,text_raw,summary_norm,text_norm
0,era_vene_1_503_1,open,[Царевна-лягушка].,core,v1_20251230,3,"ERA, Vene",1,"ERA, Vene 1, 503/4 (1)","[""402""]",/Users/eugenia/Desktop/thesis/magic_tagger/dat...,Тили были царь с царицей у не\nбыло три сына. ...,царевна-лягушка.,тили были царь с царицей у не было три сына. ц...
1,era_vene_1_515_1,open,"[По пьяни мужик спорит, что сможет принести но...",coverage,v1_20251230,1,"ERA, Vene",1,"ERA, Vene 1, 515/6 (1)","[""410""]",/Users/eugenia/Desktop/thesis/magic_tagger/dat...,"Раз пяное, ребятище» подился.\nчто можит в 12 ...","по пьяни мужик спорит, что сможет принести ноч...","раз пяное, ребятище» подился. что можит в 12 ч..."
2,era_vene_12_105_22,open,Снегурочка.,core,v1_20251230,3,"ERA, Vene",12,"ERA, Vene 12, 105 (22)","[""703*""]",/Users/eugenia/Desktop/thesis/magic_tagger/dat...,Сделали дети со снегу куклу.\nВ одного старина...,снегурочка.,сделали дети со снегу куклу. в одного старина ...
3,era_vene_12_137_98,open,Иван-дурак.,core,v1_20251230,4,"ERA, Vene",12,"ERA, Vene 12, 137/41 (98)","[""530""]",/Users/eugenia/Desktop/thesis/magic_tagger/dat...,"Кил-был стажк. В яво бло\nтра сегна. Миша, Гри...",иван-дурак.,"кил-был стажк. в яво бло тра сегна. миша, гриш..."
4,era_vene_12_189_1,open,Два брата.,core,v1_20251230,2,"ERA, Vene",12,"ERA, Vene 12, 189/94 (1)","[""735A""]",/Users/eugenia/Desktop/thesis/magic_tagger/dat...,Жили – брели два брата.\nи посла смерти отца о...,два брата.,жили — брели два брата. и посла смерти отца об...


In [6]:
col = "atu_labels_json"

def parse_labels(x):
    if pd.isna(x):
        return []
    s = str(x).strip()
    if not s:
        return []
    try:
        v = json.loads(s)
        if isinstance(v, list):
            return [str(t).strip() for t in v if str(t).strip()]

        return [str(v).strip()]
    except Exception:

        return [t.strip() for t in s.split(",") if t.strip()]

df["labels"] = df[col].apply(parse_labels)

unique_labels = sorted({lab for labs in df["labels"] for lab in labs})
print("Unique labels:", len(unique_labels))
print("Example:", unique_labels[:20])


Unique labels: 37
Example: ['1000', '1060', '1168', '1174', '300', '300A', '301', '302C*', '302С*', '307', '313', '325', '327A', '331', '402', '410', '425C', '470', '480A', '480D*']


In [7]:
label_counts = pd.Series([lab for labs in df["labels"] for lab in labs]).value_counts()
display(label_counts)

707      6
480D*    5
402      3
552      3
703*     3
530      3
307      3
650A     3
480A     3
301      2
425C     2
300      2
410      2
550      2
700      2
580      1
735A     1
556F*    1
554      1
302C*    1
313      1
1168     1
1174     1
331      1
300A     1
709      1
530A     1
302С*    1
325      1
1000     1
1060     1
511      1
470      1
849*     1
706      1
556А*    1
327A     1
Name: count, dtype: int64

In [8]:
DROP_COLS = [
    "rights_status",
    "content_description",
    "sampling_version",
    "type_count",
    "collection",
    "volume_no",
    "source_ref",
    "atu_labels_json",
    "txt_path",
    "text_raw",
    "set"
]

df = df.drop(columns=[c for c in DROP_COLS if c in df.columns]).copy()
df.head(5)


Unnamed: 0,tale_id,summary_norm,text_norm,labels
0,era_vene_1_503_1,царевна-лягушка.,тили были царь с царицей у не было три сына. ц...,[402]
1,era_vene_1_515_1,"по пьяни мужик спорит, что сможет принести ноч...","раз пяное, ребятище» подился. что можит в 12 ч...",[410]
2,era_vene_12_105_22,снегурочка.,сделали дети со снегу куклу. в одного старина ...,[703*]
3,era_vene_12_137_98,иван-дурак.,"кил-был стажк. в яво бло тра сегна. миша, гриш...",[530]
4,era_vene_12_189_1,два брата.,жили — брели два брата. и посла смерти отца об...,[735A]


### Multi-label encoding and parent-level labels for evaluation

Because each tale in our corpus can be assigned to **one or more ATU types**, we treat ATU prediction as a **multi-label classification** task. We first convert the human-assigned label sets into a machine-learning friendly representation using `MultiLabelBinarizer`. This step builds a fixed label vocabulary from the training data and transforms each tale’s label list into a **multi-hot binary vector**. This representation is required by standard multi-label classifiers (e.g., One-vs-Rest logistic regression) and ensures a reproducible mapping between labels and output dimensions.

In addition to the original ATU labels, we derive a **parent-level label set** for evaluation. ATU types frequently include suffixes or modifiers (e.g., `327A`, `480D*`), while the **leading numeric component** (e.g., `327`, `480`) captures a higher-level category that is often more stable under noisy HTR conditions and small-data regimes. We therefore extract the first 1–4 digits from each ATU label via a simple regular expression and assign the resulting parent codes as `labels_parent`. This enables evaluation at a coarser granularity (e.g., Parent-Hit@3), which better reflects the intended use of the system as a **decision-support tool**: even when the model fails to predict the exact subtype, correctly retrieving the parent class can still provide a meaningful shortlist for expert review.


In [9]:
RE_ATU_PARENT = re.compile(r"^\s*(?:ATU[_\s-]*)?(\d{1,4})")

def atu_parent(label: str) -> str:
    if label is None:
        return ""
    s = str(label).strip()
    if not s:
        return ""
    m = RE_ATU_PARENT.search(s)
    return m.group(1) if m else ""

In [10]:

# -------------------------
# 2) Robust label utilities
# -------------------------
def is_missing(x) -> bool:
    if x is None:
        return True

    # If x is array-like/container (list/tuple/ndarray/Series), treat as NOT-missing container.
    # Missingness is handled elementwise later.
    if isinstance(x, (list, tuple, set, dict, np.ndarray, pd.Series, pd.Index)):
        return False

    try:
        m = pd.isna(x)
        # pd.isna(scalar) -> bool; pd.isna(array-like) -> array (handled above)
        if isinstance(m, (bool, np.bool_)):
            return bool(m)
        return False
    except Exception:
        return False

def to_parent_set(labels) -> list[str]:
    """
    Convert a list of ATU labels to a sorted list of unique parent codes.
    Robust to:
      - labels=None / labels=NaN
      - NaN elements inside the list
      - empty/whitespace strings
    """
    if is_missing(labels):
        return []

    out: set[str] = set()
    for x in labels:
        if is_missing(x):
            continue
        s = str(x).strip()
        if not s:
            continue
        p = atu_parent(s)
        if p:
            out.add(p)

    return sorted(out)

def clean_label_list(labels) -> list[str]:
    """
    Normalize and deduplicate a label list.
    - Accepts None/NaN
    - Accepts a single string -> [string]
    - Accepts list-like -> list[str]
    """
    if is_missing(labels):
        return []
    if isinstance(labels, str):
        s = labels.strip()
        return [s] if s else []

    out: list[str] = []
    for x in labels:
        if is_missing(x):
            continue
        s = str(x).strip()
        if s:
            out.append(s)

    # stable dedup
    seen: set[str] = set()
    dedup: list[str] = []
    for s in out:
        if s not in seen:
            seen.add(s)
            dedup.append(s)
    return dedup



In [11]:
# ---------------------------------------
# 3) Parent-Hit@k from scores/proba (any match)
# ---------------------------------------
def parent_hit_at_k_from_proba(
    y_true_parent_lists: list[list[str]],
    proba: np.ndarray,
    classes: np.ndarray,
    k: int = 3
) -> float:
    """
    Parent-Hit@k (any match):
    success if at least one gold parent code appears among parent codes
    of the model's top-k predicted labels.
    """
    if k <= 0:
        raise ValueError("k must be >= 1")
    if proba.shape[0] != len(y_true_parent_lists):
        raise ValueError("n_samples mismatch between y_true_parent_lists and proba")
    if proba.shape[1] != len(classes):
        raise ValueError("proba columns != classes length (alignment issue)")

    classes_parent = np.array([atu_parent(c) for c in classes], dtype=object)
    topk_idx = np.argsort(-proba, axis=1)[:, :k]

    hits: list[int] = []
    for i, gold_parents in enumerate(y_true_parent_lists):
        gold_set = set(gold_parents or [])
        pred_parent_set = set(classes_parent[topk_idx[i]])
        pred_parent_set.discard("")  # defensive
        hits.append(1 if (gold_set & pred_parent_set) else 0)

    return float(np.mean(hits))


# -----------------------------------
# 4) Exact-Hit@k from scores/proba (any match)
# -----------------------------------
def exact_hit_at_k_from_proba(
    y_true_labels_lists: list[list[str]],
    proba: np.ndarray,
    classes: np.ndarray,
    k: int = 3
) -> float:
    """
    Exact-Hit@k (any match):
    success if at least one exact gold label appears among the model's top-k labels.
    """
    if k <= 0:
        raise ValueError("k must be >= 1")
    if proba.shape[0] != len(y_true_labels_lists):
        raise ValueError("n_samples mismatch between y_true_labels_lists and proba")
    if proba.shape[1] != len(classes):
        raise ValueError("proba columns != classes length (alignment issue)")

    topk_idx = np.argsort(-proba, axis=1)[:, :k]

    hits: list[int] = []
    for i, gold_labels in enumerate(y_true_labels_lists):
        gold_set = set(clean_label_list(gold_labels))
        pred_set = set(classes[topk_idx[i]])
        hits.append(1 if (gold_set & pred_set) else 0)

    return float(np.mean(hits))


In [12]:
# -----------------------------------------
# 6) Model wrappers (scores adapter)
# -----------------------------------------
def _stack_proba_list(proba_list) -> np.ndarray:
    """
    Convert list-of-arrays from some multilabel wrappers into (n_samples, n_classes).
    Typical forms:
      - each element is (n_samples, 2) -> take [:, 1]
      - each element is (n_samples,)  -> use as-is
    """
    cols = []
    for p in proba_list:
        p = np.asarray(p)
        if p.ndim == 2 and p.shape[1] == 2:
            cols.append(p[:, 1])
        elif p.ndim == 2 and p.shape[1] == 1:
            cols.append(p[:, 0])
        elif p.ndim == 1:
            cols.append(p)
        else:
            # last-resort flatten
            cols.append(p.reshape(-1))
    return np.column_stack(cols)

def _get_model_scores(model, X) -> np.ndarray:
    """
    Return a 2D array (n_samples, n_classes) used for top-k ranking.
    Works with:
      - predict_proba returning ndarray or list-of-arrays
      - decision_function returning ndarray
    """
    if hasattr(model, "predict_proba"):
        p = model.predict_proba(X)
        if isinstance(p, list):
            p = _stack_proba_list(p)
        else:
            p = np.asarray(p)

        if p.ndim == 1:
            p = p.reshape(-1, 1)
        return p

    if hasattr(model, "decision_function"):
        s = model.decision_function(X)
        s = np.asarray(s)
        if s.ndim == 1:
            s = s.reshape(-1, 1)
        return s

    raise AttributeError("Model must implement predict_proba or decision_function")

def parent_hit_at_k_model(
    model,
    X,
    y_true_parent_lists: list[list[str]],
    mlb,
    k: int = 3
) -> float:
    scores = _get_model_scores(model, X)
    classes = np.asarray(mlb.classes_)
    return parent_hit_at_k_from_proba(y_true_parent_lists, scores, classes, k=k)

def exact_hit_at_k_model(
    model,
    X,
    y_true_labels_lists: list[list[str]],
    mlb,
    k: int = 3
) -> float:
    scores = _get_model_scores(model, X)
    classes = np.asarray(mlb.classes_)
    return exact_hit_at_k_from_proba(y_true_labels_lists, scores, classes, k=k)

In [13]:
# -----------------------------------------
# 7) Safe X construction
# -----------------------------------------
def build_X(
    df: pd.DataFrame,
    text_cols: tuple[str, ...] = ("summary_norm", "text_norm"),
    fillna: str = "",
) -> pd.DataFrame:
    """
    One canonical X builder.

    Returns a DataFrame with ONLY the requested text columns (in the given order).
    This is compatible with ColumnTransformer that references columns by name.
    Prevents leakage through id/metadata columns by construction.

    Usage:
      X_train = build_X(train_df)
      X_test  = build_X(test_df)
    """
    cols = [c for c in text_cols if c in df.columns]
    if not cols:
        raise ValueError(f"None of text_cols {text_cols} found in df. Available: {list(df.columns)}")

    X = df[cols].copy()
    # enforce string dtype for vectorizers; avoids pd.NA issues
    for c in cols:
        X[c] = X[c].fillna(fillna).astype(str)

    return X


### Stratified train/test split for multi-label data using parent ATU codes

To evaluate the classifier on a held-out set while preserving label coverage, we implement a **custom stratified split** tailored to **multi-label** data. Standard stratification methods assume a single label per instance and do not directly support the setting where each tale can have **multiple ATU types**. This is especially problematic in our small corpus, where many labels are rare: a naive random split can easily place the only example of a label in the test set, leaving the model with **zero training examples** for that label.

Our function `stratified_multilabel_split_by_parent()` performs an approximate stratification over `labels_parent` (coarser ATU parent codes), with the goal of constructing a test subset of size `test_size` while ensuring that the training set retains minimal coverage for the labels present.

The procedure is as follows:

1. **Target test size.** We compute the desired number of test documents (`n_test = round(n * test_size)`).

2. **Label availability tracking.** We count how many documents contain each parent label (`all_counts`) and keep a mutable counter `remaining` to track how many examples of each label would remain in the training pool if we move documents into the test set.

3. **Safety constraint.** A document is considered *safe* to move into the test set if doing so does not exhaust any of its labels in the remaining pool:
   - `is_safe(i)` returns `True` only if for every label in document `i`, `remaining[label] ≥ 2`.
   This conservative rule ensures that after selecting the test items, each label included in a moved document still has at least one example left for training (and avoids dropping a label entirely from the training set).

4. **Coverage-oriented greedy selection.** We iteratively build the test set using a greedy criterion:
   - `gain(i)` measures how many **new** parent labels a candidate document would contribute to the current test set (labels not yet covered in `covered_test`).
   At each step we choose, among safe candidates, a document with maximal gain (ties are broken randomly). This increases the diversity of labels represented in the test split without violating the safety constraint.

5. **Fallback filling.** If we cannot reach the desired test size using the greedy coverage criterion (because the safety constraint becomes too restrictive), we fill the remaining slots by randomly selecting from the remaining safe documents.

Finally, we return two dataframes: `train_df` and `test_df`. This split is designed to be **more stable and fair** than a naive random split for small multi-label datasets, because it reduces the risk of creating “unseen-in-training” labels and improves the interpretability of downstream evaluation (e.g., Parent-Hit@3).


In [14]:
df = df.copy()
df["labels"] = df["labels"].apply(clean_label_list)
df["labels_parent"] = df["labels"].apply(to_parent_set)


In [15]:
df.head()

Unnamed: 0,tale_id,summary_norm,text_norm,labels,labels_parent
0,era_vene_1_503_1,царевна-лягушка.,тили были царь с царицей у не было три сына. ц...,[402],[402]
1,era_vene_1_515_1,"по пьяни мужик спорит, что сможет принести ноч...","раз пяное, ребятище» подился. что можит в 12 ч...",[410],[410]
2,era_vene_12_105_22,снегурочка.,сделали дети со снегу куклу. в одного старина ...,[703*],[703]
3,era_vene_12_137_98,иван-дурак.,"кил-был стажк. в яво бло тра сегна. миша, гриш...",[530],[530]
4,era_vene_12_189_1,два брата.,жили — брели два брата. и посла смерти отца об...,[735A],[735]


In [16]:
def stratified_multilabel_split_by_parent(
    df: pd.DataFrame,
    label_col: str = "labels_parent",
    test_size: float = 0.2,
    random_state: int = 42,
    min_train_count_per_label: int = 1,
    require_nonempty_labels: bool = False,
):
    """
    Greedy multi-label split by parent labels (heuristic).
    - Tries to reach ~test_size for test
    - Maximizes parent-label coverage in test
    - Keeps at least min_train_count_per_label occurrences of each label in train when possible
    """
    if not (0.0 < test_size < 1.0):
        raise ValueError("test_size must be in (0, 1)")
    if label_col not in df.columns:
        raise KeyError(f"Column '{label_col}' not found. Build it before splitting.")

    rng = np.random.RandomState(random_state)

    # Work on a copy; reset index for stable positional indexing
    df = df.reset_index(drop=True).copy()

    # Use your existing clean_label_list() to normalize list[str] per row
    df[label_col] = df[label_col].apply(clean_label_list)

    if require_nonempty_labels:
        df = df[df[label_col].map(len) > 0].reset_index(drop=True)

    n = len(df)
    if n < 2:
        raise ValueError("Need at least 2 rows to split")

    n_test = int(round(n * test_size))
    n_test = max(1, min(n - 1, n_test))  # leave at least 1 row in train

    # label frequencies
    all_counts = Counter(lab for labs in df[label_col] for lab in labs)
    remaining = Counter(all_counts)

    test_idx = []
    covered_test = set()

    candidates = list(range(n))
    rng.shuffle(candidates)

    def is_safe(i: int) -> bool:
        labs = df.at[i, label_col]
        if not labs:
            return True
        # ensure train keeps >= min_train_count_per_label after moving row i to test
        return all((remaining[lab] - 1) >= min_train_count_per_label for lab in labs)

    def gain(i: int) -> int:
        labs = set(df.at[i, label_col])
        return len(labs - covered_test)

    # Greedy selection: maximize new-label coverage subject to safety
    while len(test_idx) < n_test:
        safe = [i for i in candidates if (i not in test_idx and is_safe(i))]
        if not safe:
            break

        gains = np.array([gain(i) for i in safe], dtype=int)
        best_gain = gains.max()
        best = [safe[j] for j in np.where(gains == best_gain)[0]]
        chosen = int(rng.choice(best))

        test_idx.append(chosen)
        for lab in df.at[chosen, label_col]:
            remaining[lab] -= 1
            covered_test.add(lab)

    # Backfill with any remaining safe rows (random order)
    if len(test_idx) < n_test:
        safe_rest = [i for i in range(n) if (i not in test_idx and is_safe(i))]
        rng.shuffle(safe_rest)
        need = n_test - len(test_idx)
        test_idx.extend(safe_rest[:need])

    # Last resort: fill randomly if still short (may violate constraints for ultra-rare labels)
    if len(test_idx) < n_test:
        rest = [i for i in range(n) if i not in test_idx]
        rng.shuffle(rest)
        need = n_test - len(test_idx)
        test_idx.extend(rest[:need])

    # Dedup + ensure not all docs in test
    test_idx = sorted(set(test_idx))
    if len(test_idx) >= n:
        test_idx = test_idx[: n - 1]

    train_idx = [i for i in range(n) if i not in test_idx]

    train_df = df.iloc[train_idx].reset_index(drop=True)
    test_df = df.iloc[test_idx].reset_index(drop=True)

    return train_df, test_df

In [17]:
# usage
train_df, test_df = stratified_multilabel_split_by_parent(
    df,
    label_col="labels_parent",
    test_size=0.2,
    random_state=42,
    min_train_count_per_label=1,
    require_nonempty_labels=False
)

print("Train:", train_df.shape, "| Test:", test_df.shape)
print("Unique parents train:", len({l for labs in train_df["labels_parent"] for l in labs}))
print("Unique parents test:", len({l for labs in test_df["labels_parent"] for l in labs}))

Train: (40, 5) | Test: (10, 5)
Unique parents train: 32
Unique parents test: 11


In [18]:
train_df.head(5)

Unnamed: 0,tale_id,summary_norm,text_norm,labels,labels_parent
0,era_vene_1_503_1,царевна-лягушка.,тили были царь с царицей у не было три сына. ц...,[402],[402]
1,era_vene_1_515_1,"по пьяни мужик спорит, что сможет принести ноч...","раз пяное, ребятище» подился. что можит в 12 ч...",[410],[410]
2,era_vene_12_105_22,снегурочка.,сделали дети со снегу куклу. в одного старина ...,[703*],[703]
3,era_vene_12_137_98,иван-дурак.,"кил-был стажк. в яво бло тра сегна. миша, гриш...",[530],[530]
4,era_vene_12_189_1,два брата.,жили — брели два брата. и посла смерти отца об...,[735A],[735]


In [19]:
train_labels = set(l for labs in train_df["labels"] for l in labs)
test_labels  = set(l for labs in test_df["labels"] for l in labs)
unknown_in_test = sorted(test_labels - train_labels)

print("Labels only in test (will be ignored by mlb):", len(unknown_in_test))


Labels only in test (will be ignored by mlb): 0


In [20]:
X_train = build_X(train_df, ("summary_norm", "text_norm"))
X_test  = build_X(test_df,  ("summary_norm", "text_norm"))
y_train_list = train_df["labels"]
y_test_list  = test_df["labels"]

In [21]:
print(X_train.iloc[1][:500])

summary_norm    по пьяни мужик спорит, что сможет принести ноч...
text_norm       раз пяное, ребятище» подился. что можит в 12 ч...
Name: 1, dtype: object


In [22]:
mlb = MultiLabelBinarizer()
Y_train = mlb.fit_transform(y_train_list)
Y_test  = mlb.transform(y_test_list)

In [23]:
print("X_train:", X_train.shape, "y_train:", Y_train.shape)
print("X_test :", X_test.shape,  "y_test :", Y_test.shape)


X_train: (40, 2) y_train: (40, 37)
X_test : (10, 2) y_test : (10, 37)


### Feature extraction and multi-label classifier (TF-IDF + One-vs-Rest Logistic Regression)

This block defines the **final text-based baseline model** as a single scikit-learn `Pipeline` that (i) converts textual inputs into numerical features and (ii) trains a **multi-label** classifier over ATU types.

**1) Character-level TF-IDF on OCR/HTR text (`text_norm`).**  
We build a TF-IDF representation using **character n-grams (3–5)**. Character n-grams are a common and effective choice for noisy OCR/HTR corpora because they remain informative even when word boundaries or spellings are corrupted. We enable `sublinear_tf=True` (log-scaled term frequency) and cap the vocabulary with `max_features=50,000` to control dimensionality on a small dataset.

**2) Word-level TF-IDF on summaries (`summary_norm`).**  
In parallel, we build a TF-IDF representation using **word n-grams (1–2)** from the tale summary. Summaries typically contain less OCR noise and capture higher-level semantics, which complements the robustness of character n-grams. We similarly apply log-scaled TF and cap the vocabulary at `max_features=20,000`.

**3) Feature concatenation via `ColumnTransformer`.**  
The `ColumnTransformer` applies each vectorizer to its corresponding dataframe column and **concatenates** the resulting sparse vectors into a single feature space. All other dataframe columns are dropped (`remainder="drop"`), ensuring that only text-derived signals enter the model.

**4) Multi-label classification with One-vs-Rest Logistic Regression.**  
Because a tale can legitimately have **multiple ATU assignments**, we use a `OneVsRestClassifier(LogisticRegression)` scheme: a separate binary logistic regression is trained for each ATU label, producing a score per label. Logistic regression is fast, stable on sparse TF-IDF features, and provides well-behaved ranking scores for Top-k recommendation.

**5) End-to-end pipeline.**  
Finally, we wrap preprocessing and classification into a single `Pipeline` so that the same transformations are consistently applied at training and inference time. This also simplifies serialization and deployment (e.g., saving the pipeline as a single artifact for the Streamlit application).


In [24]:

def build_model(
    use_word_summary: bool = True,
    use_word_text: bool = False,
    char_analyzer: str = "char",              # "char" or "char_wb"
    char_ngram: Tuple[int, int] = (3, 5),
    word_ngram: Tuple[int, int] = (1, 2),
    char_max_features: int = 50000,
    word_max_features: int = 20000,
    C: float = 2.0,
    class_weight: Optional[str] = None,       # usually None for multi-label
    random_state: int = 42,
) -> Pipeline:
    transformers = [
        (
            "char_tfidf_text",
            TfidfVectorizer(
                analyzer=char_analyzer,
                ngram_range=char_ngram,
                min_df=1,
                max_features=char_max_features,
                sublinear_tf=True,
                lowercase=False,
            ),
            "text_norm",
        )
    ]

    if use_word_summary:
        transformers.append(
            (
                "word_tfidf_summary",
                TfidfVectorizer(
                    analyzer="word",
                    ngram_range=word_ngram,
                    min_df=1,
                    max_features=word_max_features,
                    sublinear_tf=True,
                    lowercase=False,
                ),
                "summary_norm",
            )
        )

    if use_word_text:
        transformers.append(
            (
                "word_tfidf_text",
                TfidfVectorizer(
                    analyzer="word",
                    ngram_range=word_ngram,
                    min_df=1,
                    max_features=word_max_features,
                    sublinear_tf=True,
                    lowercase=False,
                ),
                "text_norm",
            )
        )

    features = ColumnTransformer(
        transformers=transformers,
        remainder="drop",
        sparse_threshold=0.3,
    )

    base_lr = LogisticRegression(
        solver="liblinear",
        max_iter=2000,
        C=C,
        class_weight="balanced",
        random_state=random_state,
    )

    clf = OneVsRestClassifier(base_lr, n_jobs=1)

    return Pipeline([
        ("features", features),
        ("clf", clf),
    ])

In [25]:
print(type(X_train), getattr(X_train, "shape", None), getattr(X_train, "columns", None))


<class 'pandas.core.frame.DataFrame'> (40, 2) Index(['summary_norm', 'text_norm'], dtype='object')


In [26]:
y_parent_train = train_df["labels_parent"].tolist()
classes = np.asarray(mlb.classes_)

# --- CV function (train only) ---
def cv_parent_hit_at_k(
    X_train: pd.DataFrame,
    Y_train: np.ndarray,
    y_parent_train: list[list[str]],
    classes: np.ndarray,
    model_builder,
    k: int = 3,
    n_splits: int = 5,
    cv_seed: int = 123,
    verbose: bool = True,
    debug_features: bool = False,   # <-- добавьте флаг
    debug_n: int = 3,
) -> np.ndarray:
    cv = KFold(n_splits=n_splits, shuffle=True, random_state=cv_seed)
    scores = []

    for fold, (tr_idx, va_idx) in enumerate(cv.split(X_train), start=1):
        model = model_builder()
        model.fit(X_train.iloc[tr_idx], Y_train[tr_idx])

        # =========================
        # DEBUG INSERT HERE
        # =========================
        if debug_features:
            # 1) shape/sparsity of features on VAL (do not touch labels)
            Xt = model.named_steps["features"].transform(X_train.iloc[va_idx])
            nnz = Xt.nnz if hasattr(Xt, "nnz") else None
            print(f"[DEBUG] Fold {fold}: Xt shape={Xt.shape} nnz={nnz}")

        proba = model.predict_proba(X_train.iloc[va_idx])

        # normalize possible list-of-arrays to 2D array
        if isinstance(proba, list):
            cols = []
            for p in proba:
                p = np.asarray(p)
                if p.ndim == 2 and p.shape[1] == 2:
                    cols.append(p[:, 1])
                elif p.ndim == 2 and p.shape[1] == 1:
                    cols.append(p[:, 0])
                else:
                    cols.append(p.reshape(-1))
            proba = np.column_stack(cols)
        else:
            proba = np.asarray(proba)

        if debug_features:
            # 2) show top-k predicted labels for a few VAL samples
            topk_idx = np.argsort(-proba, axis=1)[:, :k]
            show = min(debug_n, topk_idx.shape[0])
            preds = [[classes[j] for j in topk_idx[i]] for i in range(show)]
            pred_parents = [[atu_parent(l) for l in preds[i]] for i in range(show)]
            print(f"[DEBUG] Fold {fold}: top-{k} labels (first {show} val): {preds}")
            print(f"[DEBUG] Fold {fold}: top-{k} parents (first {show} val): {pred_parents}")


        score = parent_hit_at_k_from_proba(
            y_true_parent_lists=[y_parent_train[i] for i in va_idx],
            proba=proba,
            classes=classes,
            k=k,
        )
        scores.append(score)

        if verbose:
            print(f"  Fold {fold}: Parent-Hit@{k}={score:.3f}")

    return np.asarray(scores, dtype=float)

# --- experiments ---
experiments = {
    "A_char + word_summary": lambda: build_model(use_word_summary=True,  use_word_text=False, char_analyzer="char"),
    "B_char + word_summary + word_text": lambda: build_model(use_word_summary=True,  use_word_text=True,  char_analyzer="char"),
    "C_charWB + word_summary": lambda: build_model(use_word_summary=True,  use_word_text=False, char_analyzer="char_wb"),
    "D_charWB + word_summary + word_text": lambda: build_model(use_word_summary=True,  use_word_text=True,  char_analyzer="char_wb"),
}

# --- run CV & pick best ---
cv_results = {}
for name, builder in experiments.items():
    print("\n" + "=" * 70)
    print("CV on TRAIN:", name)

    scores = cv_parent_hit_at_k(
        X_train=X_train,
        Y_train=Y_train,
        y_parent_train=y_parent_train,
        classes=classes,
        model_builder=builder,
        k=3,
        n_splits=5,
        cv_seed=123,
        verbose=True,
        debug_features=True,   # <-- включить
        debug_n=3,
    )
    cv_results[name] = scores
    print(f"  CV mean={scores.mean():.4f} std={scores.std(ddof=0):.4f} scores={scores}")

cv_summary = (
    pd.DataFrame({
        "model": list(cv_results.keys()),
        "mean": [cv_results[m].mean() for m in cv_results],
        "std":  [cv_results[m].std(ddof=0) for m in cv_results],
    })
    .sort_values("mean", ascending=False)
    .reset_index(drop=True)
)

display(cv_summary)

best_name = cv_summary.loc[0, "model"]
best_builder = experiments[best_name]
print("\nBEST by CV:", best_name)


CV on TRAIN: A_char + word_summary




[DEBUG] Fold 1: Xt shape=(8, 52815) nnz=37478
[DEBUG] Fold 1: top-3 labels (first 3 val): [['480A', '552', '650A'], ['703*', '480A', '552'], ['480D*', '552', '480A']]
[DEBUG] Fold 1: top-3 parents (first 3 val): [['480', '552', '650'], ['703', '480', '552'], ['480', '552', '480']]
  Fold 1: Parent-Hit@3=0.375




[DEBUG] Fold 2: Xt shape=(8, 52394) nnz=42503
[DEBUG] Fold 2: top-3 labels (first 3 val): [['707', '552', '402'], ['707', '307', '849*'], ['707', '307', '552']]
[DEBUG] Fold 2: top-3 parents (first 3 val): [['707', '552', '402'], ['707', '307', '849'], ['707', '307', '552']]
  Fold 2: Parent-Hit@3=0.500




[DEBUG] Fold 3: Xt shape=(8, 53439) nnz=26051
[DEBUG] Fold 3: top-3 labels (first 3 val): [['552', '707', '650A'], ['552', '707', '307'], ['707', '552', '307']]
[DEBUG] Fold 3: top-3 parents (first 3 val): [['552', '707', '650'], ['552', '707', '307'], ['707', '552', '307']]
  Fold 3: Parent-Hit@3=0.125




[DEBUG] Fold 4: Xt shape=(8, 53614) nnz=31545
[DEBUG] Fold 4: top-3 labels (first 3 val): [['707', '530', '402'], ['480D*', '707', '650A'], ['707', '552', '650A']]
[DEBUG] Fold 4: top-3 parents (first 3 val): [['707', '530', '402'], ['480', '707', '650'], ['707', '552', '650']]
  Fold 4: Parent-Hit@3=0.625




[DEBUG] Fold 5: Xt shape=(8, 53450) nnz=30198
[DEBUG] Fold 5: top-3 labels (first 3 val): [['703*', '707', '402'], ['707', '650A', '402'], ['480A', '707', '402']]
[DEBUG] Fold 5: top-3 parents (first 3 val): [['703', '707', '402'], ['707', '650', '402'], ['480', '707', '402']]
  Fold 5: Parent-Hit@3=0.625
  CV mean=0.4500 std=0.1871 scores=[0.375 0.5   0.125 0.625 0.625]

CV on TRAIN: B_char + word_summary + word_text




[DEBUG] Fold 1: Xt shape=(8, 72815) nnz=39069
[DEBUG] Fold 1: top-3 labels (first 3 val): [['480A', '552', '707'], ['703*', '480A', '552'], ['480D*', '552', '480A']]
[DEBUG] Fold 1: top-3 parents (first 3 val): [['480', '552', '707'], ['703', '480', '552'], ['480', '552', '480']]
  Fold 1: Parent-Hit@3=0.375




[DEBUG] Fold 2: Xt shape=(8, 72394) nnz=44202
[DEBUG] Fold 2: top-3 labels (first 3 val): [['707', '402', '552'], ['707', '307', '849*'], ['707', '307', '849*']]
[DEBUG] Fold 2: top-3 parents (first 3 val): [['707', '402', '552'], ['707', '307', '849'], ['707', '307', '849']]
  Fold 2: Parent-Hit@3=0.375




[DEBUG] Fold 3: Xt shape=(8, 73439) nnz=27063
[DEBUG] Fold 3: top-3 labels (first 3 val): [['552', '707', '650A'], ['552', '707', '307'], ['707', '552', '307']]
[DEBUG] Fold 3: top-3 parents (first 3 val): [['552', '707', '650'], ['552', '707', '307'], ['707', '552', '307']]
  Fold 3: Parent-Hit@3=0.125




[DEBUG] Fold 4: Xt shape=(8, 73614) nnz=32775
[DEBUG] Fold 4: top-3 labels (first 3 val): [['707', '530', '402'], ['480D*', '707', '402'], ['707', '552', '650A']]
[DEBUG] Fold 4: top-3 parents (first 3 val): [['707', '530', '402'], ['480', '707', '402'], ['707', '552', '650']]
  Fold 4: Parent-Hit@3=0.625




[DEBUG] Fold 5: Xt shape=(8, 73450) nnz=31343
[DEBUG] Fold 5: top-3 labels (first 3 val): [['703*', '707', '402'], ['707', '650A', '402'], ['480A', '707', '402']]
[DEBUG] Fold 5: top-3 parents (first 3 val): [['703', '707', '402'], ['707', '650', '402'], ['480', '707', '402']]
  Fold 5: Parent-Hit@3=0.625
  CV mean=0.4250 std=0.1871 scores=[0.375 0.375 0.125 0.625 0.625]

CV on TRAIN: C_charWB + word_summary




[DEBUG] Fold 1: Xt shape=(8, 52815) nnz=25445
[DEBUG] Fold 1: top-3 labels (first 3 val): [['480A', '552', '650A'], ['703*', '480A', '552'], ['480D*', '552', '480A']]
[DEBUG] Fold 1: top-3 parents (first 3 val): [['480', '552', '650'], ['703', '480', '552'], ['480', '552', '480']]
  Fold 1: Parent-Hit@3=0.375




[DEBUG] Fold 2: Xt shape=(8, 52394) nnz=29070
[DEBUG] Fold 2: top-3 labels (first 3 val): [['707', '552', '402'], ['707', '307', '849*'], ['707', '307', '552']]
[DEBUG] Fold 2: top-3 parents (first 3 val): [['707', '552', '402'], ['707', '307', '849'], ['707', '307', '552']]
  Fold 2: Parent-Hit@3=0.500




[DEBUG] Fold 3: Xt shape=(8, 53439) nnz=17769
[DEBUG] Fold 3: top-3 labels (first 3 val): [['552', '707', '650A'], ['552', '707', '307'], ['707', '552', '307']]
[DEBUG] Fold 3: top-3 parents (first 3 val): [['552', '707', '650'], ['552', '707', '307'], ['707', '552', '307']]
  Fold 3: Parent-Hit@3=0.125




[DEBUG] Fold 4: Xt shape=(8, 53614) nnz=21456
[DEBUG] Fold 4: top-3 labels (first 3 val): [['707', '402', '530'], ['480D*', '707', '402'], ['707', '552', '650A']]
[DEBUG] Fold 4: top-3 parents (first 3 val): [['707', '402', '530'], ['480', '707', '402'], ['707', '552', '650']]
  Fold 4: Parent-Hit@3=0.625




[DEBUG] Fold 5: Xt shape=(8, 53450) nnz=20401
[DEBUG] Fold 5: top-3 labels (first 3 val): [['703*', '707', '402'], ['707', '650A', '402'], ['480A', '707', '402']]
[DEBUG] Fold 5: top-3 parents (first 3 val): [['703', '707', '402'], ['707', '650', '402'], ['480', '707', '402']]
  Fold 5: Parent-Hit@3=0.625
  CV mean=0.4500 std=0.1871 scores=[0.375 0.5   0.125 0.625 0.625]

CV on TRAIN: D_charWB + word_summary + word_text




[DEBUG] Fold 1: Xt shape=(8, 72815) nnz=27036
[DEBUG] Fold 1: top-3 labels (first 3 val): [['480A', '552', '707'], ['703*', '480A', '552'], ['480D*', '552', '480A']]
[DEBUG] Fold 1: top-3 parents (first 3 val): [['480', '552', '707'], ['703', '480', '552'], ['480', '552', '480']]
  Fold 1: Parent-Hit@3=0.375




[DEBUG] Fold 2: Xt shape=(8, 72394) nnz=30769
[DEBUG] Fold 2: top-3 labels (first 3 val): [['707', '402', '552'], ['707', '307', '849*'], ['707', '307', '849*']]
[DEBUG] Fold 2: top-3 parents (first 3 val): [['707', '402', '552'], ['707', '307', '849'], ['707', '307', '849']]
  Fold 2: Parent-Hit@3=0.375




[DEBUG] Fold 3: Xt shape=(8, 73439) nnz=18781
[DEBUG] Fold 3: top-3 labels (first 3 val): [['552', '707', '650A'], ['552', '707', '307'], ['707', '552', '307']]
[DEBUG] Fold 3: top-3 parents (first 3 val): [['552', '707', '650'], ['552', '707', '307'], ['707', '552', '307']]
  Fold 3: Parent-Hit@3=0.125




[DEBUG] Fold 4: Xt shape=(8, 73614) nnz=22686
[DEBUG] Fold 4: top-3 labels (first 3 val): [['707', '530', '402'], ['480D*', '707', '402'], ['707', '552', '650A']]
[DEBUG] Fold 4: top-3 parents (first 3 val): [['707', '530', '402'], ['480', '707', '402'], ['707', '552', '650']]
  Fold 4: Parent-Hit@3=0.625




[DEBUG] Fold 5: Xt shape=(8, 73450) nnz=21546
[DEBUG] Fold 5: top-3 labels (first 3 val): [['703*', '707', '402'], ['707', '650A', '402'], ['480A', '707', '402']]
[DEBUG] Fold 5: top-3 parents (first 3 val): [['703', '707', '402'], ['707', '650', '402'], ['480', '707', '402']]
  Fold 5: Parent-Hit@3=0.625
  CV mean=0.4250 std=0.1871 scores=[0.375 0.375 0.125 0.625 0.625]




Unnamed: 0,model,mean,std
0,A_char + word_summary,0.45,0.187083
1,C_charWB + word_summary,0.45,0.187083
2,B_char + word_summary + word_text,0.425,0.187083
3,D_charWB + word_summary + word_text,0.425,0.187083



BEST by CV: A_char + word_summary


In a repeated hold-out evaluation (10 random seeds; 80/20 split using the parent-aware multi-label splitter), the TF-IDF + OvR Logistic Regression classifier substantially outperformed the naïve frequency baseline under the project’s primary metric, Parent-Hit@3 (success if at least one gold ATU parent code appears among the parents of the model’s Top-3 predicted fine-grained types). The frequency baseline reaches only ~0.15 Parent-Hit@3 (≈1–2 hits per 10 tales), whereas the proposed text-based models achieve 0.43–0.46 on average (≈4–5 hits per 10 tales), indicating that textual features provide strong predictive signal beyond label priors. Across ablations, combining character TF-IDF on text_norm with word TF-IDF on the summary fallback is already effective (mean 0.43–0.45), while adding word TF-IDF on text_norm yields a small but consistent improvement and better stability (mean 0.46, minimum 0.40 across seeds). Using char_wb instead of char did not materially change performance in this setting. Based on these results, the mixed vector representation (char + word summary + optional word text) is retained as the default baseline model for subsequent experiments and UI integration.

## Baseline Model

In [None]:


def cv_baseline_majority_parent_hit_at_k(
    y_parent_train: list[list[str]],
    k: int = 3,
    n_splits: int = 5,
    cv_seed: int = 123,
    verbose: bool = True,
) -> np.ndarray:
    """
    Baseline: in each fold, predict the top-k most frequent PARENTS from the TRAIN fold.
    Score: Parent-Hit@k on the VAL fold (any match).
    """
    n = len(y_parent_train)
    cv = KFold(n_splits=n_splits, shuffle=True, random_state=cv_seed)
    scores = []

    for fold, (tr_idx, va_idx) in enumerate(cv.split(np.arange(n)), start=1):
        # parent frequency on TRAIN fold only
        cnt = Counter(p for i in tr_idx for p in (y_parent_train[i] or []))
        topk_parents = [p for p, _ in cnt.most_common(k)]

        # evaluate on VAL
        hits = []
        for i in va_idx:
            gold = set(y_parent_train[i] or [])
            hits.append(1 if gold.intersection(topk_parents) else 0)

        score = float(np.mean(hits)) if hits else 0.0
        scores.append(score)

        if verbose:
            print(f"  Fold {fold}: baseline-majority Parent-Hit@{k}={score:.3f} | topk={topk_parents}")

    return np.asarray(scores, dtype=float)

# run
baseline_scores = cv_baseline_majority_parent_hit_at_k(
    y_parent_train=y_parent_train,
    k=3,
    n_splits=5,
    cv_seed=123,
    verbose=True
)
print(f"Baseline majority: mean={baseline_scores.mean():.4f} std={baseline_scores.std(ddof=0):.4f} scores={baseline_scores}")


  Fold 1: baseline-majority Parent-Hit@3=0.375 | topk=['480', '552', '707']
  Fold 2: baseline-majority Parent-Hit@3=0.250 | topk=['480', '707', '402']
  Fold 3: baseline-majority Parent-Hit@3=0.125 | topk=['480', '707', '552']
  Fold 4: baseline-majority Parent-Hit@3=0.375 | topk=['707', '480', '703']
  Fold 5: baseline-majority Parent-Hit@3=0.375 | topk=['480', '707', '402']
Baseline majority: mean=0.3000 std=0.1000 scores=[0.375 0.25  0.125 0.375 0.375]


In [None]:


def cv_baseline_random_label_parent_hit_at_k(
    y_parent_train: list[list[str]],
    classes: np.ndarray,   # fine labels, e.g. mlb.classes_
    k: int = 3,
    n_splits: int = 5,
    cv_seed: int = 123,
    verbose: bool = True,
) -> np.ndarray:
    """
    Baseline: random top-k fine labels (uniform) per sample in VAL fold,
    then map to parents via atu_parent and compute Parent-Hit@k.
    """
    n = len(y_parent_train)
    cv = KFold(n_splits=n_splits, shuffle=True, random_state=cv_seed)
    scores = []

    # map fine label -> parent once
    classes_parent = np.array([atu_parent(c) for c in classes], dtype=object)

    rng = np.random.RandomState(cv_seed)

    for fold, (tr_idx, va_idx) in enumerate(cv.split(np.arange(n)), start=1):
        hits = []
        for i in va_idx:
            gold = set(y_parent_train[i] or [])
            if not gold:
                hits.append(0)
                continue

            # random k distinct fine-label indices
            if len(classes) <= k:
                sampled = np.arange(len(classes))
            else:
                sampled = rng.choice(len(classes), size=k, replace=False)

            pred_parents = set(classes_parent[sampled])
            pred_parents.discard("")
            hits.append(1 if gold.intersection(pred_parents) else 0)

        score = float(np.mean(hits)) if hits else 0.0
        scores.append(score)

        if verbose:
            print(f"  Fold {fold}: baseline-random Parent-Hit@{k}={score:.3f}")

    return np.asarray(scores, dtype=float)

# run
random_scores = cv_baseline_random_label_parent_hit_at_k(
    y_parent_train=y_parent_train,
    classes=classes,
    k=3,
    n_splits=5,
    cv_seed=123,
    verbose=True
)
print(f"Baseline random: mean={random_scores.mean():.4f} std={random_scores.std(ddof=0):.4f} scores={random_scores}")




  Fold 1: baseline-random Parent-Hit@3=0.125
  Fold 2: baseline-random Parent-Hit@3=0.000
  Fold 3: baseline-random Parent-Hit@3=0.250
  Fold 4: baseline-random Parent-Hit@3=0.000
  Fold 5: baseline-random Parent-Hit@3=0.125
Baseline random: mean=0.1000 std=0.0935 scores=[0.125 0.    0.25  0.    0.125]


In [29]:
cv_results_with_baselines = dict(cv_results)
cv_results_with_baselines["BASE_majority_parent"] = baseline_scores
cv_results_with_baselines["BASE_random_fine"] = random_scores

cv_summary2 = (
    pd.DataFrame({
        "model": list(cv_results_with_baselines.keys()),
        "mean": [cv_results_with_baselines[m].mean() for m in cv_results_with_baselines],
        "std":  [cv_results_with_baselines[m].std(ddof=0) for m in cv_results_with_baselines],
    })
    .sort_values("mean", ascending=False)
    .reset_index(drop=True)
)

display(cv_summary2)


Unnamed: 0,model,mean,std
0,A_char + word_summary,0.45,0.187083
1,C_charWB + word_summary,0.45,0.187083
2,B_char + word_summary + word_text,0.425,0.187083
3,D_charWB + word_summary + word_text,0.425,0.187083
4,BASE_majority_parent,0.3,0.1
5,BASE_random_fine,0.1,0.093541


Parent-Hit@3 ≈ 0.45 means that, on average, in 45% of cases the model’s top-3 fine-grained ATU predictions include at least one label whose numeric parent code (digits only, e.g., 480 for 480A/480D*) matches one of the gold parent codes for that tale.

A majority-parent baseline of 0.30 shows that simply predicting the most frequent parent codes already yields 30% hits under this metric. However, the best model variants improve this to 0.45, i.e., an absolute gain of +0.15 over the frequency baseline.

In [30]:
# If you want to force the best model explicitly (recommended for reproducibility):
best_builder = lambda: build_model(use_word_summary=True, use_word_text=False, char_analyzer="char")  # Model A

K = 3

Y_train = mlb.fit_transform(y_train_list)
classes = np.asarray(mlb.classes_)

# ---- train best model on FULL TRAIN ----
best_model = best_builder()
best_model.fit(X_train, Y_train)

# ---- predict on TEST ----
proba = best_model.predict_proba(X_test)

# normalize possible list-of-arrays to 2D array (defensive)
if isinstance(proba, list):
    cols = []
    for p in proba:
        p = np.asarray(p)
        if p.ndim == 2 and p.shape[1] == 2:
            cols.append(p[:, 1])
        elif p.ndim == 2 and p.shape[1] == 1:
            cols.append(p[:, 0])
        else:
            cols.append(p.reshape(-1))
    proba = np.column_stack(cols)
else:
    proba = np.asarray(proba)

# ---- metrics on TEST (no fitting here) ----
test_parent_hit3 = parent_hit_at_k_from_proba(
    y_true_parent_lists=test_df["labels_parent"].tolist(),
    proba=proba,
    classes=classes,
    k=K,
)

test_exact_hit3 = exact_hit_at_k_from_proba(
    y_true_labels_lists=y_test_list,
    proba=proba,
    classes=classes,
    k=K,
)

print(f"TEST Parent-Hit@{K}: {test_parent_hit3:.4f}")
print(f"TEST Exact-Hit@{K}:  {test_exact_hit3:.4f}")

# ---- optional: per-item Top-3 report (useful for QA/UI) ----
topk_idx = np.argsort(-proba, axis=1)[:, :K]
pred_topk = [[classes[j] for j in row] for row in topk_idx]
pred_topk_parent = [[atu_parent(lab) for lab in row] for row in pred_topk]

report = pd.DataFrame({
    "tale_id": test_df["tale_id"] if "tale_id" in test_df.columns else np.arange(len(test_df)),
    "gold_labels": y_test_list,
    "gold_parents": test_df["labels_parent"].tolist(),
    "pred_top3_labels": pred_topk,
    "pred_top3_parents": pred_topk_parent,
})

display(report)

TEST Parent-Hit@3: 0.6000
TEST Exact-Hit@3:  0.6000


Unnamed: 0,tale_id,gold_labels,gold_parents,pred_top3_labels,pred_top3_parents
0,era_vene_12_541_1,[700],[700],"[700, 707, 552]","[700, 707, 552]"
1,era_vene_12_592_4,[703*],[703],"[703*, 707, 552]","[703, 707, 552]"
2,era_vene_12_97_19,[480D*],[480],"[480D*, 552, 707]","[480, 552, 707]"
3,era_vene_13_106_14,"[307, 410]","[307, 410]","[707, 552, 650A]","[707, 552, 650]"
4,era_vene_14_451_7,[650A],[650],"[707, 402, 480A]","[707, 402, 480]"
5,era_vene_16_744_22,[300],[300],"[707, 402, 552]","[707, 402, 552]"
6,era_vene_2_622_5,[425C],[425],"[707, 552, 425C]","[707, 552, 425]"
7,era_vene_7_71_1,[301],[301],"[707, 552, 650A]","[707, 552, 650]"
8,rkm_vene_1_82_47,[707],[707],"[707, 650A, 552]","[707, 650, 552]"
9,tru_vkk_5_36_20,[530],[530],"[707, 530, 552]","[707, 530, 552]"


In [31]:
from collections import Counter

gold_parent_flat = [p for ps in test_df["labels_parent"].tolist() for p in ps]
pred_parent_flat = [p for row in pred_topk_parent for p in row]

print("Gold parents in TEST:", Counter(gold_parent_flat))
print("Pred parents in top-3:", Counter(pred_parent_flat))


Gold parents in TEST: Counter({'700': 1, '703': 1, '480': 1, '307': 1, '410': 1, '650': 1, '300': 1, '425': 1, '301': 1, '707': 1, '530': 1})
Pred parents in top-3: Counter({'707': 10, '552': 9, '650': 3, '480': 2, '402': 2, '700': 1, '703': 1, '425': 1, '530': 1})


Each parent appears once: 700, 703, 480, 307, 410, 650, 300, 425, 301, 707, 530. This means the test set covers 11 distinct parents with no repetition, so performance is very sensitive to whether the model can generalize to rare parents.

Hits mostly occur when the gold parent is among the “default set” the model frequently predicts (e.g., 707, 480) or when the exact label is seen strongly enough to surface (e.g., 703, 530). Rare parents outside that set are consistently missed.

*Potential improvement (future work).* The current Top-3 predictions are strongly concentrated on a few high-frequency parents (e.g., 707/552/480), suggesting a class-imbalance effect. We will add more texts to increase type coverage in the Top-k list and potentially improve Parent-Hit@3. For the present milestone, we keep the metric and model configuration fixed to establish a stable baseline for the end-to-end system.

## Best model saving

In [48]:
def get_git_sha_short(default: str = "unknown") -> str:
    """Пробует взять git sha (если проект в git и git доступен)."""
    try:
        sha = subprocess.check_output(
            ["git", "rev-parse", "--short", "HEAD"],
            stderr=subprocess.DEVNULL,
            text=True
        ).strip()
        return sha or default
    except Exception:
        return default

We additionally refit the final classifier on the full labeled corpus (train + test) to maximize class coverage, so the model is exposed to ATU types that may be absent or underrepresented in the training split; however, all reported performance metrics are computed strictly on the frozen test split before this refit, and the full-corpus model is used only for deployment and interactive exploration.

In [49]:
K = 3

# ------------------------------------------------------------
# Resolve repo root robustly (works in scripts and notebooks)
# ------------------------------------------------------------
def find_repo_root(start: Path) -> Path:
    """
    Walk up from `start` until we find the repo root markers:
    - folder 'models' (your case), and/or
    - folder 'src'
    Adjust markers if needed.
    """
    cur = start.resolve()
    for _ in range(8):  # enough for typical repo depth
        if (cur / "models").is_dir() and (cur / "src").is_dir():
            return cur
        cur = cur.parent
    # fallback: use start
    return start.resolve()

try:
    # if running as a .py file
    ROOT_DIR = Path(__file__).resolve().parents[1]  # .../magic_tagger (from src/*)
except NameError:
    # running in notebook / REPL
    ROOT_DIR = find_repo_root(Path.cwd())

MODELS_DIR = ROOT_DIR / "models"
MODELS_DIR.mkdir(parents=True, exist_ok=True)

print("ROOT_DIR:", ROOT_DIR)
print("MODELS_DIR:", MODELS_DIR)

ROOT_DIR: /Users/eugenia/Desktop/thesis/magic_tagger
MODELS_DIR: /Users/eugenia/Desktop/thesis/magic_tagger/models


In [50]:
# ------------------------------------------------------------
# Train FINAL model on FULL dataset (train + test merged)
#   - Use only text columns (no IDs / metadata)
#   - Fit MLB on FULL labels (so the model "sees" more types)
#   - Train best model (A + balanced)
# ------------------------------------------------------------

# 0) Merge
full_df = pd.concat([train_df, test_df], ignore_index=True)

# 1) Build X only from text columns (no IDs / metadata)
X_full = build_X(full_df, ("summary_norm", "text_norm"))

# 2) Prepare labels and fit MLB on FULL dataset
y_full_list = [clean_label_list(x) for x in full_df["labels"].tolist()]

mlb_full = MultiLabelBinarizer()
Y_full = mlb_full.fit_transform(y_full_list)
classes_full = [str(c) for c in mlb_full.classes_]
# 3) Train best model (A + balanced)
final_model = build_model(
    use_word_summary=True,
    use_word_text=False,
    char_analyzer="char",
    class_weight="balanced",
    random_state=42,
)

final_model.fit(X_full, Y_full)

print("Trained final_model on FULL dataset.")
print("X_full shape:", X_full.shape)
print("Y_full shape:", Y_full.shape)
print("n_classes:", len(classes_full))

Trained final_model on FULL dataset.
X_full shape: (50, 2)
Y_full shape: (50, 37)
n_classes: 37


In [51]:

# ------------------------------------------------------------
# 4) Save artifacts to magic_tagger/models/
# ------------------------------------------------------------
joblib.dump(best_model, MODELS_DIR / "model.joblib")

with open(MODELS_DIR / "labels.json", "w", encoding="utf-8") as f:
    json.dump(classes_full, f, ensure_ascii=False, indent=2)

meta = {
    "task": "ATU multilabel classification (Top-3) + parent match",
    "k": K,
    "text_cols": ["summary_norm", "text_norm"],
    "model_name": "A_char + word_summary (balanced)",
    "note": "predict_proba columns correspond to labels.json order",
    "model_version": get_git_sha_short(),
    "generated_at": datetime.now(timezone.utc).replace(microsecond=0).isoformat()
}
with open(MODELS_DIR / "meta.json", "w", encoding="utf-8") as f:
    json.dump(meta, f, ensure_ascii=False, indent=2)

print("Saved:",
      MODELS_DIR / "model.joblib",
      MODELS_DIR / "labels.json",
      MODELS_DIR / "meta.json")

Saved: /Users/eugenia/Desktop/thesis/magic_tagger/models/model.joblib /Users/eugenia/Desktop/thesis/magic_tagger/models/labels.json /Users/eugenia/Desktop/thesis/magic_tagger/models/meta.json


In [41]:
def proba_to_2d(proba) -> np.ndarray:
    """
    Нормализует выход final_model.predict_proba:
    - если list-of-arrays (часто для OVR/мульти-голов) -> (n_samples, n_classes)
    - если уже ndarray -> просто np.asarray
    """
    if isinstance(proba, list):
        cols = []
        for p in proba:
            p = np.asarray(p)
            # бинарный классификатор: берем proba класса "1"
            if p.ndim == 2 and p.shape[1] == 2:
                cols.append(p[:, 1])
            elif p.ndim == 2 and p.shape[1] == 1:
                cols.append(p[:, 0])
            else:
                cols.append(p.reshape(-1))
        return np.column_stack(cols)
    return np.asarray(proba)

def make_json_safe(x):
    """Приводит numpy-типы/массивы к сериализуемым."""
    if isinstance(x, dict):
        return {k: make_json_safe(v) for k, v in x.items()}
    if isinstance(x, list):
        return [make_json_safe(v) for v in x]
    if isinstance(x, tuple):
        return [make_json_safe(v) for v in x]
    if isinstance(x, (np.floating,)):
        return float(x)
    if isinstance(x, (np.integer,)):
        return int(x)
    if isinstance(x, (np.ndarray,)):
        return x.tolist()
    return x

def build_result_from_model(model, X_one, classes, k=3, sample_id=None):
    """
    X_one: DataFrame из 1 строки (важно!)
    classes: список классов в том же порядке, что proba-колонки
    """
    proba = model.predict_proba(X_one)
    proba = proba_to_2d(proba)               # (1, n_classes)
    p = proba[0]

    topk_idx = np.argsort(-p)[:k]
    candidates = [{"atu": str(classes[j]), "score": float(p[j])} for j in topk_idx]

    result = {
        "id": sample_id,
        "meta": {
            "k": int(k),
            "n_classes": int(len(classes)),
        },
        "candidates": candidates,
    }
    return result

In [44]:
# 1) выберите индекс примера
idx = 33  # поменяйте на любой (например, 10, 57, ...)

# 2) сформируйте X_one (ВАЖНО: 2D, поэтому двойные скобки)
X_one = X_full.iloc[[idx]]

# 3) попробуем сделать sample_id из full_df, если есть tale_id, иначе row_{idx}
if "tale_id" in full_df.columns:
    sample_id = str(full_df.loc[idx, "tale_id"])
else:
    sample_id = f"row_{idx}"

# 4) построить result
result = build_result_from_model(
    model=final_model,
    X_one=X_one,
    classes=classes_full,
    k=3,
    sample_id=sample_id
)

# 5) сохранить
out_dir = Path("debug")
out_dir.mkdir(parents=True, exist_ok=True)

out_path = out_dir / f"predict_result_{sample_id}.json"
out_path.write_text(
    json.dumps(make_json_safe(result), ensure_ascii=False, indent=2),
    encoding="utf-8"
)

print("Saved:", out_path.resolve())
print(json.dumps(result, ensure_ascii=False, indent=2))

Saved: /Users/eugenia/Desktop/thesis/magic_tagger/notebooks/debug/predict_result_tru_vkk_13_59_4.json
{
  "id": "tru_vkk_13_59_4",
  "meta": {
    "k": 3,
    "n_classes": 37
  },
  "candidates": [
    {
      "atu": "554",
      "score": 0.9447313578419488
    },
    {
      "atu": "556F*",
      "score": 0.9447313578419488
    },
    {
      "atu": "302C*",
      "score": 0.9447313578419488
    }
  ]
}
