# Type Anchors for Classifier

## 1. Setup and Paths

In [29]:
# --- Imports ---

from pathlib import Path
import re

import numpy as np
import pandas as pd
from datetime import datetime

import matplotlib.pyplot as plt
import unicodedata
from transliterate import translit
from unidecode import unidecode
from rapidfuzz import fuzz, process
import json
import ast
import sys
from collections import Counter
from IPython.display import display

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import chi2



In [38]:
PROJECT_ROOT = Path("/Users/eugenia/Desktop/thesis/magic_tagger")

TRAIN = PROJECT_ROOT / "data" / "processed" / "train.csv"
TEST = PROJECT_ROOT / "data" / "processed" / "test.csv"
models_dir = PROJECT_ROOT / "models"


In [None]:
train_df = pd.read_csv(TRAIN, encoding="utf-8")
train_df.head()

Unnamed: 0,tale_id,summary_norm,text_norm,labels,labels_parent
0,era_vene_1_503_1,царевна-лягушка.,тили были царь с царицей у не было три сына. ц...,['402'],['402']
1,era_vene_1_515_1,"по пьяни мужик спорит, что сможет принести ноч...","раз пяное, ребятище» подился. что можит в 12 ч...",['410'],['410']
2,era_vene_12_105_22,снегурочка.,сделали дети со снегу куклу. в одного старина ...,['703*'],['703']
3,era_vene_12_137_98,иван-дурак.,"кил-был стажк. в яво бло тра сегна. миша, гриш...",['530'],['530']
4,era_vene_12_189_1,два брата.,жили — брели два брата. и посла смерти отца об...,['735A'],['735']
5,era_vene_12_440_19,снегурочка.,вили дед да бада и стали ен замой со снегу дев...,['703*'],['703']
6,era_vene_12_501_1,морозко.,дрил мужем да баба было в и двя дочери. една б...,['480D*'],['480']
7,era_vene_12_95_18,мальчик-с-пальчик. мужик бросает отрубленный п...,"бубил мужик дрова и отру был сей пальчила, бро...",['700'],['700']
8,era_vene_13_137_16,"у попа и попадьи было много земли, не могут на...","16. жин ной с попадьей. брыло в н много земли,...","['650A', '1000', '1060']","['1000', '1060', '650']"
9,era_vene_13_15_1,морозко.,1-2 авдотья ершова — 76 лет; деревья старый из...,['480D*'],['480']


In [None]:
LABEL_COL = "labels" 

def parse_labels_cell(x):
    
    if x is None or (isinstance(x, float) and pd.isna(x)):
        return []
    if isinstance(x, list):
        return [str(t).strip() for t in x if str(t).strip()]
    if isinstance(x, str):
        s = x.strip()
        if not s or s.lower() == "nan":
            return []
      
        if s.startswith("[") and s.endswith("]"):
            try:
                v = ast.literal_eval(s)
                if isinstance(v, (list, tuple)):
                    return [str(t).strip() for t in v if str(t).strip()]
            except Exception:
                pass
    
        return [s]
    return [str(x).strip()] if str(x).strip() else []

labels_series = train_df[LABEL_COL].apply(parse_labels_cell)

cnt = Counter(t for labs in labels_series for t in labs)

top10 = cnt.most_common(10)


Top-10 types:


[('707', 5),
 ('480D*', 4),
 ('402', 3),
 ('552', 3),
 ('480A', 3),
 ('703*', 2),
 ('530', 2),
 ('650A', 2),
 ('307', 2),
 ('550', 2)]

In [33]:
top10_types = [t for t, _ in top10]
print("Top-10 types:")
display(pd.DataFrame(top10, columns=["type", "count"]))

print("Top-10 type IDs (for focus_types):")
display(top10_types)

Top-10 types:


Unnamed: 0,type,count
0,707,6
1,480D*,5
2,402,3
3,703*,3
4,530,3
5,307,3
6,650A,3
7,552,3
8,480A,3
9,410,2


Top-10 type IDs (for focus_types):


['707', '480D*', '402', '703*', '530', '307', '650A', '552', '480A', '410']

In [15]:
csv_path = PROJECT_ROOT / "data" / "processed" / "classify_data_normalized.csv"

In [20]:
df_all = pd.read_csv(csv_path, encoding="utf-8")
df_all.head()

Unnamed: 0,tale_id,rights_status,content_description,set,sampling_version,type_count,collection,volume_no,source_ref,atu_labels_json,txt_path,text_raw,summary_norm,text_norm
0,era_vene_1_503_1,open,[Царевна-лягушка].,core,v1_20251230,3,"ERA, Vene",1,"ERA, Vene 1, 503/4 (1)","[""402""]",/Users/eugenia/Desktop/thesis/magic_tagger/dat...,Тили были царь с царицей у не\nбыло три сына. ...,царевна-лягушка.,тили были царь с царицей у не было три сына. ц...
1,era_vene_1_515_1,open,"[По пьяни мужик спорит, что сможет принести но...",coverage,v1_20251230,1,"ERA, Vene",1,"ERA, Vene 1, 515/6 (1)","[""410""]",/Users/eugenia/Desktop/thesis/magic_tagger/dat...,"Раз пяное, ребятище» подился.\nчто можит в 12 ...","по пьяни мужик спорит, что сможет принести ноч...","раз пяное, ребятище» подился. что можит в 12 ч..."
2,era_vene_12_105_22,open,Снегурочка.,core,v1_20251230,3,"ERA, Vene",12,"ERA, Vene 12, 105 (22)","[""703*""]",/Users/eugenia/Desktop/thesis/magic_tagger/dat...,Сделали дети со снегу куклу.\nВ одного старина...,снегурочка.,сделали дети со снегу куклу. в одного старина ...
3,era_vene_12_137_98,open,Иван-дурак.,core,v1_20251230,4,"ERA, Vene",12,"ERA, Vene 12, 137/41 (98)","[""530""]",/Users/eugenia/Desktop/thesis/magic_tagger/dat...,"Кил-был стажк. В яво бло\nтра сегна. Миша, Гри...",иван-дурак.,"кил-был стажк. в яво бло тра сегна. миша, гриш..."
4,era_vene_12_189_1,open,Два брата.,core,v1_20251230,2,"ERA, Vene",12,"ERA, Vene 12, 189/94 (1)","[""735A""]",/Users/eugenia/Desktop/thesis/magic_tagger/dat...,Жили – брели два брата.\nи посла смерти отца о...,два брата.,жили — брели два брата. и посла смерти отца об...


In [21]:
labels_series = df_all["atu_labels_json"].apply(parse_labels_cell)

cnt = Counter(t for labs in labels_series for t in labs)

top10_all = cnt.most_common(10)

print("Top-10 types:")
display(top10_all)

Top-10 types:


[('707', 6),
 ('480D*', 5),
 ('402', 3),
 ('703*', 3),
 ('530', 3),
 ('307', 3),
 ('650A', 3),
 ('552', 3),
 ('480A', 3),
 ('410', 2)]

In [24]:
RU_STOP = {
    "и","в","во","не","что","он","на","я","с","со","как","а","то","все","она","так",
    "его","но","да","ты","к","у","же","вы","за","бы","по","ее","мне","было","вот",
    "от","меня","еще","нет","о","из","ему","теперь","когда","даже","ну","вдруг",
    "ли","если","уже","или","ни","быть","был","него","до","вас","нибудь","опять",
}

In [25]:
def parse_labels_cell(x):
    """labels -> list[str] для list / строки '[]' / JSON."""
    if x is None or (isinstance(x, float) and pd.isna(x)):
        return []
    if isinstance(x, list):
        return [str(t).strip() for t in x if str(t).strip()]
    if isinstance(x, str):
        s = x.strip()
        if not s:
            return []
        if s.startswith("[") and s.endswith("]"):
            # ваш формат часто python-literal
            try:
                v = ast.literal_eval(s)
                if isinstance(v, (list, tuple)):
                    return [str(t).strip() for t in v if str(t).strip()]
            except Exception:
                pass
            # fallback: json
            try:
                v = json.loads(s)
                if isinstance(v, list):
                    return [str(t).strip() for t in v if str(t).strip()]
            except Exception:
                pass
        # одиночное значение
        return [s]
    return [str(x).strip()] if str(x).strip() else []

In [26]:
def build_text(row):
    s = str(row.get("summary_norm") or "")
    t = str(row.get("text_norm") or row.get("text_ru") or "")
    return (s + " " + t).strip()


In [41]:
def build_anchors_chi2_from_df(
    train_df: pd.DataFrame,
    out_json: str,
    focus_types: list[str],
    label_col: str = "labels",
    topk: int = 50,
    min_pos: int = 2,
):
    df = train_df.copy()
    df["__text"] = df.apply(build_text, axis=1)
    df["__labels"] = df[label_col].apply(parse_labels_cell)

    vec = TfidfVectorizer(
        ngram_range=(1, 2),
        min_df=2,
        max_df=0.6,
        sublinear_tf=True,
        lowercase=True,
        stop_words=list(RU_STOP),
    )
    X = vec.fit_transform(df["__text"])
    vocab = np.array(vec.get_feature_names_out())

    anchors = {}
    for t in focus_types:
        # one-vs-rest по наличию t в списке labels
        y = df["__labels"].apply(lambda labs: int(t in labs)).to_numpy()
        if y.sum() < min_pos:
            continue

        scores, _ = chi2(X, y)
        idx = np.argsort(scores)[::-1][:topk]
        top_scores = scores[idx]
        denom = float(top_scores.sum()) if float(top_scores.sum()) > 0 else 1.0

        anchors[t] = [
            {"pattern": str(vocab[i]), "w": float(scores[i] / denom), "src": "chi2"}
            for i in idx
            if np.isfinite(scores[i]) and scores[i] > 0
        ]

    with open(out_json, "w", encoding="utf-8") as f:
        json.dump(anchors, f, ensure_ascii=False, indent=2)

    return anchors

In [42]:
focus_types = top10_types
anchors = build_anchors_chi2_from_df(train_df, models_dir / "anchors_auto_train.json", focus_types, topk=60)
print("Built anchors for:", len(anchors), "types")


Built anchors for: 9 types
