In [None]:
import os
import pathlib
import sys

PROJECT_ROOT = pathlib.Path.cwd()

PROJECT_ROOT

In [None]:
import platform
import torch

print("Python:", platform.python_version())
print("Torch:", torch.__version__)
print("CUDA available:", torch.cuda.is_available())

In [None]:
from src.data.download_parallel import save_kz_ru

PARALLEL_PATH = save_kz_ru(split="train", out_dir="data/parallel", out_name="kazparc_kz_ru.csv")
PARALLEL_PATH

In [None]:
from src.align.mutual_align import EmbeddingAligner
from src.pipeline.build_silver_labels import main as build_silver_labels

# Use GPU explicitly if available
aligner_device = "cuda" if torch.cuda.is_available() else "cpu"
custom_aligner = EmbeddingAligner(device=aligner_device)

SILVER_PATH = build_silver_labels(parallel_csv=PARALLEL_PATH, aligner=custom_aligner)
SILVER_PATH

In [None]:
import pandas as pd

silver_df = pd.read_csv(SILVER_PATH)
silver_df

In [None]:
from src.text.predict_text import predict_text_cefr

sample_row = silver_df.iloc[0]
prediction = predict_text_cefr( # предсказать
    sample_row['kaz_item'],
    russian_text=sample_row['rus_item'],
)
prediction


In [None]:
import numpy as np
import torch
from pathlib import Path

from src.models.predict_transformer_word import load_model, WordRecord, compute_features, UNK_TOKEN, CEFR_LEVELS
from src.utils import cefr_id_to_label  # only if you want the top class name

word = "легко"
model_path = Path("models/simple_word_cefr/simple_word_cefr.pt")  # adjust if needed

# Load model and cached stats
artifact, model, device = load_model(model_path)
char2idx = artifact["char2idx"]
feature_mean = np.array(artifact["feature_mean"], dtype=np.float32)
feature_std = np.array(artifact["feature_std"], dtype=np.float32)
total_frequency = float(artifact["total_frequency"])

# Feature + char encoding
rec = WordRecord(word=word.lower(), label=0, frequency=1, rank_fraction=0.5)
features = compute_features(rec, total_frequency)
features = np.nan_to_num((features - feature_mean) / feature_std)

char_ids = torch.tensor(
    [char2idx.get(ch, char2idx[UNK_TOKEN]) for ch in rec.word], dtype=torch.long
).unsqueeze(0)
lengths = torch.tensor([char_ids.size(1)], dtype=torch.long)
features_tensor = torch.tensor(features, dtype=torch.float32).unsqueeze(0)

char_ids = char_ids.to(device)
lengths = lengths.to(device)
features_tensor = features_tensor.to(device)

# Forward pass
model.eval()
with torch.no_grad():
    logits = model(char_ids, lengths, features_tensor)
    probs = torch.softmax(logits, dim=1).cpu().numpy()[0]

# Display distribution
distribution = dict(zip(CEFR_LEVELS, probs))
print(f"CEFR distribution for '{word}':")
for level, prob in distribution.items():
    print(f"  {level}: {prob:.4f}")

top_idx = int(np.argmax(probs))
print(f"\nTop prediction: {CEFR_LEVELS[top_idx]} ({probs[top_idx]:.4f})")
