In [2]:
import os
import pathlib
import sys

PROJECT_ROOT = pathlib.Path.cwd()

PROJECT_ROOT

PosixPath('/Users/zhantore/Documents/cefr-kk-ru')

In [3]:
import platform
import torch

print("Python:", platform.python_version())
print("Torch:", torch.__version__)
print("CUDA available:", torch.cuda.is_available())

Python: 3.10.19
Torch: 2.7.1
CUDA available: False


In [5]:
from src.data.download_parallel import save_kz_ru

PARALLEL_PATH = save_kz_ru(split="train[:100]", out_dir="data/parallel", out_name="kazparc_kz_ru.csv")
PARALLEL_PATH

Generating train split: 100%|██████████| 371902/371902 [00:02<00:00, 137309.00 examples/s]

Saved: data/parallel/kazparc_kz_ru.csv rows: 100





PosixPath('data/parallel/kazparc_kz_ru.csv')

In [7]:
from src.align.mutual_align import EmbeddingAligner
from src.pipeline.build_silver_labels import main as build_silver_labels

# Use GPU explicitly if available
aligner_device = "cuda" if torch.cuda.is_available() else "cpu"
custom_aligner = EmbeddingAligner(device=aligner_device)

SILVER_PATH = build_silver_labels(parallel_csv=PARALLEL_PATH, aligner=custom_aligner)
SILVER_PATH

Some weights of BertModel were not initialized from the model checkpoint at aneuraz/awesome-align-with-co and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Saved: data/labels/silver_word_labels.csv rows=928 skipped_sentences=0


PosixPath('data/labels/silver_word_labels.csv')

In [22]:
import pandas as pd

silver_df = pd.read_csv(SILVER_PATH)
silver_df

Unnamed: 0,kaz_item,rus_item,cefr,kaz_sent,rus_sent
0,кезінде,При,B1,Қауіпті қалдықтар трансшекаралық тасымалдау ке...,При трансграничной перевозке опасные отходы до...
1,трансшекаралық,трансграничной,Unknown,Қауіпті қалдықтар трансшекаралық тасымалдау ке...,При трансграничной перевозке опасные отходы до...
2,тасымалдау,перевозке,Unknown,Қауіпті қалдықтар трансшекаралық тасымалдау ке...,При трансграничной перевозке опасные отходы до...
3,Қауіпті,опасные,B2,Қауіпті қалдықтар трансшекаралық тасымалдау ке...,При трансграничной перевозке опасные отходы до...
4,қалдықтар,отходы,Unknown,Қауіпті қалдықтар трансшекаралық тасымалдау ке...,При трансграничной перевозке опасные отходы до...
...,...,...,...,...,...
923,өте,Очень,A1,Суда жатқан өте үлкен әдемі піл.,"Очень большой красивый слон, лежащий в воде."
924,үлкен,большой,A1,Суда жатқан өте үлкен әдемі піл.,"Очень большой красивый слон, лежащий в воде."
925,әдемі,красивый,A2,Суда жатқан өте үлкен әдемі піл.,"Очень большой красивый слон, лежащий в воде."
926,піл.,"слон,",Unknown,Суда жатқан өте үлкен әдемі піл.,"Очень большой красивый слон, лежащий в воде."


In [None]:
from src.text.predict_text import predict_text_cefr

sample_row = silver_df.iloc[0]
prediction = predict_text_cefr( # предсказать
    sample_row['kaz_item'],
    russian_text=sample_row['rus_item'],
)
prediction


TextCefrPrediction(translation='При', distribution={'A1': 0.0, 'A2': 0.0, 'B1': 1.0, 'B2': 0.0, 'C1': 0.0, 'C2': 0.0}, average_level='B1', phrase_alignments=[PhraseAlignment(kazakh_phrase='кезінде', russian_token='При', kazakh_span=(0,), russian_index=0)])

In [36]:
import numpy as np
import torch
from pathlib import Path

from src.models.predict_transformer_word import load_model, WordRecord, compute_features, UNK_TOKEN, CEFR_LEVELS
from src.utils import cefr_id_to_label  # only if you want the top class name

word = "легко"
model_path = Path("models/simple_word_cefr/simple_word_cefr.pt")  # adjust if needed

# Load model and cached stats
artifact, model, device = load_model(model_path)
char2idx = artifact["char2idx"]
feature_mean = np.array(artifact["feature_mean"], dtype=np.float32)
feature_std = np.array(artifact["feature_std"], dtype=np.float32)
total_frequency = float(artifact["total_frequency"])

# Feature + char encoding
rec = WordRecord(word=word.lower(), label=0, frequency=1, rank_fraction=0.5)
features = compute_features(rec, total_frequency)
features = np.nan_to_num((features - feature_mean) / feature_std)

char_ids = torch.tensor(
    [char2idx.get(ch, char2idx[UNK_TOKEN]) for ch in rec.word], dtype=torch.long
).unsqueeze(0)
lengths = torch.tensor([char_ids.size(1)], dtype=torch.long)
features_tensor = torch.tensor(features, dtype=torch.float32).unsqueeze(0)

char_ids = char_ids.to(device)
lengths = lengths.to(device)
features_tensor = features_tensor.to(device)

# Forward pass
model.eval()
with torch.no_grad():
    logits = model(char_ids, lengths, features_tensor)
    probs = torch.softmax(logits, dim=1).cpu().numpy()[0]

# Display distribution
distribution = dict(zip(CEFR_LEVELS, probs))
print(f"CEFR distribution for '{word}':")
for level, prob in distribution.items():
    print(f"  {level}: {prob:.4f}")

top_idx = int(np.argmax(probs))
print(f"\nTop prediction: {CEFR_LEVELS[top_idx]} ({probs[top_idx]:.4f})")


CEFR distribution for 'легко':
  A1: 0.4665
  A2: 0.2356
  B1: 0.1440
  B2: 0.0671
  C1: 0.0501
  C2: 0.0367

Top prediction: A1 (0.4665)
