In [2]:
import os
import pathlib
import sys

PROJECT_ROOT = pathlib.Path.cwd()

PROJECT_ROOT

PosixPath('/Users/galymzhantore/cefr-classification-kk')

In [3]:
import platform
import torch

print("Python:", platform.python_version())
print("Torch:", torch.__version__)
print("CUDA available:", torch.cuda.is_available())

Python: 3.10.18
Torch: 2.5.1
CUDA available: False


In [4]:
from src.data.download_parallel import save_kz_ru

PARALLEL_PATH = save_kz_ru(split="train[:2000]", out_dir="data/parallel", out_name="kazparc_kz_ru.csv")
PARALLEL_PATH

Saved: data/parallel/kazparc_kz_ru.csv rows: 2000


PosixPath('data/parallel/kazparc_kz_ru.csv')

In [5]:
from src.align.mutual_align import EmbeddingAligner
from src.pipeline.build_silver_labels import main as build_silver_labels

# Use GPU explicitly if available
aligner_device = "cuda" if torch.cuda.is_available() else "cpu"
custom_aligner = EmbeddingAligner(device=aligner_device)

SILVER_PATH = build_silver_labels(parallel_csv=PARALLEL_PATH, aligner=custom_aligner)
SILVER_PATH

Some weights of BertModel were not initialized from the model checkpoint at aneuraz/awesome-align-with-co and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Saved: data/labels/silver_word_labels.csv rows=18485 skipped_sentences=0


PosixPath('data/labels/silver_word_labels.csv')

In [6]:
import pandas as pd

silver_df = pd.read_csv(SILVER_PATH)
silver_df

Unnamed: 0,kaz_item,rus_item,cefr,kaz_sent,rus_sent
0,кезінде,При,B1,Қауіпті қалдықтар трансшекаралық тасымалдау ке...,При трансграничной перевозке опасные отходы до...
1,трансшекаралық,трансграничной,Unknown,Қауіпті қалдықтар трансшекаралық тасымалдау ке...,При трансграничной перевозке опасные отходы до...
2,тасымалдау,перевозке,Unknown,Қауіпті қалдықтар трансшекаралық тасымалдау ке...,При трансграничной перевозке опасные отходы до...
3,Қауіпті,опасные,B2,Қауіпті қалдықтар трансшекаралық тасымалдау ке...,При трансграничной перевозке опасные отходы до...
4,қалдықтар,отходы,Unknown,Қауіпті қалдықтар трансшекаралық тасымалдау ке...,При трансграничной перевозке опасные отходы до...
...,...,...,...,...,...
18480,Піл,Слон,C2,Піл қалың бұтада суретке қарап тұр.,"Слон стоит в зарослях, пристально смотря на чт..."
18481,бұтада,в,A2,Піл қалың бұтада суретке қарап тұр.,"Слон стоит в зарослях, пристально смотря на чт..."
18482,бұтада,"зарослях,",Unknown,Піл қалың бұтада суретке қарап тұр.,"Слон стоит в зарослях, пристально смотря на чт..."
18483,қарап,смотря,A2,Піл қалың бұтада суретке қарап тұр.,"Слон стоит в зарослях, пристально смотря на чт..."


In [7]:
from src.align.analysis import (
    align_with_probabilities,
    informative_link_share,
    fraction_above_threshold,
    is_informative,
)


In [8]:
sample = silver_df.iloc[0]
kz_words = tuple(sample['kaz_sent'].split())
ru_words = tuple(sample['rus_sent'].split())
details = align_with_probabilities(
    custom_aligner,
    kz_words,
    ru_words,
    layer=8,
    thresh=0.05,
)
details_matrix = details.to_dataframe(kz_words, ru_words)
details_matrix.head()


Unnamed: 0,kaz_index,kaz_token,rus_index,rus_token,p_ru_given_kz,p_kz_given_ru,joint_prob,is_link
0,0,Қауіпті,0,При,1.265354e-37,1.331234e-43,1.331234e-43,False
1,0,Қауіпті,1,трансграничной,6.273898e-10,1.390772e-26,1.390772e-26,False
2,0,Қауіпті,2,перевозке,1.395833e-41,0.0,0.0,False
3,0,Қауіпті,3,опасные,1.0,1.0,1.0,True
4,0,Қауіпті,4,отходы,1.278244e-38,4.169283e-41,4.169283e-41,False


In [9]:
informative_matrix = details_matrix[details_matrix['rus_token'].apply(is_informative)].copy()
informative_matrix.head()


Unnamed: 0,kaz_index,kaz_token,rus_index,rus_token,p_ru_given_kz,p_kz_given_ru,joint_prob,is_link
0,0,Қауіпті,0,При,1.265354e-37,1.331234e-43,1.331234e-43,False
1,0,Қауіпті,1,трансграничной,6.273898e-10,1.390772e-26,1.390772e-26,False
2,0,Қауіпті,2,перевозке,1.395833e-41,0.0,0.0,False
3,0,Қауіпті,3,опасные,1.0,1.0,1.0,True
4,0,Қауіпті,4,отходы,1.278244e-38,4.169283e-41,4.169283e-41,False


In [10]:
share_correct = informative_link_share(details, kz_words, ru_words)
share_correct


1.0

In [11]:
alignment_indices = {
    'kz_indices': details.kz_keep,
    'ru_indices': details.ru_keep,
}
alignment_indices


{'kz_indices': [0,
  1,
  2,
  3,
  4,
  5,
  6,
  7,
  8,
  9,
  10,
  11,
  12,
  13,
  14,
  15,
  16,
  17,
  18,
  19,
  20,
  21,
  22],
 'ru_indices': [0,
  1,
  2,
  3,
  4,
  5,
  6,
  7,
  8,
  9,
  10,
  11,
  12,
  13,
  14,
  15,
  16,
  17,
  18,
  19,
  20,
  21,
  22,
  23]}

In [12]:
prob_threshold = 0.3
sample_size = 200
sample_records = silver_df.head(sample_size)[['kaz_sent', 'rus_sent']].to_dict('records')
fraction_above_threshold(
    sample_records,
    custom_aligner,
    layer=8,
    thresh=0.05,
    prob_threshold=prob_threshold,
)


0.9729866195405201

In [None]:
# Example ensemble inference (requires trained Russian sentence model)
from src.translation.translator import get_translator
from src.ru_sentence_model.model import RuSentenceCefrModel
from src.text.predict_text import predict_text_cefr_ensemble

ru_model = RuSentenceCefrModel.from_pretrained("src/models/ru_cefr_sentence")
translator = get_translator()
ensemble_result = predict_text_cefr_ensemble(
    sample['kaz_sent'],
    translator=translator,
    aligner=custom_aligner,
    russian_text=sample['rus_sent'],
    russian_model=ru_model,
    russian_weight=0.6,
)
ensemble_result


OSError: models/ru_cefr_sentence is not a local folder and is not a valid model identifier listed on 'https://huggingface.co/models'
If this is a private repository, make sure to pass a token having permission to this repo either by logging in with `huggingface-cli login` or by passing `token=<your_token>`