In [28]:
%load_ext autoreload
%autoreload 2
import yaml
from pathlib import Path
import sys
import os
import pprint
import torch

current_dir = Path(os.getcwd()).resolve()
print(f"Current dir: {current_dir.name}")

if current_dir.name == "LLMPolReasonEval": # uruchomione w Jupyter Lab
    project_root = current_dir
elif current_dir.name == "content": # uruchomione w Google Colab
    project_root = current_dir / "llm_pol_reason_eval"
else:  # uruchomione w PyCharm
    project_root = current_dir.parents[2]
print(f"Project root: {project_root}")
src_dir = project_root / "src"
if src_dir.exists() and str(src_dir) not in sys.path:
    sys.path.insert(0, str(src_dir))
    print(f"Directory {src_dir} added to sys.path")

from llm_pol_reason_eval.qa_engine.llm_qa_engine import LLMQAEngine
from llm_pol_reason_eval.qa_engine.inference_client import HuggingFaceClient

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
Current dir: environment_test_local_cuda
Project root: C:\Users\piotr\PycharmProjects\LLMPolReasonEval


In [29]:
if torch.cuda.is_available():
    print(torch.cuda.get_device_name(0))
else:
    print("CUDA not enabled")

NVIDIA GeForce GTX 1660 Ti


In [30]:
if torch.cuda.is_available():
    torch.cuda.empty_cache()
    print("Pamięć podręczna CUDA została wyczyszczona.")

Pamięć podręczna CUDA została wyczyszczona.


In [41]:
# EXPERIMENT_NAME = "bielik-small-matura" # lub "qwen-small-th-fs-cot-matura"
EXPERIMENT_NAME = "bielik-small-fs-cot-matura"
RUN_CONFIG_FILE = "config/runs/qa_polski_matura_dataset.yaml"
MODELS_CONFIG_FILE = "config/models.yaml"

with open(project_root / RUN_CONFIG_FILE, 'r', encoding='utf-8') as f:
    run_config = yaml.safe_load(f)['experiments'][EXPERIMENT_NAME]

with open(project_root / MODELS_CONFIG_FILE, 'r', encoding='utf-8') as f:
    models_config = yaml.safe_load(f)

print(f"Uruchamiam eksperyment: {run_config.get('task_name')}")

Uruchamiam eksperyment: Bielik 1.5B - Few-Shot & CoT - Matura


In [43]:
model_key = run_config['model']
model_cfg = models_config[model_key]

run_overrides = run_config.get("param_overrides", {})
final_gen_params = model_cfg['generation_params'].copy()
final_gen_params.update(run_overrides.get('default', {}))

inference_client = HuggingFaceClient(
    model_path=model_cfg['path'],
    default_generation_params=final_gen_params
)

qa_engine = LLMQAEngine(
    model_name=model_key,
    model_path=model_cfg['path'],
    inference_client=inference_client
)

HuggingFaceClient: Inicjalizacja modelu speakleash/Bielik-1.5B-v3.0-Instruct na urzÄ…dzeniu: cuda
HuggingFaceClient: DomyĹ›lna konfiguracja generowania: GenerationConfig {
  "bos_token_id": 1,
  "do_sample": true,
  "eos_token_id": [
    4,
    2
  ],
  "max_new_tokens": 512,
  "pad_token_id": 2,
  "temperature": 0.3
}



In [44]:
input_dataset_path = project_root / run_config['input_dataset']
output_dir = project_root / run_config['output_dir']
iterations = run_config.get('iterations', 1)
query=None

In [45]:
run_id = 1
run_output_dir = output_dir / f"run_{run_id}"
run_output_dir.mkdir(parents=True, exist_ok=True)
answers_output_path = run_output_dir / f"answers_{EXPERIMENT_NAME}.json"

In [46]:
qa_engine._setup_logger_and_reset_results(input_dataset_path, answers_output_path)

Ścieżka logów: C:\Users\piotr\PycharmProjects\LLMPolReasonEval\results\bielik-small-fs-cot-matura\run_1\logs\polski_matura_dataset_2025-06-14T20-00-06Z_bielik-1-5b-v3-instruct_2025-06-15T23-58-34Z.log
2025-06-15T23:58:34.364008+00:00 INFO: Logger uruchomiony.


In [47]:
qa_engine._load_dataset(input_dataset_path)

2025-06-15T23:58:36.131538+00:00 INFO: Ładowanie datasetu z: C:\Users\piotr\PycharmProjects\LLMPolReasonEval\data\dataset\polski_matura_dataset_2025-06-14T20-00-06Z.json
2025-06-15T23:58:36.143042+00:00 INFO: Załadowano 267 pytań i 104 kontekstów.


In [48]:
qa_engine.dataset_manager.get_stats()

{'question_category_stats': [('matura_język_polski', 267)],
 'question_type_stats': [('open_text', 179),
  ('closed_MTF', 28),
  ('closed_MCQ', 16),
  ('open_essay', 13),
  ('open_summary', 10),
  ('open_poetry_interpretation', 8),
  ('closed_MRQ', 7),
  ('open_synthesis', 6)]}

In [49]:
prompt_composition = run_config.get("prompt_composition", {})

In [50]:
batch_generator = qa_engine._create_question_batch_iterator(1,query=None)

2025-06-15T23:58:42.195448+00:00 INFO: Tworzenie iteratora batchy pytań z rozmiarem batcha: 1.


In [51]:
qa_engine._process_batches(batch_generator, model_cfg, prompt_composition, {}, answers_output_path, max_questions=5)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


2025-06-15T23:58:49.729519+00:00 INFO: Przetwarzanie batcha 1 zawierającego 1 pytań.
2025-06-15T23:58:49.729519+00:00 INFO: --- Rozpoczęcie batcha 1 | Kategoria: matura_język_polski, Typ (z metadanych): closed_MTF, Liczba pytań: 1 ---
2025-06-15T23:58:49.730656+00:00 INFO: Przetwarzanie pytania (ID: EPOP-P1-100-2105_zad1) z batcha 1.
2025-06-15T23:58:49.744707+00:00 INFO: Finalny prompt dla Q_ID EPOP-P1-100-2105_zad1 (pierwsze 200 znaków): <s><|im_start|>system
Twoja rola to wzorowy uczeń liceum, który rozwiązuje zadania z arkusza maturalnego z języka polskiego. Analizuj polecenia i załączone teksty z najwyższą starannością.

Twoje odpo...
2025-06-15T23:58:49.744707+00:00 INFO: ... (całkowita długość promptu: 15580 znaków)
2025-06-15T23:58:49.745711+00:00 INFO: Wysyłanie promptu do modelu z parametrami: {}
2025-06-15T23:59:20.516571+00:00 INFO: Otrzymano surową odpowiedź od modelu (pierwsze 200 znaków): <answer>
1. P
2. F
3. P
</answer>...
2025-06-15T23:59:20.517572+00:00 INFO: Sparsow

In [52]:
qa_engine.results

[{'model_answer_id': 'ans_089fc16c-cf40-4a74-b9b8-629832c4dcc6',
  'question_id': 'EPOP-P1-100-2105_zad1',
  'model_answer_raw_text': '<answer>\n1. P\n2. F\n3. P\n</answer>',
  'model_answer_clean_text': '1. P\n2. F\n3. P',
  'generated_by': 'bielik-1-5b-v3-instruct (speakleash/Bielik-1.5B-v3.0-Instruct)',
  'generation_date': '2025-06-15T23:59:20.518563+00:00',
  'model_configuration': '{"model_config": {"name": "bielik-1-5b-v3-instruct", "path": "speakleash/Bielik-1.5B-v3.0-Instruct", "family": "bielik", "generation_params": {"max_new_tokens": 512, "temperature": 0.3, "do_sample": true}}, "prompt_composition": {"main_template": "base_question_prompt.jinja2", "components": ["few_shot", "chain_of_thought"], "template_params": {}}, "generation_parameters": {}, "tokenizer_arguments": {"enable_thinking": false}}'},
 {'model_answer_id': 'ans_ae3267e5-4bf9-4270-ac31-3459d013d69e',
  'question_id': 'EPOP-P1-100-2105_zad8',
  'model_answer_raw_text': '<answer>\n1. P\n2. F\n3. P\n</answer>',
 

In [None]:
qa_engine._