In [1]:
%load_ext autoreload
%autoreload 2
import os
import sys
from pathlib import Path
from datetime import datetime, UTC
from dotenv import load_dotenv

current_dir = Path(os.getcwd()).resolve()
if current_dir.name == "LLMPolReasonEval": # uruchomione w Jupyter Lab
    project_root = current_dir
else:  # uruchomione w PyCharm
    project_root = current_dir.parents[2]
print(f"Project root: {project_root}")

sys.path.append(str(project_root / "src"))
from llm_pol_reason_eval.question_processing.dataset_manager import DatasetManager

if load_dotenv(os.path.join(project_root, '.env')):
    print (f"Loaded environment variables from {project_root / '.env'}")

Project root: C:\Users\piotr\PycharmProjects\LLMPolReasonEval
Loaded environment variables from C:\Users\piotr\PycharmProjects\LLMPolReasonEval\.env


In [10]:
dataset_raw_path = project_root / "data" / "dataset_raw"
output_dir = project_root / "data" / "dataset"
output_dir.mkdir(parents=True, exist_ok=True)

In [11]:
# Inicjalizacja managera i ładowanie danych
dataset_manager = DatasetManager()
for json_file in dataset_raw_path.rglob("*.json"):
    if "MPOP-P1-100-2412-gemini25pro-2025-06-14T20-55-00Z.json" in str(json_file):
        print(f"Pomijam plik: {json_file}")
        continue
    dataset_manager.add_data_from_json_file(str(json_file))

# Generowanie ścieżki wyjściowej z timestampem ISO 8601
iso_timestamp = datetime.now(UTC).replace(microsecond=0).isoformat().replace("+00:00", "Z").replace(":", "-")
output_filename = f"polski_matura_dataset_{iso_timestamp}.json"
output_path = output_dir / output_filename

# Zapis datasetu
dataset_manager.save_all_data_to_json_file(str(output_path))
print(f"Zapisano dataset do: {output_path}")

In [12]:
dataset_manager.get_stats()

{'question_category_stats': [('matura_język_polski', 267)],
 'question_type_stats': [('open_text', 179),
  ('closed_MTF', 28),
  ('closed_MCQ', 16),
  ('open_essay', 13),
  ('open_summary', 10),
  ('open_poetry_interpretation', 8),
  ('closed_MRQ', 7),
  ('open_synthesis', 6)]}

In [64]:
question_batch_generator = dataset_manager.generate_question_batches(batch_size=1,
                                          with_contexts=True,
                                          by_q_category=True,
                                          by_q_type=True
                                          )


In [65]:
from pprint import pprint
for batch in question_batch_generator:
    questions = batch.get("questions", {})
    contexts = batch.get("contexts", {})
    metadata = batch.get("metadata", {})
    print(f"Batch size: {len(questions)} questions, {len(contexts)} contexts, question_type: {metadata.get('question_type')}, question_category: {metadata.get('category')}")
    for question in questions:
        print(questions[question]["question_text"])
        # for context_id in questions[question]["context_ids"]:
        #    print(f"  Context ID: {context_id}")
        #    print(f"  Context Text: {contexts[context_id].get("context_content")}")
    break  # Przerwij po pierwszym batchu dla testów

Batch size: 1 questions, 1 contexts, question_type: closed_MTF, question_category: matura_język_polski
Oceń prawdziwość poniższych stwierdzeń odnoszących się do tekstu Ewy Kołodziejek. Zaznacz P, jeśli stwierdzenie jest prawdziwe, albo F - jeśli jest fałszywe.


In [3]:
mpop_2412_json_path = project_root / "data" / "dataset" / "MPOP-P1-100-2412-gemini25pro-2025-06-14T20-55-00Z.json"
dataset_manager_mpop_2412 = DatasetManager()

In [4]:
dataset_manager_mpop_2412.add_data_from_json_file(mpop_2412_json_path)

In [5]:
dataset_manager_mpop_2412.get_stats()

{'question_category_stats': [('matura_język_polski', 16)],
 'question_type_stats': [('open_text', 13),
  ('closed_MTF', 2),
  ('open_synthesis', 1)]}