In [None]:
import os
import json
import csv
import uuid
from pathlib import Path


def convert_conversations_to_csv(
    input_dir="/home/ckittask/ria/Dataset-Generator/output_dataset_gemma12",
    output_file="conversations.csv",
):
    """
    Convert conversations.json files from nested directory structure to CSV format.

    Args:
        input_dir (str): Root directory containing agency folders
        output_file (str): Output CSV file name
    """

    # Open CSV file for writing
    with open(output_file, "w", newline="", encoding="utf-8") as csvfile:
        fieldnames = ["conversation_id", "turn", "speaker", "text", "agency"]
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

        # Write header
        writer.writeheader()

        # Walk through the directory structure
        convs = 0
        for root, dirs, files in os.walk(input_dir):
            # Check if conversations.json exists in current directory
            print(files)
            if "conversations.json" in files:
                json_path = os.path.join(root, "conversations.json")

                # Extract agency name from path
                path_parts = Path(root).parts
                if len(path_parts) >= 2:
                    agency = path_parts[
                        -2
                    ]  # Assuming structure: output_dataset/agency1/topic1/
                else:
                    agency = "unknown"

                try:
                    # Read and parse JSON file
                    with open(json_path, "r", encoding="utf-8") as jsonfile:
                        data = json.load(jsonfile)

                    # Process each conversation in the JSON file
                    count = 0
                    for conversation in data:
                        if agency == "output_Politsei-_ja_Piirivalveamet":
                            if count == 1:
                                continue
                        count += 1
                        if "messages" in conversation:
                            # Generate unique conversation ID
                            conversation_id = str(uuid.uuid4())
                            convs += 1
                            # Process each message in the conversation
                            for turn, message in enumerate(conversation["messages"], 1):
                                writer.writerow(
                                    {
                                        "conversation_id": conversation_id,
                                        "turn": turn,
                                        "speaker": message.get("role", ""),
                                        "text": message.get("content", ""),
                                        "agency": agency,
                                    }
                                )
                                if turn == 2:
                                    break

                    # print(f"Processed: {json_path}")

                except json.JSONDecodeError as e:
                    print(f"Error reading JSON file {json_path}: {e}")
                except Exception as e:
                    print(f"Error processing file {json_path}: {e}")
        print(f"Total conversations processed: {convs}")
    print(f"Conversion complete! Output saved to {output_file}")


def get_statistics(
    input_dir="/home/ckittask/ria/Dataset-Generator/output_dataset_gemma12",
):
    """
    Print statistics about the dataset structure.
    """
    total_files = 0
    agencies = set()
    topics = set()

    for root, dirs, files in os.walk(input_dir):
        if "conversations.json" in files:
            total_files += 1
            path_parts = Path(root).parts
            if len(path_parts) >= 2:
                agencies.add(path_parts[-2])
            if len(path_parts) >= 3:
                topics.add(path_parts[-1])

    print(f"Dataset Statistics:")
    print(f"- Total conversations.json files: {total_files}")
    print(f"- Number of agencies: {len(agencies)}")
    print(f"- Number of unique topics: {len(topics)}")
    print(f"- Agencies found: {sorted(agencies)}")
    print(f"- Topics found: {sorted(topics)}")


if __name__ == "__main__":
    # Print dataset statistics first
    get_statistics()
    print("\n" + "=" * 50 + "\n")

    # Convert to CSV
    convert_conversations_to_csv()

    # Optional: Preview the first few rows of the generated CSV
    print("\nPreview of generated CSV:")
    try:
        with open("conversations.csv", "r", encoding="utf-8") as f:
            reader = csv.reader(f)
            for i, row in enumerate(reader):
                if i < 5:  # Show first 5 rows
                    print(row)
                else:
                    break
    except FileNotFoundError:
        print("CSV file not found.")

Dataset Statistics:
- Total conversations.json files: 473
- Number of agencies: 3
- Number of unique topics: 473
- Agencies found: ['output_ID.ee', 'output_Politsei-_ja_Piirivalveamet', 'output_Tarbijakaitse_ja_Tehnilise_Jarelevalve_Amet']
- Topics found: ['2014-2021_-_Ajutise_reisidokumendi_naÌ\x88idised_-_Politsei-_ja_Piirivalveamet', '2019_-_Korruptsiooniga_seotud_kohtulahendid_-_Politsei-_ja_Piirivalveamet', '2021_-_Korruptsiooniga_seotud_kohtulahendid_-_Politsei-_ja_Piirivalveamet', '2023_-_Korruptsiooniga_seotud_kohtulahendid_-_Politsei-_ja_Piirivalveamet', '5G_Tarbijakaitse_ja_Tehnilise_JaÌ\x88relevalve_Amet', 'Ahistava_jaÌ\x88litamise_kampaania_-_Ennetusprojektid_ja_kampaaniad_-_Politsei-_ja_Piirivalveamet', 'Ajutine_reisidokument_-_RiigiloÌ\x83ivude_maÌ\x88aÌ\x88rad_-_Politsei-_ja_Piirivalveamet', 'Ajutise_ja_rahvusvahelise_kaitse_taotlejate_arv_-_Politsei-_ja_Piirivalveamet', 'AmatoÌ\x88oÌ\x88rraadioside_Tarbijakaitse_ja_Tehnilise_JaÌ\x88relevalve_Amet', 'Autentimine_riiklike