In [None]:

import numpy as np
import matplotlib.pyplot as plt

import json
from tqdm.notebook import tqdm

from scipy import stats

import glob

from combra import data, angles
import os

import glob
import multiprocessing

import numpy as np
from pathlib import Path
from mpire import WorkerPool
from skimage import io, color, filters, morphology, util
import cv2
import re
import pyarrow as pa
import pyarrow.parquet as pq
from tqdm import tqdm
import polars as pl

from combra import data, angles, mvee

In [None]:
types_dict = {
    "Ultra_Co11": "средние зерна",
    "Ultra_Co25": "мелкие зерна",
    "Ultra_Co8": "средне-мелкие зерна",
    "Ultra_Co6_2": "крупные зерна",
    "Ultra_Co15": "средне-мелкие зерна",
}

# path = '/home/david/mnt/ssd_2_sata/python/phd/datasets/original/o_bc_left_3'
# path = './data/san_256x256_N100_000.h5'
# path = './data/san_512x512_N100_000.h5'
path = './data/o_bc_left_3.h5'
# path = './data/gen_diff_768x768_N5000'
path = '/home/david/mnt/ssd_2_sata/python/phd/wc_cv/co_angles/data/separeted/gen_diff_512x512_N500'


# save_path = 'san_512x512_N100_000_msl0'
save_path = 'o_bc_left_3_msl5'
# save_path = 'gen_diff_768x768_N5000_msl5'


dataset = data.PobeditDataset(path=path, max_images_num_per_class=1000)

out = dataset.generate_angles(
    save_path=save_path,   
    types_dict=types_dict,
    step=[0.1, 0.5, 1,2,3,4,5],                     
    workers=20,
    angles_tol=3,
    min_segment_len=5.0               
)

print("Angles outputs:", out)


In [None]:
# path = "./o_bc_left_3_msl0/angles_n90.parquet"
# path = "./o_bc_left_3_msl5/angles_n90.parquet"
# path = "./o_bc_left_3_msl10/angles_n90.parquet"
path = "./san_512x512_N100_000_msl0/angles_n100.parquet"
# path = './gen_diff_768x768_N5000_msl0/angles_n100.parquet'

path = './gen_diff_768x768_N5000_msl5/angles_n100.parquet'

angles.angles_plot_base(parquet_path=path, N=10, M=7, font_size=20, scatter_size=5,  step=0.5, save=False,
                        ylim=[0,0.01])
# ) 

# Grid data generation

In [None]:
types_dict = {
    "Ultra_Co11": "средние зерна",
    "Ultra_Co25": "мелкие зерна",
    "Ultra_Co8": "средне-мелкие зерна",
    "Ultra_Co6_2": "крупные зерна",
    "Ultra_Co15": "средне-мелкие зерна",
}

max_img_per_class_list=[500,5_000,10_000]

sources=[ # diff
         '/home/david/mnt/ssd_2_sata/python/phd/wc_cv/co_angles/data/combined/gen_diff_768x768_N5000.h5',
         '/home/david/mnt/ssd_2_sata/python/phd/wc_cv/co_angles/data/separeted/gen_diff_512x512_N500',
         '/home/david/mnt/ssd_2_sata/python/phd/wc_cv/co_angles/data/separeted/gen_diff_256x256_N500',
         # san
         '/home/david/mnt/ssd_2_sata/python/phd/wc_cv/co_angles/data/combined/gen_san_512x512_N100_000.h5',
         '/home/david/mnt/ssd_2_sata/python/phd/wc_cv/co_angles/data/combined/gen_san_256x256_N100_000.h5'
         # orig
         '/home/david/mnt/ssd_2_sata/python/phd/wc_cv/co_angles/data/separeted/orig_bc_left'
         ]


for max_img_per_class in tqdm(max_img_per_class_list):

    for source_path in tqdm(sources):

        # save_path = 'san_512x512_N100_000_msl0'
        save_path = 'o_bc_left_3_msl5'
        # save_path = 'gen_diff_768x768_N5000_msl5'


        dataset = data.PobeditDataset(path=source_path, max_images_num_per_class=max_img_per_class)

        out = dataset.generate_angles(
            save_path=save_path,   
            types_dict=types_dict,
            step=[0.1, 0.5, 1,2,3,4,5],                     
            workers=20,
            angles_tol=3,
            # const
            min_segment_len=5.0               
        )

        print("Angles outputs:", out)


# Grid of plots

In [None]:
def _load_angles_parquet(path):
    """Load angles parquet file generated by PobeditDataset.generate_angles().

    Only loads columns used by angles_plot_base: meta and prep (excludes raw).
    """
    df = pl.read_parquet(path, columns=["meta", "prep"])
    return {col: df[col].to_list() for col in df.columns}

def angles_plot_base(
    rows=None, save_name=None,
    N=20, M=20, save=False, indices=None,
    font_size=20, scatter_size=20, xlim=None, ylim=None,
    parquet_path=None, step=None, show=True
):
    """Plot angles density and Gaussian approximation.

    Args:
        rows: dict-of-lists from pq.read_table(...).to_pydict(), or None if parquet_path is provided
        save_name: title and filename for saving (if None, derived from parquet_path)
        parquet_path: path to angles parquet file (alternative to rows)
        step: if provided, filter to only plot rows where meta["step"] matches this value
        show: if False, skip fig.show() for faster batch processing
    """
    folder_name = None
    if parquet_path is not None:
        pq_path = Path(parquet_path)
        rows = _load_angles_parquet(parquet_path)
        folder_name = pq_path.parent.name
        if save_name is None:
            save_name = pq_path.stem

    if rows is None:
        raise ValueError("Either 'rows' or 'parquet_path' must be provided")
    if save_name is None:
        save_name = "angles_plot"

    n = len(rows["meta"])

    # Filter by step if provided (use set for O(1) lookup)
    indices_set = set(indices) if indices is not None else None
    if step is not None:
        alloys_indices = [i for i in range(n) if rows["meta"][i]["step"] == step]
        if indices_set is not None:
            alloys_indices = [i for i in alloys_indices if i in indices_set]
    else:
        alloys_indices = list(indices) if indices is not None else list(range(n))

    # Build all traces in batch (faster than individual add_trace calls)
    traces = _build_traces_batch(rows, alloys_indices, scatter_size)

    # Get step value for title
    step_val = step if step is not None else rows["meta"][alloys_indices[0]]["step"] if alloys_indices else None
    title_parts = [folder_name] if folder_name else []
    title_parts.append(save_name)
    if step_val is not None:
        title_parts.append(f"(step={step_val})")
    title_text = " ".join(title_parts)

    # Create figure with all traces at once
    fig = go.Figure(data=traces)

    fig.update_layout(
        title=dict(text=title_text, font=dict(size=15), x=0.5, xanchor="center", y=0.95, yanchor="top"),
        margin=dict(t=50),
        xaxis=dict(
            title=dict(text="углы, градусы", font=dict(size=font_size)),
            tickvals=[0, 60, 120, 180, 240, 300, 360],
            range=xlim,
            showline=True, linecolor="black", linewidth=1, mirror=True,
            showgrid=True, gridcolor="lightgray", gridwidth=0.5,
        ),
        yaxis=dict(
            title=dict(text="p(x)", font=dict(size=font_size)),
            range=ylim,
            showline=True, linecolor="black", linewidth=1, mirror=True,
            showgrid=True, gridcolor="lightgray", gridwidth=0.5,
        ),
        legend=dict(
            x=0.02,
            y=0.98,
            xanchor="left",
            yanchor="top",
            bgcolor="rgba(255,255,255,0.8)",
            bordercolor="rgba(0,0,0,0.3)",
            borderwidth=1,
        ),
        width=N * 80,
        height=M * 80,
        plot_bgcolor="white",
        paper_bgcolor="white",
    )

    if save:
        parts = [folder_name] if folder_name else []
        parts.append(save_name)
        if step_val is not None:
            parts.append(f"step{step_val}")
        file_name = "_".join(parts)
        fig.write_image(f"{file_name}.png", scale=2)

    if show:
        fig.show()

    for i in alloys_indices:
        print(rows["prep"][i]["angles_legend"])



In [None]:
# %%time
# ---- read parquet and plot ----
# in_path = "./san_512x512_N100_000/angles_n10.parquet"
# in_path = "./san_256x256_N100_000/angles_n100000.parquet"

paths = ["./san_512x512_N100_000/angles_n10.parquet",
         "./san_512x512_N100_000/angles_n100.parquet",
         "./san_512x512_N100_000/angles_n1000.parquet",
         "./san_512x512_N100_000/angles_n10000.parquet",
         "./san_512x512_N100_000/angles_n100000.parquet",
         "./san_256x256_N100_000/angles_n10.parquet",
         "./san_256x256_N100_000/angles_n100.parquet",
         "./san_256x256_N100_000/angles_n1000.parquet",
         "./san_256x256_N100_000/angles_n10000.parquet",
         "./san_256x256_N100_000/angles_n100000.parquet",
         ]

# names_dict = {'Ultra_Co11': 'средние зерна',
#               'Ultra_Co25': 'мелкие зерна',
#               'Ultra_Co8': 'средне-мелкие зерна',
#               'Ultra_Co6_2': 'крупные зерна',
#               'Ultra_Co15': 'средне-мелкие зерна'}

# step 1
# 10      - [0,0.011]
# 100     - [0,0.011]
# 1_000   - [0,0.011]
# 10_000  - [0,0.011]
# 100_000 - [0,0.011]

for path in paths:

    angles.angles_plot_base(parquet_path=path, N=10, M=7, font_size=20, scatter_size=5,  step=5, save=True, ylim=[0,0.05])


In [None]:
mvee.plot_beam_base(rows,save_name='biba',step=5,N=7,M=7,  save=False,indices=None, font_size=20,scatter_size=20)

In [None]:
import pyarrow.parquet as pq
import pyarrow as pa
from pathlib import Path

# Class name mapping
NAME_MAP = {
    "class_0": "Ultra_Co25",
    "class_1": "Ultra_Co11",
    "class_2": "Ultra_Co6_2",
}

def update_meta_names(pq_path):
    """Update meta.name in parquet file, reading only meta column for the mapping."""
    # Read only meta column to get names
    meta_table = pq.read_table(pq_path, columns=["meta"])
    meta_list = meta_table["meta"].to_pylist()
    
    # Check if any names need updating
    needs_update = any(m["name"] in NAME_MAP for m in meta_list)
    if not needs_update:
        return False
    
    # Now read full file and update
    table = pq.read_table(pq_path)
    
    # Update names in meta
    new_meta = []
    for m in meta_list:
        m_copy = dict(m)
        if m_copy["name"] in NAME_MAP:
            m_copy["name"] = NAME_MAP[m_copy["name"]]
        new_meta.append(m_copy)
    
    # Replace meta column
    meta_array = pa.array(new_meta, type=table.schema.field("meta").type)
    col_idx = table.schema.get_field_index("meta")
    table = table.set_column(col_idx, "meta", meta_array)
    
    # Write back
    pq.write_table(table, pq_path)
    return True

# Folders to process
folders = [
    Path("./san_512x512_N100_000"),
    Path("./san_256x256_N100_000"),
]

for folder in folders:
    parquets = sorted(folder.glob("*.parquet"))
    print(f"\nProcessing {len(parquets)} files in {folder}")
    
    for pq_path in parquets:
        updated = update_meta_names(pq_path)
        status = "Updated" if updated else "Skipped (no matching names)"
        print(f"  {status}: {pq_path.name}")

print("\nDone!")
