In [None]:

import numpy as np
import matplotlib.pyplot as plt

import json
from tqdm.notebook import tqdm

from scipy import stats

import glob

from combra import data, angles
import os

import glob
import multiprocessing

import numpy as np
from pathlib import Path
from mpire import WorkerPool
from skimage import io, color, filters, morphology, util
import cv2
import re
import pyarrow as pa
import pyarrow.parquet as pq
from tqdm import tqdm
import polars as pl

from combra import data, angles, mvee

In [None]:
types_dict = {
    "Ultra_Co11": "средние зерна",
    "Ultra_Co25": "мелкие зерна",
    "Ultra_Co8": "средне-мелкие зерна",
    "Ultra_Co6_2": "крупные зерна",
    "Ultra_Co15": "средне-мелкие зерна",
}

# path = '/home/david/mnt/ssd_2_sata/python/phd/datasets/original/o_bc_left_3'
# path = './data/san_256x256_N100_000.h5'
# path = './data/san_512x512_N100_000.h5'
path = './data/o_bc_left_3.h5'
# path = './data/gen_diff_768x768_N5000'
path = '/home/david/mnt/ssd_2_sata/python/phd/wc_cv/co_angles/data/separeted/gen_diff_512x512_N500'


# save_path = 'san_512x512_N100_000_msl0'
save_path = 'o_bc_left_3_msl5'
# save_path = 'gen_diff_768x768_N5000_msl5'


dataset = data.PobeditDataset(path=path, max_images_num_per_class=1000)

out = dataset.generate_angles(
    save_path=save_path,   
    types_dict=types_dict,
    step=[0.1, 0.5, 1,2,3,4,5],                     
    workers=20,
    angles_tol=3,
    min_segment_len=5.0               
)

print("Angles outputs:", out)


In [None]:
# path = "./o_bc_left_3_msl0/angles_n90.parquet"
# path = "./o_bc_left_3_msl5/angles_n90.parquet"
# path = "./o_bc_left_3_msl10/angles_n90.parquet"
path = "./san_512x512_N100_000_msl0/angles_n100.parquet"
# path = './gen_diff_768x768_N5000_msl0/angles_n100.parquet'

path = './gen_diff_768x768_N5000_msl5/angles_n100.parquet'

angles.angles_plot_base(parquet_path=path, N=10, M=7, font_size=20, scatter_size=5,  step=0.5, save=False,
                        ylim=[0,0.01])
# ) 

# Grid data generation

In [None]:
types_dict = {
    "Ultra_Co11": "средние зерна",
    "Ultra_Co25": "мелкие зерна",
    "Ultra_Co8": "средне-мелкие зерна",
    "Ultra_Co6_2": "крупные зерна",
    "Ultra_Co15": "средне-мелкие зерна",
}

max_img_per_class_list = [500, 5_000, 10_000]
min_segment_len = 5.0
output_dir = Path("./grid_results")
output_dir.mkdir(exist_ok=True)

sources = [
    # diff
    '/home/david/mnt/ssd_2_sata/python/phd/wc_cv/co_angles/data/combined/gen_diff_768x768_N5000.h5',
    '/home/david/mnt/ssd_2_sata/python/phd/wc_cv/co_angles/data/separeted/gen_diff_512x512_N500',
    '/home/david/mnt/ssd_2_sata/python/phd/wc_cv/co_angles/data/separeted/gen_diff_256x256_N500',
    # san
    '/home/david/mnt/ssd_2_sata/python/phd/wc_cv/co_angles/data/combined/gen_san_512x512_N100_000.h5',
    '/home/david/mnt/ssd_2_sata/python/phd/wc_cv/co_angles/data/combined/gen_san_256x256_N100_000.h5',
    # orig
    '/home/david/mnt/ssd_2_sata/python/phd/wc_cv/co_angles/data/separeted/orig_bc_left',
]

for max_img_per_class in tqdm(max_img_per_class_list):
    for source_path in tqdm(sources):
        source_name = Path(source_path).stem
        save_path = output_dir / f"{source_name}_msl{int(min_segment_len)}"

        dataset = data.PobeditDataset(path=source_path, max_images_num_per_class=max_img_per_class)

        out = dataset.generate_angles(
            save_path=str(save_path),
            types_dict=types_dict,
            step=[0.1, 0.5, 1, 2, 3, 4, 5],
            workers=20,
            angles_tol=3,
            min_segment_len=min_segment_len,
        )

        print(f"[{source_name}, N={max_img_per_class}] -> {out}")

# Grid of plots

In [None]:
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import polars as pl
from pathlib import Path


def _load_angles_parquet(path):
    """Load angles parquet file generated by PobeditDataset.generate_angles().

    Only loads columns used by angles_plot_base: meta and prep (excludes raw).
    """
    df = pl.read_parquet(path, columns=["meta", "prep"])
    return {col: df[col].to_list() for col in df.columns}


# SAN parquets use class_0/class_1/class_2 instead of Ultra_Co* names
_SAN_NAME_MAP = {
    "Ultra_Co25": "class_0",
    "Ultra_Co11": "class_1",
    "Ultra_Co6_2": "class_2",
}

# Folder name for each (source_type, resolution)
_FOLDER_MAP = {
    ("diff", 256): "gen_diff_256x256_N500_msl5",
    ("diff", 512): "gen_diff_512x512_N500_msl5",
    ("diff", 768): "gen_diff_768x768_N5000_msl5",
    ("gan", 256):  "gen_san_256x256_N100_000_msl5",
    ("gan", 512):  "gen_san_512x512_N100_000_msl5",
    ("orig", None): "orig_bc_left_msl5",
}

# Visual style per source type
_SOURCE_STYLES = {
    "orig": {"color": "blue",   "marker": "circle"},
    "gan":  {"color": "orange", "marker": "square"},
    "diff": {"color": "green",  "marker": "triangle-down"},
}


def _find_row(rows, step_val, class_name):
    """Find row index matching step and class name."""
    for i, m in enumerate(rows["meta"]):
        if m["step"] == step_val and m["name"] == class_name:
            return i
    return None


def _add_cell_traces(fig, row, col, rows, data_idx, source_label, style, scatter_size):
    """Add Gaussian line + density scatter for one source to a subplot cell."""
    p = rows["prep"][data_idx]
    color = style["color"]
    marker = style["marker"]

    # Use per-cell legendgroup so each cell gets its own legend entries
    lg = f"{source_label}_r{row}c{col}"

    # Gaussian fit line
    fig.add_trace(
        go.Scatter(
            x=p["angles_gauss_x"], y=p["angles_gauss_y"],
            mode="lines", line=dict(color=color, width=1.5),
            name=source_label, legendgroup=lg,
            showlegend=True,
        ),
        row=row, col=col,
    )
    # Density scatter points
    fig.add_trace(
        go.Scatter(
            x=p["angles_density_x"], y=p["angles_density_y"],
            mode="markers",
            marker=dict(symbol=marker, size=scatter_size, color=color),
            name=source_label, legendgroup=lg,
            showlegend=False,
        ),
        row=row, col=col,
    )


def angles_plot_grid(
    grid_results_dir="./grid_results",
    step=2,
    scatter_size=5,
    font_size=12,
    save=False,
    show=True,
    ylim=None,
):
    """Plot 3 grids of angle distributions: orig vs GAN vs diff.

    One figure per max_img_per_class value from [500, 5000, 10000].
    Rows = resolutions (256, 512, 768).
    Columns = grain classes (Ultra_Co25, Ultra_Co11, Ultra_Co6_2).
    No GAN data for 768 resolution.
    Real (orig) distribution is always the same (angles_n90).
    """
    grid_dir = Path(grid_results_dir)

    max_img_per_class_list = [500, 5_000, 10_000]
    resolutions = [256, 512, 768]
    grain_classes = [
        ("Ultra_Co25", "мелкие зерна"),
        ("Ultra_Co11", "средние зерна"),
        ("Ultra_Co6_2", "крупные зерна"),
    ]

    _cache = {}

    def load_cached(path):
        key = str(path)
        if key not in _cache:
            _cache[key] = _load_angles_parquet(key)
        return _cache[key]

    for max_n in max_img_per_class_list:
        # Subplot titles: row-major order
        subplot_titles = []
        for res in resolutions:
            for _, class_label in grain_classes:
                subplot_titles.append(f"{res}×{res} — {class_label}")

        n_rows = len(resolutions)
        n_cols = len(grain_classes)

        fig = make_subplots(
            rows=n_rows, cols=n_cols,
            subplot_titles=subplot_titles,
            vertical_spacing=0.08,
            horizontal_spacing=0.06,
        )

        for r_idx, res in enumerate(resolutions):
            for c_idx, (class_key, _) in enumerate(grain_classes):
                row = r_idx + 1
                col = c_idx + 1

                # --- Orig (real) — always from angles_n90 ---
                real_path = grid_dir / _FOLDER_MAP[("orig", None)] / "angles_n90.parquet"
                if real_path.exists():
                    rows_data = load_cached(real_path)
                    idx = _find_row(rows_data, float(step), f"class_{class_key}")
                    if idx is not None:
                        _add_cell_traces(
                            fig, row, col, rows_data, idx,
                            "orig", _SOURCE_STYLES["orig"], scatter_size,
                        )

                # --- GAN (not available for 768) ---
                if res != 768:
                    gan_folder = _FOLDER_MAP.get(("gan", res))
                    if gan_folder:
                        gan_path = grid_dir / gan_folder / f"angles_n{max_n}.parquet"
                        if gan_path.exists():
                            rows_data = load_cached(gan_path)
                            # GAN files use class_0/1/2 naming
                            gan_class = _SAN_NAME_MAP[class_key]
                            idx = _find_row(rows_data, float(step), gan_class)
                            # fallback: maybe already renamed
                            if idx is None:
                                idx = _find_row(rows_data, float(step), f"class_{class_key}")
                            if idx is not None:
                                _add_cell_traces(
                                    fig, row, col, rows_data, idx,
                                    "gan", _SOURCE_STYLES["gan"], scatter_size,
                                )

                # --- Diff ---
                diff_folder = _FOLDER_MAP.get(("diff", res))
                if diff_folder:
                    diff_path = grid_dir / diff_folder / f"angles_n{max_n}.parquet"
                    if diff_path.exists():
                        rows_data = load_cached(diff_path)
                        idx = _find_row(rows_data, float(step), f"class_{class_key}")
                        if idx is not None:
                            _add_cell_traces(
                                fig, row, col, rows_data, idx,
                                "diff", _SOURCE_STYLES["diff"], scatter_size,
                            )

        # --- Layout ---
        fig.update_layout(
            title=dict(
                text=f"Распределения углов (step={step}, N изобр. на класс={max_n})",
                font=dict(size=16), x=0.5, xanchor="center",
            ),
            height=350 * n_rows,
            width=450 * n_cols,
            plot_bgcolor="white",
            paper_bgcolor="white",
            showlegend=True,
        )

        # Style all subplot axes
        tick_vals = [0, 60, 120, 180, 240, 300, 360]
        for i in range(1, n_rows * n_cols + 1):
            x_key = f"xaxis{i}" if i > 1 else "xaxis"
            y_key = f"yaxis{i}" if i > 1 else "yaxis"
            fig.layout[x_key].update(
                tickvals=tick_vals,
                showline=True, linecolor="black", linewidth=1, mirror=True,
                showgrid=True, gridcolor="lightgray", gridwidth=0.5,
            )
            fig.layout[y_key].update(
                range=ylim,
                showline=True, linecolor="black", linewidth=1, mirror=True,
                showgrid=True, gridcolor="lightgray", gridwidth=0.5,
            )

        # --- Per-cell legend as annotation ---
        # Place a colored legend annotation inside each subplot
        for r_idx, res in enumerate(resolutions):
            for c_idx, (class_key, _) in enumerate(grain_classes):
                # Determine which subplot axis to reference
                ax_idx = r_idx * n_cols + c_idx + 1
                xref = f"x{ax_idx} domain" if ax_idx > 1 else "x domain"
                yref = f"y{ax_idx} domain" if ax_idx > 1 else "y domain"

                # Build legend text with colored markers
                parts = []
                parts.append('<span style="color:blue">&#9679; orig</span>')
                if res != 768:
                    parts.append('<span style="color:orange">&#9632; gan</span>')
                parts.append('<span style="color:green">&#9660; diff</span>')
                legend_text = "  ".join(parts)

                fig.add_annotation(
                    text=legend_text,
                    xref=xref, yref=yref,
                    x=0.02, y=0.98,
                    xanchor="left", yanchor="top",
                    showarrow=False,
                    font=dict(size=10),
                    bgcolor="rgba(255,255,255,0.8)",
                    bordercolor="rgba(0,0,0,0.3)",
                    borderwidth=1,
                    borderpad=3,
                )

        # Hide the default global legend (per-cell annotations replace it)
        fig.update_layout(showlegend=False)

        if save:
            fig.write_image(f"angles_grid_n{max_n}_step{step}.png", scale=2)

        if show:
            fig.show()

In [None]:
angles_plot_grid(step=2, save=True)

In [None]:
# %%time
# ---- read parquet and plot ----
# in_path = "./san_512x512_N100_000/angles_n10.parquet"
# in_path = "./san_256x256_N100_000/angles_n100000.parquet"

paths = ["./san_512x512_N100_000/angles_n10.parquet",
         "./san_512x512_N100_000/angles_n100.parquet",
         "./san_512x512_N100_000/angles_n1000.parquet",
         "./san_512x512_N100_000/angles_n10000.parquet",
         "./san_512x512_N100_000/angles_n100000.parquet",
         "./san_256x256_N100_000/angles_n10.parquet",
         "./san_256x256_N100_000/angles_n100.parquet",
         "./san_256x256_N100_000/angles_n1000.parquet",
         "./san_256x256_N100_000/angles_n10000.parquet",
         "./san_256x256_N100_000/angles_n100000.parquet",
         ]

# names_dict = {'Ultra_Co11': 'средние зерна',
#               'Ultra_Co25': 'мелкие зерна',
#               'Ultra_Co8': 'средне-мелкие зерна',
#               'Ultra_Co6_2': 'крупные зерна',
#               'Ultra_Co15': 'средне-мелкие зерна'}

# step 1
# 10      - [0,0.011]
# 100     - [0,0.011]
# 1_000   - [0,0.011]
# 10_000  - [0,0.011]
# 100_000 - [0,0.011]

for path in paths:

    angles.angles_plot_base(parquet_path=path, N=10, M=7, font_size=20, scatter_size=5,  step=5, save=True, ylim=[0,0.05])


In [None]:
mvee.plot_beam_base(rows,save_name='biba',step=5,N=7,M=7,  save=False,indices=None, font_size=20,scatter_size=20)

In [None]:
import pyarrow.parquet as pq
import pyarrow as pa
from pathlib import Path

# Class name mapping
NAME_MAP = {
    "class_0": "Ultra_Co25",
    "class_1": "Ultra_Co11",
    "class_2": "Ultra_Co6_2",
}

def update_meta_names(pq_path):
    """Update meta.name in parquet file, reading only meta column for the mapping."""
    # Read only meta column to get names
    meta_table = pq.read_table(pq_path, columns=["meta"])
    meta_list = meta_table["meta"].to_pylist()
    
    # Check if any names need updating
    needs_update = any(m["name"] in NAME_MAP for m in meta_list)
    if not needs_update:
        return False
    
    # Now read full file and update
    table = pq.read_table(pq_path)
    
    # Update names in meta
    new_meta = []
    for m in meta_list:
        m_copy = dict(m)
        if m_copy["name"] in NAME_MAP:
            m_copy["name"] = NAME_MAP[m_copy["name"]]
        new_meta.append(m_copy)
    
    # Replace meta column
    meta_array = pa.array(new_meta, type=table.schema.field("meta").type)
    col_idx = table.schema.get_field_index("meta")
    table = table.set_column(col_idx, "meta", meta_array)
    
    # Write back
    pq.write_table(table, pq_path)
    return True

# Folders to process
folders = [
    Path("./san_512x512_N100_000"),
    Path("./san_256x256_N100_000"),
]

for folder in folders:
    parquets = sorted(folder.glob("*.parquet"))
    print(f"\nProcessing {len(parquets)} files in {folder}")
    
    for pq_path in parquets:
        updated = update_meta_names(pq_path)
        status = "Updated" if updated else "Skipped (no matching names)"
        print(f"  {status}: {pq_path.name}")

print("\nDone!")
