# Dataset Summary for .las Files

This notebook scans a dataset directory recursively to find all `.las` files, infers species from the immediate parent folder name, and reports summary statistics by species and by larger dataset folders (the parent of each species folder).


In [2]:
from __future__ import annotations
from pathlib import Path
from typing import Iterable

# Set your dataset root here. You can change this as needed.
DATASET_ROOT = Path("/home/gleb/dev/tree-cluster/dataset")
assert DATASET_ROOT.exists() and DATASET_ROOT.is_dir(), "Dataset root must exist and be a directory"

# Only consider .las files
LAS_SUFFIX = ".las"


In [3]:
from enum import Enum
from pydantic import BaseModel

class Species(str, Enum):
    birch = "birch"
    cedar = "cedar"
    fir = "fir"
    larch = "larch"
    pine = "pine"
    spruce = "spruce"
    unknown = "unknown"

class LasFileRecord(BaseModel):
    path: str
    species: Species
    dataset_folder: str  # the immediate higher-level folder above the species folder

    @property
    def path_obj(self) -> Path:
        return Path(self.path)


In [4]:
from collections import defaultdict
from typing import List

SPECIES_NAMES = {s.value for s in Species}

def infer_species(path: Path) -> Species:
    parent = path.parent.name.lower()
    if parent in SPECIES_NAMES:
        return Species(parent)  # type: ignore[arg-type]
    return Species.unknown


def infer_dataset_folder(path: Path) -> str:
    species_folder = path.parent
    higher = species_folder.parent
    return higher.name


def scan_las_files(root: Path) -> List[LasFileRecord]:
    results: List[LasFileRecord] = []
    for p in root.rglob(f"*{LAS_SUFFIX}"):
        if not p.is_file():
            continue
        species = infer_species(p)
        dataset_folder = infer_dataset_folder(p)
        results.append(
            LasFileRecord(
                path=str(p),
                species=species,
                dataset_folder=dataset_folder,
            )
        )
    return results

records = scan_las_files(DATASET_ROOT)
len(records)


718

In [5]:
from collections import Counter
from itertools import groupby

# Summary by species
species_counts = Counter(r.species for r in records)
print("Counts by species:")
for sp, count in sorted(species_counts.items(), key=lambda x: x[0].value):
    print(f"  {sp.value}: {count}")

# Summary by dataset folder
folder_counts = Counter(r.dataset_folder for r in records)
print("\nCounts by dataset folder:")
for folder, count in sorted(folder_counts.items()):
    print(f"  {folder}: {count}")

# Cross-tab: folder x species
print("\nFolder x species:")
folders = sorted(folder_counts.keys())
all_species = sorted([s for s in Species], key=lambda s: s.value)
for folder in folders:
    subset = [r for r in records if r.dataset_folder == folder]
    c = Counter(r.species for r in subset)
    row = ", ".join(f"{sp.value}={c.get(sp, 0)}" for sp in all_species)
    print(f"  {folder}: {row}")


Counts by species:
  birch: 114
  cedar: 59
  fir: 59
  larch: 183
  pine: 190
  spruce: 112
  unknown: 1

Counts by dataset folder:
  PROBA1_RESEG_dataset_861de863: 298
  PROBA2_RESEG_dataset_c38db91a: 82
  PROBA3_RESEG_разобрано: 338

Folder x species:
  PROBA1_RESEG_dataset_861de863: birch=100, cedar=50, fir=59, larch=24, pine=5, spruce=60, unknown=0
  PROBA2_RESEG_dataset_c38db91a: birch=4, cedar=0, fir=0, larch=31, pine=23, spruce=23, unknown=1
  PROBA3_RESEG_разобрано: birch=10, cedar=9, fir=0, larch=128, pine=162, spruce=29, unknown=0
