In [None]:
import polars as pl
from pathlib import Path
import h5py
from dataclasses import dataclass


In [3]:
@dataclass
class LazyDatasetReference:
    directory: Path
    file_name: str
    dataset_name: str

    def load_on_demand(self):
        """Load dataset from HDF5 file lazily."""
        file_path = self.directory / self.file_name
        with h5py.File(file_path, "r") as f:
            data = f[self.dataset_name][:]
            
            if data.size == 0:
                return None  # Handle empty datasets
            
            if data.ndim == 1:
                return data.tolist()  # Store as Polars List
            
            if data.ndim == 2:
                return data  # Store as Polars Array

            raise ValueError(f"Unexpected shape {data.shape} in dataset {self.dataset_name}")

In [4]:
# --- 1. Create LazyFrame storing metadata only ---
def func_df():
    df = pl.LazyFrame({
        "directory": ["/Users/lotzegud/P08/test_folder2"] * 5,  # Directory path repeated for each file
        "file_name": [
            "h2o_2024_10_16_01043.nxs",
            "h2o_2024_10_16_01044.nxs",
            "h2o_2024_10_16_01045.nxs",
            "h2o_2024_10_16_01046.nxs",
            "h2o_2024_10_16_01047.nxs",
        ],
        "dataset_name": ["/scan/instrument/collection/exp_t01"] * 5,  # Same dataset for all
    })
    
    return df 

df = func_df()
print("This is a lazyframe")
print(df.explain(optimized=True))
print(df.limit(3).collect())
print("DF remains")
print(type(df))

del df

This is a lazyframe
DF ["directory", "file_name", "dataset_name"]; PROJECT */3 COLUMNS
shape: (3, 3)
┌─────────────────────────────────┬──────────────────────────┬─────────────────────────────────┐
│ directory                       ┆ file_name                ┆ dataset_name                    │
│ ---                             ┆ ---                      ┆ ---                             │
│ str                             ┆ str                      ┆ str                             │
╞═════════════════════════════════╪══════════════════════════╪═════════════════════════════════╡
│ /Users/lotzegud/P08/test_folde… ┆ h2o_2024_10_16_01043.nxs ┆ /scan/instrument/collection/ex… │
│ /Users/lotzegud/P08/test_folde… ┆ h2o_2024_10_16_01044.nxs ┆ /scan/instrument/collection/ex… │
│ /Users/lotzegud/P08/test_folde… ┆ h2o_2024_10_16_01045.nxs ┆ /scan/instrument/collection/ex… │
└─────────────────────────────────┴──────────────────────────┴─────────────────────────────────┘
DF remains
<class 'polars.

In [5]:
df = func_df()

df = df.with_columns(
    pl.struct(["directory", "file_name", "dataset_name"])
    .map_elements(
        lambda row: LazyDatasetReference(
            directory=Path(row["directory"]),
            file_name=row["file_name"],
            dataset_name=row["dataset_name"]
        ),
        return_dtype=pl.Object  # Ensure dtype compatibility
    )
    .alias("references")  # Store references in a new column
)

print(df.limit(3).collect())

#This does not work: ComputeError: TypeError: <lambda>() missing 2 required positional arguments: 'file_name' and 'dataset_name'
# 
# df = df.with_columns(
#     pl.col("directory")
#     .map_elements(
#         lambda directory, file_name, dataset_name: LazyDatasetReference(
#             directory=Path(directory),
#             file_name=file_name,
#             dataset_name=dataset_name
#         ),
#         return_dtype=pl.Object
#     )
#     .alias("references2")
# )
# print(df) 

df = df.with_columns(pl.col("references").map_elements(
                         lambda ref: ref.load_on_demand(),
                    return_dtype=pl.Object
                    ).alias("references loaded")
                )

print(df.limit(3).collect())

print(f"df is {type(df)}")

del df

shape: (3, 4)
┌────────────────────────┬────────────────────────┬────────────────────────┬───────────────────────┐
│ directory              ┆ file_name              ┆ dataset_name           ┆ references            │
│ ---                    ┆ ---                    ┆ ---                    ┆ ---                   │
│ str                    ┆ str                    ┆ str                    ┆ object                │
╞════════════════════════╪════════════════════════╪════════════════════════╪═══════════════════════╡
│ /Users/lotzegud/P08/te ┆ h2o_2024_10_16_01043.n ┆ /scan/instrument/colle ┆ LazyDatasetReference( │
│ st_folde…              ┆ xs                     ┆ ction/ex…              ┆ directory…            │
│ /Users/lotzegud/P08/te ┆ h2o_2024_10_16_01044.n ┆ /scan/instrument/colle ┆ LazyDatasetReference( │
│ st_folde…              ┆ xs                     ┆ ction/ex…              ┆ directory…            │
│ /Users/lotzegud/P08/te ┆ h2o_2024_10_16_01045.n ┆ /scan/instrument/colle ┆ 

I think below shows what I want. I can deal with the dataclass objects in columns, I can move them to other columns, and I can load them. 

In [None]:
print(30*"\N{strawberry}")
df = func_df()


print(f"Df is currently {type(df)}")

# --- 2. Use map_elements to load data lazily ---
df = df.with_columns(
    pl.struct(["directory", "file_name", "dataset_name"])
    .map_elements(lambda row: LazyDatasetReference(
        directory=Path(row["directory"]),
        file_name=row["file_name"],
        dataset_name=row["dataset_name"]
    ),   #.load_on_demand() <--------would load the data from the dataset
    return_dtype=pl.Object)  # Adjust dtype as needed
    .alias("data"),
    
     # Add a dummy column with a constant string
    pl.lit("/path/to/dummy").alias("dummy_path")
)

# --- 3. Collect the results (forcing evaluation) ---
print(df)
print(1*'\n')
print(df.limit(3).collect())


print(f"df remains {type(df)}")

del df 

print(30*"\N{strawberry}")

df = func_df()

df = df.with_columns(
    pl.struct(["directory", "file_name", "dataset_name"])
    .map_elements(lambda row: LazyDatasetReference(
        directory=Path(row["directory"]),
        file_name=row["file_name"],
        dataset_name=row["dataset_name"]
    ), return_dtype=pl.Object)  # Adjust dtype as needed
    .alias("data")
)
# I think this works  
df = df.with_columns(
    pl.col("data").alias("dummy_path")  # Assign "data" to "dummy_path"
)

print(df.limit(3).collect())

#Now I want to load the dataset in dummy path 
df = df.with_columns(pl.col("dummy_path").map_elements(lambda ref: ref.load_on_demand(), return_dtype=pl.Object).alias("dummy_path"))


print(df.limit(3).collect())

🍓🍓🍓🍓🍓🍓🍓🍓🍓🍓🍓🍓🍓🍓🍓🍓🍓🍓🍓🍓🍓🍓🍓🍓🍓🍓🍓🍓🍓🍓
Df is currently <class 'polars.lazyframe.frame.LazyFrame'>
naive plan: (run LazyFrame.explain(optimized=True) to see the optimized plan)

 WITH_COLUMNS:
 [col("directory").as_struct([col("file_name"), col("dataset_name")]).map_list().alias("data"), String(/path/to/dummy).alias("dummy_path")] 
  DF ["directory", "file_name", "dataset_name"]; PROJECT */3 COLUMNS


shape: (3, 5)
┌────────────────────┬────────────────────┬───────────────────┬───────────────────┬────────────────┐
│ directory          ┆ file_name          ┆ dataset_name      ┆ data              ┆ dummy_path     │
│ ---                ┆ ---                ┆ ---               ┆ ---               ┆ ---            │
│ str                ┆ str                ┆ str               ┆ object            ┆ str            │
╞════════════════════╪════════════════════╪═══════════════════╪═══════════════════╪════════════════╡
│ /Users/lotzegud/P0 ┆ h2o_2024_10_16_010 ┆ /scan/instrument/ ┆ LazyDatasetRefere ┆ /

In [7]:
print(30*"\N{hot pepper}", '\n')

df = df.with_columns(
    pl.struct(["directory", "file_name", "dataset_name"])
    .map_elements(lambda row: LazyDatasetReference(
        directory=Path(row["directory"]),
        file_name=row["file_name"],
        dataset_name=row["dataset_name"]
    ), return_dtype=pl.Object)
    .alias("data_ref")  # Store reference, NOT loaded data
)
print(type(df))
print(df)

print(df.limit(3).collect())
print(30*"\N{green apple}", '\n')

df = df.with_columns(
    pl.col("data_ref").map_elements(lambda ref: ref.load_on_demand(), return_dtype=pl.Object)
    .alias("data2")
)

print(type(df))

print(df.collect())


🌶🌶🌶🌶🌶🌶🌶🌶🌶🌶🌶🌶🌶🌶🌶🌶🌶🌶🌶🌶🌶🌶🌶🌶🌶🌶🌶🌶🌶🌶 

<class 'polars.lazyframe.frame.LazyFrame'>
naive plan: (run LazyFrame.explain(optimized=True) to see the optimized plan)

 WITH_COLUMNS:
 [col("directory").as_struct([col("file_name"), col("dataset_name")]).map_list().alias("data_ref")] 
   WITH_COLUMNS:
   [col("data").alias("dummy_path")] 
     WITH_COLUMNS:
     [col("directory").as_struct([col("file_name"), col("dataset_name")]).map_list().alias("data")] 
      DF ["directory", "file_name", "dataset_name"]; PROJECT */3 COLUMNS
shape: (3, 6)
┌────────────────┬────────────────┬────────────────┬───────────────┬───────────────┬───────────────┐
│ directory      ┆ file_name      ┆ dataset_name   ┆ data          ┆ dummy_path    ┆ data_ref      │
│ ---            ┆ ---            ┆ ---            ┆ ---           ┆ ---           ┆ ---           │
│ str            ┆ str            ┆ str            ┆ object        ┆ object        ┆ object        │
╞════════════════╪════════════════╪════════════════╪═══════════