In [None]:
#|default_exp from_tables

In [None]:
#| hide

%load_ext autoreload
%autoreload 2
from fastcore.test import test_eq

# `from_tables`

> Get image and vector locations, and optionally metadata, from one or more table inputs

- Allows you to create embeddings with any external program
- Simplifies matching embeddings, images, and metadata to each other

In [None]:
#| export

from glob import glob
from pathlib import Path

import pandas as pd
import polars as pl

In [None]:
#| export

def cat_tables(table_paths: list[Path]) -> pl.DataFrame:
    '''
    read and concatenate tables from list of paths
    '''
    extensions = {p.suffix.lower() for p in table_paths}
    if extensions not in [{".csv"}, {".parquet"}]:
        raise ValueError(f"All tables must have same extension, either .csv or .parquet. Got: {extensions}")
    if extensions == {".csv"}:
        return pl.concat((pl.read_csv(t) for t in table_paths), how="diagonal_relaxed")
    elif extensions == {".parquet"}:
        return pl.concat((pl.read_parquet(t) for t in table_paths), how="diagonal_relaxed")

In [None]:
#| export

def glob_to_tables(pattern: str) -> pd.DataFrame:
    '''
    expand a glob of tables, read in the tables,
    and output as concatenated DataFrame
    '''
    table_paths = [Path(p) for p in glob(pattern, recursive=True)]
    if len(table_paths) == 0:
        raise FileNotFoundError("No tables matched.")
    return cat_tables(table_paths)


In [None]:
#| hide

def test_parquet():
    pattern = "DELETEME_*.parquet"
    test_paths = [Path(pattern.replace("*",str(i))) for i in range(2)]
    for i, p in enumerate(test_paths):
        df = pl.DataFrame({"a": [0,1], "c":[5,7], "b": [12,5]})
        # change column ordering to test concatenation
        if i == 0:
            df = df.select(["a", "b", "c"])
        df.write_parquet(p)
    g = glob_to_tables(pattern)
    [p.unlink() for p in test_paths]
    return g

test_eq(test_parquet(), pl.DataFrame({"a": [0,1,0,1], "b": [12,5,12,5], "c": [5,7,5,7]}))

In [None]:
#| export

def table_to_meta(table: pl.DataFrame) -> tuple[list, list]:
    '''convert table to metadata columns and list'''
    # viewer expects filename column
    table = table.rename({"image_filename": "filename"})
    meta_columns = set(table.columns) - {"image_path", "hidden_vectors"}
    # convert to list as pandas does not let you index with a set
    meta_columns = list(meta_columns)
    df_meta = table[meta_columns]
    return meta_columns, df_meta.to_dicts()

In [None]:
#|hide
import nbdev; nbdev.nbdev_export()