In [None]:
#|default_exp from_tables

In [None]:
#| hide

%load_ext autoreload
%autoreload 2
from nbdev.showdoc import *
from fastcore.test import *

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# `from_tables`

> Get image and vector locations, and optionally metadata, from one or more table inputs

- Allows you to create embeddings with any external program
- Simplifies matching embeddings, images, and metadata to each other

In [None]:
#| export
from __future__ import annotations

import pandas as pd
import pyarrow.parquet as pq
from pathlib import Path

In [None]:
#| export

def glob_to_tables(pattern: str) -> pd.DataFrame:
    '''
    expand a glob of tables, read in the tables,
    and output as concatenated DataFrame
    '''
    table_paths = list(Path().glob(pattern))
    if len(table_paths) == 0: raise FileNotFoundError("No tables matched.")

    extensions = {p.suffix for p in table_paths}
    if extensions == {".csv"}:
        dataset = [pd.read_csv(t) for t in table_paths]
        return pd.concat(dataset, ignore_index=True)
    elif extensions == {".parquet"}:
        dataset = pq.ParquetDataset(table_paths)
        return dataset.read().to_pandas()
    else:
        raise ValueError(f"Unsupported table extensions: {extensions}")


In [None]:
#| hide

def test_parquet():
    pattern = "DELETEME_*.parquet"
    test_paths = [Path(pattern.replace("*",str(i))) for i in range(2)]
    for p in test_paths:
        df = pd.DataFrame({"a": [0,1], "b":[5,7]})
        df.to_parquet(p)
    g = glob_to_tables(pattern)
    [p.unlink() for p in test_paths]
    return g

    test_eq(test_parquet(), pd.DataFrame({"a": [0,1,0,1], "b": [5,7,5,7]}))

In [None]:
#|hide
import nbdev; nbdev.nbdev_export()