In [25]:
#|default_exp from_tables

In [26]:
#| hide

%load_ext autoreload
%autoreload 2
from nbdev.showdoc import *
from fastcore.test import *

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# `from_tables`

> Get image and vector locations, and optionally metadata, from one or more table inputs

- Allows you to create embeddings with any external program
- Simplifies matching embeddings, images, and metadata to each other

In [27]:
#| export

import pandas as pd
import pyarrow.parquet as pq
from pathlib import Path
from glob import glob
from typing import Tuple, List

In [28]:
#| export

def cat_tables(table_paths: list[Path]) -> pd.DataFrame:
    '''
    read and concatenate tables from list of paths
    '''
    extensions = {p.suffix.lower() for p in table_paths}
    if extensions not in [{".csv"}, {".parquet"}]:
        raise ValueError(f"All tables must have same extension, either .csv or .parquet. Got: {extensions}")
    if extensions == {".csv"}:
        dataset = [pd.read_csv(t) for t in table_paths]
        return pd.concat(dataset, ignore_index=True)
    elif extensions == {".parquet"}:
        dataset = pq.ParquetDataset(table_paths)
        return dataset.read().to_pandas()

In [29]:
#| export

def glob_to_tables(pattern: str) -> pd.DataFrame:
    '''
    expand a glob of tables, read in the tables,
    and output as concatenated DataFrame
    '''
    table_paths = [Path(p) for p in glob(pattern, recursive=True)]
    if len(table_paths) == 0:
        raise FileNotFoundError("No tables matched.")
    return cat_tables(table_paths)


In [30]:
#| hide

def test_parquet():
    pattern = "DELETEME_*.parquet"
    test_paths = [Path(pattern.replace("*",str(i))) for i in range(2)]
    for i, p in enumerate(test_paths):
        df = pd.DataFrame({"a": [0,1], "c":[5,7], "b": [12,5]})
        # parquet dataset can handle different column ordering
        if i == 0: df.sort_index(inplace=True)
        df.to_parquet(p)
    g = glob_to_tables(pattern)
    [p.unlink() for p in test_paths]
    return g

test_eq(test_parquet(), pd.DataFrame({"a": [0,1,0,1], "c": [5,7,5,7], "b": [12,5,12,5]}))

In [31]:
df = test_parquet()

In [32]:
#| export

def table_to_meta(table: pd.DataFrame) -> Tuple[List, List]:
    '''convert table to metadata columns and list'''
    # viewer expects filename column
    table = table.rename(columns={"image_filename": "filename"})
    meta_columns = set(table.columns) - set(["image_path", "embed_path"])
    # convert to list as pandas does not let you index with a set
    meta_columns = list(meta_columns)
    df_meta = table[meta_columns]
    return meta_columns, list(df_meta.to_dict(orient='index').values())

In [33]:
#|hide
import nbdev; nbdev.nbdev_export()