## Imports

In [1]:
# standard lib
import os, pwd, sys, json, yaml, atexit, tempfile, inspect

# for data-science
import pandas as pd, numpy as np, quadfeather
from pyarrow import feather

# for plotting
import matplotlib as mpl, matplotlib.pyplot as plt, seaborn as sns

# for cellular-data
import scprep, scanpy as sc, anndata as ad

## Setup

In [2]:
SEED = 3
np.random.seed(SEED)

# NOTE: this is much smaller than what deepscatter can actually handle
N_POINTS = 10000

# NOTE: this is much smaller than the default tile size of 50,000
TILE_SIZE = 1000

# full path to this notebook
FILE = os.path.abspath('')

# the sveltekit project you might be working on / want to deploy
SVELTEKIT_DIR = os.path.join(FILE, '..')

# the static assets directory of the sveltekit project where files are hosted
STATIC_ASSETS_DIR = os.path.join(SVELTEKIT_DIR, 'static')

# we are assuming that you might have multiple datasets you want to host / switch between
DATASETS_DIR = os.path.join(STATIC_ASSETS_DIR, 'datasets')

# this is where we are going to store our dataset
DATASET_NAME = 'cell'
DEMO_DATASET_DIR = os.path.join(DATASETS_DIR, DATASET_NAME)

# NOTE: this is the unique ID that will be used map additional columns to the dataset
LABEL_NAME = 'barcodes'


POINT_COLUMNS = ['fake-SNE X', 'fake-SNE Y', 'fake-SNE Z']

N_GENES = 100

In [12]:
# you can switch TARGET_DIR with whatever dataset you want to work with
TARGET_DIR = DEMO_DATASET_DIR

if not os.path.isdir(TARGET_DIR):
    os.makedirs(TARGET_DIR)    

# NOTE: you can use a temp direcotry, but this is so you can view the files and confirm they are deleted
TMP_DIR = os.path.expanduser('~/Downloads')

## Utils

In [13]:
usr = pwd.getpwuid(os.getuid())[0]

def collapse_user(path: str) -> str:
    prefix, rest = path.split(usr)    
    return '~' + rest

In [14]:
def make_temp_file(**kwargs) -> tempfile.NamedTemporaryFile:
    temp = tempfile.NamedTemporaryFile(**kwargs)
    @atexit.register
    def delete_temp() -> None:
        temp.close()
    return temp

In [15]:
# where we will store points
parquet_points = make_temp_file(suffix='.parquet', dir=os.path.expanduser(TMP_DIR))

# where we will store additional information
parquet_sidecar = make_temp_file(suffix='.parquet', dir=os.path.expanduser(TMP_DIR))

# where we will store additional information as feather file
feather_sidecar = make_temp_file(suffix='.feather', dir=os.path.expanduser(TMP_DIR))

# where we store raw labels
csv_labels = os.path.join(TARGET_DIR, 'labels.csv')

## Fake Data

In [16]:
labels = pd.Series(np.arange(N_POINTS), name=LABEL_NAME).map(lambda x: f'Barcode {x}')

conditions = pd.Series(labels.map(lambda e: f'Condition {int(e.split()[1]) % 4}'), name='conditions')
conditions.index = labels

In [17]:
df_emb = pd.DataFrame(np.random.randn(N_POINTS, 3), index=labels, columns=POINT_COLUMNS)
df_emb = df_emb.join(conditions)

gene_cols = [f'gene {i}' for i in range(1, N_GENES + 1)]

df_counts = pd.DataFrame(np.random.randn(N_POINTS, N_GENES), index=labels, columns=gene_cols)
df_normed = pd.DataFrame(np.random.randn(N_POINTS, N_GENES), index=labels, columns=gene_cols)

In [18]:
hvg_genes = pd.Series([i % 5 for i in range(len(gene_cols))], name='HVG')
hvg_names = pd.Series([gene_cols[i] for i in range(len(gene_cols))], name='gene_symbol')
hvg_genes.index = hvg_names

In [19]:
adata = ad.AnnData(
    X=df_counts,
    obs=df_emb.reset_index()[['barcodes', 'conditions']].set_index('barcodes', drop=False),
    var=pd.DataFrame(hvg_genes, index=hvg_names),
    layers={'X_norm':df_normed},
    obsm={'X_emb':df_emb.drop(columns='conditions')}
)

In [20]:
adata

AnnData object with n_obs × n_vars = 10000 × 100
    obs: 'barcodes', 'conditions'
    var: 'HVG'
    obsm: 'X_emb'
    layers: 'X_norm'

In [21]:
EMB = 'X_emb'
LAYER = 'X_norm'

In [22]:
df_sidecar = pd.DataFrame(
    adata.layers.get(LAYER),
    columns=adata.var.index,
    index=adata.obs.index
)
df_sidecar.head()

gene_symbol,gene 1,gene 2,gene 3,gene 4,gene 5,gene 6,gene 7,gene 8,gene 9,gene 10,...,gene 91,gene 92,gene 93,gene 94,gene 95,gene 96,gene 97,gene 98,gene 99,gene 100
barcodes,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Barcode 0,1.075366,0.795157,0.612961,0.342845,1.400385,-0.000778,0.142779,0.933984,1.360402,0.77382,...,-0.246998,-0.937849,-0.920359,-1.529624,-0.329658,-0.517582,1.108534,-0.571888,-0.703872,0.817223
Barcode 1,2.02358,0.924715,0.94985,0.586443,-1.083444,2.179629,-0.900052,0.51731,0.53157,0.440979,...,-0.035071,0.188018,1.537435,-1.349599,-0.492892,0.048944,-1.745872,-0.82341,0.90209,-1.183776
Barcode 2,-0.085821,-2.146595,0.68156,1.093608,1.632251,0.548563,-1.469223,1.925281,0.430757,0.264783,...,1.483044,0.08276,0.414493,-1.472177,0.200582,-1.85295,-1.166627,-0.797802,-1.979365,1.078885
Barcode 3,0.269811,0.055121,1.256017,-0.405762,0.438296,-1.374546,-0.030749,0.136598,-0.342092,1.373554,...,-0.724343,0.063594,-0.455092,0.053669,0.492894,3.163143,1.381083,1.117245,-0.53803,0.324325
Barcode 4,0.839897,2.70896,0.773679,0.139735,0.951294,0.115525,-1.225863,-0.336039,2.352683,-0.99834,...,-1.611008,0.943233,-0.193525,0.558426,-0.361419,-2.221528,-0.499174,-1.334981,0.606741,0.020852


In [23]:
df_points = adata.obsm.get(EMB).rename({
    'fake-SNE X': 'x','fake-SNE Y': 'y', 'fake-SNE Z': 'z'
}, axis=1)
df_points.loc[:, 'conditions'] = adata.obs.conditions
df_points.head()

Unnamed: 0_level_0,x,y,z,conditions
barcodes,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Barcode 0,-0.762859,-0.045113,0.639153,Condition 0
Barcode 1,-1.158903,-0.640037,-0.357941,Condition 1
Barcode 2,-0.571891,1.262319,1.020283,Condition 2
Barcode 3,1.437083,1.325585,-0.045854,Condition 3
Barcode 4,0.449052,-0.134211,-0.332791,Condition 0


In [24]:
df_points.to_parquet(parquet_points.name)
df_sidecar.to_parquet(parquet_sidecar.name)

In [25]:
pd.read_parquet(parquet_points.name)

Unnamed: 0_level_0,x,y,z,conditions
barcodes,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Barcode 0,-0.762859,-0.045113,0.639153,Condition 0
Barcode 1,-1.158903,-0.640037,-0.357941,Condition 1
Barcode 2,-0.571891,1.262319,1.020283,Condition 2
Barcode 3,1.437083,1.325585,-0.045854,Condition 3
Barcode 4,0.449052,-0.134211,-0.332791,Condition 0
...,...,...,...,...
Barcode 9995,0.533238,-0.135075,-1.157779,Condition 3
Barcode 9996,-1.075248,-1.544876,2.691278,Condition 0
Barcode 9997,1.549915,0.258567,-0.100722,Condition 1
Barcode 9998,2.480939,0.296565,-0.271057,Condition 2


## Workflow

### 1) create tiles

```sh
# NOTE: if run before renaming

$ !quadfeather --files {csv_points.name} --tile_size {TILE_SIZE} --destination {os.path.join(TARGET_DIR, 'tiles')}

Traceback (most recent call last):
  File "/Users/solst/mambaforge/envs/quadfeather/bin/quadfeather", line 8, in <module>
    sys.exit(main())
  File "/Users/solst/mambaforge/envs/quadfeather/lib/python3.10/site-packages/quadfeather/tiler.py", line 223, in main
    rewritten_files, extent, raw_schema = rewrite_in_arrow_format(args.files, schema_safe, schema, csv_block_size)
  File "/Users/solst/mambaforge/envs/quadfeather/lib/python3.10/site-packages/quadfeather/tiler.py", line 180, in rewrite_in_arrow_format
    col = data.column(dim)
  File "pyarrow/table.pxi", line 4292, in pyarrow.lib.Table.column
  File "pyarrow/table.pxi", line 4233, in pyarrow.lib.Table._ensure_integer_index
KeyError: 'Field "x" does not exist in table schema'
```

In [26]:
!quadfeather --files {parquet_points.name} \
             --tile_size {TILE_SIZE} \
             --destination {os.path.join(TARGET_DIR, 'tiles')}

### 2) make single file

Create a single file that contains all the data you want to add, but none of the data that’s already there except for your unique id field (`label` in this case). 

NOTE: `label` must be the same name and data type as in your primary file.

In [27]:
df_all = df_points.join(df_sidecar)

In [28]:
df_all.head()

Unnamed: 0_level_0,x,y,z,conditions,gene 1,gene 2,gene 3,gene 4,gene 5,gene 6,...,gene 91,gene 92,gene 93,gene 94,gene 95,gene 96,gene 97,gene 98,gene 99,gene 100
barcodes,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Barcode 0,-0.762859,-0.045113,0.639153,Condition 0,1.075366,0.795157,0.612961,0.342845,1.400385,-0.000778,...,-0.246998,-0.937849,-0.920359,-1.529624,-0.329658,-0.517582,1.108534,-0.571888,-0.703872,0.817223
Barcode 1,-1.158903,-0.640037,-0.357941,Condition 1,2.02358,0.924715,0.94985,0.586443,-1.083444,2.179629,...,-0.035071,0.188018,1.537435,-1.349599,-0.492892,0.048944,-1.745872,-0.82341,0.90209,-1.183776
Barcode 2,-0.571891,1.262319,1.020283,Condition 2,-0.085821,-2.146595,0.68156,1.093608,1.632251,0.548563,...,1.483044,0.08276,0.414493,-1.472177,0.200582,-1.85295,-1.166627,-0.797802,-1.979365,1.078885
Barcode 3,1.437083,1.325585,-0.045854,Condition 3,0.269811,0.055121,1.256017,-0.405762,0.438296,-1.374546,...,-0.724343,0.063594,-0.455092,0.053669,0.492894,3.163143,1.381083,1.117245,-0.53803,0.324325
Barcode 4,0.449052,-0.134211,-0.332791,Condition 0,0.839897,2.70896,0.773679,0.139735,0.951294,0.115525,...,-1.611008,0.943233,-0.193525,0.558426,-0.361419,-2.221528,-0.499174,-1.334981,0.606741,0.020852


In [29]:
# NOTE: this is the same as df_sidecar
df_all = df_all.drop(columns=df_points.columns)
df_all.head()

Unnamed: 0_level_0,gene 1,gene 2,gene 3,gene 4,gene 5,gene 6,gene 7,gene 8,gene 9,gene 10,...,gene 91,gene 92,gene 93,gene 94,gene 95,gene 96,gene 97,gene 98,gene 99,gene 100
barcodes,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Barcode 0,1.075366,0.795157,0.612961,0.342845,1.400385,-0.000778,0.142779,0.933984,1.360402,0.77382,...,-0.246998,-0.937849,-0.920359,-1.529624,-0.329658,-0.517582,1.108534,-0.571888,-0.703872,0.817223
Barcode 1,2.02358,0.924715,0.94985,0.586443,-1.083444,2.179629,-0.900052,0.51731,0.53157,0.440979,...,-0.035071,0.188018,1.537435,-1.349599,-0.492892,0.048944,-1.745872,-0.82341,0.90209,-1.183776
Barcode 2,-0.085821,-2.146595,0.68156,1.093608,1.632251,0.548563,-1.469223,1.925281,0.430757,0.264783,...,1.483044,0.08276,0.414493,-1.472177,0.200582,-1.85295,-1.166627,-0.797802,-1.979365,1.078885
Barcode 3,0.269811,0.055121,1.256017,-0.405762,0.438296,-1.374546,-0.030749,0.136598,-0.342092,1.373554,...,-0.724343,0.063594,-0.455092,0.053669,0.492894,3.163143,1.381083,1.117245,-0.53803,0.324325
Barcode 4,0.839897,2.70896,0.773679,0.139735,0.951294,0.115525,-1.225863,-0.336039,2.352683,-0.99834,...,-1.611008,0.943233,-0.193525,0.558426,-0.361419,-2.221528,-0.499174,-1.334981,0.606741,0.020852


NOTE: **All** columns that you want to show up in the data should ideally be `float32()` type, although doubles might not be the end of the world.

In [30]:
df_all = df_all.astype('float32')
df_all.head()

Unnamed: 0_level_0,gene 1,gene 2,gene 3,gene 4,gene 5,gene 6,gene 7,gene 8,gene 9,gene 10,...,gene 91,gene 92,gene 93,gene 94,gene 95,gene 96,gene 97,gene 98,gene 99,gene 100
barcodes,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Barcode 0,1.075366,0.795157,0.612961,0.342845,1.400385,-0.000778,0.142779,0.933984,1.360402,0.77382,...,-0.246998,-0.937849,-0.920359,-1.529624,-0.329658,-0.517582,1.108534,-0.571888,-0.703872,0.817223
Barcode 1,2.02358,0.924715,0.94985,0.586443,-1.083444,2.179629,-0.900052,0.51731,0.53157,0.440979,...,-0.035071,0.188018,1.537435,-1.349599,-0.492892,0.048944,-1.745872,-0.823409,0.90209,-1.183776
Barcode 2,-0.085821,-2.146595,0.68156,1.093608,1.632251,0.548563,-1.469223,1.925281,0.430757,0.264783,...,1.483044,0.08276,0.414493,-1.472178,0.200582,-1.85295,-1.166627,-0.797802,-1.979365,1.078885
Barcode 3,0.269811,0.055121,1.256017,-0.405762,0.438296,-1.374546,-0.030749,0.136598,-0.342092,1.373554,...,-0.724343,0.063594,-0.455092,0.053669,0.492894,3.163143,1.381083,1.117245,-0.53803,0.324325
Barcode 4,0.839897,2.70896,0.773679,0.139735,0.951294,0.115525,-1.225863,-0.336039,2.352683,-0.99834,...,-1.611008,0.943233,-0.193525,0.558426,-0.361419,-2.221528,-0.499174,-1.334981,0.606741,0.020852


The file must be a [feather file][feather file], not parquet. 

```python 
from pyarrow import feather; 

# if converting from parquet
feather.write_feather(parquet.read_table('fin.parquet), 'fout.feather))

# if converting pandas
feather.write_feather(df, 'fout.feather')
```

[feather file]: https://arrow.apache.org/docs/python/feather.html

In [31]:
feather.write_feather(df_all, feather_sidecar.name)

In [32]:
df_feather = feather.read_feather(feather_sidecar.name)
df_feather.shape

(10000, 100)

### 3) run `add_sidecars.py`

In [33]:
!python3 add_sidecars.py --tileset {os.path.join(TARGET_DIR, 'tiles')}\
                         --sidecar {feather_sidecar.name} --key {LABEL_NAME};
!clear                         

/Users/solst/Projects/featherplot/nbs/../static/datasets/cell/tiles/0/0/0.feather
/Users/solst/Projects/featherplot/nbs/../static/datasets/cell/tiles/1/0/1.feather
/Users/solst/Projects/featherplot/nbs/../static/datasets/cell/tiles/1/0/0.feather
/Users/solst/Projects/featherplot/nbs/../static/datasets/cell/tiles/1/1/1.feather
/Users/solst/Projects/featherplot/nbs/../static/datasets/cell/tiles/1/1/0.feather
/Users/solst/Projects/featherplot/nbs/../static/datasets/cell/tiles/3/4/5.feather
/Users/solst/Projects/featherplot/nbs/../static/datasets/cell/tiles/3/4/4.feather
/Users/solst/Projects/featherplot/nbs/../static/datasets/cell/tiles/3/4/2.feather
/Users/solst/Projects/featherplot/nbs/../static/datasets/cell/tiles/3/4/3.feather
/Users/solst/Projects/featherplot/nbs/../static/datasets/cell/tiles/3/5/5.feather
/Users/solst/Projects/featherplot/nbs/../static/datasets/cell/tiles/3/5/4.feather
/Users/solst/Projects/featherplot/nbs/../static/datasets/cell/tiles/3/5/2.feather
/Users/solst/Pro

## Meta Data

In [44]:
def extract_column_metadata(
    df:pd.DataFrame, is_sidecar:bool=False, do_rename:bool=True, 
    copy:bool=False, _use_all_assumed_cols:bool=False
) -> (pd.DataFrame, dict):
    
    df_cur = df.copy() if copy else df

    meta = {}

    # NOTE: strictly required
    _required_columns = 'x y'.split()
    # NOTE: assumed to be present
    _assumed_columns = _required_columns + ['z']
    
    # NOTE: first we check if the required columns are present
    _req_cols = _assumed_columns if _use_all_assumed_cols else _required_columns
    _missing_cols = sorted(list(set(_req_cols) - set(df_cur.columns)))
    _to_rename = dict()

    # NOTE: if they are not present, we then rename the first column
    # to the next missing required column. This may not be the desired effect.
    if do_rename and not is_sidecar:
        for i, cname in enumerate(df_cur.columns):
            if cname not in _assumed_columns and len(_missing_cols) > 0:
                new_col_name = _missing_cols.pop(0)
                _to_rename[cname] = dict(name=new_col_name, text=cname, index=i)


    for i, cname in enumerate(df_cur.columns):        
        col = df_cur[cname]
        dtype = col.dtype.name
        
        if dtype == 'object':
            col = df_cur[cname].astype('category')
            dtype = col.dtype.name



        if dtype == 'category':
            col = col.cat.as_ordered()
            _min, _max = int(col.cat.codes.min()), int(col.cat.codes.max())

        elif dtype == 'bool':
            _min, _max = 0, 1

        else:
            _min, _max = float(col.min()), float(col.max())
        
        text = str(cname)
        if do_rename and not is_sidecar:
            if cname in _to_rename:
                text = _to_rename[cname]['text']
                new_col_name = _to_rename[cname]['name']
                df_cur = df_cur.rename(columns={cname: new_col_name})
                cname = new_col_name
                col = df_cur[cname]


        _type = str(dtype)
        if 'int' in _type or 'float' in _type:
            _type = 'number'

        cmeta = dict(
            field=str(cname), human=str(text), type=_type,
            min=_min, max=_max, domain=[_min, _max],
            is_sidecar=is_sidecar,
        )

        meta[cname] = cmeta
    return df_cur, meta

In [45]:
df_p, meta_p = extract_column_metadata(df_points,  do_rename=False, is_sidecar=False, _use_all_assumed_cols=True)
df_s, meta_s = extract_column_metadata(df_sidecar, do_rename=False, is_sidecar=True)
column_meta = {**meta_p, **meta_s}

In [47]:
meta = dict(
    index=LABEL_NAME, n_points=N_POINTS, seed=SEED, 
    target_dir=TARGET_DIR.replace(STATIC_ASSETS_DIR, ''), tile_size=TILE_SIZE, 
    tiles_dir=os.path.join(TARGET_DIR, 'tiles').replace(STATIC_ASSETS_DIR, ''),

    columns=sorted(list(set(list(df_points.columns.tolist() + df_sidecar.columns.tolist())))),
    embedding_columns=df_p.columns.values.tolist(),
    sidecar_columns=df_s.columns.values.tolist(),
    columns_metadata=column_meta,
)

In [None]:
meta = dict(
    seed=SEED, n_points=N_POINTS, tile_size=TILE_SIZE, 
    dataset_name=DATASET_NAME, label_name=LABEL_NAME,
    target_dir=TARGET_DIR.replace(STATIC_ASSETS_DIR, ''), 
    tiles_dir=os.path.join(TARGET_DIR, 'tiles').replace(STATIC_ASSETS_DIR, ''),

    # embedding_columns=df_p.columns.values.tolist(),
    # sidecar_columns=df_s.columns.values.tolist(),
    # column_metadata=column_meta,
)

In [48]:
with open(os.path.join(TARGET_DIR, 'meta.yml'), 'w') as f:
    f.write(yaml.dump(meta))

## Cleanup

NOTE: these files will automatically be deleted when the kernel stops, but we delete them here for good practice

In [49]:
parquet_points.close()
parquet_sidecar.close()
feather_sidecar.close()