In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
# standard lib
import os, pwd, sys, json, yaml, atexit, tempfile, inspect
from pathlib import Path

# for data-science
import pandas as pd, numpy as np, quadfeather
from pyarrow import feather

# for plotting
import matplotlib as mpl, matplotlib.pyplot as plt, seaborn as sns

# for cellular-data
import scprep, scanpy as sc, anndata as ad

In [30]:
from featherplot.utils import MockSingleCellData, AnnDataProcessor, QuadFeatherRenamer
from featherplot.utils import SeriesToChannel, DataFrameToMetadata

from featherplot.utils import collapse_user
from featherplot.deepscatter import Tileset

In [4]:
mocker = MockSingleCellData()
adata = mocker.adata

In [5]:
adata

AnnData object with n_obs × n_vars = 1000 × 100
    obs: 'barcodes', 'conditions'
    var: 'is_hvg'
    obsm: 'X_mock'
    layers: 'X_norm'

### Create Processor
> this will help us extract the embedding layer and the gene expression layer

In [6]:
pipe = AnnDataProcessor(adata, 'X_mock', 'X_norm')

#### sidecars

Deepscatter calls additional columns `sidecars`, in our case those are the columns of gene expression. We place these values in `df_s`.

In [7]:
df_s = pipe.get_sidecars()
df_s.head()

gene_symbols,gene_symbol 0,gene_symbol 1,gene_symbol 2,gene_symbol 3,gene_symbol 4,gene_symbol 5,gene_symbol 6,gene_symbol 7,gene_symbol 8,gene_symbol 9,...,gene_symbol 90,gene_symbol 91,gene_symbol 92,gene_symbol 93,gene_symbol 94,gene_symbol 95,gene_symbol 96,gene_symbol 97,gene_symbol 98,gene_symbol 99
barcodes,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
barcode 0,0.659122,0.003639,-0.015749,-0.603925,0.25277,0.269117,-0.277937,-1.141009,0.246501,-0.764321,...,0.057746,1.708629,0.018431,0.961223,0.406869,1.002873,-0.670653,0.102878,0.15317,-0.048077
barcode 1,-0.402907,1.20493,-0.274225,-1.037198,0.418239,-1.241845,1.069331,0.203468,-0.540119,-0.079864,...,-0.696836,-1.429597,0.051032,-0.589662,-1.615216,-0.542598,0.689434,-0.318802,0.996157,-0.197688
barcode 2,-1.024078,-2.104927,0.151219,-0.855913,-1.109425,0.712447,-0.406566,-0.856289,0.143649,-1.00506,...,0.077651,-0.471763,-0.850139,0.462508,-0.896641,-0.725573,0.15794,0.81264,-1.503678,0.550671
barcode 3,0.566234,0.63218,-0.082447,1.087428,0.160232,2.333781,1.101377,0.20227,-0.623662,0.554229,...,-0.759271,-0.204689,1.402447,-0.877854,0.916562,-0.946905,0.661854,1.676934,0.298044,-1.028242
barcode 4,0.870096,-0.666743,0.23156,-1.01514,-0.637058,1.318197,-0.777007,-1.204698,1.581399,1.070643,...,-0.409691,1.363445,-0.772657,0.156357,-0.116396,0.323422,1.9e-05,0.673207,0.118049,-0.205185


#### points

If our gene expression features are called `sidecars`, then what is the embedding layer called? Well it is just the "points" of the plot, so we will store these values in `df_p`.

**NOTE**: we also store conditions with `df_p` as whatever is in this DataFrame will be loaded by `Deepscatter` automatically. 

In [8]:
df_p = pipe.get_embedding()
df_p = df_p.join(pipe.adata.obs.conditions)
df_p.head()

Unnamed: 0_level_0,MOCK_0,MOCK_1,MOCK_2,conditions
barcodes,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
barcode 0,1.896752,-1.114631,-0.246413,condition 0
barcode 1,1.736171,-0.340127,-0.127783,condition 1
barcode 2,0.850548,-0.049597,-1.234048,condition 2
barcode 3,0.768318,1.213939,0.328942,condition 3
barcode 4,-0.437877,-1.125257,-1.094229,condition 0


#### Combined
Now we combine `df_p` (points + condition) with `df_s` ("sidecars" i.e. gene expression). This is necessary as for script later on where we need to add the sidecars to already the `quadfeather`-ed (tiled) point data. 

In [9]:
df_all = df_p.join(df_s)
df_all.head()

Unnamed: 0_level_0,MOCK_0,MOCK_1,MOCK_2,conditions,gene_symbol 0,gene_symbol 1,gene_symbol 2,gene_symbol 3,gene_symbol 4,gene_symbol 5,...,gene_symbol 90,gene_symbol 91,gene_symbol 92,gene_symbol 93,gene_symbol 94,gene_symbol 95,gene_symbol 96,gene_symbol 97,gene_symbol 98,gene_symbol 99
barcodes,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
barcode 0,1.896752,-1.114631,-0.246413,condition 0,0.659122,0.003639,-0.015749,-0.603925,0.25277,0.269117,...,0.057746,1.708629,0.018431,0.961223,0.406869,1.002873,-0.670653,0.102878,0.15317,-0.048077
barcode 1,1.736171,-0.340127,-0.127783,condition 1,-0.402907,1.20493,-0.274225,-1.037198,0.418239,-1.241845,...,-0.696836,-1.429597,0.051032,-0.589662,-1.615216,-0.542598,0.689434,-0.318802,0.996157,-0.197688
barcode 2,0.850548,-0.049597,-1.234048,condition 2,-1.024078,-2.104927,0.151219,-0.855913,-1.109425,0.712447,...,0.077651,-0.471763,-0.850139,0.462508,-0.896641,-0.725573,0.15794,0.81264,-1.503678,0.550671
barcode 3,0.768318,1.213939,0.328942,condition 3,0.566234,0.63218,-0.082447,1.087428,0.160232,2.333781,...,-0.759271,-0.204689,1.402447,-0.877854,0.916562,-0.946905,0.661854,1.676934,0.298044,-1.028242
barcode 4,-0.437877,-1.125257,-1.094229,condition 0,0.870096,-0.666743,0.23156,-1.01514,-0.637058,1.318197,...,-0.409691,1.363445,-0.772657,0.156357,-0.116396,0.323422,1.9e-05,0.673207,0.118049,-0.205185


### QuadFeatherRenamer

Note: `quadfeather` and `deepscatter` are both under active development so things change all the time. At the moment `quadfeather` requires that `x` and `y` be in your DataFrame (it doesn't mind if `z` is there too). So this will handle the renaming of our columns.

In [10]:
qfr = QuadFeatherRenamer(df_all)

In [11]:
df_q, renamed = qfr.rename()
renamed

{'MOCK_0': 'x', 'MOCK_1': 'y', 'MOCK_2': 'z'}

### DataFrameToMetadata
 
`Deepscatter` is a really nice library; however, it also prefers to have its `plotAPI` method called with as much information as possible. This is a bit of a shame as it means that one you load your data with `deepscatter` you can't compute derived properties (e.g. domain of your data to scale the plot, check for what sidecars are availble, etc). 

The solution to this is simple. In order to have this information availble to us, we will just calculate it now (including which columns were renamed) and store it as metadata to use later

In [36]:
d2m = DataFrameToMetadata(
    df_q, 
    include_index=True,
    embedding='x y z conditions'.split(),
    alt_names={v:k for k,v in renamed.items()}
)

In [37]:
succ, fail = d2m.convert()
len(succ), len(fail)

(105, 0)

In [38]:
meta = d2m.to_meta()

## Quadfeather Workflow

Now we can now run through thte `quadfeather` workflow right here in the notebook.

### 0) setup

In [39]:
# dump everything to downloads for easy access
outdir = os.path.expanduser('~/Downloads/featherplot')
qf_dir = os.path.join(outdir, 'tiles')
if not os.path.isdir(qf_dir):
    os.makedirs(qf_dir)


p_file = os.path.join(outdir, 'points.parquet')
# NOTE: we never use s_file
# s_file = os.path.join(outdir, 'extras.parquet')
f_file = os.path.join(qf_dir, 'sidecars.feather')
m_file = os.path.join(outdir, 'meta.yml')


tile_size = 1000

### 1) create tiles

In [40]:
d2m.df.drop(columns=df_s.columns).to_parquet(p_file)
# d2m.df.drop(columns=d2m.embedding).to_parquet(s_file)

In [41]:
!quadfeather --files {p_file} \
             --tile_size {tile_size} \
             --destination {qf_dir}

### 2) make single file

In [42]:
feather.write_feather(d2m.df.drop(columns=d2m.embedding), f_file)

### 3) run `add_sidecars.py`

In [43]:
tileset = Tileset(Path(qf_dir))
tileset.add_sidecars(f_file, d2m.df.index.name)

note we copied `add_sidecars.py` so you can use it directly from this library

In [44]:
!featherplot add-sidecars --tileset {qf_dir}\
                         --sidecar {f_file} --key {d2m.df.index.name};

```sh
featherplot-py featherplot add-sidecars --help

Usage: featherplot add-sidecars 
[OPTIONS]
--tileset          PATH  Path to the tileset to add sidecars to.
--sidecar          PATH  Path to the new data to add to the tileset.
--key              TEXT  key to use for joining; must exist in both tables
--verbose  -v            Print verbose output.
--help                   Show this message and exit.
```

alternatively you can run the script form wherever you saved it

In [None]:
!python3 add_sidecars.py --tileset {qf_dir}\
                         --sidecar {f_file} --key {d2m.df.index.name};

### 4) update metadata with directo

In [45]:
meta.keys()

dict_keys(['index', 'n_points', 'embedding', 'sidecars', 'columns_metadata', 'tiles_dir'])

In [46]:
# relative path to tiles
meta['tiles_dir'] = qf_dir.replace(outdir, '')
# full path to tiles
meta['full_path'] = collapse_user(qf_dir)

In [47]:
with open(m_file, 'w') as f:
    f.write(yaml.dump(meta))

### 5) cleanup