# `datamate` Examples

Basic examples demonstrating `datamate` functionality.

<div style="position: relative; display: inline-block;">
    <img src="https://upload.wikimedia.org/wikipedia/commons/thumb/8/8c/Brainbow_%28Smith_2007%29.jpg/715px-Brainbow_%28Smith_2007%29.jpg" alt="Brainbow (Smith 2007)" style="max-width: 100%; height: auto;">
    <div style="font-size: 12px; color: #555; margin-top: 5px;">
        Image: "Mouse neurons labeled with fluorescent tags" by Stephen J Smith (2007), licensed under 
        <a href="https://creativecommons.org/licenses/by/3.0/" target="_blank" style="color: #007BFF;">CC BY 3.0</a>.
    </div>
</div>

## Filesystem as memory

In [None]:
from pathlib import Path
import numpy as np
import matplotlib.pyplot as plt

from datamate import Directory, set_root_dir

%load_ext autoreload
%autoreload 2

In [None]:
# we set the root directory
root_dir = Path(".") / "data"
set_root_dir(root_dir)

In [None]:
# we erase data from earlier execution of this notebook -- ignore this cell
if root_dir.exists():
    import shutil

    shutil.rmtree(root_dir)

In [None]:
# we create a Directory instance
cell_measurements = Directory()
cell_measurements

We 'measure' cell attributes: identity, x- and y-coordinates, and colors.

In [None]:
# we store data by setting attributes
n_cells = 100
cell_measurements.cell_id = np.arange(n_cells)
cell_measurements.x = np.random.normal(0, 1, size=n_cells)
cell_measurements.y = np.random.normal(0, 1, size=n_cells)
cell_measurements.colors = np.random.rand(n_cells, 3)

In [None]:
# we verify files with the tree-view method
# (automatically called)
cell_measurements

In [None]:
# we access data as attributes
plt.scatter(cell_measurements.x, cell_measurements.y, c=cell_measurements.colors, s=10)
plt.xlabel("cell location in x")
plt.ylabel("cell location in y")
plt.title(f"Locations and colors of {n_cells} cells")

In [None]:
# we index h5-arrays from disk without fully loading them to reduce memory load
start_cell_id = 0
end_cell_id = 50
plt.scatter(
    cell_measurements.x[start_cell_id:end_cell_id],
    cell_measurements.y[start_cell_id:end_cell_id],
    c=cell_measurements.colors[start_cell_id:end_cell_id],
    s=10,
)
plt.xlabel("cell location in x")
plt.ylabel("cell location in y")
plt.title(f"Locations and colors of {end_cell_id - start_cell_id} cells")

In [None]:
# we use the directory name to point to the same directory again
cell_measurements = Directory("Directory_0000")

# works also with specifying the root directory
# cell_measurements = Directory(root_dir / "Directory_0000")

cell_measurements

## Hierarchical data organization

In [None]:
# we navigate upwards on the filesystem hierarchy
cell_measurements.parent

In [None]:
# we navigate upwards twice
cell_measurements.parent.parent

In [None]:
# we create a pointer to a child Directory
# (as long as no file/attribute with this name already exists)
cell_measurements.connections

In [None]:
# we `measure` a random connectivity matrix
connectivity_matrix = np.random.randn(n_cells, n_cells) > 2
plt.imshow(connectivity_matrix)
plt.xlabel("postsynaptic cell id")
plt.ylabel("presynaptic cell id")
plt.title("connectivity matrix")

In [None]:
# we store the connectivity as graph (i.e. edges) because its sparse
post_cell_id, pre_cell_id = np.where(connectivity_matrix)
cell_measurements.connections.pre_cell_id = pre_cell_id
cell_measurements.connections.post_cell_id = post_cell_id

In [None]:
# the connections are now stored in our directory
cell_measurements

In [None]:
# we access them later from the same directory
cell_measurements.connections

In [None]:
# with attribute-style access to the h5-array
cell_measurements.connections.pre_cell_id[:]

In [None]:
# or composing strings following the pathlib syntax for your preference
(cell_measurements / "connections/pre_cell_id")[:]

## Configuration-based compilation of data

We wrap up the code above into a coherent object that can be configured and compiled to a `Directory`.

In [None]:
from time import sleep
from pathlib import Path
import numpy as np
import matplotlib.pyplot as plt

from datamate import Directory, root

data_dir = Path(".") / "data"


@root(data_dir)  # this optional decorator defines the root directory
class CellMeasurements(Directory):

    def __init__(self, n_cells=100, seed=0):
        print("Loading connectome ...")
        sleep(5)
        np.random.seed(seed)

        # store cell attributes
        self.cell_id = np.arange(n_cells)
        self.x = np.random.normal(0, 1, size=n_cells)
        self.y = np.random.normal(0, 1, size=n_cells)
        self.colors = np.random.rand(n_cells, 3)

        # store connectivity attributes
        connectivity_matrix = np.random.randn(n_cells, n_cells)
        pre_cell_id, post_cell_id = np.where(connectivity_matrix > 2)
        self.connections.pre_cell_id = pre_cell_id
        self.connections.post_cell_id = post_cell_id
        print("Stored connectome!")

In [None]:
# we init 'CellMeasurements'
# __init__ is only run if a directory of this type and config does not yet exist
cell_measurements = CellMeasurements()

In [None]:
# we verify contents written by __init__
cell_measurements

In [None]:
# we verify config written by __init__
cell_measurements.meta

In [None]:
# we change the seed
# we automatically get a second directory of the same type (but with different data)
cell_measurements_2 = CellMeasurements(n_cells=100, seed=42)

In [None]:
# we verify contents written by __init__
cell_measurements_2

In [None]:
# we verify config written by __init__
cell_measurements_2.meta

## Memory persistence

We restart the kernel and retrieve the data quickly later, using the same code and without recomputing.

In [None]:
from time import sleep
from pathlib import Path
import numpy as np
import matplotlib.pyplot as plt

from datamate import Directory, root

data_dir = Path(".") / "data"


@root(data_dir)
class CellMeasurements(Directory):

    def __init__(self, n_cells=100, seed=0):
        print("Loading connectome ...")
        sleep(5)
        np.random.seed(seed)

        # store cell attributes
        self.cell_id = np.arange(n_cells)
        self.x = np.random.normal(0, 1, size=n_cells)
        self.y = np.random.normal(0, 1, size=n_cells)
        self.colors = np.random.rand(n_cells, 3)

        # store connectivity attributes
        connectivity_matrix = np.random.randn(n_cells, n_cells)
        pre_cell_id, post_cell_id = np.where(connectivity_matrix > 2)
        self.connections.pre_cell_id = pre_cell_id
        self.connections.post_cell_id = post_cell_id
        print("Stored connectome!")

In [None]:
# fast init because points to the directories with the same type and configuration
cell_measurements = CellMeasurements(n_cells=100, seed=0)
cell_measurements_2 = CellMeasurements(n_cells=100, seed=42)

In [None]:
cell_measurements.config

In [None]:
cell_measurements_2.config

### Pandas integration

We load the h5 data to a pandas dataframe for further processing.

In [None]:
cells = cell_measurements.to_df()
connections = cell_measurements.connections.to_df()

In [None]:
cells

In [None]:
connections

We load the meta data into a pandas dataframe.

In [None]:
cell_measurements.meta.to_df(name="measurements 1")

In [None]:
cell_measurements_2.meta.to_df(name="measurements 2")

We tabularize experiment configurations.

In [None]:
configs = cell_measurements.meta.to_df(name="measurements 1").join(
    cell_measurements_2.meta.to_df(name="measurements 2")
)
configs

Or, vice versa, we create a directory from a pandas DataFrame (note, must provide [h5py compatible type information](https://docs.h5py.org/en/stable/faq.html)):

In [None]:
configs

In [None]:
dtypes = {"measurements 1": "S50", "measurements 2": "S50"}

In [None]:
# we create a directory from the dataframe of configs
directory = Directory.from_df(configs, dtypes, "experiments_config")

In [None]:
directory

In [None]:
directory.to_df(dtypes={"measurements 1": str, "measurements 2": str})

Alternatively, we seamlessly store and retrieve dataframes via csv files.

In [None]:
directory.cells = cell_measurements.to_df()
directory.connections = connections

In [None]:
# we verify the dataframes
directory.cells

In [None]:
directory.connections

In [None]:
# we extend the dataframes
directory.extend("cells", cell_measurements_2.to_df())
directory.extend("connections", cell_measurements_2.connections.to_df())

In [None]:
# we verify the dataframes
directory.cells

In [None]:
# we verify the dataframes
directory.connections

### Example: visualize the graph

In [None]:
def visualize_measurements(cell_measurements):
    try:
        import networkx as nx
    except ModuleNotFoundError as e:
        print(e, ", install networkx to visualize the cell graph structure.")
        _input = input("install now? yes/no")
        if _input == "yes":
            import sys
            !{sys.executable} -m pip install networkx
            import networkx as nx
        else:
            return
        
    cells = cell_measurements.to_df()
    connections = cell_measurements.connections.to_df()
        
    G = nx.Graph()
    G.add_nodes_from(cells.cell_id)
    G.add_edges_from(connections.values)
    pos = dict(zip(cells["cell_id"].values, cells[["x", "y"]].values))

    options = {
        "font_size": 4,
        "node_size": 10,
        "node_color": cell_measurements.colors[:],
        "edgecolors": "0.5",
        "linewidths": 0.25,
        "width": 0.25,
    }
    nx.draw_networkx(G, pos, **options)

In [None]:
visualize_measurements(cell_measurements)

In [None]:
visualize_measurements(cell_measurements_2)

## Configuration comparison and diffing

In [None]:
# we compare how the `measurements` differ in their configuration
# (this works with complex nested configurations too)
cell_measurements.meta.diff(cell_measurements_2.meta)

## Directory structure visualization (tree view)

In [None]:
from datamate import Directory, set_verbosity_level

data_dir = Path(".") / "data"

In [None]:
# default: we display 2 levels of the hierarchy and 25 lines
set_verbosity_level(1)
Directory(data_dir)

In [None]:
# we display all subdirectories and files
set_verbosity_level(2)
Directory(data_dir)

In [None]:
# we display referenced folder and last modified date
set_verbosity_level(0)
Directory(data_dir)

In [None]:
set_verbosity_level(2)

## Parallel read/write operations

We start the training loop by running the cells below.

We run the jupyter notebook `01b_datamate_intro_supplement.ipynb` to see how data is simultaneously written and read to the loss.h5 file.

In [None]:
from tqdm.auto import tqdm
from time import sleep
from pathlib import Path
import numpy as np
import matplotlib.pyplot as plt

from datamate import Directory, root

data_dir = Path(".") / "data"


@root(data_dir)
class NetworkDir(Directory):

    class Config:
        tau: float = 200.0
        sigma: float = 0.1

    def __init__(self, num_iters: int = 100):
        del self.loss
        for i in tqdm(range(num_iters), desc="Training"):
            self.train_iter(i)

    def train_iter(self, iter):
        self.extend(
            "loss",
            [np.exp(-iter / self.config.tau) + np.random.rand() * self.config.sigma],
        )
        sleep(0.25)

In [None]:
network_dir = NetworkDir()
network_dir

In [None]:
plt.plot(network_dir.loss[:])
plt.xlabel("iteration")
plt.ylabel("loss")
plt.title("Training loss")

In [None]:
# we verify that the directory exists
"NetworkDir_0000" in Directory(data_dir)

In [None]:
# we delete the directory and its contents
# network_dir.rmtree("y")

In [None]:
# we verify that the directory is deleted
# "NetworkDir_0000" in Directory(data_dir)