# Mouse Whole Cortex and Hippocampus 10x
Source: [Allen Brain Map](https://portal.brain-map.org/atlases-and-data/rnaseq/mouse-whole-cortex-and-hippocampus-10x)

In [1]:
import os

import pandas as pd
import scanpy as sc
import anndata as ad
import numpy as np
from tqdm.auto import tqdm
from scipy.sparse import csr_matrix, bmat
import multiprocessing

from util import download
import hdf5

## Import

We fetch the necessary files to construct the AnnData object:
* Count Matrix (HDF5)
* Cell Metadata (CSV)
* UMAP Coordinates (CSV)

In [2]:
expression_path = download("https://idk-etl-prod-download-bucket.s3.amazonaws.com/aibs_mouse_ctx-hpf_10x/expression_matrix.hdf5")
metadata_path = download("https://idk-etl-prod-download-bucket.s3.amazonaws.com/aibs_mouse_ctx-hpf_10x/metadata.csv")
coordinates_path = download("https://idk-etl-prod-download-bucket.s3.amazonaws.com/aibs_mouse_ctx-hpf_10x/tsne.csv")

expression_matrix.hdf5: 0.00iB [00:00, ?iB/s]

KeyboardInterrupt: 

First, we import the counts data and create the AnnData Object.

In [3]:
expression_file = hdf5.load(expression_path)
hdf5.tree(expression_file)
expression = expression_file["data"]

'data':	<HDF5 group "/data" (4 members)>
 |- 'counts':	<HDF5 dataset "counts": shape (31053, 1169320), type "<i4">
 |- 'gene':	<HDF5 dataset "gene": shape (31053,), type "|S30">
 |- 'samples':	<HDF5 dataset "samples": shape (1169320,), type "|S36">
 |- 'shape':	<HDF5 dataset "shape": shape (2,), type "<i4">


Read large matrix in chunk steps, such that it does not use to much memory.

**WARNING:** This step uses a lot (>60GiB) of RAM.

In [None]:
counts = expression["counts"]
m = hdf5.read_sparse_chunks(counts, sparse_format=csr_matrix, transpose=True)

/data/counts:   0%|          | 0/3744 [00:00<?, ?it/s]

In [4]:
mm = bmat(m)
mm

We construct the AnnData object by attaching the variables.

In [None]:
dataset = ad.AnnData(X=m, dtype=m.dtype)
dataset.obs_names = expression["samples"]
dataset.var_names = expression["gene"]
dataset.write("mouse_brain_raw.sc.h5ad", compression="gzip")

In [None]:
a = csr_matrix([[1,2],[3,4]])

In [None]:
mm.asformat("csr")

In [3]:
import os
os.environ["TMPDIR"]

'/home/scratch/fabian.pottbaecker/226501'

In [4]:
a = np.full((2,2), None)

In [5]:
from scipy.sparse import random
a[0,0] = random(100,100, format="csr")
a[0,1] = random(100,100, format="csr")
a[1,0] = random(100,100, format="csr")
a[1,1] = random(100,100, format="csr")

In [6]:
a[0,0]

<100x100 sparse matrix of type '<class 'numpy.float64'>'
	with 100 stored elements in Compressed Sparse Row format>

In [7]:
bmat(a)

<200x200 sparse matrix of type '<class 'numpy.float64'>'
	with 400 stored elements in Compressed Sparse Row format>

In [2]:
import inspect

In [3]:
print(inspect.getsource(bmat))

def bmat(blocks, format=None, dtype=None):
    """
    Build a sparse matrix from sparse sub-blocks

    Parameters
    ----------
    blocks : array_like
        Grid of sparse matrices with compatible shapes.
        An entry of None implies an all-zero matrix.
    format : {'bsr', 'coo', 'csc', 'csr', 'dia', 'dok', 'lil'}, optional
        The sparse format of the result (e.g. "csr"). By default an
        appropriate sparse matrix format is returned.
        This choice is subject to change.
    dtype : dtype, optional
        The data-type of the output matrix. If not given, the dtype is
        determined from that of `blocks`.

    Returns
    -------
    bmat : sparse matrix

    See Also
    --------
    block_diag, diags

    Examples
    --------
    >>> from scipy.sparse import coo_matrix, bmat
    >>> A = coo_matrix([[1, 2], [3, 4]])
    >>> B = coo_matrix([[5], [6]])
    >>> C = coo_matrix([[7]])
    >>> bmat([[A, B], [None, C]]).toarray()
    array([[1, 2, 5],
          

In [13]:
import scipy
scipy.__version__

'1.7.3'

In [1]:
sc.__version__

NameError: name 'sc' is not defined