what are we even doing here?

In [1]:
import anndata
import h5py
import numpy as np
import os
import pandas as pd
import pathlib
import scipy.sparse

In [2]:
import abc_atlas_access.abc_atlas_cache.abc_project_cache as abc_project_cache

In [3]:
cache_dir = "/Users/scott.daniel/KnowledgeEngineering/cell_type_mapper/examples/data/abc_cache"

if not os.path.isdir(cache_dir):
    raise RuntimeError(
        "Set cache_dir above to the path to the directory where you want to download ABC Atlas data"
    )

abc_cache = abc_project_cache.AbcProjectCache.from_cache_dir(cache_dir)

In [4]:
abc_cache.load_latest_manifest()

# Get Whole Mouse Brain taxonomy

In [5]:
whole_mouse_taxonomy_dir = 'WMB-taxonomy'

In [6]:
whole_mouse_metadata_dir = 'WMB-10X'

In [7]:
whole_mouse_cell_metadata = abc_cache.get_metadata_dataframe(
    directory=whole_mouse_metadata_dir,
    file_name='cell_metadata'
)[['cell_label', 'feature_matrix_label', 'cluster_alias', 'dataset_label']]



In [8]:
whole_mouse_term_set_df = abc_cache.get_metadata_dataframe(
    directory=whole_mouse_taxonomy_dir,
    file_name='cluster_annotation_term_set'
)


In [9]:
whole_mouse_level_name_lookup = {
    label: name for label, name in zip(whole_mouse_term_set_df.label, whole_mouse_term_set_df.name)
}

In [10]:
whole_mouse_membership_df = abc_cache.get_metadata_dataframe(
    directory=whole_mouse_taxonomy_dir,
    file_name='cluster_to_cluster_annotation_membership'
)

In [11]:
neurotransmitter_df = whole_mouse_membership_df[
    whole_mouse_membership_df.cluster_annotation_term_set_label == 'CCN20230722_NEUR'
]

In [12]:
neurotransmitter_df = neurotransmitter_df.rename(
    {'cluster_annotation_term_name': 'neurotransmitter',
     'cluster_annotation_term_label': 'neurotransmitter_label'},
    axis=1
)

In [13]:
neurotransmitter_df = neurotransmitter_df[['cluster_alias', 'neurotransmitter', 'neurotransmitter_label']]

In [14]:
supertype_df = whole_mouse_membership_df[
    whole_mouse_membership_df.cluster_annotation_term_set_label == 'CCN20230722_SUPT'
]

In [15]:
supertype_df = supertype_df.rename(
    {'cluster_annotation_term_name': 'supertype',
     'cluster_annotation_term_label': 'supertype_label'},
    axis=1
)

In [16]:
supertype_df = supertype_df[['cluster_alias', 'supertype', 'supertype_label']]

In [17]:
neurotransmitter_df

Unnamed: 0,cluster_alias,neurotransmitter,neurotransmitter_label
21288,128,Glut,CS20230722_NEUR_Glut
21289,129,Glut,CS20230722_NEUR_Glut
21290,130,Glut,CS20230722_NEUR_Glut
21291,143,Glut,CS20230722_NEUR_Glut
21292,131,Glut,CS20230722_NEUR_Glut
...,...,...,...
26605,5279,,CS20230722_NEUR_NA
26606,5275,,CS20230722_NEUR_NA
26607,5272,,CS20230722_NEUR_NA
26608,5274,,CS20230722_NEUR_NA


In [18]:
len(set(neurotransmitter_df.cluster_alias))

5322

In [19]:
whole_mouse_cell_metadata = whole_mouse_cell_metadata.join(
    neurotransmitter_df.set_index('cluster_alias'),
    on='cluster_alias'
)
whole_mouse_cell_metadata = whole_mouse_cell_metadata.join(
    supertype_df.set_index('cluster_alias'),
    on='cluster_alias'
)
whole_mouse_cell_metadata = whole_mouse_cell_metadata[
    ['cell_label',
     'dataset_label',
     'feature_matrix_label',
     'neurotransmitter',
     'neurotransmitter_label',
     'supertype',
     'supertype_label']
]

In [20]:
whole_mouse_cell_metadata

Unnamed: 0,cell_label,dataset_label,feature_matrix_label,neurotransmitter,neurotransmitter_label,supertype,supertype_label
0,GCGAGAAGTTAAGGGC-410_B05,WMB-10Xv3,WMB-10Xv3-HPF,Glut,CS20230722_NEUR_Glut,0082 L2 IT PPP-APr Glut_3,CS20230722_SUPT_0082
1,AATGGCTCAGCTCCTT-411_B06,WMB-10Xv3,WMB-10Xv3-HPF,Glut,CS20230722_NEUR_Glut,0082 L2 IT PPP-APr Glut_3,CS20230722_SUPT_0082
2,AACACACGTTGCTTGA-410_B05,WMB-10Xv3,WMB-10Xv3-HPF,Glut,CS20230722_NEUR_Glut,0082 L2 IT PPP-APr Glut_3,CS20230722_SUPT_0082
3,CACAGATAGAGGCGGA-410_A05,WMB-10Xv3,WMB-10Xv3-HPF,Glut,CS20230722_NEUR_Glut,0082 L2 IT PPP-APr Glut_3,CS20230722_SUPT_0082
4,AAAGTGAAGCATTTCG-410_B05,WMB-10Xv3,WMB-10Xv3-HPF,Glut,CS20230722_NEUR_Glut,0082 L2 IT PPP-APr Glut_3,CS20230722_SUPT_0082
...,...,...,...,...,...,...,...
4042971,GTGTGAGCAAACGCGA-1350_C05,WMB-10XMulti,WMB-10XMulti,GABA-Glyc,CS20230722_NEUR_GABA-Glyc,1074 NLL Gata3 Gly-Gaba_1,CS20230722_SUPT_1074
4042972,TTAGCAATCCCTGTTA-1350_C05,WMB-10XMulti,WMB-10XMulti,Glut,CS20230722_NEUR_Glut,0682 RN Spp1 Glut_1,CS20230722_SUPT_0682
4042973,TTTGGCTGTCGCGCAA-1350_C05,WMB-10XMulti,WMB-10XMulti,GABA-Glyc,CS20230722_NEUR_GABA-Glyc,1076 NLL Gata3 Gly-Gaba_3,CS20230722_SUPT_1076
4042974,ATCCACCTCACAGACT-1320_B04,WMB-10XMulti,WMB-10XMulti,GABA-Glyc,CS20230722_NEUR_GABA-Glyc,1076 NLL Gata3 Gly-Gaba_3,CS20230722_SUPT_1076


Check out [this visualization](https://knowledge.brain-map.org/abcatlas#AQEBQVA4Sk5ONUxZQUJHVk1HS1kxQgACUTFOQ1dXUEc2RlowRE5JWEpCUQADAQE0TVY3SEE1REcyWEpaM1VEOEc5AAIBU2VybwAABAEBAn5CWhl%2FXfheA4Ju2HOCJBFuAAUABgEBAjE1Qks0N0RDSU9GMVNMTFVXOVAAA34AAAAEAAAIRzRJNEdGSlhKQjlBVFozUFRYMQAJTFZEQkpBVzhCSTVZU1MxUVVCRwAKAAsBbm9uZQACbm9uZQADAQQBAAIjMDAwMDAwAAPIAQAFAQECIzAwMDAwMAADyAEAAAACAQA%3D)

In [21]:
sero = whole_mouse_cell_metadata[whole_mouse_cell_metadata.neurotransmitter=='Sero']

In [22]:
files_needed = sero[['dataset_label', 'feature_matrix_label']].drop_duplicates()

In [23]:
files_needed

Unnamed: 0,dataset_label,feature_matrix_label
262281,WMB-10Xv3,WMB-10Xv3-P
288492,WMB-10Xv3,WMB-10Xv3-MB
288920,WMB-10Xv3,WMB-10Xv3-MY


In [24]:
for row in files_needed.iterrows():
    data = row[1]
    abc_cache.get_file_path(
        directory=data['dataset_label'],
        file_name=f"{data['feature_matrix_label']}/log2"
    )

downloads ~ 25 GB of data

In [25]:
set(sero.feature_matrix_label)

{'WMB-10Xv3-MB', 'WMB-10Xv3-MY', 'WMB-10Xv3-P'}

In [26]:
gene_metadata = abc_cache.get_metadata_dataframe(
    directory='WMB-10X',
    file_name='gene'
).set_index('gene_identifier')

In [27]:
expression_data = np.zeros(
    (len(sero), len(gene_metadata)),
    dtype=float
)

dst_cell_to_idx = {
    cell: ii for ii, cell in enumerate(sero.cell_label.values)
}

for dataset_label, feature_matrix_label in zip(files_needed.dataset_label,
                                               files_needed.feature_matrix_label):
    file_name = f'{feature_matrix_label}/log2'
    print(f'reading {dataset_label}/{file_name}')
    src_path = abc_cache.get_file_path(
        directory=dataset_label,
        file_name=file_name
    )
    src = anndata.read_h5ad(src_path, backed='r')
    np.testing.assert_array_equal(
        src.var.index.values,
        gene_metadata.index.values
    )

    src_cell_to_idx = {
        cell: ii for ii, cell in enumerate(src.obs.index.values)
    }

    subset = sero[
        np.logical_and(
            sero.dataset_label==dataset_label,
            sero.feature_matrix_label==feature_matrix_label
        )
    ]

    src_idx = np.array([src_cell_to_idx[cell] for cell in subset.cell_label])
    dst_idx = np.array([dst_cell_to_idx[cell] for cell in subset.cell_label])

    expression_data[dst_idx, :] = src[src_idx, :].X.toarray()
    
    src.file.close()
    del src

sero_adata = anndata.AnnData(
    obs=sero.set_index('cell_label'),
    var=gene_metadata,
    X=expression_data
)

del expression_data


reading WMB-10Xv3/WMB-10Xv3-P/log2
reading WMB-10Xv3/WMB-10Xv3-MB/log2
reading WMB-10Xv3/WMB-10Xv3-MY/log2


In [28]:
sorted(set(sero_adata.obs.supertype))

['0888 MB-MY Tph2 Glut-Sero_1',
 '0889 MB-MY Tph2 Glut-Sero_2',
 '0890 MB-MY Tph2 Glut-Sero_3',
 '0891 MB-MY Tph2 Glut-Sero_4',
 '0892 MB-MY Tph2 Glut-Sero_5',
 '0893 MB-MY Tph2 Glut-Sero_6',
 '0894 MB-MY Tph2 Glut-Sero_7']

In [29]:
(sero_adata.obs.supertype.values == '0893 MB-MY Tph2 Glut-Sero_6')

array([False, False, False, ..., False, False, False], shape=(1469,))

In [30]:
mask_0893 = (sero_adata.obs.supertype.values == '0893 MB-MY Tph2 Glut-Sero_6')
subset_0893 = sero_adata[mask_0893, :]
    

In [31]:
mask_0894 = (sero_adata.obs.supertype.values == '0894 MB-MY Tph2 Glut-Sero_7')
subset_0894 = sero_adata[mask_0894, :]

In [32]:
mask_other = np.logical_and(
    np.logical_not(mask_0893),
    np.logical_not(mask_0894)
)

subset_others = sero_adata[mask_other, :]

In [33]:
print(len(subset_0893))
print(len(subset_0894))
print(len(subset_others))

217
44
1208


In [34]:
import scipy.stats

In [35]:
mu_0893 = subset_0893.X.mean(axis=0)
mu_0894 = subset_0894.X.mean(axis=0)
mu_others = subset_others.X.mean(axis=0)

In [36]:
delta = mu_0893-mu_others
wilcoxon = scipy.stats.ranksums(subset_0893.X, subset_others.X, axis=0)

In [37]:
valid = np.logical_and(
    wilcoxon.pvalue<0.01,
    delta>5.0
)

In [38]:
valid_delta = delta[valid]
valid_genes = subset_others.var.gene_symbol.values[valid]

In [39]:
valid_genes[np.argmax(valid_delta)]

'Zeb2'

In [40]:
delta = mu_0894-mu_0893
wilcoxon = scipy.stats.ranksums(subset_0894.X, subset_0893.X, axis=0)

In [41]:
valid = np.logical_and(
    wilcoxon.pvalue<0.01,
    delta>5.0
)

In [42]:
valid_delta = delta[valid]
valid_genes = subset_0894.var.gene_symbol.values[valid]

In [43]:
valid_genes[np.argmax(valid_delta)]

'C1ql3'