In [6]:
import numpy as np
import pandas as pd
import scanpy as sc
import pymn
import anndata as ad
import time
import os
from pyprojroot import here
import resource
from scipy.stats import chi2_contingency

In [7]:
base_data_folder = "/vault/lfrench/mouse_brain_cluster_replicability/data/"
base_results_folder = os.path.join(here(), "results")

In [11]:
adata_macosko = sc.read_h5ad(base_data_folder + "/whole_mouse_brain/macosko/from_google_drive/Macosko_Mouse_Atlas_Single_Nuclei.Use_Backed.h5ad", backed="r")

In [10]:
macosko_meta_data = pd.read_csv(base_data_folder + "/whole_mouse_brain/macosko/from_google_drive/Library_Metadata.tsv", sep = '\t')

In [12]:
macosko_meta_data = macosko_meta_data[['library', 'region', 'brain_struct']]

In [13]:
macosko_meta_data = macosko_meta_data.rename(columns = {"library": "derived_cell_libs"})

In [15]:
adata_macosko.obs = adata_macosko.obs.merge(macosko_meta_data)

In [16]:
adata_macosko.obs.brain_struct.value_counts()

brain_struct
Isocortex    879639
MB           696170
CB           600320
MY           465753
TH           427206
HPF          372153
PAL          254759
P            251060
OLF          179563
HY           109517
CTXsp         89065
STR           82091
Name: count, dtype: int64

In [17]:
adata_macosko.obs.region.value_counts()

region
MB      696170
BS      674619
CB      566290
TH      427206
ENT     192866
OLF     179563
MOp     164819
CTX     159201
HPF     151474
RSP     146733
AUD     129453
ACA     119307
HY      109517
BNST    106826
AMY      89065
PALm     72031
VISP     62088
VIS      49556
S1       48482
STRd     45812
NTS      42194
PALv     41116
LSX      36279
PALd     34786
DCN      34030
SUB      27813
Name: count, dtype: int64

In [18]:
Macosko_regions = adata_macosko.obs

In [19]:
Macosko_regions["region_remap"] = Macosko_regions.region.copy()
regions_for_annotation = Macosko_regions.groupby(['region_remap']).size().sort_values(ascending=False)
regions_for_annotation = pd.DataFrame(regions_for_annotation).reset_index()
regions_for_annotation = regions_for_annotation.rename(columns={0 : "cell_count"})

In [20]:
#less manual mappings in Macosko as it is more sparse
Macosko_regions["region_remap"] = Macosko_regions["region_remap"].str.replace("PALd", "PAL")
Macosko_regions["region_remap"] = Macosko_regions["region_remap"].str.replace("PALm", "PAL")
Macosko_regions["region_remap"] = Macosko_regions["region_remap"].str.replace("PALv", "PAL")

In [21]:
Macosko_regions = Macosko_regions.groupby(['region_remap', 'ClusterNm']).size()

  Macosko_regions = Macosko_regions.groupby(['region_remap', 'ClusterNm']).size()


In [22]:
Macosko_regions = Macosko_regions.reset_index()

In [23]:
Macosko_regions = Macosko_regions.rename(columns={0 : "cell_count_Macosko"})

In [24]:
Macosko_regions

Unnamed: 0,region_remap,ClusterNm,cell_count_Macosko
0,ACA,Astro_Agt_Sntg1,6
1,ACA,Astro_Emid1_Cd38,0
2,ACA,Astro_Emid1_Gdf10,0
3,ACA,Astro_Ephb1_Fzd2,12215
4,ACA,Astro_Ephb1_Gfap,402
...,...,...,...
120715,VISP,Ser_Fev_Wfdc12,0
120716,VISP,Ser_Nkx6-1_Trh,0
120717,VISP,Tanycyte_Rax_Ccdc170,0
120718,VISP,Tanycyte_Rax_Fndc3c1_1,0


In [25]:
############
### Zeng
############

In [32]:
adata_Zeng = sc.read_h5ad(base_data_folder + "/whole_mouse_brain/processed/zeng/subsets/AIT21.0.merged.with_multiome.h5ad", backed='r')

In [33]:
Zeng_regions = adata_Zeng.obs

In [34]:
Zeng_regions["region_remap"] = Zeng_regions.roi.copy()

In [35]:
#manual mapping to line up the regions as best as posible
Zeng_regions["region_remap"] = Zeng_regions["region_remap"].str.replace("^Mouse Multiome ", "", regex=True) 
Zeng_regions["region_remap"] = Zeng_regions["region_remap"].str.replace("^SSp$", "S1", regex=True)
Zeng_regions["region_remap"] = Zeng_regions["region_remap"].str.replace("^STR - STRd$", "STRd", regex=True)
Zeng_regions["region_remap"] = Zeng_regions["region_remap"].str.replace("^PAR-POST-PRE-SUB-ProS$", "SUB", regex=True)
Zeng_regions["region_remap"] = Zeng_regions["region_remap"].str.replace("^PAR-POST-PRE-SUB-ProS$", "SUB", regex=True)
Zeng_regions["region_remap"] = Zeng_regions["region_remap"].str.replace("^HY LZ$", "HY", regex=True)
Zeng_regions["region_remap"] = Zeng_regions["region_remap"].str.replace("^CNU - PAL$", "PAL", regex=True)
Zeng_regions["region_remap"] = Zeng_regions["region_remap"].str.replace("^VISp$", "VISP", regex=True)
Zeng_regions["region_remap"] = Zeng_regions["region_remap"].str.replace("^VISl$", "VIS", regex=True)
Zeng_regions["region_remap"] = Zeng_regions["region_remap"].str.replace("^VISa$", "VIS", regex=True)
Zeng_regions["region_remap"] = Zeng_regions["region_remap"].str.replace("^VISm$", "VIS", regex=True)
Zeng_regions["region_remap"] = Zeng_regions["region_remap"].str.replace("^VISpos$", "VIS", regex=True)
Zeng_regions["region_remap"] = Zeng_regions["region_remap"].str.replace("^STR - LSX$", "LSX", regex=True)


In [36]:
split_values = Zeng_regions['region_remap'].str.split(' - ', expand=True)

In [37]:
set(split_values[0])

{'ACA',
 'AI-CLA',
 'AId-AIv',
 'AId-AIv-AIp',
 'AUD',
 'AUD-TEa-PERI-ECT',
 'CB',
 'CTXsp',
 'ENT',
 'HB',
 'HIP',
 'HY',
 'LSX',
 'MB',
 'MB-PONS',
 'MO-FRP',
 'MOp',
 'MOs-FRP',
 'MY',
 'OLF',
 'PAL',
 'PL-ILA-ORB',
 'PONS',
 'PTLp',
 'RSP',
 'S1',
 'SS-GU-VISC',
 'SSs-GU-VISC-AIp',
 'STR',
 'STRd',
 'SUB',
 'TEa-PERI-ECT',
 'TH',
 'VIS',
 'VIS-PTLp',
 'VISP'}

In [38]:
Zeng_regions['region_remap'] = split_values[0]

In [39]:
#applied after shortening
Zeng_regions["region_remap"] = Zeng_regions["region_remap"].str.replace("^PONS$", "BS", regex=True)
Zeng_regions["region_remap"] = Zeng_regions["region_remap"].str.replace("^HIP$", "HPF", regex=True) #excludes subiculum
Zeng_regions["region_remap"] = Zeng_regions["region_remap"].str.replace("^HB$", "BS", regex=True)
Zeng_regions["region_remap"] = Zeng_regions["region_remap"].str.replace("^MY$", "BS", regex=True)


In [40]:
set(Zeng_regions["region_remap"]).difference(Macosko_regions["region_remap"])

{'AI-CLA',
 'AId-AIv',
 'AId-AIv-AIp',
 'AUD-TEa-PERI-ECT',
 'CTXsp',
 'MB-PONS',
 'MO-FRP',
 'MOs-FRP',
 'PL-ILA-ORB',
 'PTLp',
 'SS-GU-VISC',
 'SSs-GU-VISC-AIp',
 'STR',
 'TEa-PERI-ECT',
 'VIS-PTLp'}

In [41]:
shared_regions = set(Zeng_regions["region_remap"]).intersection(Macosko_regions["region_remap"])

In [42]:
shared_regions

{'ACA',
 'AUD',
 'BS',
 'CB',
 'ENT',
 'HPF',
 'HY',
 'LSX',
 'MB',
 'MOp',
 'OLF',
 'PAL',
 'RSP',
 'S1',
 'STRd',
 'SUB',
 'TH',
 'VIS',
 'VISP'}

In [43]:
len(shared_regions)

19

In [44]:
set(Macosko_regions["region_remap"]).difference(Zeng_regions["region_remap"])

{'AMY', 'BNST', 'CTX', 'DCN', 'NTS'}

In [45]:
set(Zeng_regions["region_remap"]).difference(Macosko_regions["region_remap"])

{'AI-CLA',
 'AId-AIv',
 'AId-AIv-AIp',
 'AUD-TEa-PERI-ECT',
 'CTXsp',
 'MB-PONS',
 'MO-FRP',
 'MOs-FRP',
 'PL-ILA-ORB',
 'PTLp',
 'SS-GU-VISC',
 'SSs-GU-VISC-AIp',
 'STR',
 'TEa-PERI-ECT',
 'VIS-PTLp'}

In [46]:
Zeng_regions.groupby(['region_remap', 'cl']).size().sum()

  Zeng_regions.groupby(['region_remap', 'cl']).size().sum()


4044536

In [47]:
#Count cells per region and cluster
Zeng_regions = Zeng_regions.groupby(['region_remap', 'cl']).size()

  Zeng_regions = Zeng_regions.groupby(['region_remap', 'cl']).size()


In [48]:
Zeng_regions = Zeng_regions.reset_index()

In [49]:
Zeng_regions = Zeng_regions.rename(columns={0 : "cell_count_Zeng"})

In [50]:
Macosko_regions_all = Macosko_regions.copy()
Macosko_regions = Macosko_regions[Macosko_regions["region_remap"].isin(shared_regions)]
Zeng_regions_all = Zeng_regions.copy()
Zeng_regions = Zeng_regions[Zeng_regions["region_remap"].isin(shared_regions)]



In [51]:
os.makedirs(base_results_folder + "/region_profile_compare/", exist_ok=True)

In [53]:
Zeng_regions.to_csv(base_results_folder + "/region_profile_compare/Zeng_region_counts.csv", index=False)
Macosko_regions.to_csv(base_results_folder + "/region_profile_compare/Macosko_region_counts.csv", index=False)

In [54]:
base_results_folder + "/region_profile_compare/Zeng_region_counts.csv"

'/vault/lfrench/mouse_brain_cluster_replicability/results//region_profile_compare/Zeng_region_counts.csv'