In [None]:
import os
import pandas as pd
import numpy as np
import anndata
import time
import matplotlib.pyplot as plt
import json
import requests
import pickle
import gzip as gz
import re

import sys
sys.path.insert(1, '/home/jw3514/Work/CellType_Psy/src/')
from CellType_PSY import *

os.chdir("/home/jw3514/Work/CellType_Psy/AllenBrainCellAtlas/")
print(f"Current working directory: {os.getcwd()}")



In [None]:
CellTypesDF = pd.read_csv("dat/CellTypeHierarchy.csv")
Class2Cluster = {}
Subclass2Cluster = {}
for i, row in CellTypesDF.iterrows():
    _cluster, _class, _subclass, _supertype = row
    if _class not in Class2Cluster:
        Class2Cluster[_class] = []
    if _subclass not in Subclass2Cluster:
        Subclass2Cluster[_subclass] = []
    Class2Cluster[_class].append(_cluster)
    Subclass2Cluster[_subclass].append(_cluster)
    
ClusterAnn = pd.read_excel("/home/jw3514/Work/data/Allen_Mouse_Brain_Cell_Atlas/SuppTables/41586_2023_6812_MOESM8_ESM.xlsx",
                          sheet_name = "cluster_annotation", index_col="cluster_id_label")


## Test Split Cluster

In [None]:
cell_extended = pd.read_csv("dat/cell_metadata_with_cluster_annotation.csv")

In [None]:
ENSMUSG2HumanEntrez = json.load(open("dat/ENSMUSG2HumanEntrez.json", 'r'))
def ExpMatConvertEntrez(CombineMat, ENSMUSG2HumanEntrez):
    dat_rows = []
    index = []
    for ensg, row in CombineMat.iterrows():
        if ensg in ENSMUSG2HumanEntrez:
            for entrez in ENSMUSG2HumanEntrez[ensg]:
                index.append(entrez)
                dat_rows.append(row)
    ExpDF_humanEntrez = pd.DataFrame(data=dat_rows, index=index, columns=CombineMat.columns.values)
    ExpDF_humanEntrezDedup = ExpDF_humanEntrez[~ExpDF_humanEntrez.index.duplicated(keep='first')]
    return ExpDF_humanEntrezDedup

In [None]:
feature_matrix_label = cell_extended["feature_matrix_label"].unique()
feature_matrix_label_v3 = [x for x in feature_matrix_label if 'v3' in x]
feature_matrix_label_v2 = [x for x in feature_matrix_label if 'v2' in x]

In [None]:
print(feature_matrix_label_v2)
print(feature_matrix_label_v3)

In [None]:
#RAW_EXP_DIR = "/home/jw3514/Work/data/Allen_Mouse_Brain_Cell_Atlas/abc_download_root/expression_matrices/WMB-10Xv3/20230630/"
RAW_EXP_DIR = "/mnt/data0/AllenMouseSC/abc_download_root/expression_matrices/WMB-10Xv3/20230630"
feature2Fil_10xV3 = {}
for feature in feature_matrix_label_v3:
    Adat = anndata.read_h5ad(RAW_EXP_DIR + "{}-log2.h5ad".format(feature), backed='r')
    feature2Fil_10xV3[feature] = Adat

In [None]:
RAW_EXP_DIR = "/mnt/data0/AllenMouseSC/abc_download_root/expression_matrices/WMB-10Xv2/20230630"
feature2Fil_10xV2 = {}
for feature in feature_matrix_label_v2:
    Adat = anndata.read_h5ad(RAW_EXP_DIR + "{}-log2.h5ad".format(feature), backed='r')
    feature2Fil_10xV2[feature] = Adat

In [None]:
#### Test on one cluster

In [None]:
cluster = "2691 RE-Xi Nox4 Glut_1"
cluster_meta = cell_extended[cell_extended["cluster"]==cluster]
tmp_value_counts = cluster_meta["feature_matrix_label"].value_counts()

In [None]:
tmp_value_counts

In [None]:
CT_V3_features = [x for x in tmp_value_counts.index.values if "v3" in x]

In [None]:
CT_V3_features

In [None]:
Cluster_Barcodes = cell_extended[cell_extended["cluster"]==cluster]["cell_barcode"].values
len(Cluster_Barcodes)

In [None]:
Cluster_GeneXCell_DFs = []
for feature in CT_V3_features:
    print(feature)
    feature_h5d = feature2Fil_10xV3[feature]
    
    submat_cluster = feature_h5d[feature_h5d.obs["cell_barcode"].isin(Cluster_Barcodes)]
    submat_cluster = submat_cluster.to_df()
    submat_cluster = submat_cluster.transpose()
    submat_cluster = ExpMatConvertEntrez(submat_cluster, ENSMUSG2HumanEntrez)
    Cluster_GeneXCell_DFs.append(submat_cluster)

In [None]:
concatenated_df = pd.concat(Cluster_GeneXCell_DFs, axis=1)

In [None]:
concatenated_df

In [None]:
cluster_clean_name = re.sub(r'\W+', '_', cluster)

In [None]:
cluster_clean_name

In [None]:
SaveDIR = "/home/jw3514/Work/data/Allen_Mouse_Brain_Cell_Atlas/Cluster_GeneXCell/"

In [None]:
concatenated_df.to_csv("{}/{}.GeneXCell.csv".format(SaveDIR, cluster_clean_name))

In [None]:
def ExtractGeneXcellForCluster(cluster, cell_extended=cell_extended, ):
    cluster_meta = cell_extended[cell_extended["cluster"]==cluster]
    tmp_value_counts = cluster_meta["feature_matrix_label"].value_counts()
    CT_V3_features = [x for x in tmp_value_counts.index.values if "v3" in x]
    Cluster_Barcodes = cell_extended[cell_extended["cluster"]==cluster]["cell_barcode"].values
    print("CT: {} Ncells: {}".format(cluster, len(Cluster_Barcodes)))
    Cluster_GeneXCell_DFs = []
    for feature in CT_V3_features:
        feature_h5d = feature2Fil_10xV3[feature]
        submat_cluster = feature_h5d[feature_h5d.obs["cell_barcode"].isin(Cluster_Barcodes)]
        submat_cluster = submat_cluster.to_df()
        submat_cluster = submat_cluster.transpose()
        submat_cluster = ExpMatConvertEntrez(submat_cluster, ENSMUSG2HumanEntrez)
        Cluster_GeneXCell_DFs.append(submat_cluster)
    cluster_clean_name = re.sub(r'\W+', '_', cluster)
    concatenated_df = pd.concat(Cluster_GeneXCell_DFs, axis=1)
    concatenated_df.to_csv("{}/{}.GeneXCell.csv".format(SaveDIR, cluster_clean_name))

In [None]:
cluster = "2691 RE-Xi Nox4 Glut_1"
ExtractGeneXcellForCluster(cluster, cell_extended=cell_extended, )

In [None]:
for cluster in ClusterAnn.index.values:
    print(cluster)

In [None]:
100 * 5000 / 1000

## Aggregate to CT LogUMI

In [None]:
DatDIR = "/home/jw3514/Work/data/Allen_Mouse_Brain_Cell_Atlas/Cluster_GeneXCell_UMI"

In [None]:
test_genexcell = pd.read_csv("{}/0001_CLA_EPd_CTX_Car3_Glut_1.GeneXCell.csv".format(DatDIR), index_col=0)

In [None]:
test_genexcell_log = np.log2(test_genexcell+1)

In [None]:
Gene_Cluster_Mean = test_genexcell_log.mean(axis=1)

In [None]:
Total_UMI = test_genexcell_log.sum(axis=0)

In [None]:
plt.hist(Total_UMI)

In [None]:
def processCluster(cluster, DatDIR):
    cluster_clean_name = re.sub(r'\W+', '_', cluster)
    test_genexcell = pd.read_csv("{}/{}.GeneXCell.csv".format(DatDIR, cluster_clean_name), index_col=0)
    test_genexcell_log = np.log2(test_genexcell+1)
    Gene_Cluster_Mean = test_genexcell_log.mean(axis=1)
    Gene_Cluster_Total = test_genexcell_log.sum(axis=1)
    Total_UMI = test_genexcell_log.sum(axis=0)
    return Gene_Cluster_Mean, Gene_Cluster_Total, Total_UMI

In [None]:
clusters = ClusterAnn.head(2).index.values

In [None]:
results = [processCluster(cluster, DatDIR) for cluster in clusters]

In [None]:
def CollectRes(clusters, results):
    Indv_cluster_means = []
    Indv_cluster_total_UMIs = []
    Gene_Total_Exp = np.zeros(len(results[0][1]))
    for cluster, res in zip(clusters, results):
        gene_mean_logUMI = res[0]
        gene_mean_logUMI.name = cluster
        gene_total_logUMI = res[1]
        Gene_Total_Exp += gene_total_logUMI
        cell_depth = res[2].values
        Indv_cluster_means.append(gene_mean_logUMI)
        Indv_cluster_total_UMIs.append(cell_depth)
        
    # Make and save cluster Exp Mat
    Cluster_Exp_DF = pd.concat(Indv_cluster_means, axis=1)
    Cluster_Exp_DF.to_csv("/home/jw3514/Work/CellType_Psy/AllenBrainCellAtlas/dat/SC_UMI_Mats/cluster_MeanLogUMI.csv")
    
    # Save Cell depth for each clusters
    with open("/home/jw3514/Work/CellType_Psy/AllenBrainCellAtlas/dat/SC_UMI_Mats/clusters_CellDepth.npy", 'wb') as f:
        np.save(f, Indv_cluster_total_UMIs)
        
    # Calculate Expression Matching quntiles
    Gene_Total_Exp = Gene_Total_Exp/Total_N_Cells
    Gene_Total_Exp.name="PerCellExp"
    Gene_Total_Exp_DF = pd.DataFrame(data=Gene_Total_Exp)
    Gene_Total_Exp_DF = Gene_Total_Exp_DF.sort_values("PerCellExp")
    Gene_Total_Exp_DF["Rank"] = [1+x for x in range(Gene_Total_Exp_DF.shape[0])] # compute Rank
    Gene_Total_Exp_DF["quantile"] = Gene_Total_Exp_DF["Rank"]/Gene_Total_Exp_DF.shape[0]
    Gene_Total_Exp_DF.to_csv("/home/jw3514/Work/CellType_Psy/AllenBrainCellAtlas/dat/SC_UMI_Mats/ABC_LogUMI.Match.10xV3.csv")

In [None]:
CollectRes(clusters, results)

#### Process some big CT

In [None]:
def processCluster2(cluster, DatDIR):
    
    cluster_clean_name = re.sub(r'\W+', '_', cluster)
    print("Reading file")
    test_genexcell = pd.read_csv("{}/{}.GeneXCell.csv".format(DatDIR, cluster_clean_name), index_col=0)
    print("calculating mean")
    test_genexcell_log = np.log2(test_genexcell+1)
    Gene_Cluster_Mean = test_genexcell_log.mean(axis=1)
    Gene_Cluster_Mean.to_csv("/home/jw3514/Work/CellType_Psy/AllenBrainCellAtlas/dat/SC_UMI_Mats/SplitCTs/{}.csv".format(cluster_clean_name))
    return 

In [None]:
def processClusterV2(cluster, DatDIR):
    
    cluster_clean_name = re.sub(r'\W+', '_', cluster)
    print("Reading file")
    test_genexcell = csv.reader("{}/{}.GeneXCell.csv".format(DatDIR, cluster_clean_name), index_col=0)
    print("calculating mean")
    test_genexcell_log = np.log2(test_genexcell+1)
    Gene_Cluster_Mean = test_genexcell_log.mean(axis=1)
    Gene_Cluster_Mean.to_csv("/home/jw3514/Work/CellType_Psy/AllenBrainCellAtlas/dat/SC_UMI_Mats/SplitCTs/{}.csv".format(cluster_clean_name))
    return 

In [None]:
cluster = "1082 LSX Nkx2-1 Gaba_2"

In [None]:
cluster_clean_name = re.sub(r'\W+', '_', cluster)
with open("{}/{}.GeneXCell.csv".format(DatDIR, cluster_clean_name)) as csvfile:
    gene_dat = []
    gene_index = []
    reader = csv.reader(csvfile)
    head = next(reader)
    for row in reader:
        gene_index.append(int(row[0]))
        log2UMI = np.log2(np.array([float(x) for x in row[1:]]) + 1)
        gene_dat.append(log2UMI.mean())
    Gene_Cluster_Mean = pd.Series(data=gene_dat, index=gene_index)
    Gene_Cluster_Mean.to_csv("/home/jw3514/Work/CellType_Psy/AllenBrainCellAtlas/dat/SC_UMI_Mats/SplitCTs/{}.csv".format(cluster_clean_name))

In [None]:
Gene_Cluster_Mean

In [None]:
Gene_Cluster_Mean.to_csv("/home/jw3514/Work/CellType_Psy/AllenBrainCellAtlas/dat/SC_UMI_Mats/SplitCTs/test.{}.csv".format(cluster_clean_name))

In [None]:
## Aggregate splited files into one matrix

Indv_cluster_means = []
Indv_cluster_total_UMIs = []
Gene_Total_Exp = []
Total_N_Cells = 0
for cluster, row in ClusterAnn.iterrows():
    cluster_v3_cell_counts = row["v3.size"]
    if cluster_v3_cell_counts == 0:
        continue
    cluster_clean_name = re.sub(r'\W+', '_', cluster)  
    gene_mean_logUMI = pd.read_csv(("/home/jw3514/Work/CellType_Psy/AllenBrainCellAtlas/dat/SC_UMI_Mats/SplitCTs/{}.csv".format(cluster_clean_name)),
                                  index_col=0)
    gene_mean_logUMI.rename(columns={"0": cluster}, inplace=True)
    if len(gene_mean_logUMI) != 17938:
        print(cluster, cluster_clean_name, len(gene_mean_logUMI))
    Total_N_Cells += cluster_v3_cell_counts
    #print(cluster_v3_cell_counts)
    if len(Gene_Total_Exp) == 0:
        Gene_Total_Exp = gene_mean_logUMI * cluster_v3_cell_counts
    else:
        Gene_Total_Exp += gene_mean_logUMI * cluster_v3_cell_counts
    gene_mean_logUMI.name = cluster
    Indv_cluster_means.append(gene_mean_logUMI)

# Make and save cluster Exp Mat
Cluster_Exp_DF = pd.concat(Indv_cluster_means, axis=1)
Cluster_Exp_DF.to_csv("/home/jw3514/Work/CellType_Psy/AllenBrainCellAtlas/dat/SC_UMI_Mats/cluster_MeanLogUMI.csv")

In [None]:
gene_mean_logUMI.rename(columns={"0": cluster}, inplace=True)

In [None]:
gene_mean_logUMI

In [None]:
# Calculate Expression Matching quntiles
Gene_Total_Exp = Gene_Total_Exp/Total_N_Cells
Gene_Total_Exp.name="PerCellExp"
Gene_Total_Exp_DF = pd.DataFrame(data=Gene_Total_Exp)
Gene_Total_Exp_DF = Gene_Total_Exp_DF.sort_values("PerCellExp")
Gene_Total_Exp_DF["Rank"] = [1+x for x in range(Gene_Total_Exp_DF.shape[0])] # compute Rank
Gene_Total_Exp_DF["quantile"] = Gene_Total_Exp_DF["Rank"]/Gene_Total_Exp_DF.shape[0]
Gene_Total_Exp_DF.to_csv("/home/jw3514/Work/CellType_Psy/AllenBrainCellAtlas/dat/SC_UMI_Mats/ABC_LogUMI.Match.10xV3.csv")