The [Tabula Muris](https://docs.scvi-tools.org/en/stable/tutorials/notebooks/scrna/tabula_muris.html) was downloded using this bash file:
```bash
sbatch download_tubala_muris.sh
```
then should select skin sample by using:
```Python
pd.uniqe(tm_droplet.obs.tissue)
pd.uniqe(tm_facs.obs.tissue)
```

In [3]:
import scanpy as sc
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import celltypist
from celltypist import models
import anndata as ad

In [4]:
import warnings
warnings.simplefilter("ignore", FutureWarning)
warnings.simplefilter("ignore", UserWarning)
warnings.simplefilter("ignore", RuntimeWarning)

In [5]:
rdata = sc.read_h5ad("/data/kanferg/Sptial_Omics/projects/NguyenLab/data/czi/Guimaraes_et_al.h5ad")
list_rdata = []
list_rdata.append(rdata[(rdata.obs['disease_ontology_term_id']=='MONDO:0005105')])
list_rdata.append(rdata[(rdata.obs['disease_ontology_term_id']=='MONDO:0006486')])
rdata_concat = ad.concat(list_rdata)

In [6]:
from gseapy import Biomart
# Initialize Biomart with a specific host
try:
    bm = Biomart(host="useast.ensembl.org")
    datasets = bm.get_datasets()
    print(datasets.head())
except Exception as e:
    print("An error occurred while fetching data from BioMart:", e)

                      Dataset                               Description
0   llaticaudata_gene_ensembl  Blue-ringed sea krait genes (latLat_1.0)
1  tnigroviridis_gene_ensembl           Tetraodon genes (TETRAODON 8.0)
2    nleucogenys_gene_ensembl                   Gibbon genes (Nleu_3.0)
3        bbbison_gene_ensembl       American bison genes (Bison_UMD1.0)
4        vvulpes_gene_ensembl                 Red fox genes (VulVul2.2)


In [7]:
h2m =  bm.query(dataset='hsapiens_gene_ensembl',
               attributes=['ensembl_gene_id','external_gene_name',
                           'mmusculus_homolog_ensembl_gene',
                           'mmusculus_homolog_associated_gene_name'])
h2m = h2m.dropna(subset=['mmusculus_homolog_associated_gene_name'])
h2m_dict = {h:m for h,m in zip(h2m['ensembl_gene_id'].values,h2m['mmusculus_homolog_associated_gene_name'].values)}
rdata_concat.var['mouse_id'] = "0"
var_in = [h2m_dict[ens] if ens in list(h2m_dict.keys()) else "0" for ens in rdata_concat.var_names.values ]
rdata_concat.var['mouse_id'] = var_in
rdata_concat_mouse_symb = rdata_concat[:,rdata_concat.var['mouse_id']!="0"].copy()
del rdata_concat
rdata_concat_mouse_symb.var_names = rdata_concat_mouse_symb.var["mouse_id"].values
rdata_concat_mouse_symb_a = rdata_concat_mouse_symb[:, ~rdata_concat_mouse_symb.var_names.duplicated()].copy()  # For genes
rdata_concat_mouse_symb_b = rdata_concat_mouse_symb_a[~rdata_concat_mouse_symb_a.obs_names.duplicated(), :].copy()
rdata_concat_mouse_symb_b.var_names_make_unique()

In [10]:
pd.unique(rdata_concat_mouse_symb_b.obs['assay'])

['Smart-seq2', '10x 5' v2', '10x 3' v2', '10x 3' v3']
Categories (4, object): ['10x 3' v2', '10x 3' v3', '10x 5' v2', 'Smart-seq2']

In [None]:
tm_droplet = sc.read_h5ad('/data/kanferg/Sptial_Omics/projects/NguyenLab/data/czi/TM_droplet.h5ad')
tm_facs = sc.read_h5ad('/data/kanferg/Sptial_Omics/projects/NguyenLab/data/czi/TM_facs.h5ad')
gene_len = pd.read_csv(
    "https://raw.githubusercontent.com/chenlingantelope/HarmonizationSCANVI/master/data/gene_len.txt",
    delimiter=" ",
    header=None,
    index_col=0,
)
tm_droplet.obs['CellType'] = tm_droplet.obs['cell_ontology_class']
tm_facs.obs['CellType'] = tm_facs.obs['cell_ontology_class']
rdata_concat_mouse_symb_b.obs['CellType'] = rdata_concat_mouse_symb_b.obs['author_cell_type']
gene_len = gene_len.reindex(tm_facs.var.index).dropna()
tm_facs = tm_facs[:, gene_len.index]
assert (tm_facs.var.index == gene_len.index).sum() == tm_facs.shape[1]
tm_facs.X = tm_facs.X / gene_len[1].values * np.median(gene_len[1].values)
# round to integer
tm_facs.X = np.rint(tm_facs.X)