In [None]:
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
import scanpy as sc
import episcanpy as epi

import os
import scipy

import Utils as ut

--> First download the data from notebook 0 <---

# HumanBrain

In [None]:
adata=sc.read_10x_mtx("Datasets/10XhsBrain3kMO/CTA/filtered_feature_bc_matrix")
adata

In [None]:
adata.write("Datasets/10XhsBrain3kMO/CTA/gex.h5ad", compression="gzip")

## Annotation
check notebook 10X brain CTA to see the procedure to do the cell type annotation

In [None]:
adata=epi.read_h5ad("Datasets/10XhsBrain3kMO/CTA/annotated_gex.h5ad")
adata

In [None]:
set(adata.obs.celltype)

In [None]:
df=pd.DataFrame(adata.obs["celltype"]).copy()
df.rename(columns={"celltype" : "CellType"}, inplace=True)
df["CellType"]=df["CellType"].astype(str)
df.replace("NA",np.nan, inplace=True)
df.replace("neuronal cell - Purkinje cell -FOXP2","Purkinje_neuron_FOXP2", inplace=True)
df.replace("neuronal cell - Purkinje cell -ITPR1","Purkinje_neuron_ITPR1", inplace=True)
df.replace("Purkinje layer neuron","Purkinje_neuron_layer", inplace=True)
df.replace("interneuron - MLI - molecular layer interneurons","Molec_Layer_Interneur", inplace=True)
df.replace("astrocyte","Astrocyte", inplace=True)
df.replace("astrocyte (progenitor)","Astrocyte_progenitor", inplace=True)
df.replace("microglia","Microglia", inplace=True)
df.replace("oligodendrocyte","Oligodendrocyte", inplace=True)
df.replace("inhibitory neuron","Inhibitory_neuron", inplace=True)
df.replace('inhibitory neuron - MAF+',"Inhibitory_neuron_MAF",inplace=True)
df.replace("inhibitory neuron - PVALB+ SST+","Inhibitory_neuron_PVALB_SST", inplace=True)
set(df.CellType)

In [None]:
df["Raw_celltype"]=df["CellType"]
df["Raw_celltype"].replace("Purkinje_neuron_FOXP2","Purkinje_neuron", inplace=True)
df["Raw_celltype"].replace("Purkinje_neuron_ITPR1","Purkinje_neuron", inplace=True)
df["Raw_celltype"].replace("Purkinje_neuron_layer","Purkinje_neuron", inplace=True)

df["Raw_celltype"].replace("Inhibitory_neuron","Inhibitory_neuron", inplace=True)
df["Raw_celltype"].replace("Inhibitory_neuron_PVALB_SST","Inhibitory_neuron", inplace=True)
df["Raw_celltype"].replace("Inhibitory_neuron_MAF","Inhibitory_neuron", inplace=True)

df["Raw_celltype"].replace("Astrocyte_progenitor","Astrocyte", inplace=True)
set(df["Raw_celltype"])

In [None]:
df.to_csv("Datasets/10XhsBrain3kMO/10XhsBrain3kMO_metadata.csv")

# PBMC

In [None]:
X=scipy.io.mmread("Datasets/10XhsPBMC10kMO/CTA/filtered_feature_bc_matrix/matrix.mtx.gz")
X

In [None]:
obs=pd.read_csv("Datasets/10XhsPBMC10kMO/CTA/filtered_feature_bc_matrix/barcodes.tsv.gz", sep="\t", header=None, index_col=0)
print(obs.shape)
obs.index.name=""
obs["barcode"]=list(obs.index)
obs.head()

In [None]:
var=pd.read_csv("Datasets/10XhsPBMC10kMO/CTA/filtered_feature_bc_matrix/features.tsv.gz", sep="\t", header=None, index_col=0)
print(var.shape)
var.columns=["Gene","Experiment","chr","start","stop"]
var.index.name=""
var.head()

In [None]:
adata=sc.AnnData(X=scipy.sparse.csr_matrix(X.T,  dtype="int32"))
adata

In [None]:
adata.var.index=np.array(var.index).astype(str)
adata.var["Experiment"]=np.array(var.Experiment).astype(str)
adata.var["chr"]=np.array(var.chr).astype(str)
adata.var["start"]=np.array(var.start).astype(str)
adata.var["stop"]=np.array(var.stop).astype(str)
adata=adata[:,adata.var["Experiment"]=="Gene Expression"].copy()
adata.obs.index=np.array(obs.index).astype(str)
adata

In [None]:
adata.write("Datasets/10XhsPBMC10kMO/CTA/10XhsPBMC10kMO_GEX_temp_Raw.h5ad", compression="gzip")

## CTA

Follow https://muon-tutorials.readthedocs.io/en/latest/single-cell-rna-atac/pbmc10k/1-Gene-Expression-Processing.html

In [None]:
mr=pd.read_csv("Datasets/10XhsPBMC10kMO/CTA/10XhsPBMC10kMO_CTA_MariaR.csv", index_col=0, usecols=[0,1])
mr.index.name=""
mr["RNA_celltype"]=[str(ct).replace("+","").replace(" ","_") for ct in mr["RNA_celltype"]]
mr["RNA_celltype"].replace("CD14_mono","Mono_CD14", inplace=True)
mr["RNA_celltype"].replace("CD16_mono","Mono_CD16", inplace=True)
mr["RNA_celltype"].replace("intermediate_mono","Mono_intermediate", inplace=True)
mr["RNA_celltype"].replace("CD4_memory_T","T_CD4_memory", inplace=True)
mr["RNA_celltype"].replace("CD4_naïve_T","T_CD4_naïve", inplace=True)
mr["RNA_celltype"].replace("MAIT","T_MAIT", inplace=True)
mr["RNA_celltype"].replace("CD8_naïve_T","T_CD8_naive", inplace=True)
mr["RNA_celltype"].replace("CD8_activated_T","T_CD8_activated", inplace=True)
mr["RNA_celltype"].replace("memory_B","B_memory", inplace=True)
mr["RNA_celltype"].replace("naïve_B","B_naive", inplace=True)
mr["RNA_celltype"].replace("mDC","DCm", inplace=True)
mr["RNA_celltype"].replace("pDC","DCp", inplace=True)
mr["RNA_celltype"].replace("nan",np.nan, inplace=True)

mr["Raw_CellType"]=mr["RNA_celltype"]
mr["Raw_CellType"].replace("Mono_CD14","Monocytes", inplace=True)
mr["Raw_CellType"].replace("Mono_CD16","Monocytes", inplace=True)
mr["Raw_CellType"].replace("Mono_intermediate","Monocytes", inplace=True)

mr["CellType"]=mr["RNA_celltype"]
mr["CellType"].replace("Mono_CD14","Monocytes", inplace=True)
mr["CellType"].replace("Mono_CD16","Monocytes_CD16", inplace=True)
mr["CellType"].replace("Mono_intermediate","Monocytes", inplace=True)
mr.head()

In [None]:
adata=epi.read_h5ad("Datasets/10XhsPBMC10kMO/CTA/10XhsPBMC10kMO_GEX_temp_Raw.h5ad")
inter=ut.intersection([adata.obs.index, mr.dropna().index])
adata=adata[inter]
adata.obs=mr.loc[inter]
epi.pp.normalize_total(adata)
epi.pp.log1p(adata)

In [None]:
marker_genes = ['IL7R', 'TRAC',
                'ITGB1',
                'SLC4A10',
                'CD8A', 'CD8B', 'CCL5',
                'GNLY', 'NKG7',
                'CD79A', 'MS4A1', 'IGHM', 'IGHD',
                'IL4R', 'TCL1A',
                'KLF4', 'LYZ', 'S100A8', 'ITGAM',
                'CD14', 'FCGR3A', 'MS4A7',
                'CST3', 'CLEC10A', 'IRF8', 'TCF4']

In [None]:
sc.pl.dotplot(adata, marker_genes, groupby="CellType")

In [None]:
sc.pl.dotplot(adata, marker_genes, groupby="RNA_celltype")

In [None]:
mr.dropna(how="any").to_csv("Datasets/10XhsPBMC10kMO/10XhsPBMC10kMO_metadata.csv")

# HSPC

In [None]:
atac=epi.read_h5ad("Datasets/HSPC/CTA/3423-MV-2_adata_atac_postpro.h5ad")
atac.obs.head()
gex=epi.read_hgex=epi.read_h5ad("Datasets/HSPC/CTA/3423-MV-2_adata_postpro.h5ad")
gex.obs.head()
df=pd.DataFrame(gex.obs["leiden"])
df.rename({"leiden" : "CellType"}, axis=1, inplace=True)
df.head()
df["CellType"]=df["CellType"].str.replace(" ","_")
set(df["CellType"])
df.to_csv("Datasets/HSPC/HSPC_metadata.csv")

# MouseBrain

In [None]:
atac=epi.read_h5ad("Datasets/MouseBrain/CTA/adata_atac_postpro.h5ad")
atac.obs.head()

In [None]:
gex=epi.read_h5ad("Datasets/MouseBrain/CTA/adata_postpro.h5ad")
gex.obs.head()

In [None]:
df=pd.DataFrame(gex.obs["leiden"])
df.rename({"leiden" : "CellType"}, axis=1, inplace=True)
print(set(df["CellType"]))
df.head()

In [None]:
df["CellType"]=df["CellType"].str.replace(" ","_")
df["CellType"]=df["CellType"].str.replace(", ","_")
df["CellType"]=df["CellType"].str.replace("-","_")
df["CellType"]=df["CellType"].str.replace(",","_")
df["CellType"]=df["CellType"].str.replace("__","_")
set(df["CellType"])

In [None]:
df.to_csv("Datasets/MouseBrain/MouseBrain_metadata.csv")

# Kidney

In [None]:
cta=pd.read_csv("Datasets/Kidney/GSE172008_human_kidney_snATAC.cluster_labels.txt.gz", sep="\t")
print(cta.shape)
cta.head()

In [None]:
cta["Sample"]=[c.split("_")[0] for c in cta["#cell_id"]]
cta.rename({"cluster_name" : "CellType"}, axis=1, inplace=True)
print(set(cta["Sample"]))
cta=cta[cta["Sample"]=="HK2431"]
print(cta.shape)
cta.head()

In [None]:
cta.index=[c.split("_")[1] for c in cta["#cell_id"]]
cta.head()

In [None]:
cta[["#cell_id","CellType","Sample"]].to_csv("Datasets/Kidney/Kidney_metadata.csv")

# Datasets infos

In [None]:
datasets=["10XhsBrain3kMO", "10XhsBrain3kMO","GSE117309", "10XhsPBMC10kMO","10XhsPBMC10kMO", "HSPC","HSPC","MouseBrain","MouseBrain","Kidney"]
featurespaces=["Peak","GEX","Window", "Peak", "GEX", "Peak","GEX","Peak","GEX","Peak"]
labels=["CellType","CellType","BCsubtype","CellType","CellType","CellType","CellType","CellType","CellType","CellType"]
ds_infos=pd.DataFrame(data=[datasets, featurespaces, labels]).T
ds_infos.columns=["DSs","FsSs","LBs"]
ds_infos["Names"]=["Human brain","Human brain","Breast cancer","PBMC","PBMC", "HSPC","HSPC", "Mouse brain","Mouse brain","Kidney"]
ds_infos.to_csv("Tables/Datasets_infos.tsv", sep="\t")
ds_infos

In [None]:
ds_infos=pd.read_csv("Tables/Datasets_infos.tsv", sep="\t", index_col=0)

In [None]:
diz={}
for ds, nm, lb in zip(ds_infos["DSs"], ds_infos["Names"], ds_infos["LBs"]):
    d=pd.read_csv(f"Datasets/{ds}/{ds}_metadata.csv", index_col=0)
    diz[nm]=len(set(d[lb]))

In [None]:
ds_infos=pd.read_csv("Tables/Datasets_infos.tsv", sep="\t", index_col=0)
df=pd.DataFrame(columns=["Dataset","Feature space","Number of features", "Number of cells"])
for ds, nm, lb, fs in zip(ds_infos["DSs"], ds_infos["Names"], ds_infos["LBs"], ds_infos["FsSs"]):
    adata=sc.read_h5ad(f"Datasets/{ds}/FeatureSpaces/{fs}/CM/{ds}_{fs}_Def.h5ad")
    if fs != "GEX":
        fs=f"{fs}s"
    d=pd.DataFrame(data=[nm, fs, adata.shape[1], adata.shape[0]], index=df.columns)
    df=pd.concat([df, d.T])
df

In [None]:
df.to_csv("Tables/DatasetsDimensions.tsv", index=None, sep="\t")

In [None]:
diz={}
for ds, nm, lb in zip(ds_infos["DSs"], ds_infos["Names"], ds_infos["LBs"]):
    d=pd.read_csv(f"Datasets/{ds}/{ds}_metadata.csv", index_col=0)
    diz[nm]=list(set(d[lb].dropna()))

In [None]:
cts=pd.DataFrame.from_dict(diz, orient="index").T.to_csv("Tables/DatasetsCellType.tsv", sep="\t")