In [2]:
import scanpy as sc
import pandas as pd
import os
from anndata import AnnData
from random import sample
import warnings
warnings.filterwarnings('ignore')

# 10X genmonics Datasets

In [None]:
# this script is modified from squidpy.
import os
import pandas as pd
import scanpy as sc
from pathlib import Path
from scanpy import _utils
from scanpy._settings import settings
from typing import Literal, NamedTuple
import scipy
import tarfile
DICT = {
   '1.1.0' :[
      "V1_Breast_Cancer_Block_A_Section_1",
      "V1_Breast_Cancer_Block_A_Section_2",
      "V1_Human_Heart",
      "V1_Human_Lymph_Node",
      "V1_Mouse_Kidney",
      "V1_Adult_Mouse_Brain",
      "V1_Mouse_Brain_Sagittal_Posterior",
      "V1_Mouse_Brain_Sagittal_Posterior_Section_2",
      "V1_Mouse_Brain_Sagittal_Anterior",
      "V1_Mouse_Brain_Sagittal_Anterior_Section_2",
      "V1_Human_Brain_Section_1",
      "V1_Human_Brain_Section_2",
      "V1_Adult_Mouse_Brain_Coronal_Section_1",
      "V1_Adult_Mouse_Brain_Coronal_Section_2",
   ],
   '1.2.0' :[
      "Targeted_Visium_Human_Cerebellum_Neuroscience",
      "Parent_Visium_Human_Cerebellum",
      "Targeted_Visium_Human_SpinalCord_Neuroscience",
      "Parent_Visium_Human_SpinalCord",
      "Targeted_Visium_Human_Glioblastoma_Pan_Cancer",
      "Parent_Visium_Human_Glioblastoma",
      "Targeted_Visium_Human_BreastCancer_Immunology",
      "Parent_Visium_Human_BreastCancer",
      "Targeted_Visium_Human_OvarianCancer_Pan_Cancer",
      "Targeted_Visium_Human_OvarianCancer_Immunology",
      "Parent_Visium_Human_OvarianCancer",
      "Targeted_Visium_Human_ColorectalCancer_GeneSignature",
      "Parent_Visium_Human_ColorectalCancer",
   ],
   '1.3.0' :[
      "Visium_FFPE_Mouse_Brain",
      "Visium_FFPE_Mouse_Brain_IF",
      "Visium_FFPE_Mouse_Kidney",
      "Visium_FFPE_Human_Breast_Cancer",
      "Visium_FFPE_Human_Prostate_Acinar_Cell_Carcinoma",
      "Visium_FFPE_Human_Prostate_Cancer",
      "Visium_FFPE_Human_Prostate_IF",
      "Visium_FFPE_Human_Normal_Prostate",
   ],
   '2.0.0' :[
      "CytAssist_FFPE_Human_Lung_Squamous_Cell_Carcinoma",
      "CytAssist_FFPE_Human_Skin_Melanoma",
      "CytAssist_11mm_FFPE_Human_Ovarian_Carcinoma",
      "CytAssist_FFPE_Mouse_Brain_Rep1",
      "CytAssist_FFPE_Mouse_Brain_Rep2",
      "CytAssist_11mm_FFPE_Mouse_Embryo",
      "Visium_FFPE_Human_Prostate_IF",
      "Visium_Fresh_Frozen_Adult_Mouse_Brain",
      "Visium_Mouse_Olfactory_Bulb",
      "V1_Mouse_Brain_Sagittal_Anterior_Section_2",
      "V1_Mouse_Brain_Sagittal_Posterior_Section_2",
      "Targeted_Visium_Human_Cerebellum_Neuroscience",
   ],
}



class VisiumFiles(NamedTuple):
    feature_matrix: str
    spatial_attrs: str
    cluster_analysis: str


def download(sample_id,spaceranger_version,base_dir=None):
   if base_dir is None:
      base_dir = settings.datasetdir
   base_dir = Path(base_dir)
   sample_dir = base_dir / (sample_id+spaceranger_version)
   sample_dir.mkdir(exist_ok=True, parents=True)
   url_prefix = f"https://cf.10xgenomics.com/samples/spatial-exp/{spaceranger_version}/{sample_id}/"
   visium_files = VisiumFiles(
      f"{sample_id}_filtered_feature_bc_matrix.h5", f"{sample_id}_spatial.tar.gz", f"{sample_id}_analysis.tar.gz"
   )
   # download spatial data
   tar_pth = sample_dir / visium_files.spatial_attrs
   _utils.check_presence_download(filename=tar_pth, backup_url=url_prefix + visium_files.spatial_attrs)
   with tarfile.open(tar_pth) as f:
      for el in f:
         if not (sample_dir / el.name).exists():
               f.extract(el, sample_dir)
   # download count data
   _utils.check_presence_download(
      filename=sample_dir / "filtered_feature_bc_matrix.h5",
      backup_url=url_prefix + visium_files.feature_matrix,
   )

   # download cluster data
   tar_pth = sample_dir / visium_files.cluster_analysis
   _utils.check_presence_download(filename=tar_pth, backup_url=url_prefix + visium_files.cluster_analysis)
   with tarfile.open(tar_pth) as f:
      for el in f:
         if not (sample_dir / el.name).exists():
               f.extract(el, sample_dir)
   
   return(sample_dir)

import os
def preprocess(sample_dir):
   for root,_,files in os.walk(sample_dir):
      for file in files:
         if 'clusters.csv' in file:
            clusters_file=os.path.join(root,file)
         if 'tissue_positions' in file:
            tissue_positions_file=os.path.join(root,file)
   
   label = pd.read_csv(clusters_file,index_col=0)
   adata = sc.read_10x_h5(sample_dir / "filtered_feature_bc_matrix.h5")
   try:
      spatial = pd.read_csv(tissue_positions_file)
      spatial = spatial[spatial.iloc[:,1]==1].set_index('barcode').iloc[:,-2:]
   except(KeyError):
      spatial = pd.read_csv(tissue_positions_file,header=None)
      spatial = spatial[spatial.iloc[:,1]==1].set_index(0).iloc[:,-2:]
   adata.obs['cluster'] = label#.loc[adata.obs.index & label.index].values
   adata.obsm['spatial'] = spatial.loc[adata.obs.index & spatial.index].values
   if scipy.sparse.issparse(adata.X):
      adata.X = adata.X.toarray()
   
   output_dir = sample_dir.parent / "h5ad_dir"
   output_dir.mkdir(exist_ok=True, parents=True)
   adata.var_names_make_unique()
   adata.write(output_dir / f"{sample_dir.name}.h5ad")

if __name__ == "__main__":
   for spaceranger_version in DICT:
      for sample_id in DICT[spaceranger_version]:
         sample_dir = download(sample_id,spaceranger_version)
         preprocess(sample_dir)

# Spatial Bench

In [1]:
def parse_SpatialBench(file_folder,name,saving_base_path='./data/SpatialBench/'):
    
    saving_path = os.path.join(saving_base_path,f'{name}.h5ad')
    
    counts = pd.read_table(os.path.join(file_folder,'Insitu_count.txt'))
    locations = pd.read_table(os.path.join(file_folder,'Locations.txt'),sep='\s')
    assert counts.shape[0] == locations.shape[0]
    
    # if counts.shape[0]>5000:return
    
    sp_adata = AnnData(counts,dtype=int)
    sp_adata.obsm['spatial'] = locations.values
    
    print(saving_path,*sp_adata.shape)
    sp_adata.write_h5ad(saving_path)

In [3]:
data_dir = '../SpatialBench_LiB2022.NM/data/DataUpload/'
for data_id in os.listdir(data_dir):
    if data_id.startswith('.'):continue
    parse_SpatialBench(
        os.path.join(data_dir,data_id),
        data_id
    )

./data/SpatialBench/Dataset1.h5ad 8425 347
./data/SpatialBench/Dataset10.h5ad 1549 981
./data/SpatialBench/Dataset11.h5ad 1380 141
./data/SpatialBench/Dataset12.h5ad 6000 119
./data/SpatialBench/Dataset13.h5ad 6000 118
./data/SpatialBench/Dataset14.h5ad 3039 84
./data/SpatialBench/Dataset15.h5ad 3405 33
./data/SpatialBench/Dataset16.h5ad 11426 76
./data/SpatialBench/Dataset17.h5ad 1154 42
./data/SpatialBench/Dataset18.h5ad 982 914
./data/SpatialBench/Dataset19.h5ad 995 946
./data/SpatialBench/Dataset2.h5ad 175 42
./data/SpatialBench/Dataset20.h5ad 4784 1000
./data/SpatialBench/Dataset21.h5ad 4895 1000
./data/SpatialBench/Dataset22.h5ad 2432 1000
./data/SpatialBench/Dataset23.h5ad 1211 1000
./data/SpatialBench/Dataset24.h5ad 1162 1000
./data/SpatialBench/Dataset25.h5ad 1127 1000
./data/SpatialBench/Dataset26.h5ad 2425 1000
./data/SpatialBench/Dataset27.h5ad 198 1000
./data/SpatialBench/Dataset28.h5ad 277 1000
./data/SpatialBench/Dataset29.h5ad 1835 1000
./data/SpatialBench/Dataset3.h5ad

# SpatialLIBD

In [8]:
from anndata import AnnData
import scanpy as sc
import numpy as np
import pandas as pd
import os

sample_ids = ['151508','151509','151510','151670','151671','151672','151674','151675','151676','151507','151669','151673']

def read_data(sample_no = '151507'):
    
    file_dir = './data/SpatialLIBD/processed/'
    spatial_info_dir = './data/SpatialLIBD/locs/'
    
    spatial_info = pd.read_csv(os.path.join(spatial_info_dir,f"{sample_no}_tissue_positions_list.txt"),index_col=0,header=None)
    file_dir=os.path.join(file_dir,f'{sample_no}_filtered_feature_bc.csv')
    
    data = pd.read_csv(file_dir,index_col=0)
    samples = data.columns
    genes = data.index
    data = data.values
    anndata = AnnData(
        X=data.T.astype(np.float32),
        obs=pd.DataFrame(index=samples),
        var=pd.DataFrame(index =genes)
    )
    anndata.obsm['spatial'] = spatial_info.loc[samples,[4,5]].values
    sc.pp.filter_genes(anndata, min_cells=10)
    sc.pp.filter_cells(anndata, min_genes=10)
    true_label = ground_truth(sample_id)
    anndata.obs['ground_truth'] = true_label
    
    return anndata

def ground_truth(sample_id):
    ground_truth_dir = './data/SpatialLIBD/labels/'
    df_ground = pd.read_csv(os.path.join(ground_truth_dir,f'cluster_labels_{sample_id}.csv'),index_col=0)['ground_truth']
    df_ground.fillna('UNKOWN',inplace=True)
    df_ground.index = [i.split('_')[1] for i in df_ground.index]
    return df_ground


In [9]:
for sample_id in sample_ids:
    print(sample_id)
    data = read_data(sample_id)
    data.write_h5ad(f'./data/SpatialLIBD/{sample_id}.h5ad')

151508
151509
151510
151670
151671
151672
151674
151675
151676
151507
151669
151673
