In [38]:
import anndata as ad
import scanpy as sc
import pandas as pd

Use `tar -xzvf` on .tar.gz files to extract `filtered_feature_bc_matrix`. This can then be loaded as AnnData object and saved to H5AD

# BMMC dataset

In [21]:
bmmc_multi = sc.read("/workspace/data/BMMC_70k_multiome.h5ad")

In [31]:
print(bmmc_multi.obs.columns)

Index(['GEX_pct_counts_mt', 'GEX_n_counts', 'GEX_n_genes', 'GEX_size_factors',
       'GEX_phase', 'ATAC_nCount_peaks', 'ATAC_atac_fragments',
       'ATAC_reads_in_peaks_frac', 'ATAC_blacklist_fraction',
       'ATAC_nucleosome_signal', 'cell_type', 'batch', 'ATAC_pseudotime_order',
       'GEX_pseudotime_order', 'Samplename', 'Site', 'DonorNumber', 'Modality',
       'VendorLot', 'DonorID', 'DonorAge', 'DonorBMI', 'DonorBloodType',
       'DonorRace', 'Ethnicity', 'DonorGender', 'QCMeds', 'DonorSmoker'],
      dtype='object')


# Lymphoma 14k dataset

In [None]:
# This only loads the gene expression data for some reason
lymphoma14k = sc.read_10x_h5("/workspace/data/lymph_node_lymphoma_14k_filtered_feature_bc_matrix.h5")
lymphoma14k.var_names_make_unique()
print(lymphoma14k.var['feature_types'].value_counts())

  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")


In [17]:
lymphoma14k_multi = sc.read_10x_mtx(
    '/workspace/data/filtered_feature_bc_matrix/',  
    var_names='gene_symbols',
    cache=True,                     
    gex_only=False                  
)

In [None]:
print(lymphoma14k_multi.var['feature_types'].value_counts())

feature_types
Peaks              71766
Gene Expression    36601
Name: count, dtype: int64


In [20]:
lymphoma14k_multi.write("/workspace/data/lymphoma_14k_multiome.h5ad")

In [49]:
meta_lymph = pd.read_csv("/workspace/data/lymph_node_lymphoma_14k_per_barcode_metrics.csv")

In [51]:
meta_lymph.columns

Index(['barcode', 'gex_barcode', 'atac_barcode', 'is_cell', 'excluded_reason',
       'gex_raw_reads', 'gex_mapped_reads', 'gex_conf_intergenic_reads',
       'gex_conf_exonic_reads', 'gex_conf_intronic_reads',
       'gex_conf_exonic_unique_reads', 'gex_conf_exonic_antisense_reads',
       'gex_conf_exonic_dup_reads', 'gex_exonic_umis',
       'gex_conf_intronic_unique_reads', 'gex_conf_intronic_antisense_reads',
       'gex_conf_intronic_dup_reads', 'gex_intronic_umis',
       'gex_conf_txomic_unique_reads', 'gex_umis_count', 'gex_genes_count',
       'atac_raw_reads', 'atac_unmapped_reads', 'atac_lowmapq',
       'atac_dup_reads', 'atac_chimeric_reads', 'atac_mitochondrial_reads',
       'atac_fragments', 'atac_TSS_fragments', 'atac_peak_region_fragments',
       'atac_peak_region_cutsites'],
      dtype='object')

# PBMCs 10k male

In [24]:
pbmc_10k_male = sc.read_10x_mtx(
    '/workspace/data/filtered_feature_bc_matrix/',  
    var_names='gene_symbols',
    cache=False,                     
    gex_only=False                  
)

In [34]:
print(pbmc_10k_male.var['feature_types'].value_counts())
print(pbmc_10k_male.obs.columns)

feature_types
Peaks              111743
Gene Expression     36601
Name: count, dtype: int64
Index([], dtype='object')


In [27]:
pbmc_10k_male.write("/workspace/data/pbmc_10k_male_multiome.h5ad")

In [39]:
meta = pd.read_csv("/workspace/data/pbmc_10k_male_metrics.csv")

In [41]:
meta.columns

Index(['barcode', 'gex_barcode', 'atac_barcode', 'is_cell', 'excluded_reason',
       'gex_raw_reads', 'gex_mapped_reads', 'gex_conf_intergenic_reads',
       'gex_conf_exonic_reads', 'gex_conf_intronic_reads',
       'gex_conf_exonic_unique_reads', 'gex_conf_exonic_antisense_reads',
       'gex_conf_exonic_dup_reads', 'gex_exonic_umis',
       'gex_conf_intronic_unique_reads', 'gex_conf_intronic_antisense_reads',
       'gex_conf_intronic_dup_reads', 'gex_intronic_umis',
       'gex_conf_txomic_unique_reads', 'gex_umis_count', 'gex_genes_count',
       'atac_raw_reads', 'atac_unmapped_reads', 'atac_lowmapq',
       'atac_dup_reads', 'atac_chimeric_reads', 'atac_mitochondrial_reads',
       'atac_fragments', 'atac_TSS_fragments', 'atac_peak_region_fragments',
       'atac_peak_region_cutsites'],
      dtype='object')

# PBMCs 10k female

In [28]:
pbmc_10k_female = sc.read_10x_mtx(
    '/workspace/data/filtered_feature_bc_matrix/',  
    var_names='gene_symbols',
    cache=False,                     
    gex_only=False                  
)

In [35]:
print(pbmc_10k_female.var['feature_types'].value_counts())
print(pbmc_10k_female.obs.columns)

feature_types
Peaks              143887
Gene Expression     36601
Name: count, dtype: int64
Index([], dtype='object')


In [30]:
pbmc_10k_female.write("/workspace/data/pbmc_10k_female_multiome.h5ad")