In [None]:
import snapatac2 as snap
import scanpy as sc
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from harmony import harmonize
import math
import os
import csv

### 0. Set environment

In [None]:
storing_dir = "/work/DevM_analysis/01.annotation/07.integration_atac/data/sample_h5ad_storing"
arc_path = "/work/DevM_analysis/01.annotation/02.cleandata/data/filtered_frag_files"
my_frag_files = [f'{arc_path}/{fl}' for fl in os.listdir(arc_path) if "tbi" not in fl]

outputs = []
names = []
for fl in my_frag_files:
    name = fl.split("/")[-1]
    name = name.split("_")[0]
    names.append(name)
    outputs.append(f'{storing_dir}/{name}.h5ad')

### 1. Create sample anndata

In [None]:
adatas = snap.pp.import_data(my_frag_files, file=outputs, 
                             chrom_sizes=snap.genome.hg38, 
                             sorted_by_barcode = False)

### 2. Preprocess

In [None]:
snap.pp.add_tile_matrix(adatas, bin_size=5000)

In [None]:
snap.pp.select_features(adatas, n_features=50000)

### 3. Create annDataset

In [None]:
## -- Create
data = snap.AnnDataSet(
    adatas=[(name, adata) for (name, adata) in zip(names, adatas)],
    filename="/work/DevM_analysis/01.annotation/07.integration_atac/data/FL_atac_snapatac2-integration.annDataset.h5ads"
)

In [None]:
unique_cell_ids = [sa + '_' + bc for sa, bc in zip(data.obs['sample'], data.obs_names)]
data.obs_names = unique_cell_ids
assert data.n_obs == np.unique(data.obs_names).size

In [None]:
print(f'Number of cells: {data.n_obs}')
print(f'Number of unique barcodes: {np.unique(data.obs_names).size}')

In [None]:
data.close()

In [None]:
## -- Add meta
meta_cells = pd.read_table("/work/DevM_analysis/01.annotation/02.cleandata/data/cell-info_filtering.txt")
meta_cells = meta_cells[meta_cells["HighQualityCell"]==1]

In [None]:
meta_cells.index = meta_cells["integration_barcode"]

In [None]:
#data.obs = pd.concat([data.obs, meta.loc[data.obs_names]], axis = 1) #doesnt work with builtin anndata
data.obs["nCount_RNA"] = meta_cells.loc[data.obs_names, "nCount_RNA"]
data.obs["nFeature_RNA"] = meta_cells.loc[data.obs_names, "nFeature_RNA"]
data.obs["percent.mt"] = meta_cells.loc[data.obs_names, "percent.mt"]
data.obs["percent.rb"] = meta_cells.loc[data.obs_names, "percent.rb"]
data.obs["nCount_peaks"] = meta_cells.loc[data.obs_names, "nCount_peaks"]
data.obs["nFeature_peaks"] = meta_cells.loc[data.obs_names, "nFeature_peaks"]
data.obs["nucleosome_signal"] = meta_cells.loc[data.obs_names, "nucleosome_signal"]
data.obs["TSS.enrichment"] = meta_cells.loc[data.obs_names, "TSS.enrichment"]
data.obs["libraryID"] = meta_cells.loc[data.obs_names, "libraryID"]
data.obs["sampleID"] = meta_cells.loc[data.obs_names, "sampleID"]
data.obs["donorID"] = meta_cells.loc[data.obs_names, "donorID"]
data.obs["PCW"] = meta_cells.loc[data.obs_names, "PCW"]

### 4. Preprocess annDataset

In [None]:
snap.pp.select_features(data, n_features=50000)

In [None]:
## -- Dim reduction
snap.tl.spectral(data) 
    #Default: n_comps=30, weighted_by_sd=True 
    #The dimensions are weighted by their eigen values, giving more importance to most informative dims, n_comps should be large enough, default 30

In [None]:
## -- Umap
snap.tl.umap(data, use_rep="X_spectral", key_added="umap_unintegrated")

In [None]:
## -- Save
data.close()
data = snap.read_dataset("/work/DevM_analysis/01.annotation/07.integration_atac/data/FL_atac_snapatac2-integration.annDataset.h5ads")

### 5. Batch correction

In [None]:
## -- Convert to adata
adata = data.to_adata()

In [None]:
## -- Harmony
data.obsm["X_spectral_harmony_libraryID+donorID"] = harmonize(adata.obsm["X_spectral"][:, :], 
                                                         adata.obs, batch_key=['libraryID', 'donorID'])
data.obsm["X_spectral_harmony_libraryID+donorID"] = np.float64(data.obsm["X_spectral_harmony_libraryID+donorID"])

In [None]:
## -- Mnn
snap.pp.mnc_correct(data, batch="sampleID", key_added='X_spectral_mnn_sampleID')

In [None]:
## -- Umap
snap.tl.umap(data, use_rep="X_spectral_harmony_libraryID+donorID", key_added="umap_harmony_libraryID+donorID")

In [None]:
snap.tl.umap(data, use_rep="X_spectral_mnn_sampleID", key_added="umap_mnn_sampleID")

### 6. Convert to adata

In [None]:
data = snap.read_dataset("/work/DevM_analysis/01.annotation/07.integration_atac/data/FL_atac_snapatac2-integration.annDataset.h5ads")

In [None]:
adata = data.to_adata()

In [None]:
adata.write_h5ad("/work/DevM_analysis/01.annotation/07.integration_atac/data/FL_atac_snapatac2-integration.annData-format.h5ad")

In [None]:
data.close()