## **<span style="color:">File: downsample_methods.ipynb</span>**
Description: This file starts the down sampling process for preparation of cNMF <br>
This is continued in the density_downsample.ipynb

### **Imports**

In [1]:
import scanpy as sc
# some stuff to make this notebook work better with Scanpy
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import matplotlib.pyplot as plt
import matplotlib as mpl
import pandas as pd
import numpy as np

 ## **Immune Exclusion Data** ##
 - Import data
 - Subsample the data for easier NMF factorization

In [2]:
imm_excl_adata = sc.read_h5ad("/home/james/data/immune_exclusion_data/outer_combined_all4_dat.h5ad")
print(imm_excl_adata)

AnnData object with n_obs × n_vars = 130681 × 15785
    obs: 'orig.ident', 'nCount_RNA', 'nFeature_RNA', 'SampleId', 'percent.mt', 'RNA_snn_res.0.5', 'seurat_clusters', 'celltype_each', 'celltype_annotation', 'exp_num', 'active_ident', 'Cell_Type', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'pct_counts_in_top_50_genes', 'pct_counts_in_top_100_genes', 'pct_counts_in_top_200_genes', 'pct_counts_in_top_500_genes', 'IES', 'CD8T_core9', 'CD8T_cytokine', 'Major_cell_type', 'leiden', 'session_id', 'IES_status', 'scIES_status', 'IES_norm', 'Tumor_Type', 'Vantage_ID'
    var: 'n_cells_by_counts', 'mean_counts', 'log1p_mean_counts', 'pct_dropout_by_counts', 'total_counts', 'log1p_total_counts', 'n_counts', 'mean', 'std'
    uns: 'Cell_Type_colors', 'Tumor_Type_colors', 'exp_num_colors', 'leiden_dge', 'neighbors', 'pca', 'umap'
    obsm: 'X_pca', 'X_umap'
    layers: 'arcsin'
    obsp: 'connectivities', 'distances'


In [5]:
## Checking out different obs
imm_excl_adata.obs['SampleId'].value_counts()

10096_s1       20871
10180_02_s3    13742
10180_01_s1    13425
10096_s4       12418
10096_s3       10322
10284_s3        8892
10180_02_s1     8038
10180_02_s4     7859
10284_s4        7542
10180_01_s4     6922
9142_s2         5569
10284_s2        4343
10180_02_s2     3248
9142_s1         1921
10180_01_s2     1644
10284_s1        1618
10180_01_s3     1476
10096_s2         831
Name: SampleId, dtype: int64

## **<span style="color:darkred">Immune Exclusion Data: downsampled proportionally using scanpy</span>**
**Description:** <br> 
These cells perform and evaluate the downsampling of data <br>
using proportional methods by cell type.

In [13]:
down_sample_adata = sc.pp.subsample(
    data = imm_excl_adata,
    fraction = 0.2,
    copy = True
    )

#down_sample_adata.write_h5ad("/home/james/data/immune_exclusion_data/ds_outer_combined_all4_dat.h5ad")

In [14]:
print(down_sample_adata)

AnnData object with n_obs × n_vars = 26136 × 15785
    obs: 'orig.ident', 'nCount_RNA', 'nFeature_RNA', 'SampleId', 'percent.mt', 'RNA_snn_res.0.5', 'seurat_clusters', 'celltype_each', 'celltype_annotation', 'exp_num', 'active_ident', 'Cell_Type', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'pct_counts_in_top_50_genes', 'pct_counts_in_top_100_genes', 'pct_counts_in_top_200_genes', 'pct_counts_in_top_500_genes', 'IES', 'CD8T_core9', 'CD8T_cytokine', 'Major_cell_type', 'leiden', 'session_id', 'IES_status', 'scIES_status', 'IES_norm', 'Tumor_Type', 'Vantage_ID'
    var: 'n_cells_by_counts', 'mean_counts', 'log1p_mean_counts', 'pct_dropout_by_counts', 'total_counts', 'log1p_total_counts', 'n_counts', 'mean', 'std'
    uns: 'Cell_Type_colors', 'Tumor_Type_colors', 'exp_num_colors', 'leiden_dge', 'neighbors', 'pca', 'umap'
    obsm: 'X_pca', 'X_umap'
    layers: 'arcsin'
    obsp: 'connectivities', 'distances'


In [15]:
down_sample_adata.obs["Cell_Type"].value_counts()

Cholangiocytes                  4308
Fibroblasts                     4220
Macrophages                     3654
T cells                         2985
Plasma cells                    2806
B cells                         1350
Basal cells                     1315
Endothelial cells               1218
Goblet cells                    1174
Smooth muscle cells              641
Crypt cells                      492
NK cells                         373
Neutrophils                      361
T memory cells                   270
Mast cells                       245
Ductal cells                     159
Enterocytes                      117
B cells memory                   108
Alveolar macrophages              99
Adipocytes                        59
Dendritic cells                   53
Plasmacytoid dendritic cells      40
Schwann cells                     29
Enteric glia cells                23
Epithelial cells                  20
Pericytes                         10
Enteric neurons                    7
N

## **<span style="color:darkred">Proportional Down Sampling and Filter for low obs</span>**
**Description:** <br>
Using the down sample data from above I applied a cutoff to remove noisy cell types that have observations below 100.

In [22]:
# Get cell counts from dwon sampled data
cell_type_counts = down_sample_adata.obs['Cell_Type'].value_counts()

# Filter out lowly observed cell types
cell_types_to_keep = cell_type_counts[cell_type_counts >= 100].index

# Make a new adata object with this filtered cell type
adata_above_100 = down_sample_adata[down_sample_adata.obs['Cell_Type'].isin(cell_types_to_keep)].copy()

# Write data to new h5ad file
#adata_above_100.write_h5ad('/home/james/data/immune_exclusion_data/above100_ds_outer_combined_all4_dat.h5ad')

print(adata_above_100.obs['Cell_Type'].value_counts())

Cholangiocytes         4308
Fibroblasts            4220
Macrophages            3654
T cells                2985
Plasma cells           2806
B cells                1350
Basal cells            1315
Endothelial cells      1218
Goblet cells           1174
Smooth muscle cells     641
Crypt cells             492
NK cells                373
Neutrophils             361
T memory cells          270
Mast cells              245
Ductal cells            159
Enterocytes             117
B cells memory          108
Name: Cell_Type, dtype: int64


In [23]:
print(adata_above_100)

AnnData object with n_obs × n_vars = 25796 × 15785
    obs: 'orig.ident', 'nCount_RNA', 'nFeature_RNA', 'SampleId', 'percent.mt', 'RNA_snn_res.0.5', 'seurat_clusters', 'celltype_each', 'celltype_annotation', 'exp_num', 'active_ident', 'Cell_Type', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'pct_counts_in_top_50_genes', 'pct_counts_in_top_100_genes', 'pct_counts_in_top_200_genes', 'pct_counts_in_top_500_genes', 'IES', 'CD8T_core9', 'CD8T_cytokine', 'Major_cell_type', 'leiden', 'session_id', 'IES_status', 'scIES_status', 'IES_norm', 'Tumor_Type', 'Vantage_ID'
    var: 'n_cells_by_counts', 'mean_counts', 'log1p_mean_counts', 'pct_dropout_by_counts', 'total_counts', 'log1p_total_counts', 'n_counts', 'mean', 'std'
    uns: 'Cell_Type_colors', 'Tumor_Type_colors', 'exp_num_colors', 'leiden_dge', 'neighbors', 'pca', 'umap'
    obsm: 'X_pca', 'X_umap'
    layers: 'arcsin'
    obsp: 'connectivities', 'distances'


## **<span style="color:darkred">Count Based Down Sampling</span>**
**Description:** <br>
Provide a cutoff number and any cell type with greater than those obsevations will be randomly <br>
sampled and aby cell types with less than the number of cutoff observations will be kept in whole.

In [3]:
def count_subsample(adata, group_key, cutoff = 2000):
    """
    Performs proportional downsampling of an AnnData object.

    Args:
        adata: The AnnData object.
        group_key: The key in `adata.obs` specifying the groups.

    Returns:
        A new AnnData object with the subsampled cells. Returns None if the group_key is invalid.
    """
    if group_key not in adata.obs.columns:
        print(f"Error: group_key '{group_key}' not found in adata.obs")
        return None

    groups = adata.obs[group_key].unique()
    subsampled_indices = []

    for group in groups:
        group_cells = adata.obs[adata.obs[group_key] == group].index
        n_cells = len(group_cells)

        if n_cells > cutoff:
            sampled_indices = np.random.choice(group_cells, size=cutoff, replace=False)
        else:
            sampled_indices = group_cells

        subsampled_indices.extend(sampled_indices)

    return adata[subsampled_indices, :].copy()

In [4]:
subsample_adata = count_subsample(
    adata = imm_excl_adata,
    group_key = 'Cell_Type',
    cutoff = 2000)

# Write down sampled data to new h5ad file
#subsample_adata.write_h5ad('/home/james/data/immune_exclusion_data/ds_2000_outer_combined_all4_dat.h5ad')

In [5]:
print(subsample_adata)

AnnData object with n_obs × n_vars = 31791 × 15785
    obs: 'orig.ident', 'nCount_RNA', 'nFeature_RNA', 'SampleId', 'percent.mt', 'RNA_snn_res.0.5', 'seurat_clusters', 'celltype_each', 'celltype_annotation', 'exp_num', 'active_ident', 'Cell_Type', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'pct_counts_in_top_50_genes', 'pct_counts_in_top_100_genes', 'pct_counts_in_top_200_genes', 'pct_counts_in_top_500_genes', 'IES', 'CD8T_core9', 'CD8T_cytokine', 'Major_cell_type', 'leiden', 'session_id', 'IES_status', 'scIES_status', 'IES_norm', 'Tumor_Type', 'Vantage_ID'
    var: 'n_cells_by_counts', 'mean_counts', 'log1p_mean_counts', 'pct_dropout_by_counts', 'total_counts', 'log1p_total_counts', 'n_counts', 'mean', 'std'
    uns: 'Cell_Type_colors', 'Tumor_Type_colors', 'exp_num_colors', 'leiden_dge', 'neighbors', 'pca', 'umap'
    obsm: 'X_pca', 'X_umap'
    layers: 'arcsin'
    obsp: 'connectivities', 'distances'


In [6]:
print(subsample_adata.obs['Cell_Type'].value_counts())

Endothelial cells               2000
Plasma cells                    2000
B cells                         2000
Macrophages                     2000
Basal cells                     2000
Cholangiocytes                  2000
Crypt cells                     2000
T cells                         2000
Goblet cells                    2000
Fibroblasts                     2000
Smooth muscle cells             2000
NK cells                        1862
Neutrophils                     1757
T memory cells                  1413
Mast cells                      1324
Ductal cells                     749
Enterocytes                      556
B cells memory                   501
Alveolar macrophages             439
Adipocytes                       298
Dendritic cells                  280
Plasmacytoid dendritic cells     153
Epithelial cells                 132
Schwann cells                    129
Enteric glia cells                97
Pericytes                         56
Enteric neurons                   45
N