# Subset Notebook

Working towards analyzing clusters derived in the cluster notebook so that they can be used to create RAG vectors

In [32]:
import warnings

# import numba
from numba.core.errors import NumbaDeprecationWarning, NumbaPendingDeprecationWarning

warnings.filterwarnings("ignore", category=DeprecationWarning)

warnings.simplefilter("ignore", category=NumbaDeprecationWarning)

In [33]:
import scanpy as sc
import pandas as pd
import numpy as np

# import os
from scipy.sparse import csr_matrix
import seaborn as sns
import matplotlib.pyplot as plt

# import celltypist
from celltypist import models
import scarches as sca

# import urllib.request

warnings.filterwarnings("ignore", category=pd.errors.PerformanceWarning)

sc.set_figure_params(figsize=(5, 5))  # type: ignore

In [34]:
adata = sc.read_h5ad("data/subset.h5ad")
adata

AnnData object with n_obs × n_vars = 9370 × 31208
    obs: 'n_genes_by_counts', 'total_counts', 'pct_counts_in_top_20_genes', 'total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'pct_counts_ribo', 'total_counts_hb', 'pct_counts_hb', 'outlier', 'mt_outlier', 'DF_score', 'batch', 'size_factors', 'leiden_2'
    var: 'gene_ids', 'feature_types', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts', 'highly_variable'

In [35]:
adata.obs["leiden_2"].unique()

['15', '9', '6', '7', '0', ..., '3', '4', '18', '16', '17']
Length: 19
Categories (19, object): ['0', '1', '2', '3', ..., '15', '16', '17', '18']

In [36]:
adata_1 = adata[adata.obs["leiden_2"] == "12"]  # type: ignore

In [37]:
adata_1.var.mean_counts

AL627309.1    0.001840
AL627309.5    0.010493
AL627309.4    0.000639
AP006222.2    0.000046
AL669831.2    0.000471
                ...   
AC004556.3    0.003391
AC233755.2    0.000715
AC233755.1    0.001323
AC007325.4    0.001270
AC007325.2    0.000122
Name: mean_counts, Length: 31208, dtype: float32

In [39]:
b = adata.var[adata.var.highly_variable]
adata[:, b.index]

View of AnnData object with n_obs × n_vars = 9370 × 4000
    obs: 'n_genes_by_counts', 'total_counts', 'pct_counts_in_top_20_genes', 'total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'pct_counts_ribo', 'total_counts_hb', 'pct_counts_hb', 'outlier', 'mt_outlier', 'DF_score', 'batch', 'size_factors', 'leiden_2'
    var: 'gene_ids', 'feature_types', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts', 'highly_variable'

In [31]:
a = adata.var.highly_variable
v = adata.var
v[a]

Unnamed: 0,gene_ids,feature_types,n_cells_by_counts,mean_counts,pct_dropout_by_counts,total_counts,highly_variable
CCNL2,ENSG00000221978,Gene Expression,15878,0.161236,87.926852,21205.0,True
MRPL20,ENSG00000242485,Gene Expression,12320,0.118709,90.632247,15612.0,True
CDK11A,ENSG00000008128,Gene Expression,12018,0.116329,90.861879,15299.0,True
GNB1,ENSG00000078369,Gene Expression,32797,0.450907,75.062160,59301.0,True
SKI,ENSG00000157933,Gene Expression,12582,0.129521,90.433030,17034.0,True
...,...,...,...,...,...,...,...
MT-ND4L,ENSG00000212907,Gene Expression,11561,0.155077,91.209368,20395.0,True
MT-ND4,ENSG00000198886,Gene Expression,37838,0.838954,71.229137,110335.0,True
MT-ND5,ENSG00000198786,Gene Expression,37477,0.704231,71.503631,92617.0,True
MT-ND6,ENSG00000198695,Gene Expression,12662,0.142463,90.372201,18736.0,True
