# Subset Notebook

Working towards analyzing clusters derived in the cluster notebook so that they can be used to create RAG vectors

In [1]:
import warnings

# import numba
from numba.core.errors import NumbaDeprecationWarning, NumbaPendingDeprecationWarning

warnings.filterwarnings("ignore", category=DeprecationWarning)

warnings.simplefilter("ignore", category=NumbaDeprecationWarning)

In [20]:
import scanpy as sc
import pandas as pd
import numpy as np
import anndata as ad

# import os
from scipy.sparse import csr_matrix
import seaborn as sns
import matplotlib.pyplot as plt

# import celltypist
from celltypist import models
import scarches as sca

# import urllib.request

warnings.filterwarnings("ignore", category=pd.errors.PerformanceWarning)

sc.set_figure_params(figsize=(5, 5))  # type: ignore

In [11]:
adata = sc.read_h5ad("data/subset.h5ad")
adata

AnnData object with n_obs × n_vars = 9370 × 31208
    obs: 'n_genes_by_counts', 'total_counts', 'pct_counts_in_top_20_genes', 'total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'pct_counts_ribo', 'total_counts_hb', 'pct_counts_hb', 'outlier', 'mt_outlier', 'DF_score', 'batch', 'size_factors', 'leiden_2'
    var: 'gene_ids', 'feature_types', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts', 'highly_variable'

In [28]:
def get_highly_variable_genes(adata: ad.AnnData) -> ad.AnnData:
    b = adata.var[adata.var.highly_variable]
    return adata[:, b.index]

hvar = get_highly_variable_genes(adata)

In [29]:
def get_cluster_names(adata: ad.AnnData, criterion="leiden_2") -> list[str]:
    clusters = [
        str(x) for x in sorted([int(cluster) for cluster in adata.obs[criterion].unique()])
    ]
    return clusters
print(get_cluster_names(hvar))

['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18']


In [30]:
from typing import Any

def partition_clusters(adata: ad.AnnData, criterion="leiden_2") -> dict[str, ad.AnnData]:
    clusters = get_cluster_names(adata, criterion)
    cluster_table: dict[str, Any] = {}
    for cluster in clusters:
        subset = adata[adata.obs[criterion] == cluster]
        cluster_table[cluster] = subset.copy()
    return cluster_table

cluster_table = partition_clusters(hvar)
print(f"Length of cluster table: {len(cluster_table)}")
assert isinstance(cluster_table[list(cluster_table.keys())[0]], ad.AnnData) # ensure that we are dealing with copies, not slices

Length of cluster table: 19


In [31]:
def calculate_highest_frequency_genes(adata: ad.AnnData, number_of_genes:int = 20, verbose=False) -> None:
    cell_count, gene_count = adata.shape
    if verbose:
        print(f"{cell_count} cells, {gene_count} genes")
    gene_table = {}
    
    for cell_no in range(cell_count):
        b = adata.X[cell_no] > 0.0
        genes = adata.var.index[b]
        for gene in genes:
            if gene in gene_table:
                gene_table[gene] += 1
            else:
                gene_table[gene] = 1
    gene_table = dict(sorted(gene_table.items(), key=lambda x:x[1], reverse=True))

    gene_list = list(gene_table.keys())[0:number_of_genes]
    if verbose:
        for gene in gene_list:
            print(f"{gene}:{gene_table[gene]} ({gene_table[gene]/cell_count*100:4.1f}%)")
    return gene_list
            

In [32]:
def calculate_gene_signature_per_cluster(cluster_table: dict[str, ad.AnnData], 
                                         genes_per_cluster=25,
                                         repeat_limit=5,
                                        )-> dict[str:list[str]]:
    gene_dict = {}
    for cluster in cluster_table:
        cdata = cluster_table[cluster]
        gene_list = calculate_highest_frequency_genes(cdata, genes_per_cluster)
        # print(f"Cluster:{cluster}. Genes: {gene_list}")
        for gene in gene_list:
            if gene in gene_dict:
                gene_dict[gene].append(cluster)
            else:
                gene_dict[gene] = [cluster]
        # eliminate genes that are present "everywhere"
        gene_dict = {k:v for k,v in gene_dict.items() if len(v) < repeat_limit}

    cluster_dict = {k:list() for k in cluster_table}
    for gene in gene_dict:
        clusters = gene_dict[gene]
        for cluster in clusters:
            cluster_dict[cluster].append(gene)
    return cluster_dict

cluster_dict = calculate_gene_signature_per_cluster(cluster_table)
for cluster in cluster_dict:
    print(f"{cluster}:{cluster_dict[cluster]}")


0:['GNLY', 'TXNIP', 'S100A4', 'PRKCH']
1:['GNLY', 'PRKCH', 'SYNE2', 'SYNE1', 'FYN', 'RIPOR2']
2:['PARP8', 'RORA', 'RPL11', 'IL7R', 'CBLB', 'RPL10']
3:['GNLY', 'TXNIP', 'S100A4', 'SYNE2', 'HLA-A']
4:['RIPOR2', 'ZBTB20', 'SMCHD1', 'SKAP1', 'DOCK10', 'PDE3B']
5:['SMCHD1', 'BANK1', 'BACH2', 'RALGPS2', 'FCHSD2', 'ZCCHC7', 'LYN', 'ADK', 'CAMK2D', 'MEF2C']
6:['RIPOR2', 'ZBTB20', 'SMCHD1', 'BANK1', 'BACH2', 'RALGPS2', 'FCHSD2', 'ZCCHC7', 'LYN', 'ADK', 'CAMK2D', 'MEF2C']
7:['PRKCH', 'FYN', 'PARP8', 'CBLB', 'SKAP1', 'HLA-B', 'AOAH', 'ATP8A1']
8:['BACH2', 'ZCCHC7', 'LYN', 'PDE4D', 'PCDH9', 'ARID1B', 'SSBP2', 'TCF4', 'EBF1', 'TMEM131L', 'ACSM3']
9:['AOAH', 'NEAT1', 'NAMPT', 'DPYD', 'VCAN', 'ANXA1', 'ARHGAP26', 'PLXDC2', 'FOS', 'QKI', 'LYST', 'SIPA1L1', 'MED13L', 'ATP2B1', 'JMJD1C', 'HIF1A', 'LRMDA', 'STK17B']
10:['RPL11', 'NFIA', 'TFRC', 'SLC25A21', 'RPLP1', 'USP15', 'FTH1', 'RPL14', 'RPS23', 'RPS8', 'ITGA4', 'CD36', 'RPL5']
11:['TFRC', 'SLC25A21', 'USP15', 'CD36', 'SOX6', 'SPTA1', 'SLC25A37', 'HB