# Subset Notebook

Working towards analyzing clusters derived in the cluster notebook so that they can be used to create RAG vectors

In [1]:
import warnings

# import numba
from numba.core.errors import NumbaDeprecationWarning, NumbaPendingDeprecationWarning

warnings.filterwarnings("ignore", category=DeprecationWarning)

warnings.simplefilter("ignore", category=NumbaDeprecationWarning)

In [2]:
import scanpy as sc
import pandas as pd
import numpy as np

# import os
from scipy.sparse import csr_matrix
import seaborn as sns
import matplotlib.pyplot as plt

# import celltypist
from celltypist import models
import scarches as sca

# import urllib.request

warnings.filterwarnings("ignore", category=pd.errors.PerformanceWarning)

sc.set_figure_params(figsize=(5, 5))  # type: ignore

In order to use the mouse gastrulation seqFISH datsets, please install squidpy (see https://github.com/scverse/squidpy).
  from .autonotebook import tqdm as notebook_tqdm
In order to use sagenet models, please install pytorch geometric (see https://pytorch-geometric.readthedocs.io) and 
 captum (see https://github.com/pytorch/captum).
mvTCR is not installed. To use mvTCR models, please install it first using "pip install mvtcr"
multigrate is not installed. To use multigrate models, please install it first using "pip install multigrate".


In [3]:
adata = sc.read_h5ad("data/subset.h5ad")
adata

AnnData object with n_obs × n_vars = 9370 × 31208
    obs: 'n_genes_by_counts', 'total_counts', 'pct_counts_in_top_20_genes', 'total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'pct_counts_ribo', 'total_counts_hb', 'pct_counts_hb', 'outlier', 'mt_outlier', 'DF_score', 'batch', 'size_factors', 'leiden_2'
    var: 'gene_ids', 'feature_types', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts', 'highly_variable'

In [4]:
adata.obs["leiden_2"].unique()

['15', '9', '6', '7', '0', ..., '3', '4', '18', '16', '17']
Length: 19
Categories (19, object): ['0', '1', '2', '3', ..., '15', '16', '17', '18']

In [5]:
adata_1 = adata[adata.obs["leiden_2"] == "12"]  # type: ignore

In [6]:
adata_1.var.mean_counts

AL627309.1    0.001840
AL627309.5    0.010493
AL627309.4    0.000639
AP006222.2    0.000046
AL669831.2    0.000471
                ...   
AC004556.3    0.003391
AC233755.2    0.000715
AC233755.1    0.001323
AC007325.4    0.001270
AC007325.2    0.000122
Name: mean_counts, Length: 31208, dtype: float32

In [7]:
b = adata.var[adata.var.highly_variable]
hvar = adata[:, b.index]

In [8]:
clusters = [
    str(x) for x in sorted([int(cluster) for cluster in adata.obs["leiden_2"].unique()])
]

In [9]:
from typing import Any


cluster_table: dict[str, Any] = {}
for cluster in clusters:
    # b = hvar.obs[hvar.obs["leiden_2"] == cluster]
    # print(f"{cluster} len: {len(b)}")
    subset = hvar[hvar.obs["leiden_2"] == cluster]
    # print(subset.shape)
    cluster_table[cluster] = subset

    # if int(cluster) > 0:
    #     break

print(f"Length of cluster table: {len(cluster_table)}")

Length of cluster table: 19


In [34]:
c0 = cluster_table["0"].copy()

In [35]:
c0.obs

Unnamed: 0,n_genes_by_counts,total_counts,pct_counts_in_top_20_genes,total_counts_mt,pct_counts_mt,total_counts_ribo,pct_counts_ribo,total_counts_hb,pct_counts_hb,outlier,mt_outlier,DF_score,batch,size_factors,leiden_2
TGACCAAGTAGACAAA,297,347.0,18.443804,12.0,3.458213,20.0,5.763689,1.0,0.288184,False,False,Singlet,12,0.387125,0
CGGCCATAGCGAGCGA,253,298.0,20.805369,3.0,1.006711,10.0,3.355705,1.0,0.335570,False,False,Singlet,12,0.263221,0
AACCCGCAGCATGTTA,485,606.0,15.841584,14.0,2.310231,30.0,4.950495,3.0,0.495050,False,False,Singlet,12,0.587172,0
AATCATCCAGTTTACG,387,457.0,15.098468,10.0,2.188184,16.0,3.501094,2.0,0.437637,False,False,Singlet,12,0.496341,0
TGAAGGATCGTTACTT,252,284.0,16.901408,8.0,2.816901,27.0,9.507042,1.0,0.352113,False,False,Singlet,12,0.306024,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
CGGTGAGAGCACTAGT,306,344.0,15.697674,12.0,3.488372,20.0,5.813954,2.0,0.581395,False,False,Singlet,12,0.367999,0
GATCAAGAGGTTAGAG,248,288.0,19.791667,5.0,1.736111,12.0,4.166667,0.0,0.000000,False,False,Singlet,12,0.269336,0
GCATGAGCACAGCCAT,252,308.0,21.428571,16.0,5.194805,30.0,9.740259,1.0,0.324675,False,False,Singlet,12,0.249449,0
TTGACATCAGCAAGTG,249,289.0,19.723183,3.0,1.038062,20.0,6.920415,0.0,0.000000,False,False,Singlet,12,0.314412,0


In [82]:
cell_count, gene_count = c0.shape
print(f"{cell_count} cells, {gene_count} genes")
gene_table = {}

for cell_no in range(cell_count):
    # print(f"{cell_no}:{c0.obs.index[cell_no]}")
    b = c0.X[cell_no] > 0.0
    genes = c0.var.index[b]
    for gene in genes:
        if gene in gene_table:
            gene_table[gene] += 1
        else:
            gene_table[gene] = 1
gene_table = dict(sorted(gene_table.items(), key=lambda x:x[1], reverse=True))

for gene in list(gene_table.keys())[0:20]:
    print(f"{gene}:{gene_table[gene]} ({gene_table[gene]/cell_count*100:4.1f}%)")
            
    

1232 cells, 4000 genes
MALAT1:1232 (100.0%)
B2M:1159 (94.1%)
MT-CO1:1070 (86.9%)
PTPRC:996 (80.8%)
MT-ATP6:875 (71.0%)
PLCG2:804 (65.3%)
HLA-B:792 (64.3%)
MT-CO3:737 (59.8%)
MBNL1:704 (57.1%)
HBB:646 (52.4%)
MT-CYB:642 (52.1%)
GNLY:636 (51.6%)
ACTB:619 (50.2%)
MT-CO2:613 (49.8%)
MT-ND4:604 (49.0%)
ARHGAP15:594 (48.2%)
ZEB2:571 (46.3%)
TXNIP:569 (46.2%)
EEF1A1:563 (45.7%)
UTRN:563 (45.7%)


In [93]:
import anndata as ad
def calculate_highest_frequency_genes(adata: ad.AnnData, number_of_genes:int = 20, verbose=False) -> None:
    cell_count, gene_count = adata.shape
    if verbose:
        print(f"{cell_count} cells, {gene_count} genes")
    gene_table = {}
    
    for cell_no in range(cell_count):
        b = adata.X[cell_no] > 0.0
        genes = adata.var.index[b]
        for gene in genes:
            if gene in gene_table:
                gene_table[gene] += 1
            else:
                gene_table[gene] = 1
    gene_table = dict(sorted(gene_table.items(), key=lambda x:x[1], reverse=True))

    gene_list = list(gene_table.keys())[0:number_of_genes]
    if verbose:
        for gene in gene_list:
            print(f"{gene}:{gene_table[gene]} ({gene_table[gene]/cell_count*100:4.1f}%)")
    return gene_list
            

In [94]:
calculate_highest_frequency_genes(c0)

['MALAT1',
 'B2M',
 'MT-CO1',
 'PTPRC',
 'MT-ATP6',
 'PLCG2',
 'HLA-B',
 'MT-CO3',
 'MBNL1',
 'HBB',
 'MT-CYB',
 'GNLY',
 'ACTB',
 'MT-CO2',
 'MT-ND4',
 'ARHGAP15',
 'ZEB2',
 'TXNIP',
 'EEF1A1',
 'UTRN']

In [95]:
clusters

['0',
 '1',
 '2',
 '3',
 '4',
 '5',
 '6',
 '7',
 '8',
 '9',
 '10',
 '11',
 '12',
 '13',
 '14',
 '15',
 '16',
 '17',
 '18']

In [97]:
for cluster in clusters:
    cdata = cluster_table[cluster].copy()
    gene_list = calculate_highest_frequency_genes(cdata)
    print(f"Cluster:{cluster}. Genes: {gene_list}")

Cluster:0. Genes: ['MALAT1', 'B2M', 'MT-CO1', 'PTPRC', 'MT-ATP6', 'PLCG2', 'HLA-B', 'MT-CO3', 'MBNL1', 'HBB', 'MT-CYB', 'GNLY', 'ACTB', 'MT-CO2', 'MT-ND4', 'ARHGAP15', 'ZEB2', 'TXNIP', 'EEF1A1', 'UTRN']
Cluster:1. Genes: ['MALAT1', 'B2M', 'MT-CO1', 'PTPRC', 'PLCG2', 'MBNL1', 'HLA-B', 'MT-CO3', 'MT-ATP6', 'ARHGAP15', 'UTRN', 'ZEB2', 'ACTB', 'PRKCH', 'DDX5', 'RABGAP1L', 'SYNE2', 'SYNE1', 'FYN', 'HBB']
Cluster:2. Genes: ['MALAT1', 'B2M', 'MT-CO1', 'PLCG2', 'PTPRC', 'ARHGAP15', 'HLA-B', 'MBNL1', 'EEF1A1', 'MT-ATP6', 'MT-CO3', 'MT-CYB', 'HBB', 'MT-ND4', 'PARP8', 'ACTB', 'MT-CO2', 'RORA', 'RPL11', 'IL7R']
Cluster:3. Genes: ['MALAT1', 'B2M', 'MT-CO1', 'PLCG2', 'PTPRC', 'HLA-B', 'MT-ATP6', 'MT-CO3', 'HBB', 'MBNL1', 'GNLY', 'ACTB', 'S100A4', 'MT-CYB', 'MT-ND4', 'ZEB2', 'EEF1A1', 'DDX5', 'RPL41', 'MT-CO2']
Cluster:4. Genes: ['MALAT1', 'MT-CO1', 'B2M', 'PLCG2', 'ARHGAP15', 'PTPRC', 'MBNL1', 'FOXP1', 'MT-CO3', 'HBB', 'MT-ATP6', 'MT-CYB', 'MT-ND4', 'HLA-B', 'EEF1A1', 'ANKRD44', 'ZBTB20', 'MT-CO2', 