In [1]:
import cupy as cp
import cupyx
import scanpy as sc
import spatialleiden as sl
import squidpy as sq
import numpy as np
from cupyx.scipy.sparse import csr_matrix
import os
from PIL import Image
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from matplotlib.colors import LinearSegmentedColormap
from matplotlib.backends.backend_pdf import PdfPages
import seaborn as sns
import random
import pandas as pd
# from rsc_functions.utility.applyqc import applyqc
# from rsc_functions.reports.plot import plot_spatial,plot_spatial_data, plot_dist
# from rsc_functions.utility.rank_genes_groups import return_markers,rank_genes_groups
# from rsc_functions.reports.plot import plot_expression


In [4]:
path_016 = "/data/kanferg/Sptial_Omics/playGround/Data/Pancreatic_Cancer_paper_2024"

In [7]:
files = [file for file in os.listdir(path_016) if ".csv" in file]
files_row_counts = [file for file in files if "_raw_counts" in file]

In [20]:
len(files_row_counts)

30

In [17]:
start_file_id = [file.split("-")[0] for file in files_row_counts]
start_file_id_unique = set(start_file_id)
start_file_id_unique_dict = {file:0 for file in start_file_id_unique}
start_file_id_unique_dict

{'S22': 0, 'S18': 0, 'S21': 0, 'S14': 0, 'S20': 0, 'S13': 0, 'S17': 0}

In [18]:
for file in files_row_counts:
    start_file_id_unique_dict[file.split("-")[0]] += 1

In [19]:
start_file_id_unique_dict

{'S22': 1, 'S18': 5, 'S21': 10, 'S14': 2, 'S20': 5, 'S13': 5, 'S17': 2}

# Sample Quality Control

| Patient   | Sample Name   | Origin           | Spot before QC | Spots after QC | % Spots retained post QC |
|-----------|---------------|------------------|----------------|----------------|--------------------------|
| IU_PT1    | IU_PDA_T1     | Pancreas          | 3610           | 3530           | 97.78393352               |
| IU_PT2    | IU_PDA_HM2_2  | Liver             | 963            | 959            | 99.58463136               |
| IU_PT2    | IU_PDA_HM2    | Liver             | 2481           | 2478           | 99.87908102               |
| IU_PT2    | IU_PDA_T2     | Pancreas          | 4121           | 4118           | 99.92720214               |
| IU_PT2    | IU_PDA_NP2    | Normal Pancreas   | 2996           | 2995           | 99.96662216               |
| IU_PT3    | IU_PDA_T3     | Pancreas          | 4409           | 4354           | 98.7525516                |
| IU_PT3    | IU_PDA_HM3    | Liver             | 1185           | 1176           | 99.24050633               |
| IU_PT4    | IU_PDA_T4     | Pancreas          | 3864           | 3621           | 93.71118012               |
| IU_PT4    | IU_PDA_HM4    | Liver             | 1857           | 1841           | 99.13839526               |
| IU_PT5    | IU_PDA_HM5    | Liver             | 3042           | 3038           | 99.86850756               |
| IU_PT6    | IU_PDA_HM6    | Liver             | 3555           | 1666           | 46.86357243               |
| IU_PT6    | IU_PDA_T6     | Pancreas          | 3792           | 3397           | 89.58333333               |
| IU_PT6    | IU_PDA_LNM6   | Lymph node        | 3748           | 3745           | 99.91995731               |
| IU_PT7    | IU_PDA_LNM7   | Lymph node        | 3191           | 3186           | 99.84330931               |
| IU_PT8    | IU_PDA_T8     | Pancreas          | 3891           | 3779           | 97.12156258               |
| IU_PT8    | IU_PDA_LNM8   | Lymph node        | 3435           | 3407           | 99.18486172               |
| IU_PT8    | IU_PDA_HM8    | Liver             | 4044           | 4032           | 99.70326409               |
| IU_PT9    | IU_PDA_HM9    | Liver             | 2365           | 1908           | 80.67653277               |
| IU_PT9    | IU_PDA_T9     | Pancreas          | 3529           | 3526           | 99.91499008               |
| IU_PT10   | IU_PDA_HM10   | Liver             | 2978           | 2348           | 78.84486232               |
| IU_PT10   | IU_PDA_T10    | Pancreas          | 2793           | 2714           | 97.17150018               |
| IU_PT10   | IU_PDA_NP10   | Normal Pancreas   | 3010           | 2966           | 98.53820598               |
| IU_PT10   | IU_PDA_LNM10  | Lymph node        | 4173           | 4147           | 99.37694704               |
| IU_PT11   | IU_PDA_T11    | Pancreas          | 2841           | 2777           | 97.74727209               |
| IU_PT11   | IU_PDA_NP11   | Normal Pancreas   | 3913           | 3859           | 98.61998467               |
| IU_PT11   | IU_PDA_HM11   | Liver             | 3979           | 3931           | 98.79366675               |
| IU_PT12   | IU_PDA_LNM12  | Lymph node        | 3293           | 3213           | 97.57060431               |
| IU_PT12   | IU_PDA_T12    | Pancreas          | 3657           | 3642           | 99.58982773               |
| IU_PT12   | IU_PDA_HM12   | Liver             | 2962           | 2961           | 99.96623903               |
| IU_PT13   | IU_PDA_HM13   | Liver             | 3343           | 2182           | 65.27071493               |
| **Total** |               |                  | **97020**       | **91496**       | **94.30632859**            |


<center><h3>Tina expirment</center>
To analyze Tina's experiment, which investigates molecular changes in the tumor microenvironment (TME) across different treatments and time points in a B16 melanoma mouse model, I recommend a statistical approach that accounts for both fixed and random effects. Specifically, use a linear mixed-effects model with gene expression as the response variable. The fixed effects should include the treatment groups (mock, IL-15, IL-21, IL-15/IL-21 transduced T cells) and time points (Day 4 and Day 8), along with their interaction to assess whether the effect of treatment varies over time. The random effects should account for variability between individual mice by including random intercepts for each mouse, specified as (1 | mouse_id) in the model. This approach adjusts for the hierarchical structure of the data, where observations are nested within mice, and captures individual differences that could influence gene expression levels.

For the high-dimensional gene expression data, use specialized tools like DESeq2 or edgeR that are designed for RNA-seq analysis and can handle the count-based nature of the data. Apply appropriate normalization and multiple testing correction methods to control the false discovery rate. Integrate clinical data such as tumor volume, survival rates, and toxicity levels by performing correlation analyses or incorporating them into the model as covariates to explore their relationship with gene expression patterns.

This statistical strategy allows for robust identification of differentially expressed genes and pathways associated with the treatments while accounting for both the experimental design and biological variability. It provides a framework to uncover molecular insights into how different cytokine treatments affect the TME and correlate with tumor response, aligning with the primary objectives of Tina's research.

<center>Example from Nature Gentics 2024, Khaliq et al

To compare gene expression between two clusters, CC1 and CC3, across primary and secondary tumors, I will refer to the analysis approach described in the paper "Spatial transcriptomic analysis of primary and metastatic pancreatic cancers highlights tumor microenvironmental heterogeneity" (https://doi.org/10.1038/s41588-024-01914-4).

Goal:
The objective is to assess the differential gene expression between the two clusters (CC1 and CC3) in both the primary tumor and the metastatic (secondary) tumor environments. This comparison aims to uncover potential differences in the tumor microenvironment (TME) that contribute to cancer progression or metastasis.

By comparing the gene expression profiles of clusters CC1 and CC3 in primary and secondary tumors, this analysis can provide a detailed understanding of how the tumor microenvironment varies between these clusters and across different stages of cancer.

In [40]:
import re
import scanpy as sc
import squidpy as sq
from anndata import AnnData
import scipy.sparse as sp

In [17]:
path_016 = "/data/kanferg/Sptial_Omics/playGround/Data/Pancreatic_Cancer_paper_2024"
files = [file for file in os.listdir(path_016) if ".csv" in file]
files_row_counts = [file for file in files if "_raw_counts" in file]
files_row_counts

['IU_PDA_T11_raw_counts.csv',
 'IU_PDA_T2_raw_counts.csv',
 'IU_PDA_LNM8_raw_counts.csv',
 'IU_PDA_LNM6_raw_counts.csv',
 'IU_PDA_HM8_raw_counts.csv',
 'IU_PDA_NP10_raw_counts.csv',
 'IU_PDA_HM10_raw_counts.csv',
 'IU_PDA_HM12_raw_counts.csv',
 'IU_PDA_LNM7_raw_counts.csv',
 'IU_PDA_HM9_raw_counts.csv',
 'IU_PDA_HM2_raw_counts.csv',
 'IU_PDA_NP2_raw_counts.csv',
 'IU_PDA_HM4_raw_counts.csv',
 'IU_PDA_LNM10_raw_counts.csv',
 'IU_PDA_T10_raw_counts.csv',
 'IU_PDA_HM3_raw_counts.csv',
 'IU_PDA_T3_raw_counts.csv',
 'IU_PDA_T6_raw_counts.csv',
 'IU_PDA_T4_raw_counts.csv',
 'IU_PDA_T12_raw_counts.csv',
 'IU_PDA_NP11_raw_counts.csv',
 'IU_PDA_HM2_2_raw_counts.csv',
 'IU_PDA_HM11_raw_counts.csv',
 'IU_PDA_T9_raw_counts.csv',
 'IU_PDA_T8_raw_counts.csv',
 'IU_PDA_HM13_raw_counts.csv',
 'IU_PDA_HM5_raw_counts.csv',
 'IU_PDA_T1_raw_counts.csv',
 'IU_PDA_LNM12_raw_counts.csv',
 'IU_PDA_HM6_raw_counts.csv']

In [4]:
files_with_HM = [file for file in files_row_counts if 'HM' in file]
hm_identifiers = set()
for file in files:
    match = re.search(r'HM\d+(_\d+)?', file)  # Regex to match 'HM' followed by digits (and optionally _digits)
    if match:
        hm_identifiers.add(match.group())  # Add the matched HM identifier to the set
hm_identifiers = [hm for hm in hm_identifiers]
hm_identifiers

['HM4',
 'HM11',
 'HM6',
 'HM10',
 'HM3',
 'HM9',
 'HM5',
 'HM2',
 'HM2_2',
 'HM13',
 'HM8',
 'HM12']

In [41]:
def agrrData(tissue_name = hm_identifiers[0]):
    file_sel = [file for file in files if tissue_name in file]
    def get_file(attrib,file_sel):
        file_ind = [file for file in file_sel if attrib in file]
        return file_ind[0]
    raw_counts_ind =  get_file(attrib = 'raw_counts',file_sel = file_sel)   
    xy_coordinates_ind = get_file(attrib = 'xy_coordinates',file_sel =file_sel)
    var_ind = get_file(attrib = 'var',file_sel =file_sel)
    cell_type_ind = get_file(attrib = 'cell_type',file_sel =file_sel)
    clusters_ind = get_file(attrib = 'clusters',file_sel =file_sel)
    # read data
    counts_df = pd.read_csv(os.path.join(path_016,raw_counts_ind),index_col=None)
    counts = counts_df.iloc[:,1:].to_numpy()
    del counts_df
    xy_coordinates = pd.read_csv(os.path.join(path_016,xy_coordinates_ind),index_col=None).iloc[:,1:].to_numpy()
    var = pd.read_csv(os.path.join(path_016,var_ind),index_col=None).iloc[:,1:].rename(columns = {"x":'id'})
    obs = pd.read_csv(os.path.join(path_016,cell_type_ind),index_col=None).iloc[:,1:].rename(columns = {'seurat_subset$first_type':'cell_type'})
    obs['clusters'] = pd.read_csv(os.path.join(path_016,clusters_ind),index_col=None).iloc[:,1:].rename(columns = {'seurat_subset$CompositionCluster_CC':'clusters'})
    andata = AnnData(counts.T,var = var ,obsm={"spatial": xy_coordinates}, obs = obs)
    andata.obsm['spatial'] = np.array(andata.obsm['spatial'], dtype=np.float64)
    andata.var = andata.var.set_index('id')
    andata.var.index.name = None
    andata.X = sp.csr_matrix(andata.X)
    return andata

In [42]:
andata_h0 = agrrData(tissue_name = hm_identifiers[0])
andata_h0



AnnData object with n_obs × n_vars = 1841 × 17893
    obs: 'cell_type', 'clusters'
    obsm: 'spatial'

In [47]:
import warnings
warnings.filterwarnings('ignore')
andata_h0 = agrrData(tissue_name = hm_identifiers[0])
andata_h1 = agrrData(tissue_name = hm_identifiers[1])
andata_h2 = agrrData(tissue_name = hm_identifiers[2])
andata_h3 = agrrData(tissue_name = hm_identifiers[3])
andata_h4 = agrrData(tissue_name = hm_identifiers[4])
andata_h5 = agrrData(tissue_name = hm_identifiers[5])
andata_h6 = agrrData(tissue_name = hm_identifiers[6])
#andata_h7 = agrrData(tissue_name = hm_identifiers[7])
andata_h8 = agrrData(tissue_name = hm_identifiers[8])
andata_h9 = agrrData(tissue_name = hm_identifiers[9])
andata_h10 = agrrData(tissue_name = hm_identifiers[10])
andata_h11 = agrrData(tissue_name = hm_identifiers[11])

In [48]:
adata_concat = andata_h0.concatenate(andata_h1)

In [49]:
adata_concat

AnnData object with n_obs × n_vars = 5772 × 17893
    obs: 'cell_type', 'clusters', 'batch'
    obsm: 'spatial'