## 06 - Evaluate Phenotypic Volume
Combine data from traced cohorts (IGO15600, IGO15601, IGO15771, IGO16686, IGO17402, IGO17543) and non-traced cohorts (IGO15488_1_2, IGO16318) to evaluate phenotypic diversity.

8/26/2025 - Reran with latest libraries to ensure consistency

In [1]:
import numpy as np
import pandas as pd
import scanpy as sc
from joblib import dump, load

import math
import matplotlib
from matplotlib import pyplot as plt

from pathlib import Path
import anndata

In [2]:
plt.rcParams['svg.fonttype'] = 'none'
plt.rcParams["font.family"] = "Arial"
#plt.rcParams['figure.figsize'] = (3,4)

In [3]:
sc.settings.verbosity = 3             # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.logging.print_header()
sc.settings.set_figure_params(dpi=80, facecolor='white')
np.random.seed(1573)   #fix so we can reproduce later

In [4]:
Path("./figures").mkdir(parents=True, exist_ok=True) # generate figures here
Path("./write").mkdir(parents=True, exist_ok=True) # write h5ad here

In [5]:
# Score Cluster 5 cells
import csv
clusters = {}
clusterkeys = []
HPCS = False

def resetClusters(hpcs = 'cell2020'):
    clusters = {}
    clusterkeys = []
    HPCS = False
    try:
        if hpcs == 'cell2020':         
            with open('../common_files/clusters_cell2020.csv',encoding='utf-8-sig') as csvfile:
                csvreader = csv.reader(csvfile, delimiter=",")
                for row in csvreader:
                    clusters[row[0]] = [x for x in row[1:] if x != '']
            for i in range(1,13):
                #if i == 9: continue
                clusterkeys.append('Cluster %i' % i)
            HPCS = 'Cluster 5'
        else:
            raise ValueError
    except ValueError:
        print("%s is an invalid choice" % hpcs)
        raise
    return (clusters, clusterkeys, HPCS)

(clusters, clusterkeys, HPCS) = resetClusters('cell2020')

In [6]:
(clusters, clusterkeys, HPCS) = resetClusters('cell2020')

In [7]:
def scoreAndPlot(ad, excludeList = None, groupby="Classification",rotation=90,numgenes=25,ctlgenes=25):
    #cmap = 'Reds' #colormap
    cmap = 'jet' #colormap
    if excludeList == None:
        for i in clusterkeys:
            if (numgenes > ctlgenes):
                ctlgenes = numgenes
            sc.tl.score_genes(ad, clusters[i][0:numgenes],score_name="%s" % i, ctrl_size=ctlgenes)
  
        sc.pl.umap(ad, color=clusterkeys, color_map=cmap)
        sc.pl.dotplot(ad, clusterkeys, groupby=groupby, swap_axes=True)

In [8]:
from scipy.stats import ranksums

def HPCSViolinPlot(ad, cluster='0', groupby='leiden', score='Cluster5', save=None, singleGene=False):
    ad.obs['Cl5'] = 'not HPCS'
    ad.obs.loc[ad.obs[groupby].isin([cluster]), 'Cl5'] = 'HPCS'

    if singleGene:
        pvalue = ranksums(ad[ad.obs['Cl5'].isin(['HPCS'])][:,score].X.toarray(),ad[~ad.obs['Cl5'].isin(['HPCS'])][:,score].X.toarray())[1]
    else:
        pvalue = ranksums(ad[ad.obs['Cl5'].isin(['HPCS'])].obs[score],ad[ad.obs['Cl5'].isin(['not HPCS'])].obs[score])[1]
    sc.pl.violin(ad, score, groupby='Cl5', xlabel = 'p = ' + str(pvalue), save=save)

## Define cells to be only traced cells for PV calculation

In [9]:
adata = anndata.read_h5ad('write/combined_data.h5ad')

In [10]:
#Ensure PV is using only same variables across all cells and batches
non_zero_vars = adata.var_names[np.all(adata.X != 0, axis=0)]
adata = adata[:, non_zero_vars]
adata = adata[:, adata.var.highly_variable]

In [11]:
#Phenotypic Volume
def calcPV(matrix):
    gene_cov = np.cov(matrix.T)
    mat = pd.DataFrame(gene_cov)
    eig_vals = np.linalg.eigvalsh(mat)
    return(sum(np.log2(eig_vals[eig_vals > 0])))

In [12]:
adata.obs.Group.value_counts()

Group
Saline                10660
Hopx_GFP+mScarlet+     5660
Hopx_GFP+mScarlet-     4846
KPT                    1903
Hopx_12wk3d            1226
14wk                    991
Hopx_12wk14d            911
12wk_3d                 877
8wk                     742
6wk_3d                  586
Name: count, dtype: int64

## 14 wk PV

In [13]:
from scipy.stats import mannwhitneyu

In [14]:
repeats = 1000

In [15]:
matrix = adata[adata.obs['Group'].isin(['12wk_3d']),adata.var['highly_variable']].X.copy()
dist1 = []
for x in range(0,repeats):
    sample_id = np.random.choice(matrix.shape[0],100)
    dist1.append(calcPV(matrix[sample_id]))

In [16]:
matrix = adata[adata.obs['cell type'].isin(['HPCS']) & adata.obs['Group'].isin(['14wk']),adata.var['highly_variable']].X.copy()
dist2 = []
for x in range(0,repeats):
    sample_id = np.random.choice(matrix.shape[0],100)
    dist2.append(calcPV(matrix[sample_id]))

In [17]:
matrix = adata[adata.obs['Group'].isin(['14wk']),adata.var['highly_variable']].X.copy()
dist3 = []
for x in range(0,repeats):
    sample_id = np.random.choice(matrix.shape[0],100)
    dist3.append(calcPV(matrix[sample_id]))

In [18]:
matrix = adata[adata.obs['batch'].isin(['untraced_14wk']),adata.var['highly_variable']].X.copy()
dist4 = []
for x in range(0,repeats):
    sample_id = np.random.choice(matrix.shape[0],100)
    dist4.append(calcPV(matrix[sample_id]))

In [19]:
matrix = adata[adata.obs['Group'].isin(['Hopx_12wk3d']),adata.var['highly_variable']].X.copy()
dist5 = []
for x in range(0,repeats):
    sample_id = np.random.choice(matrix.shape[0],100)
    dist5.append(calcPV(matrix[sample_id]))

In [20]:
matrix = adata[adata.obs['Group'].isin(['Hopx_12wk14d']),adata.var['highly_variable']].X.copy()
dist6 = []
for x in range(0,repeats):
    sample_id = np.random.choice(matrix.shape[0],100)
    dist6.append(calcPV(matrix[sample_id]))

In [21]:
matrix = adata[adata.obs['cell type'].isin(['AT1-like']) & adata.obs['Group'].isin(['Hopx_12wk14d']),adata.var['highly_variable']].X.copy()
dist7 = []
for x in range(0,repeats):
    sample_id = np.random.choice(matrix.shape[0],100)
    dist7.append(calcPV(matrix[sample_id]))

In [22]:
matrix = adata[adata.obs['cell type'].isin(['HPCS']) & adata.obs['Group'].isin(['Hopx_12wk14d']),adata.var['highly_variable']].X.copy()
dist8 = []
for x in range(0,repeats):
    sample_id = np.random.choice(matrix.shape[0],100)
    dist8.append(calcPV(matrix[sample_id]))

In [23]:
matrix = adata[adata.obs['cell type'].isin(['AT1-like']) & adata.obs['Group'].isin(['14wk']),adata.var['highly_variable']].X.copy()
dist9 = []
for x in range(0,repeats):
    sample_id = np.random.choice(matrix.shape[0],100)
    dist9.append(calcPV(matrix[sample_id]))

In [24]:
pd.DataFrame(np.column_stack((dist1,dist2,dist9,dist3,dist4,dist5,dist6,dist7,dist8)),columns=['All@12wk_3d','HPCS@14wk','AT1-like@14wk','All@14wk','Untraced@14wk','Hopx_12wk_3d','Hopx_12wk_14d','AT1-like@Hopx12wk_14d','HPCS@Hopx12wk_14d']).to_excel('figures/Fig2e_PV_hpcs_14wk_Hopx_combined.xlsx')

## 8 wk PV

In [25]:
matrix = adata[adata.obs['Group'].isin(['6wk_3d']),adata.var['highly_variable']].X.copy()
dist10 = []
for x in range(0,repeats):
    sample_id = np.random.choice(matrix.shape[0],100)
    dist10.append(calcPV(matrix[sample_id]))

In [26]:
matrix = adata[adata.obs['cell type'].isin(['HPCS']) & adata.obs['Group'].isin(['8wk']),adata.var['highly_variable']].X.copy()
dist11 = []
for x in range(0,repeats):
    sample_id = np.random.choice(matrix.shape[0],100)
    dist11.append(calcPV(matrix[sample_id]))

In [27]:
matrix = adata[adata.obs['cell type'].isin(['AT1-like']) & adata.obs['Group'].isin(['8wk']),adata.var['highly_variable']].X.copy()
dist12 = []
for x in range(0,repeats):
    sample_id = np.random.choice(matrix.shape[0],100)
    dist12.append(calcPV(matrix[sample_id]))

In [28]:
matrix = adata[adata.obs['Group'].isin(['8wk']),adata.var['highly_variable']].X.copy()
dist13 = []
for x in range(0,repeats):
    sample_id = np.random.choice(matrix.shape[0],100)
    dist13.append(calcPV(matrix[sample_id]))

In [29]:
pd.DataFrame(np.column_stack((dist10,dist11,dist12,dist13)),columns=['All@6wk_3d','HPCS@8wk','AT1-like','All@8wk']).to_excel('figures/Fig2e_PV_hpcs_8wk_combined.xlsx')

## For Revision

In [30]:
adata = adata[adata.obs.Group.isin(['KPT']),:]

In [31]:
# Response to Reviewer 1 Point 1: How frequent is the HPCS in untraced cells
# Count within each cell type
counts = adata.obs.groupby('Classification')['cell type'].value_counts(normalize=True).reset_index()
print(counts[counts['cell type'] == 'HPCS'])
print(f'HPCS mean: {counts[counts["cell type"] == "HPCS"]["proportion"].mean()} +/- {counts[counts["cell type"] == "HPCS"]["proportion"].std()}')

   Classification cell type  proportion
1    BB1053_B0314      HPCS    0.193548
8    BD1959_B0303      HPCS    0.167832
16   BF1607_B0303      HPCS    0.167364
HPCS mean: 0.17624819055511456 +/- 0.014984238106996147


  counts = adata.obs.groupby('Classification')['cell type'].value_counts(normalize=True).reset_index()
