In [1]:
'''
author: Yike Xie
data: 11/26/2023
content: compute the statistical significance of correlation analysis shown in the manuscript
'''

'\nauthor: Yike Xie\ndata: 11/26/2023\ncontent: compute the statistical significance of correlation analysis shown in the manuscript\n'

In [2]:
import os
import sys
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
from matplotlib import rcParams
import seaborn as sns

import scanpy as sc
import pickle

In [3]:
def sig_dataset(feas_fn, data_fn):
    print('load features')
    with open(feas_fn, 'rb') as f:
        features = pd.read_pickle(f)
    
    print('load dataset')
    adata = sc.read_h5ad(data_fn)
    sc.pp.normalize_total(adata, target_sum=1e6)

    print('filter genes')
    sc.pp.filter_genes(adata, min_cells=10)

    print('Add features to adata')
    for col in features.columns:
        adata.obs[col] = features.loc[adata.obs_names][col]

    return adata

In [4]:
feas_fn = '/home/yike/phd/cancer_cells_img_seq/figures/combine_features.pkl'
data_fn = '/home/yike/phd/cancer_cells_img_seq/data/combine_gene.h5ad'
adata = sig_dataset(feas_fn, data_fn)

load features
load dataset
filter genes
Add features to adata


In [37]:
# add the average fluorescent intensity of each channel to the dataset
df = pd.DataFrame(index=adata.obs_names, columns=adata.obs['wavelengths'].iloc[0])
for i in df.index:
    df.loc[i] = adata.obs['spectra'].loc[i] / adata.obs['area'].loc[i]
    
for col in df.columns:
    adata.obs['{}_{}'.format(col[0], col[1])] = df.loc[adata.obs_names][col]
    
# add the ratio of channel 4 [3] to channel 2 [1] after background subtraction to dataset    
adata.obs['C4_C2'] = df[df.columns[3]] / df[df.columns[1]]

In [41]:
## divide live and dead cells according to the fraction o mitochondrial reads
live_cells = adata.obs[adata.obs['pct_counts_mt'] < 25].index
dead_cells = adata.obs[adata.obs['pct_counts_mt'] > 25].index

# create dataset only having cells with low percentage of mitochondrial reads
adata_live = adata[live_cells, :]

# Average spectra
adata_live.obs['spectra_norm'] = np.array(adata_live.obs['spectra'].tolist()).mean(axis=1) / adata_live.obs['area']

  adata_live.obs['spectra_norm'] = np.array(adata_live.obs['spectra'].tolist()).mean(axis=1) / adata_live.obs['area']


## statistical significance of correlation analysis

In [8]:
from scipy.stats import spearmanr

#### Figure 3 

In [31]:
# size with gene: 'RBM8A', 'HDGFRP3', 'HIGD1A'

fea = 'area'
fea_data = adata_live.obs[fea].tolist()
for gene in ['RBM8A', 'HDGFRP3', 'HIGD1A']:
    res = spearmanr(fea_data, adata_live[:, gene].X[:, 0].tolist())
    rho, P = res[0], res[1]
    print(f'{gene} with {fea}: rho={rho}, p-value={P}')

RBM8A with area: rho=-0.25082078141458203, p-value=0.005934527744146469
HDGFRP3 with area: rho=0.03625958281750584, p-value=0.695428403129469
HIGD1A with area: rho=0.36092087046170857, p-value=5.52077057826243e-05


In [63]:
# size with exon usage: DAAM1, exon25, exon 11, exon 1

save_figures = '/home/yike/phd/cancer_cells_img_seq/figures/correlate_features/exon/'
gene_exp = pd.read_csv(save_figures + 'gene_exp.tsv', sep='\t', index_col='gene_name')

exon_number_fra = pd.read_csv(save_figures + 'exon_number_frac.tsv', sep='\t', index_col=[0, 1])
exon_number_fra_live = exon_number_fra[adata_live.obs_names]

In [83]:
fea = 'area'
gene = 'DAAM1'
g_idx = (gene_exp[live_cells].loc[gene] >= 10)

for exon in [('DAAM1', 25), ('DAAM1', 11), ('DAAM1', 1)]:
    fea_data = adata_live[g_idx].obs[fea].tolist()
    res = spearmanr(fea_data, exon_number_fra_live.loc[exon][g_idx])
    rho, P = res[0], res[1]
    print(f'{exon} with {fea}: rho={rho}, p-value={P}')

('DAAM1', 25) with area: rho=-0.4515959437270447, p-value=4.2275621239977475e-05
('DAAM1', 11) with area: rho=-0.013775599537232135, p-value=0.9059819735729822
('DAAM1', 1) with area: rho=0.4433243409047735, p-value=6.0533314872030635e-05


#### Figure S4

In [32]:
# size with gene: TUBG2
fea = 'area'
fea_data = adata_live.obs[fea].tolist()
for gene in ['TUBG2']:
    res = spearmanr(fea_data, adata_live[:, gene].X[:, 0].tolist())
    rho, P = res[0], res[1]
    print(f'{gene} with {fea}: rho={rho}, p-value={P}')

TUBG2 with area: rho=0.32290569127681995, p-value=0.0003413031650952017


In [33]:
# eccentricity with gene: GNB2L1, RHOA,GRWD1

fea = 'eccentricity'
fea_data = adata_live.obs[fea].tolist()
for gene in ['GNB2L1', 'RHOA', 'GRWD1']:
    res = spearmanr(fea_data, adata_live[:, gene].X[:, 0].tolist())
    rho, P = res[0], res[1]
    print(f'{gene} with {fea}: rho={rho}, p-value={P}')

GNB2L1 with eccentricity: rho=-0.2077556753949942, p-value=0.023376645509655194
RHOA with eccentricity: rho=0.023209027236245683, p-value=0.8021682734750271
GRWD1 with eccentricity: rho=0.2726843980382112, p-value=0.00269703715525957


#### Figure 4

In [43]:
# ave intensity of C4 with gene: LDHA, ATP5I, IDH1, ME2

fea = '343_451'
fea_data = adata_live.obs[fea].tolist()
for gene in ['LDHA', 'ATP5I', 'IDH1', 'ME2']:
    res = spearmanr(fea_data, adata_live[:, gene].X[:, 0].tolist())
    rho, P = res[0], res[1]
    print(f'{gene} with {fea}: rho={rho}, p-value={P}')

LDHA with 343_451: rho=-0.31684645500416087, p-value=0.00044674254312204715
ATP5I with 343_451: rho=0.0006481943306339052, p-value=0.9944177824880066
IDH1 with 343_451: rho=0.2607494247800869, p-value=0.00418146826766667
ME2 with 343_451: rho=0.29344085112309776, p-value=0.0011999834106921124


#### Figure S5

In [42]:
# Ratio of C4 to C2 with gene: INSR, ALDOA, ENO1

fea = 'C4_C2'
fea_data = adata_live.obs[fea].tolist()
for gene in ['INSR', 'ALDOA', 'ENO1']:
    res = spearmanr(fea_data, adata_live[:, gene].X[:, 0].tolist())
    rho, P = res[0], res[1]
    print(f'{gene} with {fea}: rho={rho}, p-value={P}')

INSR with C4_C2: rho=-0.3219113605857897, p-value=0.00035685723830144366
ALDOA with C4_C2: rho=0.0034540416602203395, p-value=0.9702605251229992
ENO1 with C4_C2: rho=0.2139866115937901, p-value=0.01944879572874183
