In [1]:
'''
author Yike Xie
data: 6/10/2023
content: correlate image features and transcriptome
'''

'\nauthor Yike Xie\ndata: 6/10/2023\ncontent: correlate image features and transcriptome\n'

In [2]:
import os
import sys
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
from matplotlib import rcParams
import seaborn as sns

import scanpy as sc
import pickle

In [52]:
def sig_dataset(feas_fn, data_fn):
    print('load features')
    with open(feas_fn, 'rb') as f:
        features = pd.read_pickle(f)
    
    print('load dataset')
    adata = sc.read_h5ad(data_fn)
    sc.pp.normalize_total(adata, target_sum=1e6)

    print('filter genes')
    sc.pp.filter_genes(adata, min_cells=10)

    print('Add features to adata')
    for col in features.columns:
        adata.obs[col] = features.loc[adata.obs_names][col]

    return adata

def get_correlation(adata_s, feas):
    #corr = dsim.correlation.correlate_features_phenotypes(feas, fillna=0)##
    exp = adata_s.X.T
    phe = adata_s.obs[feas].fillna(0)

    x = exp #(39466, 10)
    y = phe.values.T #(2, 10)

    from scipy.stats import rankdata

    xw = np.zeros_like(x, float).toarray()
    for ii, xi in enumerate(x):
        xw[ii] = rankdata(xi, method='average')
    yw = np.zeros_like(y, float)
    for ii, yi in enumerate(y):
        yw[ii] = rankdata(yi, method='average')

    xw = ((xw.T - xw.mean(axis=1)) / xw.std(axis=1)).T
    yw = ((yw.T - yw.mean(axis=1)) / yw.std(axis=1)).T
    n = xw.shape[1]
    r = np.dot(xw, yw.T) / n

    corr = pd.DataFrame(
                    data=r,
                    index=adata_s.var_names,
                    columns=phe.columns,
                    dtype=float)

    return corr

In [4]:
feas_fn = '/home/yike/phd/cancer_cells_img_seq/figures/combine_features.pkl'
data_fn = '/home/yike/phd/cancer_cells_img_seq/data/combine_gene.h5ad'
adata = sig_dataset(feas_fn, data_fn)

save_figures = '/home/yike/phd/cancer_cells_img_seq/figures/correlate_features/gene/'

load features
load dataset
filter genes
Add features to adata


  df_sub[k].cat.remove_unused_categories(inplace=True)


In [6]:
# add the average fluorescent intensity of each channel to the dataset
df = pd.DataFrame(index=adata.obs_names, columns=adata.obs['wavelengths'].iloc[0])
for i in df.index:
    df.loc[i] = adata.obs['spectra'].loc[i] / adata.obs['area'].loc[i]
    
for col in df.columns:
    adata.obs['{}_{}'.format(col[0], col[1])] = df.loc[adata.obs_names][col]
    
# add the ratio of channel 4 [3] to channel 2 [1] after background subtraction to dataset    
adata.obs['C4_C2'] = df[df.columns[3]] / df[df.columns[1]]

## focus following analyses on cells with low percentage of mitochondrial reads

In [47]:
## divide live and dead cells according to the fraction of mitochondrial reads
live_cells = adata.obs[adata.obs['pct_counts_mt'] < 25].index
dead_cells = adata.obs[adata.obs['pct_counts_mt'] > 25].index

# create dataset only having cells with low percentage of mitochondrial reads
adata_live = adata[live_cells, :]

  df_sub[k].cat.remove_unused_categories(inplace=True)


#### average intensity of each channel

In [55]:
# correlations between spectras and gene expressions at single cell resolution
feas = ['{}_{}'.format(col[0], col[1]) for col in df.columns]
corr = get_correlation(adata_live, feas)
corr = corr.fillna(0)
corr['Frac'] = 100 * (adata_live[:, corr.index].X > 0).sum(axis=0) / adata_live.obs.shape[0]
corr['Number'] = (adata_live[:, corr.index].X > 0).sum(axis=0)

corr.to_csv(save_figures + 'spectra_gene_correlation_live_cells.tsv', sep='\t')

  xw = ((xw.T - xw.mean(axis=1)) / xw.std(axis=1)).T
  df_sub[k].cat.remove_unused_categories(inplace=True)
  df_sub[k].cat.remove_unused_categories(inplace=True)


#### ratio between channel 4 and channel 2

In [56]:
feas = ['C4_C2']
corr = get_correlation(adata_live, feas)
corr = corr.fillna(0)
corr['Frac'] = 100 * (adata_live[:, corr.index].X > 0).sum(axis=0) / adata_live.obs.shape[0]
corr['Number'] = (adata_live[:, corr.index].X > 0).sum(axis=0)

corr.to_csv(save_figures + 'correlation_ratio_4_2_live_cells.tsv', sep='\t')

  xw = ((xw.T - xw.mean(axis=1)) / xw.std(axis=1)).T
  df_sub[k].cat.remove_unused_categories(inplace=True)
  df_sub[k].cat.remove_unused_categories(inplace=True)
