# Part II: Get metadata

- Number of transcripts per cell and per gene
- Relate original cell ID with `ndimage` one
- Some other data related to cell and overall concentration

In [26]:
import numpy as np
import pandas as pd
import tifffile as tf
from glob import glob
import os
from scipy import ndimage, interpolate, spatial, stats

from importlib import reload
import utils

# Load all general data

- Same setup as in Part I

In [4]:
wsrc = '../cell_dams/'
nsrc = '../nuclear_mask/'
tsrc = '../translocs/'
psrc = '../proc/'
osrc = '../data/'
sample = 'D2'

dst = '../kde/'

dst += sample + os.sep
if not os.path.isdir(dst):
    os.mkdir(dst)

wall = tf.imread(wsrc + sample + '_dams.tif').astype(bool)
nuclei = tf.imread(nsrc + sample + '_EDT.tif') < 2
label, cellnum = ndimage.label(wall, ndimage.generate_binary_structure(2,1))
print('Detected',cellnum,'cells')

- Load all the CSVs with spatial locations

In [29]:
filenames = sorted(glob(tsrc + sample + os.sep + '*.csv'))
tsize = np.zeros(len(filenames), dtype=int)
transcriptomes = [os.path.splitext(filenames[i])[0].split('_-_')[-1] for i in range(len(filenames)) ]
translocs = [None for i in range(len(filenames))]
for i in range(len(filenames)):
    translocs[i] = pd.read_csv(filenames[i], header=None, names=['X', 'Y', 'Z'])
    tsize[i] = len(translocs[i])
tlocs = pd.concat(translocs)

# Compute metadata

Get metadata for every cell
-  Lower-left and upper right corners of the minimal bounding box containing the cell
-  Length and height of such box
-  Total cell area
-  Total area occupied by nuclei
-  Total area occupied by cytosol
-  Cytosol-to-total area ratio
-  Center of mass according to the original data
-  Center of mass according to `ndimage` computed right now
-  Cell ID from the original data
-  Label ID matched from `ndimage` (matching done by comparing centers of mass)

In [20]:
reload(utils)
filename = dst + sample + '_cells_metadata.csv'
print(filename)
if not os.path.isfile(filename):
    celllocs = utils.celllocs_read(osrc + sample + '_data/' + transcriptomes[1] + '/' + transcriptomes[1] + ' - localization results by cell.csv')
    dcoords, cnuclei, cmatches = utils.match_original_ndimage(celllocs, wall, label, cellnum)
    objss = ndimage.find_objects(label)
    meta = utils.generate_cell_metadata(label, objss, nuclei)
    meta = meta.join(pd.DataFrame(np.round(dcoords[cmatches], 2), columns=['orig_comX', 'orig_comY']))
    meta = meta.join(pd.DataFrame(np.round(np.flip(cnuclei, axis=1),2), columns=['ndimage_comX', 'ndimage_comY']))
    meta['orig_cellID'] = celllocs['Cell.ID..'].values[cmatches]
    meta['ndimage_cellID'] = np.arange(1,cellnum+1)
    meta.to_csv(filename, index=False)

metacell = pd.read_csv(filename)
print(metacell.shape)
metacell.loc[:,['length', 'height']].describe().astype(int).T

../kde/D2/D2_cells_metadata.csv
(2937, 16)


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
length,2937,309,336,77,209,278,372,17152
height,2937,309,338,81,205,278,379,17152


Get metadata for every gene

- Total number of transcripts throughout the cross section
- Total number of transcripts floating in cytosol
- Total number floating in nuclei
- Cytosolic-to-total transcript ratio

In [17]:
filename = dst + sample + '_transcripts_metadata.csv'
print(filename)
if not os.path.isfile(filename):
    data = pd.read_csv(osrc + sample + '_data' + os.sep + '32771-slide1_' + sample + '_results.txt', header=None, sep='\t').drop(columns=[4])
    _, orig_size = np.unique(data.iloc[:,-1], return_index = False, return_inverse=False, return_counts=True) 
    meta = pd.DataFrame()
    meta['total_number'] = orig_size
    meta['cyto_number'] = tsize
    meta['nuclei_number'] = orig_size - tsize
    meta['ratio'] = tsize/orig_size
    meta['gene'] = transcriptomes
    
    meta.to_csv(filename, index=False)

metatrans = pd.read_csv(filename)
metatrans.head()

../kde/D2/D2_transcripts_metadata.csv


Unnamed: 0,total_number,cyto_number,nuclei_number,ratio,gene
0,32613,32171,442,0.986447,BAC45727
1,1162,1151,11,0.990534,BAC46169
2,1820064,1795069,24995,0.986267,BAC47034
3,1939,1923,16,0.991748,BAC47856
4,3033,2985,48,0.984174,BAC48395


Get metadata for every cell and gene

- Number of transcripts of a given gene in a given cell

In [18]:
filename = dst + sample + '_transcells_metadata.csv'
print(filename)
if not os.path.isfile(filename):
    meta = utils.generate_transcell_metadata(translocs, transcriptomes, cellnum, label)
    meta.to_csv(filename, index=False)
transcell = pd.read_csv(filename)
transcell.head()

../kde/D2/D2_transcells_metadata.csv


Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,2929,2930,2931,2932,2933,2934,2935,2936,2937,gene
0,31,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,BAC45727
1,7,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,BAC46169
2,859,1,0,1,1,0,0,0,0,0,...,0,1,0,1,0,2,0,0,0,BAC47034
3,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,BAC47856
4,5,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,BAC48395
