# Combine all BM and UC data into single dataframe for analysis

In [9]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm, trange
import os
import sys
import dask

# directories and inputs

In [10]:
# folder paths for PKL files with RNA positions
dotPaths = []
dotPaths.append(r"..\results\HBM\detected_dots")
dotPaths.append(r"..\results\HUC\detected_dots")
dotPaths.append(r"..\results\HCH\detected_dots")
cwd = os.getcwd()

# folder paths for cell masks
maskPaths = []
maskPaths.append(r"..\images\HBM\registered")
maskPaths.append(r"..\images\HUC\registered")
maskPaths.append(r"..\images\HCH\registered")

# folder to export single dataframe
exportPath = r"..\results\spatial_statistics"

idCols = ['Y', 'X', 'FOV', 'CellLabel', 'TissueSource']

In [11]:
dfAll = pd.DataFrame(columns = idCols) # to store all data

# Read all pickle files for cells from different tissue types and combine into single dataframe

In [12]:
# loop over all tissue types
for dotPath, maskPath in zip(dotPaths, maskPaths):
        
    tissueSource = dotPath.split(os.sep)[-3]
    tissueSource = tissueSource.split('_')[-1]
    if 'Culture' in tissueSource:
        tissueSource = tissueSource.replace('Culture', '')
        
    tissueSource = tissueSource.upper()
    
    os.chdir(dotPath)    

    pklFiles = [f for f in os.listdir() if f.endswith('.pkl')]

    # preserve cell count across FOVs.  Start with max of previous
    dfSub = dfAll.loc[dfAll['TissueSource'] == tissueSource]
    if dfSub.size == 0:
        cellCount = 0 # no cells recorded yet
        
    else:
        cellCount = dfSub['CellLabel'].max()
    
    for jj, fileName in enumerate(tqdm(pklFiles)): # each pkl file

        data = pd.read_pickle(fileName)
        parts = fileName.replace('.pkl', '')
        parts = parts.split('_')
        cellLabel = parts[-1]
        fov = parts[-2]
        # _, fov, cellLabel = fileName.split('_')
        fov = int(fov)
        # cellLabel = cellLabel.replace('.pkl', '')
        # cellLabel = int(cellLabel)
        cellCount += 1

        """
        Each file is a cell. Once you load them into python, each will have a dictionary. 
        Keys are markers, and the corresponding list is the position of each dot.
        position in (row, column) format
        """
        for markerName, coords in data.items(): # key, val

            if len(coords) != 0: # not empty

                dfSub = pd.DataFrame() # to merge with big one
                coords = np.array(coords)
                dfSub['Y'] = coords[:, 0]
                dfSub['X'] = coords[:, 1]
                dfSub[markerName] = np.ones(coords.shape[0]) # 1 signal for each spot
                dfSub['FOV'] = np.repeat(fov, coords.shape[0])
                dfSub['CellLabel'] = np.repeat(cellCount, coords.shape[0])
                dfSub['TissueSource'] = np.repeat([tissueSource], coords.shape[0])

                # merge back
                # dfAll = dfAll.merge(dfSub, how = 'outer') # all cols
                dfAll = pd.concat([dfAll, dfSub])
    os.chdir(cwd)

FileNotFoundError: [WinError 3] The system cannot find the path specified: '..\\results\\HBM\\detected_dots'

# Group same spot for different markers

In [5]:
dfAll = dfAll.fillna(0) # replace nans with 0 (background)
# combine same coords
dfAll = dfAll.groupby(idCols).sum() # add same markers

# replace coord columns from index
for ii, colName in enumerate(idCols):
    
    dfAll[colName] = dfAll.index.get_level_values(ii)

dfAll = dfAll.reset_index(drop = True)

In [6]:
dfAll

Unnamed: 0,EEF2,ACTB,SOX9,GAPDH,SPP1,IL8,IL6,CCL11,COL5A2,COL1A1,...,MALAT1,RUNX1,CXCR4,MKI67,NANOG,Y,X,FOV,CellLabel,TissueSource
0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1,307,88,119,UC
1,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1,357,5,47,UC
2,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1,376,5,47,UC
3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1,412,5,47,UC
4,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1,417,5,47,UC
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3033039,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,2224,717,2,3,HCH
3033040,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,2225,210,26,204,HCH
3033041,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,2226,676,2,3,HCH
3033042,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,2226,685,2,3,HCH


In [7]:
dfAll.groupby(['TissueSource']).max()

Unnamed: 0_level_0,EEF2,ACTB,SOX9,GAPDH,SPP1,IL8,IL6,CCL11,COL5A2,COL1A1,PDL1,MALAT1,RUNX1,CXCR4,MKI67,NANOG,Y,X,FOV,CellLabel
TissueSource,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
BM,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,2198,2133,95,237
HCH,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2227,2196,34,247
UC,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,2182,2135,91,121


# Export combined dataframe

In [9]:
os.chdir(exportPath)

dfAll.to_pickle('01_all_RNA_positions.pkl')