In [1]:
import os
import re 
import numpy as np 
import pandas as pd 
import anndata as ad

In [2]:
#
# Read in TMA cell data.
#
data_dir = './TMA1_004/TMA1_004_h5ad'

# Load the data.
adatas = {}

# iterate through the file names in the input folder 
for fh in os.listdir(data_dir):

    # ignore any hidden or other non data files 
    if fh.endswith('h5ad'):

        # isolate the core name from the file name using a series of regular expressions
        # the expressions might have to change depending on which slide you use 
        initial_split = re.split('\.', fh)[0]
        core_ID = re.split('-', initial_split)[-1]

        # read the file as anndata object, save in dictionary, {core_ID : adata}
        adatas[core_ID] = ad.read_h5ad(os.path.join(data_dir, fh))



In [3]:
adatas

{'C2': AnnData object with n_obs × n_vars = 9285 × 40
     obs: 'X_centroid', 'Y_centroid', 'Area', 'MajorAxisLength', 'MinorAxisLength', 'Eccentricity', 'Solidity', 'Extent', 'Orientation', 'imageid'
     uns: 'all_markers',
 'F7': AnnData object with n_obs × n_vars = 6368 × 40
     obs: 'X_centroid', 'Y_centroid', 'Area', 'MajorAxisLength', 'MinorAxisLength', 'Eccentricity', 'Solidity', 'Extent', 'Orientation', 'imageid'
     uns: 'all_markers',
 'C5': AnnData object with n_obs × n_vars = 9666 × 40
     obs: 'X_centroid', 'Y_centroid', 'Area', 'MajorAxisLength', 'MinorAxisLength', 'Eccentricity', 'Solidity', 'Extent', 'Orientation', 'imageid'
     uns: 'all_markers',
 'G5': AnnData object with n_obs × n_vars = 18567 × 40
     obs: 'X_centroid', 'Y_centroid', 'Area', 'MajorAxisLength', 'MinorAxisLength', 'Eccentricity', 'Solidity', 'Extent', 'Orientation', 'imageid'
     uns: 'all_markers',
 'B7': AnnData object with n_obs × n_vars = 7336 × 40
     obs: 'X_centroid', 'Y_centroid', 'Ar

In [4]:
#
# Read in TMA metadata.
#
metadata_df = pd.read_csv('./metadata.tsv', sep='\t')
metadata_df

Unnamed: 0,Core,Subtype
0,G3,TNBC
1,G7,TNBC
2,F9,TNBC
3,F1,TNBC
4,E6,TNBC
5,A8,TNBC
6,A1,Luminal A
7,G5,Luminal A
8,E4,Luminal A
9,H4,Luminal B Her2Pos


In [5]:
#
# Create single data frame with cell features and metadata.
#

# Select cores that are in the metadata.
selected_adatas = {key: value for key, value in adatas.items() if key in metadata_df['Core'].values}
selected_adatas.keys()

# Combine the selected cores into one anndata object.
combined = ad.concat(selected_adatas, label = 'Core')

  utils.warn_names_duplicates("obs")


In [6]:
# Create final dataframe.
final_df = combined.to_df()

# Filter out Control/DNA/AF columns.
marker_cols = final_df.filter(regex="^(?!(Control|DNA|AF))").columns
final_df = final_df[marker_cols]

# Add Core column.
final_df['Core'] = combined.obs['Core']

# Join dataframe with metadata.
final_df = final_df.merge(metadata_df, left_on='Core', right_on='Core')
final_df

Unnamed: 0,CD3,pERK,Rad51,CCND1,Vimentin,aSMA,Ecad,ER,PR,EGFR,...,CK19,CK17,LaminABC,AR,H2Ax,PCNA,PanCK,CD31,Core,Subtype
0,1.108049,2.367280,1.205983,0.809805,0.643100,0.756149,0.904068,1.873106,0.628908,0.905082,...,1.842349,0.957701,0.734693,1.192826,1.014207,1.136202,1.540347,1.503884,G5,Luminal A
1,1.311849,2.748798,1.975927,0.784924,0.616813,1.014435,1.037699,2.009408,0.450378,0.720003,...,1.765434,0.923039,0.716357,0.900161,0.728166,1.100546,1.165204,1.135478,G5,Luminal A
2,1.032452,2.477824,2.406576,0.858549,0.979219,0.883893,1.461693,2.592998,0.725899,0.818180,...,4.319531,0.956670,1.415923,1.038165,0.871665,1.171252,4.410381,1.246464,G5,Luminal A
3,1.355602,2.269463,2.032605,0.832309,1.029909,1.421840,0.975202,2.062229,0.641186,0.830057,...,2.202574,0.986778,1.890009,1.013757,0.990621,1.184966,2.222565,1.360663,G5,Luminal A
4,1.179248,2.718190,2.304946,0.890247,0.758033,1.413838,1.545422,3.391289,0.808776,0.840453,...,3.262141,0.965366,1.823325,1.305266,0.948487,1.754226,3.625247,1.270104,G5,Luminal A
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
250675,1.160113,2.319591,2.874407,0.757991,1.001444,2.052812,0.665222,1.729627,0.462439,0.635879,...,1.543982,0.801783,0.790103,0.680631,0.977413,0.647092,0.560885,1.197684,E4,Luminal A
250676,0.984769,2.487343,2.914809,0.651790,0.523562,0.317693,0.631203,1.288297,0.230164,0.623026,...,1.679608,0.432103,0.764876,0.607900,0.565372,0.688264,0.930430,0.722552,E4,Luminal A
250677,1.866579,2.141912,2.754014,1.008417,1.665936,3.014499,0.756899,1.953017,0.469935,0.750915,...,2.000212,0.896921,1.071702,0.903956,1.000271,0.991021,1.363368,1.348060,E4,Luminal A
250678,1.634488,2.699897,2.714368,0.995328,1.882192,2.115073,0.735628,1.905791,0.473499,0.783893,...,2.055048,0.918055,1.057348,1.034916,1.025777,1.059736,1.173739,1.310694,E4,Luminal A


In [7]:
# Write dataframe to file.
final_df.to_csv('./tma_single_cell.tsv', index=False, sep='\t')