In [2]:
import numpy as np
import scipy as sp
import pandas as pd
import rpy2 as rp
from rpy2.robjects.packages import importr
from rpy2.robjects import pandas2ri
from rpy2.robjects.conversion import localconverter
import os 
import rpy2.robjects as ro
import pickle
from collections import Counter


# import R's "flowCore" package
utils = importr('flowCore')
os.chdir('../')

In [3]:
##### list fcs files #####
cytof_files = pd.read_csv("Bioheart_combined_metadata.csv")
print(cytof_files)
fn = [os.path.join(os.getcwd(),f) for f in cytof_files.file_name]
fn

     sample_id  Gensini_bin                file_name   data
0           87            1   Bioheart_sample_87.fcs  train
1           88            1   Bioheart_sample_88.fcs  train
2           89            1   Bioheart_sample_89.fcs  train
3           92            1   Bioheart_sample_92.fcs  train
4           96            1   Bioheart_sample_96.fcs  train
..         ...          ...                      ...    ...
164        605            0  Bioheart_sample_605.fcs   test
165        460            0  Bioheart_sample_460.fcs   test
166        802            1  Bioheart_sample_802.fcs   test
167        668            0  Bioheart_sample_668.fcs   test
168        556            0  Bioheart_sample_556.fcs   test

[169 rows x 4 columns]


['/Users/elijahwillie/Documents/PhD/DeepLearning_CyTOF/Bioheart_combined/Bioheart_sample_87.fcs',
 '/Users/elijahwillie/Documents/PhD/DeepLearning_CyTOF/Bioheart_combined/Bioheart_sample_88.fcs',
 '/Users/elijahwillie/Documents/PhD/DeepLearning_CyTOF/Bioheart_combined/Bioheart_sample_89.fcs',
 '/Users/elijahwillie/Documents/PhD/DeepLearning_CyTOF/Bioheart_combined/Bioheart_sample_92.fcs',
 '/Users/elijahwillie/Documents/PhD/DeepLearning_CyTOF/Bioheart_combined/Bioheart_sample_96.fcs',
 '/Users/elijahwillie/Documents/PhD/DeepLearning_CyTOF/Bioheart_combined/Bioheart_sample_97.fcs',
 '/Users/elijahwillie/Documents/PhD/DeepLearning_CyTOF/Bioheart_combined/Bioheart_sample_103.fcs',
 '/Users/elijahwillie/Documents/PhD/DeepLearning_CyTOF/Bioheart_combined/Bioheart_sample_152.fcs',
 '/Users/elijahwillie/Documents/PhD/DeepLearning_CyTOF/Bioheart_combined/Bioheart_sample_154.fcs',
 '/Users/elijahwillie/Documents/PhD/DeepLearning_CyTOF/Bioheart_combined/Bioheart_sample_167.fcs',
 '/Users/elijahw

In [5]:
##### read fcs file using the flowCore R package #####
# flowCore is a very well maintained R package for reading and analyzing fcs files
# Many of the fcs file related packages in python are a little buggy to use
# Therefore, it is worth the trouble to read the fcs files using R 

r = rp.robjects.r
expr_list = []
for i in range(0,len(fn)):
    fn_i = fn[i]
    r_code = ("library(flowCore);"+
          "library(MetaCyto);"+
          "fn = '"+ fn_i+ "'; "+
          "fcs = read.FCS(fn,truncate_max_range = FALSE);"+
          "expr = fcs@exprs;"+
          "markers = markerFinder(fcs);"+ 
          "colnames(expr) = markers;"+
          "expr = as.data.frame(expr);"
         # subsample 10,000 cells

         "expr = expr[sample(1:nrow(expr),10000,replace = TRUE), ];" +
             "return(as.data.frame(expr));")
    expr =  r(r_code)
    expr_list.append(expr)

In [6]:
expr_list[2]

HLA_DR,CD3,CD4,...,CD304,CD141,CD1C_PE
0.000000,80.998055,44.891235,...,0.129048,0.577319,0.925188
1.077275,0.000000,0.468571,,0.000000,0.000000,2.517735
1.858683,328.272766,63.528400,,0.000000,2.107844,0.000000
1.504390,383.252319,151.527740,,0.000000,6.196556,0.000000
...,...,...,,...,...,...
2.660940,1.363354,4.308849,,0.950135,0.000000,0.000000
64.054016,20.610851,15.391182,,0.000000,11.569795,3.288527
1.581026,175.718628,69.520859,,0.000000,0.000000,0.537952
0.000000,159.044159,69.141266,,0.000000,4.394193,2.552474


In [7]:
markers = []
for i in range(len(expr_list)):
#     expr_list[i].colna = expr_list[i].columns.str.replace(' ', '-')
    markers.extend(expr_list[i].colnames)

markers = Counter(markers)
markers = [k for k, c in markers.items() if c == 169]
print(markers)

for i in range(0,len(expr_list)):
    t1 = expr_list[i] 
    with localconverter(ro.default_converter + pandas2ri.converter):
        t1 = ro.conversion.rpy2py(t1)
    expr_list[i] = t1.loc[:,markers]

['HLA_DR', 'CD3', 'CD4', 'CD8A', 'CD25', 'CD127', 'FOXP3', 'CD27', 'KLRG1', 'CD56', 'CD45RO', 'CD45RA', 'CD192_CCR2', 'CD194_CCR4', 'CD196_CCR6', 'CD39', 'CD38', 'KI67', 'CD183_CXCR3', 'CCR7', 'CD19', 'CD20', 'IGD', 'CD14', 'CD304', 'CD141', 'CD1C_PE']


In [8]:
len(markers)

27

In [9]:
##### transform and format into numpy array
def arcsinh(x):
    return(np.arcsinh(x/5))

coln = expr_list[0].columns
for i in range(len(expr_list)):
    t1 = expr_list[i]
    t1 = t1.apply(arcsinh)
    t1 = t1.values
    shape1 = list(t1.shape)+[1]
    t1 = t1.reshape(shape1)
    expr_list[i] = t1
    
expr_list = np.stack(expr_list)
print("The dimenstion of the data is: ", expr_list.shape)

The dimenstion of the data is:  (169, 10000, 27, 1)


In [10]:
coln

Index(['HLA_DR', 'CD3', 'CD4', 'CD8A', 'CD25', 'CD127', 'FOXP3', 'CD27',
       'KLRG1', 'CD56', 'CD45RO', 'CD45RA', 'CD192_CCR2', 'CD194_CCR4',
       'CD196_CCR6', 'CD39', 'CD38', 'KI67', 'CD183_CXCR3', 'CCR7', 'CD19',
       'CD20', 'IGD', 'CD14', 'CD304', 'CD141', 'CD1C_PE'],
      dtype='object')

In [11]:
allData = {"cytof_files":cytof_files, 
            "expr_list" : expr_list,
            "marker_names" : coln}

with open("DL_CyTOF/Bioheart_combined_allData.obj", "wb") as f:
    pickle.dump(allData, f)