In this example, we use CellCnn to analyze a mass cytometry dataset acquired to characterize human natural killer (NK) cell diversity and associate NK cell subsets with genetic and environmental factors, namely prior Cytomegalovirus (CMV) infection [1]. This dataset comprises mass cytometry measurements of 36 markers, including 28 NK cell receptors, for PBMC samples of 20 donors with varying serology for CMV. 

We will train CellCnn to identify CMV seropositivity-associated cell populations from the **ungated data** (after removal of dead cells and doublets). To run this example, please download the [NK cell dataset](http://www.imsb.ethz.ch/research/claassen/Software/cellcnn.html) and place the decompressed folder in the cellCnn/examples directory.

[1] Horowitz, A. et al. Genetic and environmental determinants of human NK cell diversity revealed by mass cytometry. Sci. Transl. Med. 5 (2013).

In [1]:
import os, sys, errno, glob, fcm
import numpy as np

import cellCnn
from cellCnn.utils import ftrans, mkdir_p, get_items
from cellCnn.model import CellCnn
from cellCnn.plotting import plot_results_2class
from sklearn.metrics import roc_auc_score

%pylab inline


Using Theano backend.


Populating the interactive namespace from numpy and matplotlib


In [2]:
# define input and output directories
WDIR = os.path.join(cellCnn.__path__[0], 'examples')
FCS_DATA_PATH = os.path.join(WDIR, 'NK_cell_dataset', 'gated_alive')

# define output directory
OUTDIR = os.path.join(WDIR, 'output_NK_ungated')
mkdir_p(OUTDIR)

In [3]:
# look at the measured markers
data_fcs = fcm.loadFCS(glob.glob(FCS_DATA_PATH + '/*.fcs')[0], transform=None, auto_comp=False)
print data_fcs.channels


  warn("text in segment does not start and end with delimiter")


['Time', 'Cell_length', 'CD3', 'Dead', '(La139)Dd', 'CD27', 'CD19', 'CD4', 'CD8', 'CD57', '2DL1-S1', 'TRAIL', '2DL2-L3-S2', 'CD16', 'CD10', '3DL1-S1', 'CD117', '2DS4', 'ILT2-CD85j', 'NKp46', 'NKG2D', 'NKG2C', '2B4', 'CD33', 'CD11b', 'NKp30', 'CD122', '3DL1', 'NKp44', 'CD127', '2DL1', 'CD94', 'CD34', 'CCR7', '2DL3', 'NKG2A', 'HLA-DR', '2DL4', 'CD56', '2DL5', 'CD25', 'DNA1', 'DNA2']


In [4]:
# select the relevant markers for further analysis
markers = ['CD3', 'CD27', 'CD19', 'CD4', 'CD8', 'CD57', '2DL1-S1', 'TRAIL', '2DL2-L3-S2',
           'CD16', 'CD10', '3DL1-S1', 'CD117', '2DS4', 'ILT2-CD85j', 'NKp46', 'NKG2D', 'NKG2C',
           '2B4', 'CD33', 'CD11b', 'NKp30', 'CD122', '3DL1', 'NKp44', 'CD127', '2DL1', 'CD94',
           'CD34', 'CCR7', '2DL3', 'NKG2A', 'HLA-DR', '2DL4', 'CD56', '2DL5', 'CD25']
marker_idx = [data_fcs.channels.index(label) for label in markers]
nmark = len(markers)

In [2]:
# the following function randomly split the fcs files into training and test set
# the argument `nrep` defines how many random splits to create

def create_symlinks(nrep=1):

    # # prior CMV infection status obtained from the original study (Horowitz et al. 2013)
    sample_ids = np.sort([f.split('_')[-2] for f in glob.glob(FCS_DATA_PATH + '/*fcs')])
    y_label = np.asarray([1,1,0,0,1,0, 1,0,0,0,1, 0,0,0,0,0, 1,1,1,1])

    # split samples into groups
    group1 = np.where(y_label == 0)[0]
    group2 = np.where(y_label == 1)[0]
    l1, l2 = len(group1), len(group2)
    ntrain_per_class = 7
    ntest_group1 = l1 - ntrain_per_class
    ntest_group2 = l2 - ntrain_per_class

    for irep in range(nrep):
        # get the sample indices
        train_idx1 = list(np.random.choice(group1, size=ntrain_per_class, replace=False))
        test_idx1 = [i for i in group1 if i not in train_idx1]
        train_idx2 = list(np.random.choice(group2, size=ntrain_per_class, replace=False))
        test_idx2 = [i for i in group2 if i not in train_idx2]

        # create directories
        basepath = os.path.join(FCS_DATA_PATH, 'CV_run_%d' % irep)
        train_path = os.path.join(basepath, 'train')
        test_path = os.path.join(basepath, 'test')
        mkdir_p(basepath)
        mkdir_p(train_path)
        mkdir_p(test_path)

        # store symbolic links to training and test FCS files
        for i, suffix in zip(train_idx1 + train_idx2,
                             (['group1'] * len(train_idx1)) + (['group2'] * len(train_idx2))):
            fname = 'a_%s_alive' % sample_ids[i]
            os.symlink(os.path.join(FCS_DATA_PATH, fname + '.fcs'),
                    os.path.join(train_path, fname + '_%s.fcs' % suffix))

        for i, suffix in zip(test_idx1 + test_idx2,
                             (['group1'] * len(test_idx1)) + (['group2'] * len(test_idx2))):
            fname = 'a_%s_alive' % sample_ids[i]
            os.symlink(os.path.join(FCS_DATA_PATH, fname + '.fcs'),
                    os.path.join(test_path, fname + '_%s.fcs' % suffix))

In [5]:
# set random seed for reproducible results
np.random.seed(12345)

# run this only once to create the training and test directories
create_symlinks()

In [6]:

cofactor = 5
i_run = 0
curr_data_dir = os.path.join(FCS_DATA_PATH, 'CV_run_%s' % i_run)
curr_out_dir = os.path.join(OUTDIR, 'CV_run_%s' % i_run)
mkdir_p(curr_out_dir)

# read the training sample names
group1 = glob.glob(curr_data_dir + '/train/*_group1.fcs')
group2 = glob.glob(curr_data_dir + '/train/*_group2.fcs')

# load the training samples
group1_list, group2_list = [], []
for fname in group1:
    x_full = np.asarray(fcm.loadFCS(fname, transform=None, auto_comp=False))
    x = ftrans(x_full[:,marker_idx], cofactor)
    group1_list.append(x)

for fname in group2:
    x_full = np.asarray(fcm.loadFCS(fname, transform=None, auto_comp=False))
    x = ftrans(x_full[:,marker_idx], cofactor)
    group2_list.append(x)

# read the test sample names
test_group1 = glob.glob(curr_data_dir + '/test/*_group1.fcs')
test_group2 = glob.glob(curr_data_dir + '/test/*_group2.fcs')

# load the test samples
t_group1_list, t_group2_list = [], []
test_phenotypes = []
for fname in test_group1:
    x_full = np.asarray(fcm.loadFCS(fname, transform=None, auto_comp=False))
    x = ftrans(x_full[:,marker_idx], cofactor)
    t_group1_list.append(x)
    test_phenotypes.append(0)

for fname in test_group2:
    x_full = np.asarray(fcm.loadFCS(fname, transform=None, auto_comp=False))
    x = ftrans(x_full[:,marker_idx], cofactor)
    t_group2_list.append(x)
    test_phenotypes.append(1)

# finally prepare training and vallidation data
cut = int(.8 * len(group1_list))
train_samples = group1_list[:cut] + group2_list[:cut]
train_phenotypes = [0] * len(group1_list[:cut]) + [1] * len(group2_list[:cut])
valid_samples = group1_list[cut:] + group2_list[cut:]
valid_phenotypes = [0] * len(group1_list[cut:]) + [1] * len(group2_list[cut:])
test_samples = t_group1_list + t_group2_list


In [15]:
# run a CellCnn analysis

model = CellCnn(ncell=3000, nsubset=1000, max_epochs=10, nrun=20, coeff_l2=0,
                ncell_pooled=range(10, 30), nfilter_choice=[5,10], dropout=True,
                learning_rate=0.001)

model.fit(train_samples=train_samples, train_phenotypes=train_phenotypes,
          valid_samples=valid_samples, valid_phenotypes=valid_phenotypes, outdir=curr_out_dir)


Generating multi-cell inputs...
Done.
training network: 1
Number of filters: 10
Cells pooled: 14
Train on 2000 samples, validate on 2000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Best validation accuracy: 1.00
training network: 2
Number of filters: 10
Cells pooled: 11
Train on 2000 samples, validate on 2000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Best validation accuracy: 0.81
training network: 3
Number of filters: 5
Cells pooled: 11
Train on 2000 samples, validate on 2000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Best validation accuracy: 0.50
training network: 4
Number of filters: 10
Cells pooled: 18
Train on 2000 samples, validate on 2000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Best validation accuracy: 0.34
training network: 5
Number of filter

An exception was raised during training the network.
Cannot reshape input of shape (5,) to shape [ 1 10  1  1]
Apply node that caused the error: Reshape{4}(conv1_b, TensorConstant{[ 1 10  1  1]})
Toposort index: 8
Inputs types: [TensorType(float32, vector), TensorType(int64, vector)]
Inputs shapes: [(5,), (4,)]
Inputs strides: [(4,), (8,)]
Inputs values: [array([-0.02081279, -0.0463671 , -0.05543111, -0.04294106, -0.0474725 ], dtype=float32), array([ 1, 10,  1,  1])]
Outputs clients: [[Elemwise{Add}[(0, 0)](CorrMM{valid, (1, 1)}.0, Reshape{4}.0)]]

Backtrace when the node is created(use Theano flag traceback.limit=N to make it longer):
  File "/Users/eiriniar/public_repos/CellCnn/cellCnn/model.py", line 167, in fit
    accur_thres=self.accur_thres, verbose=self.verbose)
  File "/Users/eiriniar/public_repos/CellCnn/cellCnn/model.py", line 430, in train_model
    dropout, dropout_p, regression, n_classes, lr)
  File "/Users/eiriniar/public_repos/CellCnn/cellCnn/model.py", line 534, in bu

training network: 7
Number of filters: 10
Cells pooled: 19
Train on 2000 samples, validate on 2000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Best validation accuracy: 1.00
training network: 8
Number of filters: 5
Cells pooled: 16
Train on 2000 samples, validate on 2000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Best validation accuracy: 1.00
training network: 9
Number of filters: 5
Cells pooled: 19
Train on 2000 samples, validate on 2000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10


An exception was raised during training the network.
Cannot reshape input of shape (10,) to shape [1 5 1 1]
Apply node that caused the error: Reshape{4}(conv1_b, TensorConstant{[1 5 1 1]})
Toposort index: 8
Inputs types: [TensorType(float32, vector), TensorType(int64, vector)]
Inputs shapes: [(10,), (4,)]
Inputs strides: [(4,), (8,)]
Inputs values: ['not shown', array([1, 5, 1, 1])]
Outputs clients: [[Elemwise{Add}[(0, 0)](CorrMM{valid, (1, 1)}.0, Reshape{4}.0)]]

Backtrace when the node is created(use Theano flag traceback.limit=N to make it longer):
  File "/Users/eiriniar/public_repos/CellCnn/cellCnn/model.py", line 167, in fit
    accur_thres=self.accur_thres, verbose=self.verbose)
  File "/Users/eiriniar/public_repos/CellCnn/cellCnn/model.py", line 430, in train_model
    dropout, dropout_p, regression, n_classes, lr)
  File "/Users/eiriniar/public_repos/CellCnn/cellCnn/model.py", line 534, in build_model
    name='conv1')(data_input)
  File "/Users/eiriniar/virtual_env/venv_CellC

training network: 10
Number of filters: 5
Cells pooled: 27
Train on 2000 samples, validate on 2000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Best validation accuracy: 1.00
training network: 11
Number of filters: 5
Cells pooled: 15
Train on 2000 samples, validate on 2000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Best validation accuracy: 0.76
training network: 12
Number of filters: 10
Cells pooled: 11
Train on 2000 samples, validate on 2000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Best validation accuracy: 0.68
training network: 13
Number of filters: 10
Cells pooled: 27
Train on 2000 samples, validate on 2000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Best validation accuracy: 1.00
training network: 14
Number of filters: 5
Cells pooled: 25


An exception was raised during training the network.
Unable to open file (Unable to open file: name = '/users/eiriniar/public_repos/cellcnn/cellcnn/examples/output_nk_ungated/cv_run_0/nnet_run_15.hdf5', errno = 2, error message = 'no such file or directory', flags = 0, o_flags = 0)


Train on 2000 samples, validate on 2000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Best validation accuracy: 0.50
training network: 18
Number of filters: 5
Cells pooled: 22
Train on 2000 samples, validate on 2000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Best validation accuracy: 1.00
training network: 19
Number of filters: 5
Cells pooled: 19
Train on 2000 samples, validate on 2000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Best validation accuracy: 0.50
training network: 20
Number of filters: 5
Cells pooled: 13
Train on 2000 samples, validate on 2000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Best validation accuracy: 0.50


<cellCnn.model.CellCnn at 0x26ea563d0>

In [16]:
# now make predictions using the trained model
train_pred = model.predict(train_samples)
valid_pred = model.predict(valid_samples)
test_pred = model.predict(test_samples)
print train_pred, train_phenotypes
print valid_pred, valid_phenotypes
print test_pred, test_phenotypes

# calculate area under the ROC curve
train_auc = roc_auc_score(train_phenotypes, train_pred[:,1])
valid_auc = roc_auc_score(valid_phenotypes, valid_pred[:,1])
test_auc = roc_auc_score(test_phenotypes, test_pred[:,1])
print train_auc, valid_auc, test_auc

Predictions based on multi-cell inputs containing 82324 cells.
Predictions based on multi-cell inputs containing 180167 cells.
Predictions based on multi-cell inputs containing 90075 cells.
[[ 0.53379648  0.46620354]
 [ 0.62872454  0.37127548]
 [ 0.60943627  0.39056371]
 [ 0.59409626  0.40590374]
 [ 0.63164697  0.36835304]
 [ 0.48501236  0.51498765]
 [ 0.310597    0.689403  ]
 [ 0.35741645  0.64258357]
 [ 0.41167112  0.58832888]
 [ 0.46106153  0.53893845]] [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]
[[ 0.61455639  0.38544363]
 [ 0.58399214  0.41600785]
 [ 0.3871421   0.61285788]
 [ 0.3738252   0.62617481]] [0, 0, 1, 1]
[[ 0.5216577   0.47834232]
 [ 0.64074167  0.35925832]
 [ 0.54948739  0.4505126 ]
 [ 0.45362244  0.54637756]
 [ 0.38780395  0.61219605]
 [ 0.5135174   0.48648259]] [0, 0, 0, 0, 1, 1]
1.0 1.0 0.875


In [11]:
# plot the results of the CellCnn analysis in the output directory
plot_results_2class(model.results, test_samples, test_phenotypes,
                    markers, curr_out_dir, filter_response_thres=.4)

Loading the weights of consensus filters.
Found 1 discriminative filter(s):  [0]


<matplotlib.figure.Figure at 0x272523710>

<matplotlib.figure.Figure at 0x272528310>

<matplotlib.figure.Figure at 0x264597390>