In [1]:
import sys
sys.path.insert(0, '/tf/pollock')

In [2]:
import logging
import os
import random
from collections import Counter
from importlib import reload
import time

import anndata
import numpy as np
import pandas as pd
import scanpy as sc
import matplotlib.pyplot as plt

import pollock
from pollock import PollockDataset, PollockModel, load_from_directory
# import pollock.models.analysis as pollock_analysis

  from pandas.core.index import RangeIndex


In [3]:
import tensorflow as tf
from tensorflow.keras import layers

tf.keras.backend.clear_session()  # For easy reset of notebook state.

In [4]:
DATA_DIR = '/data/single_cell_classification'
MODEL_DIR = '/models'

In [5]:
run_name = 'br'

expression_fp = os.path.join(DATA_DIR, 'tumor', 'BR', 'raw', 'houxiang_brca',
                            'breast_counts_matrix.tsv')
label_fp = os.path.join(DATA_DIR, 'tumor', 'BR', 'raw', 'houxiang_brca',
                            'breast_metadata.tsv')

training_image_dir = os.path.join(MODEL_DIR, 'scratch', run_name)
model_save_dir = os.path.join(MODEL_DIR, run_name)

sample_column = 'Genes'
sep='\t'
cell_type_key = 'cell_type'

n_per_cell_type = 5000
epochs = 5
batch_size = 128

In [6]:
expression_df = pd.read_hdf(expression_fp.replace('.tsv', '.h5'), 'df')
expression_df

Genes,MIR1302-2HG,FAM138A,OR4F5,AL627309.1,AL627309.3,AL627309.2,AL627309.4,AL732372.1,OR4F29,AC114498.1,...,AC007325.2,BX072566.1,AL354822.1,AC023491.2,AC004556.1,AC233755.2,AC233755.1,AC240274.1,AC213203.1,FAM231C
HT062B1_S1PA_AAACCCACACAAATGA-1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
HT062B1_S1PA_AAACCCAGTGCTCCGA-1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
HT062B1_S1PA_AAACCCATCGGAATTC-1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
HT062B1_S1PA_AAACGAACAGCTAACT-1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
HT062B1_S1PA_AAACGAAGTAGGGAGG-1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
HT110B1_XB3_TTTCATGTCGGCAGTC-1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
HT110B1_XB3_TTTCCTCGTGTTACTG-1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
HT110B1_XB3_TTTGACTCAGGGTCTC-1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
HT110B1_XB3_TTTGGAGCAAGAGGCT-1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [7]:
label_df = pd.read_csv(
    label_fp,
    sep=sep
    )
label_df = label_df.set_index('cell_id')
label_df = label_df.loc[expression_df.index]
label_df

Unnamed: 0,sample_id,cancer_type,tissue_type,organ_type,cell_type,species,method,facs
HT062B1_S1PA_AAACCCACACAAATGA-1,TWCE-HT062B1-S1PAA1A1Z1B1,Breast Cancer,Epithelial,Breast,CD8+ T-cells,Homo sapiens,sc,yes
HT062B1_S1PA_AAACCCAGTGCTCCGA-1,TWCE-HT062B1-S1PAA1A1Z1B1,Breast Cancer,Epithelial,Breast,Endothelial cells,Homo sapiens,sc,yes
HT062B1_S1PA_AAACCCATCGGAATTC-1,TWCE-HT062B1-S1PAA1A1Z1B1,Breast Cancer,Epithelial,Breast,Fibroblasts,Homo sapiens,sc,yes
HT062B1_S1PA_AAACGAACAGCTAACT-1,TWCE-HT062B1-S1PAA1A1Z1B1,Breast Cancer,Epithelial,Breast,BR_Malignant,Homo sapiens,sc,yes
HT062B1_S1PA_AAACGAAGTAGGGAGG-1,TWCE-HT062B1-S1PAA1A1Z1B1,Breast Cancer,Epithelial,Breast,CD4+ T-cells,Homo sapiens,sc,yes
...,...,...,...,...,...,...,...,...
HT110B1_XB3_TTTCATGTCGGCAGTC-1,TWCE-HT110B1-XB3,Breast Cancer,Epithelial,Breast,BR_Malignant,Homo sapiens,sc,yes
HT110B1_XB3_TTTCCTCGTGTTACTG-1,TWCE-HT110B1-XB3,Breast Cancer,Epithelial,Breast,Fibroblasts,Homo sapiens,sc,yes
HT110B1_XB3_TTTGACTCAGGGTCTC-1,TWCE-HT110B1-XB3,Breast Cancer,Epithelial,Breast,Fibroblasts,Homo sapiens,sc,yes
HT110B1_XB3_TTTGGAGCAAGAGGCT-1,TWCE-HT110B1-XB3,Breast Cancer,Epithelial,Breast,Macrophages,Homo sapiens,sc,yes


In [8]:
adata = anndata.AnnData(X=expression_df.values, obs=label_df)
adata.obs.index = expression_df.index
adata.var.index = expression_df.columns
adata

AnnData object with n_obs × n_vars = 49088 × 33538 
    obs: 'sample_id', 'cancer_type', 'tissue_type', 'organ_type', 'cell_type', 'species', 'method', 'facs'

In [9]:
counts = Counter(adata.obs[cell_type_key])
counts.most_common()

[('BR_Malignant', 10137),
 ('Fibroblasts', 8305),
 ('CD4+ T-cells', 8001),
 ('CD8+ T-cells', 7080),
 ('Macrophages', 3585),
 ('B-cells', 3001),
 ('Endothelial cells', 2883),
 ('NK cells', 2053),
 ('Tregs', 1523),
 ('Plasma cells', 1445),
 ('Unknown', 808),
 ('Mast cells', 161),
 ('DC', 106)]

In [10]:
## get rid of unknowns
adata = adata[adata.obs[cell_type_key]!='Unknown']
adata

View of AnnData object with n_obs × n_vars = 48280 × 33538 
    obs: 'sample_id', 'cancer_type', 'tissue_type', 'organ_type', 'cell_type', 'species', 'method', 'facs'

In [11]:
# def __init__(self, adata, cell_type_key='ClusterName', n_per_cell_type=500,
# ¦   ¦   batch_size=64, dataset_type='training', min_genes=200, min_cells=3, mito_threshold=.2,
# ¦   ¦   max_n_genes=None, log=True, cpm=True, min_disp=.2, standard_scaler=None,
# ¦   ¦   range_scaler=None, cell_type_encoder=None, genes=None, cell_types=None):


In [12]:
pds = PollockDataset(adata, cell_type_key=cell_type_key, n_per_cell_type=500, batch_size=128,
                    dataset_type='training', min_genes=200, min_cells=3, mito_threshold=.1,
                    max_n_genes=None, log=True, cpm=False, min_disp=.2)

2020-03-20 17:18:49,924 normalizing counts for model training
Trying to set attribute `.obs` of view, copying.
  view_to_actual(data)
2020-03-20 17:19:54,958 remaining after min disp: {remaining}
2020-03-20 17:20:10,421 creating datasets


In [13]:
pm = PollockModel(pds.cell_types, pds.train_adata.shape[1], )

In [14]:
pm.fit(pds, epochs=20)

2020-03-20 17:20:25,739 epoch: 1, val loss: 29.210050582885742
2020-03-20 17:20:28,489 epoch: 2, val loss: 24.722400665283203
2020-03-20 17:20:31,349 epoch: 3, val loss: 24.055740356445312
2020-03-20 17:20:34,139 epoch: 4, val loss: 23.554485321044922
2020-03-20 17:20:36,894 epoch: 5, val loss: 23.189311981201172
2020-03-20 17:20:39,729 epoch: 6, val loss: 22.760957717895508
2020-03-20 17:20:42,569 epoch: 7, val loss: 22.21238899230957
2020-03-20 17:20:45,364 epoch: 8, val loss: 21.49134635925293
2020-03-20 17:20:48,024 epoch: 9, val loss: 20.955577850341797
2020-03-20 17:20:50,714 epoch: 10, val loss: 20.567888259887695
2020-03-20 17:20:53,491 epoch: 11, val loss: 20.20861053466797
2020-03-20 17:20:56,466 epoch: 12, val loss: 19.983163833618164
2020-03-20 17:20:59,131 epoch: 13, val loss: 19.819347381591797
2020-03-20 17:21:01,908 epoch: 14, val loss: 19.744384765625
2020-03-20 17:21:04,714 epoch: 15, val loss: 19.670251846313477
2020-03-20 17:21:07,548 epoch: 16, val loss: 19.5777606

1.0
0.09612432847275518


In [15]:
pm.save(pds, os.path.join(MODEL_DIR, 'testing'), )

[ 2.  3.  7. 10.  2.]


  _warn_prf(average, modifier, msg_start, len(result))


[11.  3. 11.  1.  0.]


  _warn_prf(average, modifier, msg_start, len(result))


In [16]:
pm.summary['training']

{'metrics': {'B-cells': {'precision': 0.09198813056379822,
   'recall': 0.124,
   'f1-score': 0.10562180579216353,
   'support': 500},
  'BR_Malignant': {'precision': 0.10679611650485436,
   'recall': 0.132,
   'f1-score': 0.1180679785330948,
   'support': 500},
  'CD4+ T-cells': {'precision': 0.08631921824104234,
   'recall': 0.106,
   'f1-score': 0.09515260323159784,
   'support': 500},
  'CD8+ T-cells': {'precision': 0.0997920997920998,
   'recall': 0.096,
   'f1-score': 0.09785932721712538,
   'support': 500},
  'DC': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 84},
  'Endothelial cells': {'precision': 0.09926470588235294,
   'recall': 0.108,
   'f1-score': 0.10344827586206896,
   'support': 500},
  'Fibroblasts': {'precision': 0.10766045548654245,
   'recall': 0.104,
   'f1-score': 0.10579857578840285,
   'support': 500},
  'Macrophages': {'precision': 0.10329670329670329,
   'recall': 0.094,
   'f1-score': 0.09842931937172775,
   'support': 500},
  'Mast cells':