In [1]:
import sys
sys.path.insert(0, '/tf/pollock')

In [2]:
%load_ext autoreload

In [3]:
%autoreload 2

In [4]:
import logging
import os
import random
from collections import Counter
from importlib import reload
import time

import anndata
import numpy as np
import pandas as pd
import scanpy as sc
import matplotlib.pyplot as plt

import pollock
from pollock import PollockDataset, PollockModel, load_from_directory
# import pollock.models.analysis as pollock_analysis

  from pandas.core.index import RangeIndex


In [5]:
import tensorflow as tf
from tensorflow.keras import layers

tf.keras.backend.clear_session()  # For easy reset of notebook state.

In [6]:
DATA_DIR = '/data/single_cell_classification'
MODEL_DIR = '/models'

In [7]:
run_name = 'br'

expression_fp = os.path.join(DATA_DIR, 'tumor', 'BR', 'raw', 'houxiang_brca',
                            'breast_counts_matrix.tsv')
label_fp = os.path.join(DATA_DIR, 'tumor', 'BR', 'raw', 'houxiang_brca',
                            'breast_metadata.tsv')

training_image_dir = os.path.join(MODEL_DIR, 'scratch', run_name)
model_save_dir = os.path.join(MODEL_DIR, run_name)

sample_column = 'Genes'
sep='\t'
cell_type_key = 'cell_type'

n_per_cell_type = 5000
epochs = 5
batch_size = 128

In [8]:
expression_df = pd.read_hdf(expression_fp.replace('.tsv', '.h5'), 'df')
expression_df

Genes,MIR1302-2HG,FAM138A,OR4F5,AL627309.1,AL627309.3,AL627309.2,AL627309.4,AL732372.1,OR4F29,AC114498.1,...,AC007325.2,BX072566.1,AL354822.1,AC023491.2,AC004556.1,AC233755.2,AC233755.1,AC240274.1,AC213203.1,FAM231C
HT062B1_S1PA_AAACCCACACAAATGA-1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
HT062B1_S1PA_AAACCCAGTGCTCCGA-1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
HT062B1_S1PA_AAACCCATCGGAATTC-1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
HT062B1_S1PA_AAACGAACAGCTAACT-1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
HT062B1_S1PA_AAACGAAGTAGGGAGG-1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
HT110B1_XB3_TTTCATGTCGGCAGTC-1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
HT110B1_XB3_TTTCCTCGTGTTACTG-1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
HT110B1_XB3_TTTGACTCAGGGTCTC-1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
HT110B1_XB3_TTTGGAGCAAGAGGCT-1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [9]:
label_df = pd.read_csv(
    label_fp,
    sep=sep
    )
label_df = label_df.set_index('cell_id')
label_df = label_df.loc[expression_df.index]
label_df

Unnamed: 0,sample_id,cancer_type,tissue_type,organ_type,cell_type,species,method,facs
HT062B1_S1PA_AAACCCACACAAATGA-1,TWCE-HT062B1-S1PAA1A1Z1B1,Breast Cancer,Epithelial,Breast,CD8+ T-cells,Homo sapiens,sc,yes
HT062B1_S1PA_AAACCCAGTGCTCCGA-1,TWCE-HT062B1-S1PAA1A1Z1B1,Breast Cancer,Epithelial,Breast,Endothelial cells,Homo sapiens,sc,yes
HT062B1_S1PA_AAACCCATCGGAATTC-1,TWCE-HT062B1-S1PAA1A1Z1B1,Breast Cancer,Epithelial,Breast,Fibroblasts,Homo sapiens,sc,yes
HT062B1_S1PA_AAACGAACAGCTAACT-1,TWCE-HT062B1-S1PAA1A1Z1B1,Breast Cancer,Epithelial,Breast,BR_Malignant,Homo sapiens,sc,yes
HT062B1_S1PA_AAACGAAGTAGGGAGG-1,TWCE-HT062B1-S1PAA1A1Z1B1,Breast Cancer,Epithelial,Breast,CD4+ T-cells,Homo sapiens,sc,yes
...,...,...,...,...,...,...,...,...
HT110B1_XB3_TTTCATGTCGGCAGTC-1,TWCE-HT110B1-XB3,Breast Cancer,Epithelial,Breast,BR_Malignant,Homo sapiens,sc,yes
HT110B1_XB3_TTTCCTCGTGTTACTG-1,TWCE-HT110B1-XB3,Breast Cancer,Epithelial,Breast,Fibroblasts,Homo sapiens,sc,yes
HT110B1_XB3_TTTGACTCAGGGTCTC-1,TWCE-HT110B1-XB3,Breast Cancer,Epithelial,Breast,Fibroblasts,Homo sapiens,sc,yes
HT110B1_XB3_TTTGGAGCAAGAGGCT-1,TWCE-HT110B1-XB3,Breast Cancer,Epithelial,Breast,Macrophages,Homo sapiens,sc,yes


In [10]:
adata = anndata.AnnData(X=expression_df.values, obs=label_df)
adata.obs.index = expression_df.index
adata.var.index = expression_df.columns
adata

AnnData object with n_obs × n_vars = 49088 × 33538 
    obs: 'sample_id', 'cancer_type', 'tissue_type', 'organ_type', 'cell_type', 'species', 'method', 'facs'

In [11]:
counts = Counter(adata.obs[cell_type_key])
counts.most_common()

[('BR_Malignant', 10137),
 ('Fibroblasts', 8305),
 ('CD4+ T-cells', 8001),
 ('CD8+ T-cells', 7080),
 ('Macrophages', 3585),
 ('B-cells', 3001),
 ('Endothelial cells', 2883),
 ('NK cells', 2053),
 ('Tregs', 1523),
 ('Plasma cells', 1445),
 ('Unknown', 808),
 ('Mast cells', 161),
 ('DC', 106)]

In [12]:
## get rid of unknowns
adata = adata[adata.obs[cell_type_key]!='Unknown']
adata

View of AnnData object with n_obs × n_vars = 48280 × 33538 
    obs: 'sample_id', 'cancer_type', 'tissue_type', 'organ_type', 'cell_type', 'species', 'method', 'facs'

In [13]:
# def __init__(self, adata, cell_type_key='ClusterName', n_per_cell_type=500,
# ¦   ¦   batch_size=64, dataset_type='training', min_genes=200, min_cells=3, mito_threshold=.2,
# ¦   ¦   max_n_genes=None, log=True, cpm=True, min_disp=.2, standard_scaler=None,
# ¦   ¦   range_scaler=None, cell_type_encoder=None, genes=None, cell_types=None):


In [14]:
pds = PollockDataset(adata.copy(), cell_type_key=cell_type_key, n_per_cell_type=1000, batch_size=128,
                    dataset_type='training', min_genes=200, min_cells=3, mito_threshold=None,
                    max_n_genes=None, log=True, cpm=False, min_disp=.2)

2020-03-20 22:27:34,523 normalizing counts for model training
2020-03-20 22:27:34,525 filtering by min genes: 200
2020-03-20 22:27:40,945 filtering by min cells: 3
2020-03-20 22:27:54,710 loging data
2020-03-20 22:28:02,727 filtering with dispersion 0.2
2020-03-20 22:28:14,415 remaining after min disp: 5908
2020-03-20 22:28:14,419 scaling data
2020-03-20 22:28:23,870 scaling to between 0, 1
2020-03-20 22:28:30,064 creating datasets


In [15]:
Counter(pds.val_adata.obs[cell_type_key]).most_common()

[('BR_Malignant', 9137),
 ('Fibroblasts', 7305),
 ('CD4+ T-cells', 7001),
 ('CD8+ T-cells', 6080),
 ('Macrophages', 2585),
 ('B-cells', 2001),
 ('Endothelial cells', 1883),
 ('NK cells', 1053),
 ('Tregs', 523),
 ('Plasma cells', 445),
 ('Mast cells', 33),
 ('DC', 22)]

In [16]:
# class BVAE(tf.keras.Model):
#   def __init__(self, latent_dim, input_size):
#     super(BVAE, self).__init__()
#     self.latent_dim = latent_dim
#     self.input_size = input_size
#     self.inference_net = tf.keras.Sequential(
#       [
#           tf.keras.layers.InputLayer(input_shape=(input_size,)),
#           tf.keras.layers.Dense(800, activation='relu'),
#           tf.keras.layers.Dropout(.2),
#           tf.keras.layers.Dense(800, activation='relu'),
#           tf.keras.layers.Dropout(.2),
#           tf.keras.layers.Dense(latent_dim + latent_dim),
#       ]
#     )

#     self.generative_net = tf.keras.Sequential(
#         [
#           tf.keras.layers.InputLayer(input_shape=(latent_dim,)),
#           tf.keras.layers.Dense(800, activation='relu'),
#           tf.keras.layers.Dropout(.2),
#           tf.keras.layers.Dense(800, activation='relu'),
#           tf.keras.layers.Dropout(.2),
#           tf.keras.layers.Dense(input_size),
#         ]
#     )

#   @tf.function
#   def sample(self, eps=None):
#     if eps is None:
#       eps = tf.random.normal(shape=(100, self.latent_dim))
#     return self.decode(eps, apply_sigmoid=True)

#   def encode(self, x):
#     mean, logvar = tf.split(self.inference_net(x), num_or_size_splits=2, axis=1)
#     return mean, logvar

#   def reparameterize(self, mean, logvar):
#     eps = tf.random.normal(shape=mean.shape)
#     return eps * tf.exp(logvar * .5) + mean

#   def decode(self, z, apply_sigmoid=False):
#     logits = self.generative_net(z)
#     if apply_sigmoid:
#       probs = tf.sigmoid(logits)
#       return probs

#     return logits

In [17]:
# optimizer = tf.keras.optimizers.Adam(1e-4)

# def log_normal_pdf(sample, mean, logvar, raxis=1):
#   log2pi = tf.math.log(2. * np.pi)
#   return tf.reduce_sum(
#       -.5 * ((sample - mean) ** 2. * tf.exp(-logvar) + logvar + log2pi),
#       axis=raxis)

# @tf.function
# def compute_loss(model, x, alpha=0.00005):
#   mean, logvar = model.encode(x)
#   z = model.reparameterize(mean, logvar)
#   x_logit = model.decode(z)

#   kl_loss = .5 * tf.reduce_sum(tf.exp(logvar) + tf.square(mean) - 1. - logvar, axis=1)
#   reconstruction_loss = .5 * tf.reduce_sum(tf.square((x - x_logit)), axis=1)

#   overall_loss = tf.reduce_mean(reconstruction_loss + alpha * kl_loss)
#   return overall_loss

# @tf.function
# def compute_apply_gradients(model, x, optimizer, alpha=.00005):
#   with tf.GradientTape() as tape:
#     loss = compute_loss(model, x, alpha=alpha)
#   gradients = tape.gradient(loss, model.trainable_variables)
#   optimizer.apply_gradients(zip(gradients, model.trainable_variables))

In [18]:
# epochs = 50
# latent_dim = 100
# alpha = 0.1
# # num_examples_to_generate = 16

# # keeping the random vector constant for generation (prediction) so
# # it will be easier to see the improvement.
# # random_vector_for_generation = tf.random.normal(
# #     shape=[num_examples_to_generate, latent_dim])
# model = BVAE(latent_dim, pds.val_adata.shape[1])

In [19]:
# # generate_and_save_images(model, 0, random_vector_for_generation)

# for epoch in range(1, epochs + 1):
#   start_time = time.time()
#   for train_x in pds.train_ds:
#     compute_apply_gradients(model, train_x, optimizer, alpha=alpha)
#   end_time = time.time()

#   if epoch % 1 == 0:
#     loss = tf.keras.metrics.Mean()
#     for test_x in pds.val_ds:
#       loss(compute_loss(model, test_x, alpha=alpha))
#     print(f'epoch: {epoch}, val loss: {loss.result()}')


In [20]:
# mean, logvar = model.encode(pds.train_adata.X)
# train_embeddings = model.reparameterize(mean, logvar).numpy()

# mean, logvar = model.encode(pds.val_adata.X[:10000])
# val_embeddings = model.reparameterize(mean, logvar).numpy()

In [21]:
# from sklearn.preprocessing import OrdinalEncoder
# from sklearn.ensemble import RandomForestClassifier
# encoder = OrdinalEncoder()
# y_train = encoder.fit_transform(np.asarray(pds.train_adata.obs[cell_type_key]).reshape(-1, 1)).flatten()
# y_val = encoder.transform(np.asarray(pds.val_adata.obs[cell_type_key][:10000]).reshape(-1, 1)).flatten()

In [22]:
# clf = RandomForestClassifier()

In [23]:
# %%time
# clf.fit(train_embeddings, y_train)

In [24]:
# clf.score(train_embeddings, y_train)

In [25]:
# clf.score(val_embeddings, y_val)

In [26]:
pm = PollockModel(pds.cell_types, pds.train_adata.shape[1], alpha=.001)

In [27]:
pm.fit(pds, epochs=100)

2020-03-20 22:28:48,690 epoch: 1, val loss: 22.403322219848633
2020-03-20 22:28:51,600 epoch: 2, val loss: 20.046213150024414
2020-03-20 22:28:54,413 epoch: 3, val loss: 18.71497344970703
2020-03-20 22:28:57,222 epoch: 4, val loss: 18.29530143737793
2020-03-20 22:29:00,009 epoch: 5, val loss: 17.95246124267578
2020-03-20 22:29:02,958 epoch: 6, val loss: 17.627763748168945
2020-03-20 22:29:06,013 epoch: 7, val loss: 17.353614807128906
2020-03-20 22:29:08,929 epoch: 8, val loss: 17.141666412353516
2020-03-20 22:29:11,736 epoch: 9, val loss: 16.95850944519043
2020-03-20 22:29:14,615 epoch: 10, val loss: 16.806135177612305
2020-03-20 22:29:17,508 epoch: 11, val loss: 16.666513442993164
2020-03-20 22:29:20,254 epoch: 12, val loss: 16.553842544555664
2020-03-20 22:29:23,003 epoch: 13, val loss: 16.45140838623047
2020-03-20 22:29:25,870 epoch: 14, val loss: 16.36064910888672
2020-03-20 22:29:28,817 epoch: 15, val loss: 16.28092384338379
2020-03-20 22:29:31,470 epoch: 16, val loss: 16.21195793

1.0
0.8674477251234632


In [28]:
pm.save(pds, os.path.join(MODEL_DIR, 'testing'), )

In [29]:
pm.summary['training']

{'metrics': {'B-cells': {'precision': 0.9820717131474104,
   'recall': 0.986,
   'f1-score': 0.9840319361277445,
   'support': 1000},
  'BR_Malignant': {'precision': 0.9671120246659815,
   'recall': 0.941,
   'f1-score': 0.9538773441459706,
   'support': 1000},
  'CD4+ T-cells': {'precision': 0.7452914798206278,
   'recall': 0.831,
   'f1-score': 0.7858156028368795,
   'support': 1000},
  'CD8+ T-cells': {'precision': 0.7870646766169154,
   'recall': 0.791,
   'f1-score': 0.7890274314214465,
   'support': 1000},
  'DC': {'precision': 0.9534883720930233,
   'recall': 0.9761904761904762,
   'f1-score': 0.9647058823529412,
   'support': 84},
  'Endothelial cells': {'precision': 0.95489443378119,
   'recall': 0.995,
   'f1-score': 0.9745347698334965,
   'support': 1000},
  'Fibroblasts': {'precision': 0.9899699097291875,
   'recall': 0.987,
   'f1-score': 0.9884827240861291,
   'support': 1000},
  'Macrophages': {'precision': 0.9898477157360406,
   'recall': 0.975,
   'f1-score': 0.9823677

In [30]:
pm.summary['validation']['metrics']['accuracy']

0.8676053378165388

In [31]:
pm.summary['training']['metrics']['accuracy']

0.927144535840188

In [32]:
l_pds, l_pm = load_from_directory(adata, os.path.join(MODEL_DIR, 'testing'))

2020-03-20 22:35:29,162 normalizing counts for model training
2020-03-20 22:35:37,624 loging data
  view_to_actual(data)
2020-03-20 22:35:42,598 scaling data


In [33]:
labels = l_pm.predict_pollock_dataset(l_pds, labels=True, )
labels

(('CD8+ T-cells',
  'Endothelial cells',
  'Fibroblasts',
  'BR_Malignant',
  'CD8+ T-cells',
  'NK cells',
  'BR_Malignant',
  'BR_Malignant',
  'BR_Malignant',
  'CD8+ T-cells',
  'NK cells',
  'BR_Malignant',
  'Endothelial cells',
  'NK cells',
  'NK cells',
  'BR_Malignant',
  'Macrophages',
  'Endothelial cells',
  'Fibroblasts',
  'Fibroblasts',
  'BR_Malignant',
  'NK cells',
  'Fibroblasts',
  'NK cells',
  'Tregs',
  'NK cells',
  'Endothelial cells',
  'CD8+ T-cells',
  'Endothelial cells',
  'Tregs',
  'Tregs',
  'BR_Malignant',
  'BR_Malignant',
  'CD8+ T-cells',
  'BR_Malignant',
  'CD8+ T-cells',
  'NK cells',
  'Fibroblasts',
  'BR_Malignant',
  'BR_Malignant',
  'Tregs',
  'Endothelial cells',
  'Endothelial cells',
  'BR_Malignant',
  'Endothelial cells',
  'NK cells',
  'BR_Malignant',
  'Macrophages',
  'BR_Malignant',
  'Endothelial cells',
  'CD8+ T-cells',
  'BR_Malignant',
  'Tregs',
  'Endothelial cells',
  'NK cells',
  'CD8+ T-cells',
  'CD8+ T-cells',
  'CD8