In [1]:
import sys
sys.path.insert(0, '/tf/pollock')

In [2]:
%load_ext autoreload

In [3]:
%autoreload 2

In [4]:
import logging
import os
import random
from collections import Counter
from importlib import reload
import time

import anndata
import numpy as np
import pandas as pd
import scanpy as sc
import matplotlib.pyplot as plt

import pollock
from pollock import PollockDataset, PollockModel, load_from_directory
# import pollock.models.analysis as pollock_analysis

  from pandas.core.index import RangeIndex


In [5]:
import tensorflow as tf
from tensorflow.keras import layers

tf.keras.backend.clear_session()  # For easy reset of notebook state.

In [6]:
DATA_DIR = '/data/single_cell_classification'
MODEL_DIR = '/models'

In [7]:
run_name = 'br'

expression_fp = os.path.join(DATA_DIR, 'tumor', 'BR', 'raw', 'houxiang_brca',
                            'breast_counts_matrix.tsv')
label_fp = os.path.join(DATA_DIR, 'tumor', 'BR', 'raw', 'houxiang_brca',
                            'breast_metadata.tsv')

training_image_dir = os.path.join(MODEL_DIR, 'scratch', run_name)
model_save_dir = os.path.join(MODEL_DIR, run_name)

sample_column = 'Genes'
sep='\t'
cell_type_key = 'cell_type'

n_per_cell_type = 5000
epochs = 5
batch_size = 128

In [None]:
expression_df = pd.read_hdf(expression_fp.replace('.tsv', '.h5'), 'df')
expression_df

In [None]:
label_df = pd.read_csv(
    label_fp,
    sep=sep
    )
label_df = label_df.set_index('cell_id')
label_df = label_df.loc[expression_df.index]
label_df

In [None]:
adata = anndata.AnnData(X=expression_df.values, obs=label_df)
adata.obs.index = expression_df.index
adata.var.index = expression_df.columns
adata

In [None]:
counts = Counter(adata.obs[cell_type_key])
counts.most_common()

In [None]:
## get rid of unknowns
adata = adata[adata.obs[cell_type_key]!='Unknown']
adata

In [None]:
# def __init__(self, adata, cell_type_key='ClusterName', n_per_cell_type=500,
# ¦   ¦   batch_size=64, dataset_type='training', min_genes=200, min_cells=3, mito_threshold=.2,
# ¦   ¦   max_n_genes=None, log=True, cpm=True, min_disp=.2, standard_scaler=None,
# ¦   ¦   range_scaler=None, cell_type_encoder=None, genes=None, cell_types=None):


In [None]:
pds = PollockDataset(adata.copy(), cell_type_key=cell_type_key, n_per_cell_type=1000, batch_size=128,
                    dataset_type='training', min_genes=200, min_cells=3, mito_threshold=None,
                    max_n_genes=None, log=True, cpm=False, min_disp=.2)

In [None]:
Counter(pds.val_adata.obs[cell_type_key]).most_common()

In [None]:
# class BVAE(tf.keras.Model):
#   def __init__(self, latent_dim, input_size):
#     super(BVAE, self).__init__()
#     self.latent_dim = latent_dim
#     self.input_size = input_size
#     self.inference_net = tf.keras.Sequential(
#       [
#           tf.keras.layers.InputLayer(input_shape=(input_size,)),
#           tf.keras.layers.Dense(800, activation='relu'),
#           tf.keras.layers.Dropout(.2),
#           tf.keras.layers.Dense(800, activation='relu'),
#           tf.keras.layers.Dropout(.2),
#           tf.keras.layers.Dense(latent_dim + latent_dim),
#       ]
#     )

#     self.generative_net = tf.keras.Sequential(
#         [
#           tf.keras.layers.InputLayer(input_shape=(latent_dim,)),
#           tf.keras.layers.Dense(800, activation='relu'),
#           tf.keras.layers.Dropout(.2),
#           tf.keras.layers.Dense(800, activation='relu'),
#           tf.keras.layers.Dropout(.2),
#           tf.keras.layers.Dense(input_size),
#         ]
#     )

#   @tf.function
#   def sample(self, eps=None):
#     if eps is None:
#       eps = tf.random.normal(shape=(100, self.latent_dim))
#     return self.decode(eps, apply_sigmoid=True)

#   def encode(self, x):
#     mean, logvar = tf.split(self.inference_net(x), num_or_size_splits=2, axis=1)
#     return mean, logvar

#   def reparameterize(self, mean, logvar):
#     eps = tf.random.normal(shape=mean.shape)
#     return eps * tf.exp(logvar * .5) + mean

#   def decode(self, z, apply_sigmoid=False):
#     logits = self.generative_net(z)
#     if apply_sigmoid:
#       probs = tf.sigmoid(logits)
#       return probs

#     return logits

In [None]:
# optimizer = tf.keras.optimizers.Adam(1e-4)

# def log_normal_pdf(sample, mean, logvar, raxis=1):
#   log2pi = tf.math.log(2. * np.pi)
#   return tf.reduce_sum(
#       -.5 * ((sample - mean) ** 2. * tf.exp(-logvar) + logvar + log2pi),
#       axis=raxis)

# @tf.function
# def compute_loss(model, x, alpha=0.00005):
#   mean, logvar = model.encode(x)
#   z = model.reparameterize(mean, logvar)
#   x_logit = model.decode(z)

#   kl_loss = .5 * tf.reduce_sum(tf.exp(logvar) + tf.square(mean) - 1. - logvar, axis=1)
#   reconstruction_loss = .5 * tf.reduce_sum(tf.square((x - x_logit)), axis=1)

#   overall_loss = tf.reduce_mean(reconstruction_loss + alpha * kl_loss)
#   return overall_loss

# @tf.function
# def compute_apply_gradients(model, x, optimizer, alpha=.00005):
#   with tf.GradientTape() as tape:
#     loss = compute_loss(model, x, alpha=alpha)
#   gradients = tape.gradient(loss, model.trainable_variables)
#   optimizer.apply_gradients(zip(gradients, model.trainable_variables))

In [None]:
# epochs = 50
# latent_dim = 100
# alpha = 0.1
# # num_examples_to_generate = 16

# # keeping the random vector constant for generation (prediction) so
# # it will be easier to see the improvement.
# # random_vector_for_generation = tf.random.normal(
# #     shape=[num_examples_to_generate, latent_dim])
# model = BVAE(latent_dim, pds.val_adata.shape[1])

In [None]:
# # generate_and_save_images(model, 0, random_vector_for_generation)

# for epoch in range(1, epochs + 1):
#   start_time = time.time()
#   for train_x in pds.train_ds:
#     compute_apply_gradients(model, train_x, optimizer, alpha=alpha)
#   end_time = time.time()

#   if epoch % 1 == 0:
#     loss = tf.keras.metrics.Mean()
#     for test_x in pds.val_ds:
#       loss(compute_loss(model, test_x, alpha=alpha))
#     print(f'epoch: {epoch}, val loss: {loss.result()}')


In [None]:
# mean, logvar = model.encode(pds.train_adata.X)
# train_embeddings = model.reparameterize(mean, logvar).numpy()

# mean, logvar = model.encode(pds.val_adata.X[:10000])
# val_embeddings = model.reparameterize(mean, logvar).numpy()

In [None]:
# from sklearn.preprocessing import OrdinalEncoder
# from sklearn.ensemble import RandomForestClassifier
# encoder = OrdinalEncoder()
# y_train = encoder.fit_transform(np.asarray(pds.train_adata.obs[cell_type_key]).reshape(-1, 1)).flatten()
# y_val = encoder.transform(np.asarray(pds.val_adata.obs[cell_type_key][:10000]).reshape(-1, 1)).flatten()

In [None]:
# clf = RandomForestClassifier()

In [None]:
# %%time
# clf.fit(train_embeddings, y_train)

In [None]:
# clf.score(train_embeddings, y_train)

In [None]:
# clf.score(val_embeddings, y_val)

In [None]:
pm = PollockModel(pds.cell_types, pds.train_adata.shape[1], alpha=.01)

In [None]:
pm.fit(pds, epochs=100)

In [None]:
pm.save(pds, os.path.join(MODEL_DIR, 'testing'), )

In [None]:
pm.summary['training']

In [None]:
pm.summary['validation']['metrics']['accuracy']

In [None]:
pm.summary['training']['metrics']['accuracy']

In [None]:
l_pds, l_pm = load_from_directory(adata, os.path.join(MODEL_DIR, 'testing'))

In [None]:
labels = l_pm.predict_pollock_dataset(l_pds, labels=True, )
labels