In [1]:
import sys
sys.path.insert(0, '/tf/pollock')

In [2]:
%load_ext autoreload

In [3]:
%autoreload 2

In [4]:
import logging
import os
import random
from collections import Counter
from importlib import reload
import time

import anndata
import numpy as np
import pandas as pd
import scanpy as sc
import matplotlib.pyplot as plt
import anndata2ri


import pollock
from pollock import PollockDataset, PollockModel, load_from_directory
# import pollock.models.analysis as pollock_analysis

  from pandas.core.index import RangeIndex


In [5]:
import tensorflow as tf
from tensorflow.keras import layers

tf.keras.backend.clear_session()  # For easy reset of notebook state.

In [6]:
DATA_DIR = '/data/single_cell_classification'
MODEL_DIR = '/models'

## expression tables

In [7]:
run_name = 'sc_brca'

expression_fp = os.path.join(DATA_DIR, 'tumor', 'BR', 'raw', 'houxiang_brca',
                            'breast_counts_matrix.tsv')
label_fp = os.path.join(DATA_DIR, 'tumor', 'BR', 'raw', 'houxiang_brca',
                            'breast_metadata.tsv')

model_save_dir = os.path.join(MODEL_DIR, run_name)

sample_column = 'Genes'
sep='\t'
cell_type_key = 'cell_type'

In [None]:
run_name = 'sc_hnsc'

expression_fp = os.path.join(DATA_DIR, 'tumor', 'HNSC', 'raw', 'hnsc_yize',
                            'Assigned_WUHN_15_processed_cluster_review_gene_expression_format_2.tsv')
label_fp = os.path.join(DATA_DIR, 'tumor', 'HNSC', 'raw', 'hnsc_yize',
                            'Assigned_WUHN_15_processed_cluster_review_cell_metadata_format_2.tsv')

model_save_dir = os.path.join(MODEL_DIR, run_name)

sample_column = 'Genes'
sep='\t'
cell_type_key = 'cell_type'

In [None]:
run_name = 'sc_cesc'

expression_fp = os.path.join(DATA_DIR, 'tumor', 'CESC', 'raw', 'cesc_yize_v2',
                            'Assigned_CESC_9_processed_cluster_review_final_gene_expression_format.tsv')
label_fp = os.path.join(DATA_DIR, 'tumor', 'CESC', 'raw', 'cesc_yize_v2',
                            'Assigned_CESC_9_processed_cluster_review_final_cell_metadata_format.tsv')

model_save_dir = os.path.join(MODEL_DIR, run_name)

sample_column = 'Genes'
sep='\t'
cell_type_key = 'cell_type'

## RDS object

In [None]:
run_name = 'sc_pdac'

rds_fp = os.path.join(DATA_DIR, 'tumor', 'PDAC', 'All_merged.rds')

model_save_dir = os.path.join(MODEL_DIR, run_name)

sample_column = 'Genes'
sep='\t'
cell_type_key = 'cell_type'

In [None]:

# Activate the anndata2ri conversion between SingleCellExperiment and AnnData
anndata2ri.activate()

#Loading the rpy2 extension enables cell magic to be used
#This runs R code in jupyter notebook cells
%load_ext rpy2.ipython

In [None]:
%%R -i rds_fp
suppressPackageStartupMessages(library(Seurat))

exp <- readRDS(file = rds_fp)

exp

In [None]:
%%R -o adata
#convert the Seurat object to a SingleCellExperiment object
adata <- as.SingleCellExperiment(exp)

adata

## H5

In [None]:
run_name = 'sn_ccrcc'

expression_fp = os.path.join(DATA_DIR, 'tumor', 'CCRCC', 'yige',
                            'adata.h5')
label_fp = os.path.join(DATA_DIR, 'tumor', 'CCRCC', 'yige',
                            'metadata.tsv')

model_save_dir = os.path.join(MODEL_DIR, run_name)

sample_column = 'Genes'
sep='\t'
cell_type_key = 'cell_type'

In [8]:
expression_df = pd.read_hdf(expression_fp.replace('.tsv', '.h5'), 'df')
expression_df

Genes,MIR1302-2HG,FAM138A,OR4F5,AL627309.1,AL627309.3,AL627309.2,AL627309.4,AL732372.1,OR4F29,AC114498.1,...,AC007325.2,BX072566.1,AL354822.1,AC023491.2,AC004556.1,AC233755.2,AC233755.1,AC240274.1,AC213203.1,FAM231C
HT062B1_S1PA_AAACCCACACAAATGA-1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
HT062B1_S1PA_AAACCCAGTGCTCCGA-1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
HT062B1_S1PA_AAACCCATCGGAATTC-1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
HT062B1_S1PA_AAACGAACAGCTAACT-1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
HT062B1_S1PA_AAACGAAGTAGGGAGG-1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
HT110B1_XB3_TTTCATGTCGGCAGTC-1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
HT110B1_XB3_TTTCCTCGTGTTACTG-1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
HT110B1_XB3_TTTGACTCAGGGTCTC-1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
HT110B1_XB3_TTTGGAGCAAGAGGCT-1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [9]:
label_df = pd.read_csv(
    label_fp,
    sep=sep
    )
label_df = label_df.set_index('cell_id')
label_df = label_df.loc[expression_df.index]
label_df

Unnamed: 0,sample_id,cancer_type,tissue_type,organ_type,cell_type,species,method,facs
HT062B1_S1PA_AAACCCACACAAATGA-1,TWCE-HT062B1-S1PAA1A1Z1B1,Breast Cancer,Epithelial,Breast,CD8+ T-cells,Homo sapiens,sc,yes
HT062B1_S1PA_AAACCCAGTGCTCCGA-1,TWCE-HT062B1-S1PAA1A1Z1B1,Breast Cancer,Epithelial,Breast,Endothelial cells,Homo sapiens,sc,yes
HT062B1_S1PA_AAACCCATCGGAATTC-1,TWCE-HT062B1-S1PAA1A1Z1B1,Breast Cancer,Epithelial,Breast,Fibroblasts,Homo sapiens,sc,yes
HT062B1_S1PA_AAACGAACAGCTAACT-1,TWCE-HT062B1-S1PAA1A1Z1B1,Breast Cancer,Epithelial,Breast,BR_Malignant,Homo sapiens,sc,yes
HT062B1_S1PA_AAACGAAGTAGGGAGG-1,TWCE-HT062B1-S1PAA1A1Z1B1,Breast Cancer,Epithelial,Breast,CD4+ T-cells,Homo sapiens,sc,yes
...,...,...,...,...,...,...,...,...
HT110B1_XB3_TTTCATGTCGGCAGTC-1,TWCE-HT110B1-XB3,Breast Cancer,Epithelial,Breast,BR_Malignant,Homo sapiens,sc,yes
HT110B1_XB3_TTTCCTCGTGTTACTG-1,TWCE-HT110B1-XB3,Breast Cancer,Epithelial,Breast,Fibroblasts,Homo sapiens,sc,yes
HT110B1_XB3_TTTGACTCAGGGTCTC-1,TWCE-HT110B1-XB3,Breast Cancer,Epithelial,Breast,Fibroblasts,Homo sapiens,sc,yes
HT110B1_XB3_TTTGGAGCAAGAGGCT-1,TWCE-HT110B1-XB3,Breast Cancer,Epithelial,Breast,Macrophages,Homo sapiens,sc,yes


In [10]:
adata = anndata.AnnData(X=expression_df.values, obs=label_df)
adata.obs.index = expression_df.index
adata.var.index = expression_df.columns
adata

AnnData object with n_obs × n_vars = 49088 × 33538 
    obs: 'sample_id', 'cancer_type', 'tissue_type', 'organ_type', 'cell_type', 'species', 'method', 'facs'

In [11]:
counts = Counter(adata.obs[cell_type_key])
counts.most_common()

[('BR_Malignant', 10137),
 ('Fibroblasts', 8305),
 ('CD4+ T-cells', 8001),
 ('CD8+ T-cells', 7080),
 ('Macrophages', 3585),
 ('B-cells', 3001),
 ('Endothelial cells', 2883),
 ('NK cells', 2053),
 ('Tregs', 1523),
 ('Plasma cells', 1445),
 ('Unknown', 808),
 ('Mast cells', 161),
 ('DC', 106)]

In [12]:
## get rid of unknowns
adata = adata[adata.obs[cell_type_key]!='Unknown']
adata

View of AnnData object with n_obs × n_vars = 48280 × 33538 
    obs: 'sample_id', 'cancer_type', 'tissue_type', 'organ_type', 'cell_type', 'species', 'method', 'facs'

In [13]:
pds = PollockDataset(adata.copy(), cell_type_key=cell_type_key, n_per_cell_type=1000, batch_size=128,
                    dataset_type='training', min_genes=200, min_cells=3, mito_threshold=None,
                    max_n_genes=None, log=True, cpm=True, min_disp=.2)

2020-03-23 15:13:06,471 normalizing counts for model training
2020-03-23 15:13:06,472 filtering by min genes: 200
2020-03-23 15:13:12,908 filtering by min cells: 3
2020-03-23 15:13:26,645 converting to cpm
2020-03-23 15:13:27,639 loging data
2020-03-23 15:13:35,809 filtering with dispersion 0.2
2020-03-23 15:13:47,616 remaining after min disp: 7283
2020-03-23 15:13:47,620 scaling data
2020-03-23 15:13:58,332 scaling to between 0, 1
2020-03-23 15:14:05,539 creating datasets


In [14]:
Counter(pds.val_adata.obs[cell_type_key]).most_common()

[('BR_Malignant', 9137),
 ('Fibroblasts', 7305),
 ('CD4+ T-cells', 7001),
 ('CD8+ T-cells', 6080),
 ('Macrophages', 2585),
 ('B-cells', 2001),
 ('Endothelial cells', 1883),
 ('NK cells', 1053),
 ('Tregs', 523),
 ('Plasma cells', 445),
 ('Mast cells', 33),
 ('DC', 22)]

In [15]:
pm = PollockModel(pds.cell_types, pds.train_adata.shape[1], alpha=.001)

In [16]:
pm.fit(pds, epochs=100)

2020-03-23 15:14:25,707 epoch: 1, val loss: 12.445099830627441
2020-03-23 15:14:29,209 epoch: 2, val loss: 11.999298095703125
2020-03-23 15:14:32,466 epoch: 3, val loss: 11.588274955749512
2020-03-23 15:14:35,649 epoch: 4, val loss: 11.518356323242188
2020-03-23 15:14:38,871 epoch: 5, val loss: 11.49637508392334
2020-03-23 15:14:42,093 epoch: 6, val loss: 11.480427742004395
2020-03-23 15:14:45,091 epoch: 7, val loss: 11.475638389587402
2020-03-23 15:14:47,972 epoch: 8, val loss: 11.467459678649902
2020-03-23 15:14:51,286 epoch: 9, val loss: 11.454673767089844
2020-03-23 15:14:54,523 epoch: 10, val loss: 11.410367012023926
2020-03-23 15:14:57,855 epoch: 11, val loss: 11.29349136352539
2020-03-23 15:15:01,117 epoch: 12, val loss: 11.209061622619629
2020-03-23 15:15:04,400 epoch: 13, val loss: 11.177154541015625
2020-03-23 15:15:07,534 epoch: 14, val loss: 11.162842750549316
2020-03-23 15:15:11,030 epoch: 15, val loss: 11.151219367980957
2020-03-23 15:15:14,266 epoch: 16, val loss: 11.114

1.0
0.6002679415782284


In [17]:
pm.save(pds, model_save_dir)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [18]:
pm.summary['training']

{'metrics': {'B-cells': {'precision': 0.4859675036927622,
   'recall': 0.658,
   'f1-score': 0.5590484282073067,
   'support': 1000},
  'BR_Malignant': {'precision': 0.7468619246861925,
   'recall': 0.714,
   'f1-score': 0.7300613496932516,
   'support': 1000},
  'CD4+ T-cells': {'precision': 0.25154457193292146,
   'recall': 0.285,
   'f1-score': 0.26722925457102675,
   'support': 1000},
  'CD8+ T-cells': {'precision': 0.22592592592592592,
   'recall': 0.183,
   'f1-score': 0.2022099447513812,
   'support': 1000},
  'DC': {'precision': 0.8571428571428571,
   'recall': 0.07142857142857142,
   'f1-score': 0.13186813186813187,
   'support': 84},
  'Endothelial cells': {'precision': 0.9282982791586998,
   'recall': 0.971,
   'f1-score': 0.9491691104594331,
   'support': 1000},
  'Fibroblasts': {'precision': 0.9348249027237354,
   'recall': 0.961,
   'f1-score': 0.9477317554240631,
   'support': 1000},
  'Macrophages': {'precision': 0.8974854932301741,
   'recall': 0.928,
   'f1-score': 0.

In [19]:
pm.summary['validation']['metrics']['accuracy']

0.5989282336870863

In [20]:
pm.summary['training']['metrics']['accuracy']

0.5611045828437132

In [21]:
l_pds, l_pm = load_from_directory(adata, model_save_dir)

2020-03-23 15:21:51,002 normalizing counts for model training
2020-03-23 15:21:51,019 converting to cpm




2020-03-23 15:22:01,485 loging data
  view_to_actual(data)
2020-03-23 15:22:07,113 scaling data


In [22]:
labels = l_pm.predict_pollock_dataset(l_pds, labels=True, )
labels

(('B-cells',
  'Endothelial cells',
  'Fibroblasts',
  'BR_Malignant',
  'CD8+ T-cells',
  'CD4+ T-cells',
  'BR_Malignant',
  'BR_Malignant',
  'BR_Malignant',
  'BR_Malignant',
  'Tregs',
  'BR_Malignant',
  'Endothelial cells',
  'Tregs',
  'Tregs',
  'BR_Malignant',
  'Macrophages',
  'Endothelial cells',
  'Fibroblasts',
  'Fibroblasts',
  'BR_Malignant',
  'BR_Malignant',
  'Fibroblasts',
  'NK cells',
  'Tregs',
  'Tregs',
  'Endothelial cells',
  'Tregs',
  'Endothelial cells',
  'NK cells',
  'Tregs',
  'BR_Malignant',
  'BR_Malignant',
  'CD4+ T-cells',
  'BR_Malignant',
  'CD4+ T-cells',
  'CD8+ T-cells',
  'Fibroblasts',
  'BR_Malignant',
  'Endothelial cells',
  'Tregs',
  'Endothelial cells',
  'Endothelial cells',
  'BR_Malignant',
  'Endothelial cells',
  'Plasma cells',
  'BR_Malignant',
  'Macrophages',
  'BR_Malignant',
  'Endothelial cells',
  'B-cells',
  'BR_Malignant',
  'CD8+ T-cells',
  'Endothelial cells',
  'NK cells',
  'BR_Malignant',
  'CD8+ T-cells',
  'T

In [23]:
adata.obs[cell_type_key]

HT062B1_S1PA_AAACCCACACAAATGA-1         CD8+ T-cells
HT062B1_S1PA_AAACCCAGTGCTCCGA-1    Endothelial cells
HT062B1_S1PA_AAACCCATCGGAATTC-1          Fibroblasts
HT062B1_S1PA_AAACGAACAGCTAACT-1         BR_Malignant
HT062B1_S1PA_AAACGAAGTAGGGAGG-1         CD4+ T-cells
                                         ...        
HT110B1_XB3_TTTCATGTCGGCAGTC-1          BR_Malignant
HT110B1_XB3_TTTCCTCGTGTTACTG-1           Fibroblasts
HT110B1_XB3_TTTGACTCAGGGTCTC-1           Fibroblasts
HT110B1_XB3_TTTGGAGCAAGAGGCT-1           Macrophages
HT110B1_XB3_TTTGGAGGTAACATGA-1          BR_Malignant
Name: cell_type, Length: 48280, dtype: object