In [183]:
import os

import altair as alt
import pandas as pd
import numpy as np
import scipy
from sklearn.metrics import confusion_matrix, classification_report, f1_score, accuracy_score

import mgitools.os_helpers as os_helpers

In [184]:
DATA_DIR = './results/10272020_teir1/'

In [186]:
fps = sorted(os_helpers.listfiles(DATA_DIR, regex='.tsv$'))
# fps = [x for x in fps if 'seurat' not in x]
fps

['./results/10272020_teir1/scRNAseq/brca/SingleCellNet.tsv',
 './results/10272020_teir1/scRNAseq/brca/actinn.tsv',
 './results/10272020_teir1/scRNAseq/brca/pollock.tsv',
 './results/10272020_teir1/scRNAseq/brca/scanpy_ingest.tsv',
 './results/10272020_teir1/scRNAseq/brca/seurat_transfer.tsv',
 './results/10272020_teir1/scRNAseq/cesc/SingleCellNet.tsv',
 './results/10272020_teir1/scRNAseq/cesc/actinn.tsv',
 './results/10272020_teir1/scRNAseq/cesc/pollock.tsv',
 './results/10272020_teir1/scRNAseq/cesc/scanpy_ingest.tsv',
 './results/10272020_teir1/scRNAseq/cesc/seurat_transfer.tsv',
 './results/10272020_teir1/scRNAseq/hnscc/SingleCellNet.tsv',
 './results/10272020_teir1/scRNAseq/hnscc/actinn.tsv',
 './results/10272020_teir1/scRNAseq/hnscc/pollock.tsv',
 './results/10272020_teir1/scRNAseq/hnscc/scanpy_ingest.tsv',
 './results/10272020_teir1/scRNAseq/hnscc/seurat_transfer.tsv',
 './results/10272020_teir1/scRNAseq/melanoma/SingleCellNet.tsv',
 './results/10272020_teir1/scRNAseq/melanoma/act

In [188]:
pred_map = {}
for fp in fps:
    dtype = fp.split('/')[-3]
    disease = fp.split('/')[-2].upper()
    method = fp.split('/')[-1].replace('.tsv', '')
    if 'actinn' in method: method = method.upper()
    if 'scanpy' in method: method = 'Scanpy Ingest'
    if 'pollock' in method: method = 'Pollock'
    try:
        df = pd.read_csv(fp, sep='\t')
        df.columns = ['cell_id', 'groundtruth', 'predicted', 'probability']
        if dtype not in pred_map: pred_map[dtype] = {}
        if disease not in pred_map[dtype]: pred_map[dtype][disease] = {}
        pred_map[dtype][disease][method] = df
    except:
        print('failed: ', fp)


failed:  ./results/10272020_teir1/scRNAseq/melanoma/seurat_transfer.tsv
failed:  ./results/10272020_teir1/snATACseq/brca/seurat_transfer.tsv
failed:  ./results/10272020_teir1/snATACseq/gbm/seurat_transfer.tsv


In [157]:
pred_map

{'scRNAseq': {'BRCA': {'SingleCellNet':                                 cell_id  groundtruth    predicted  probability
   0      _HT062B1_S1PA_AAACGAATCGTGGCGT-1   CD8 T cell   CD8 T cell          NaN
   1      _HT062B1_S1PA_AAAGAACAGCTATCTG-1           NK           NK          NaN
   2      _HT062B1_S1PA_AAAGGATTCTGCCTGT-1  Endothelial  Endothelial          NaN
   3      _HT062B1_S1PA_AAAGGTAGTGGAAGTC-1         Treg         Treg          NaN
   4      _HT062B1_S1PA_AACAACCTCACTCACC-1  Endothelial  Endothelial          NaN
   ...                                 ...          ...          ...          ...
   11248   _HT171B1_BC2_TTTCGATGTGTAGGAC-1   CD4 T cell   CD4 T cell          NaN
   11249   _HT171B1_BC2_TTTGACTGTCGATTTG-1   CD8 T cell   CD8 T cell          NaN
   11250   _HT171B1_BC2_TTTGACTGTCGGTGAA-1   CD4 T cell   CD4 T cell          NaN
   11251   _HT171B1_BC2_TTTGGTTCATAGACTC-1   CD4 T cell   CD4 T cell          NaN
   11252   _HT171B1_BC2_TTTGGTTGTTGCTGAT-1   CD8 T cell   CD8

In [192]:
def get_performance_df(d):
    diseases = sorted(d.keys())
    methods = sorted(d[diseases[0]].keys())
    ds, scores, ms = [], [], []
    for disease in diseases:
        for method in methods:
            if method in d[disease]:
                df = d[disease][method]
                scores.append(f1_score(df['groundtruth'], df['predicted'], average='weighted'))
                ds.append(disease)
                ms.append(method)
    df = pd.DataFrame.from_dict({
        'disease': ds,
        'method': ms,
        'f1-score': scores
    })
    return df

In [190]:
dtype_order = ['scRNAseq', 'snRNAseq', 'snATACseq']

In [193]:
def make_tick_chart(df, title=''):
    chart = alt.Chart(df, title=title)
    chart = chart.mark_tick(thickness=2,).encode(
        x=alt.X(f'disease:N', axis=alt.Axis(title=None)),
        y=alt.Y(f'f1-score:Q', scale=alt.Scale(zero=False), axis=alt.Axis(grid=True)),
        color=alt.Color('method:N', sort=['Pollock'])
    )
    return chart

def make_point_chart(df):
    chart = alt.Chart(df)
    chart = chart.mark_point().encode(
        x=alt.X(f'disease:N', axis=alt.Axis(title=None)),
        y=alt.Y(f'f1-score:Q', scale=alt.Scale(zero=False), axis=alt.Axis(grid=True)),
        color=alt.Color('method:N', sort=['Pollock'])
    )
    return chart

combined = None
for dtype in dtype_order:
    d = pred_map[dtype]
    df = get_performance_df(d)
    df['disease'] = [x.upper() for x in df['disease']]
    print(df.shape)
    chart = make_point_chart(df) + make_tick_chart(df, title=dtype)
    if combined is None:
        combined = chart
    else:
        combined |= chart
combined.resolve_scale(y='shared')

(29, 3)
(15, 3)
(8, 3)


In [162]:
def build_f1_matrix(d):
    
    cell_type_labels = {c for df in d.values() for c in df['groundtruth']}
#     cell_type_labels.update({c for df in d.values() for c in df['predicted']})
    cell_types = sorted(cell_type_labels)

    data = []
    labels = []
    for label, df in d.items():
        labels.append(label)
        report = classification_report(df['groundtruth'], df['predicted'], labels=cell_types, output_dict=True)
        data.append([report[c]['f1-score'] for c in cell_types])
    df = pd.DataFrame(data=data, columns=cell_types, index=labels)
    return df
    

In [164]:
df = build_f1_matrix(pred_map['scRNAseq']['PBMC'])
df['method'] = df.index.to_list()
df = df.melt(id_vars=['method'])
df.columns = ['method', 'cell type', 'f1-score']
df

Unnamed: 0,method,cell type,f1-score
0,SingleCellNet,B cell,0.986014
1,ACTINN,B cell,0.982578
2,Pollock,B cell,0.928058
3,Scanpy Ingest,B cell,0.729223
4,SingleCellNet,CD4 T cell,0.971367
5,ACTINN,CD4 T cell,0.949807
6,Pollock,CD4 T cell,0.963945
7,Scanpy Ingest,CD4 T cell,0.81118
8,SingleCellNet,CD8 T cell,0.767347
9,ACTINN,CD8 T cell,0.659722


In [167]:
def get_cell_f1_heatmap(df, title='', keep_x=True, keep_y=True, y_title=''):
    x_kwargs, y_kwargs = {}, {}
    if not keep_x: x_kwargs['axis'] = None
    if not keep_y: y_kwargs['axis'] = None
    return alt.Chart(df, title=title).mark_rect().encode(
        x=alt.X('cell type:N', title='', **x_kwargs),
        y=alt.Y('method:N', title=y_title, sort=['Pollock'], **y_kwargs),
        color=alt.Color('f1-score:Q',)
    )

In [182]:
# for dtype, d in pred_map.items():
d = pred_map['scRNAseq']
combined = None
for i, dtype in enumerate(dtype_order):
    d = pred_map[dtype]
    row = None
    for disease, m in d.items():
        df = build_f1_matrix(m)
        df['method'] = df.index.to_list()
        df = df.melt(id_vars=['method'])
        df.columns = ['method', 'cell type', 'f1-score']
        keep_x = True if i == len(pred_map) - 1 else False
        if row is None:
            row = get_cell_f1_heatmap(df, y_title=dtype, title=disease)
        else:
            row = alt.hconcat(row, get_cell_f1_heatmap(df, keep_y=False, title=disease), spacing=2)
            
    if combined is None:
        combined = row
    else:
        combined = alt.vconcat(combined, row, spacing=-35)
combined

In [103]:
get_cell_f1_heatmap(df)

data availability dot plot

In [148]:
import scanpy as sc
fps = sorted(os_helpers.listfiles('/home/estorrs/pollock/benchmarking/data/10232020_harmonized/teir_1/', regex='.h5ad$'))
size_map = {fp.split('/')[-2]:{} for fp in fps}
for fp in fps:
    if '/_train.h5ad' not in fp and '/_val.h5ad' not in fp:
        dtype = fp.split('/')[-2]
        disease = fp.split('/')[-1].replace('.h5ad', '')
        size_map[dtype][disease] = sc.read_h5ad(fp).shape[0]
size_map

{'scRNAseq': {'brca': 98564,
  'brca_train': 2600,
  'brca_val': 11253,
  'cesc': 31745,
  'cesc_train': 1941,
  'cesc_val': 8449,
  'hnscc': 227195,
  'hnscc_train': 2200,
  'hnscc_val': 10288,
  'melanoma': 52367,
  'melanoma_train': 2000,
  'melanoma_val': 6735,
  'pbmc': 2638,
  'pbmc_train': 940,
  'pbmc_val': 1698,
  'pdac': 177089,
  'pdac_train': 3296,
  'pdac_val': 15435},
 'snATACseq': {'brca': 70679,
  'brca_train': 2064,
  'brca_val': 9028,
  'ccrcc': 79851,
  'ccrcc_train': 1208,
  'ccrcc_val': 6003,
  'gbm': 65782,
  'gbm_train': 1316,
  'gbm_val': 5650},
 'snRNAseq': {'brca': 109002,
  'brca_train': 2455,
  'brca_val': 9490,
  'ccrcc': 123250,
  'ccrcc_train': 2113,
  'ccrcc_val': 8605,
  'gbm': 192699,
  'gbm_train': 1689,
  'gbm_val': 6810}}

In [170]:
sm = {k:{c.upper():size for c, size in v.items() if 'train' not in c and 'val' not in c} for k, v in size_map.items()}
sm

{'scRNAseq': {'BRCA': 98564,
  'CESC': 31745,
  'HNSCC': 227195,
  'MELANOMA': 52367,
  'PBMC': 2638,
  'PDAC': 177089},
 'snATACseq': {'BRCA': 70679, 'CCRCC': 79851, 'GBM': 65782},
 'snRNAseq': {'BRCA': 109002, 'CCRCC': 123250, 'GBM': 192699}}

In [171]:
# dtype_order
# disease_order = sorted({d for k, ds in pred_map.items() for d in ds.keys()})

# data = []
# for dtype in dtype_order:
#     for disease in disease_order:
#         if dtype in pred_map and disease in pred_map[dtype]:
#             data.append([dtype, disease])
            
data = [[dtype, disease, sm[dtype][disease]] for dtype, d in pred_map.items() for disease in d.keys()]
df = pd.DataFrame(data=data, columns=['data type', 'disease', 'num cells'])
df

Unnamed: 0,data type,disease,num cells
0,scRNAseq,BRCA,98564
1,scRNAseq,CESC,31745
2,scRNAseq,HNSCC,227195
3,scRNAseq,MELANOMA,52367
4,scRNAseq,PBMC,2638
5,scRNAseq,PDAC,177089
6,snATACseq,BRCA,70679
7,snATACseq,GBM,65782
8,snRNAseq,BRCA,109002
9,snRNAseq,CCRCC,123250


In [172]:
chart = alt.Chart(df).mark_circle().encode(
    x=alt.X('disease:O'),
    y=alt.Y('data type:O'),
    color=alt.Color('num cells:Q'),
    size='num cells:Q'
)
chart

In [22]:
classification_report(y_true, y_pred, labels=['a', 'b', 'c'], output_dict=True)

{'a': {'precision': 1.0,
  'recall': 0.5,
  'f1-score': 0.6666666666666666,
  'support': 2},
 'b': {'precision': 0.5, 'recall': 0.5, 'f1-score': 0.5, 'support': 2},
 'c': {'precision': 0.5,
  'recall': 1.0,
  'f1-score': 0.6666666666666666,
  'support': 1},
 'accuracy': 0.6,
 'macro avg': {'precision': 0.6666666666666666,
  'recall': 0.6666666666666666,
  'f1-score': 0.611111111111111,
  'support': 5},
 'weighted avg': {'precision': 0.7,
  'recall': 0.6,
  'f1-score': 0.5999999999999999,
  'support': 5}}

In [25]:
accuracy_score(y_true, y_pred)

0.6

In [13]:
pd.read_csv('./results/10272020_teir1/snATACseq/gbm/SingleCellNet.tsv', sep='\t')

Unnamed: 0,groundtruth,predictions,probability
0,Malignant,Malignant,
1,Microglia,Microglia,
2,T cells,T cells,
3,Fibroblast,Fibroblast,
4,T cells,T cells,
...,...,...,...
5645,Microglia,Microglia,
5646,Malignant,Malignant,
5647,Microglia,Malignant,
5648,Neuron,Neuron,


In [14]:
pd.read_csv('./results/10272020_teir1/snATACseq/gbm/SingleCellNet.tsv', sep='\t')

Unnamed: 0,groundtruth,predictions,probability
0,Malignant,Malignant,
1,Microglia,Microglia,
2,T cells,T cells,
3,Fibroblast,Fibroblast,
4,T cells,T cells,
...,...,...,...
5645,Microglia,Microglia,
5646,Malignant,Malignant,
5647,Microglia,Malignant,
5648,Neuron,Neuron,


In [None]:
sklearn.metrics.f1_score(