In [1]:
import numpy as nmp
import pandas as pnd
import matplotlib.pyplot as plt

import pymc3 as pmc

import clonosGP as cln

In [2]:
%load_ext autoreload
%autoreload 2

%matplotlib inline

In [3]:
DATA = pnd.read_csv('data/melanoma_Cutts_2017.csv')
METRICS = pnd.read_csv('results/melanoma_Cutts_2017.csv')

In [4]:
nmp.random.seed(42)
pmc.tt_rng(42);

RES1 = cln.infer(DATA, 
                 model_args={'K': 20, 'prior': 'Flat', 'cov': 'Mat32', 'lik': 'BBin', 'threshold': 0.0},
                 pymc3_args={'niters': 10000, 'method': 'advi', 'flow': 'scale-loc', 'learning_rate': 1e-2, 'random_seed': 42})

INFO:clonosGP:No CNm column in the data. Multiplicity values will be approximated.
Average Loss = 2,370.8: 100%|██████████| 10000/10000 [00:39<00:00, 255.60it/s]
Finished [100%]: Average Loss = 2,370.7
INFO:pymc3.variational.inference:Finished [100%]: Average Loss = 2,370.7
INFO:clonosGP:Calculating posterior cluster weights and centres.
INFO:clonosGP:Calculating posterior CCF values.
INFO:clonosGP:Calculating posterior predictive distribution.
INFO:clonosGP:Calculating dispersion(s).
INFO:clonosGP:Finished.


In [5]:
nmp.random.seed(42)
pmc.tt_rng(42);

RES2 = cln.infer(DATA, 
                 model_args={'K': 20, 'prior': 'GP0', 'cov': 'Exp', 'lik': 'BBin', 'threshold': 0.0}, 
                 pymc3_args={'niters': 10000, 'method': 'advi', 'flow': 'scale-loc', 'learning_rate': 1e-2, 'random_seed': 42})

INFO:clonosGP:No CNm column in the data. Multiplicity values will be approximated.
  result[diagonal_slice] = x
Average Loss = 2,366.3: 100%|██████████| 10000/10000 [00:42<00:00, 235.81it/s]
Finished [100%]: Average Loss = 2,366.3
INFO:pymc3.variational.inference:Finished [100%]: Average Loss = 2,366.3
INFO:clonosGP:Calculating posterior cluster weights and centres.
INFO:clonosGP:Calculating posterior CCF values.
INFO:clonosGP:Calculating posterior predictive distribution.
INFO:clonosGP:Calculating GP-related quantities.
INFO:clonosGP:Calculating dispersion(s).
INFO:clonosGP:Finished.


In [6]:
data1, centres1 = RES1['data'], RES1['centres'] 
data2, centres2, centres_gp = RES2['data'], RES2['centres'], RES2['centres_gp']

In [7]:
%load_ext rpy2.ipython
%R library(tidyverse)
%R library(patchwork)


[32m✔[39m [34mtibble [39m 2.1.3     [32m✔[39m [34mdplyr  [39m 0.8.3
[32m✔[39m [34mtidyr  [39m 1.0.0     [32m✔[39m [34mstringr[39m 1.4.0
[32m✔[39m [34mreadr  [39m 1.3.1     [32m✔[39m [34mforcats[39m 0.4.0

[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()



array(['patchwork', 'forcats', 'stringr', 'dplyr', 'purrr', 'readr',
       'tidyr', 'tibble', 'ggplot2', 'tidyverse', 'tools', 'stats',
       'graphics', 'grDevices', 'utils', 'datasets', 'methods', 'base'],
      dtype='<U9')

In [12]:
%%R -i data1,data2,centres1,centres2,centres_gp,METRICS -w 10 -h 10 --units in

df2 = data2 %>% filter(CLUSTERID != 'uncertain')
cids2 = df2 %>% pull(CLUSTERID) %>% unique()
colors = colorRampPalette(RColorBrewer::brewer.pal(8, 'Set2'))(length(cids2))

gg1 =
    df2 %>%
    ggplot() +
    geom_line(aes(x = TIME2, y = VAF, group = MUTID, color = CLUSTERID)) +
    scale_x_continuous(breaks = unique(df2$TIME2), labels = unique(df2$SAMPLEID)) +
    scale_color_manual(values = colors) +
    labs(x = NULL, y = 'variant allele fraction') +
    theme_bw() +    
    theme(legend.position = 'none',
          axis.text.x = element_blank())

ctrs2 = centres2 %>% filter(CLUSTERID %in% cids2) %>% mutate(CLUSTERID = as.character(CLUSTERID))
ctrs_gp = centres_gp %>% filter(CLUSTERID %in% cids2) %>% mutate(CLUSTERID = as.character(CLUSTERID))
gg2 =
    ggplot() +
#     geom_ribbon(aes(x = TIME, ymin = PHI_LO, ymax = PHI_HI, fill = CLUSTERID), data = ctrs_gp, alpha = 0.5) +
    geom_line(aes(x = TIME, y = PHI, color = CLUSTERID), data = ctrs_gp) +
    geom_point(aes(x = TIME2, y = PHI, color = CLUSTERID), data = ctrs2) +
    scale_x_continuous(breaks = unique(df2$TIME2), labels = unique(df2$SAMPLEID)) +
    scale_color_manual(values = colors) +
    scale_fill_manual(values = colors) +
    labs(x = NULL, y = 'cancer cell fraction') +
    theme_bw() +
    theme(legend.position = 'none',
          axis.text.x = element_blank())

df1 = data1 %>% filter(CLUSTERID != 'uncertain')     
cids1 = df1 %>% pull(CLUSTERID) %>% unique()
ctrs1 = centres1 %>% mutate(CLUSTERID = as.character(CLUSTERID)) %>% filter(CLUSTERID %in% cids1)
gg3 =
    ctrs1 %>%
    ggplot() +
    geom_line(aes(x = TIME2, y = PHI, group = CLUSTERID), linetype = 'dashed') +
#     geom_linerange(aes(x = TIME2, ymin = PHI_LO, ymax=PHI_HI, group = CLUSTERID)) +    
    geom_point(aes(x = TIME2, y = PHI, group = CLUSTERID)) +
    scale_x_continuous(breaks = unique(df1$TIME2), labels = unique(df1$SAMPLEID)) +
    labs(x = 'sample', y = 'cancer cell fraction') +
    theme_bw() +
    theme(legend.position = 'none',
          axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5))
    
metrics = 
    METRICS %>% 
    filter(LIK == 'BBin', METRIC == 'LOSS', PRIOR != 'GP2') %>% 
    mutate(LABEL = if_else(PRIOR == 'Flat', 'Flat', str_c(PRIOR, COV, sep='-'))) %>%
    mutate(LABEL = factor(LABEL, levels = .$LABEL))

med = metrics %>% filter(LABEL == 'Flat') %>% pull(MEDIAN)

gg4 =
    metrics %>%
    ggplot() +
    geom_hline(yintercept = -med, linetype = 'dashed') +        
    geom_linerange(aes(x = LABEL, ymin = -HIGH, ymax=-LOW)) +    
    geom_point(aes(x = LABEL, y = -MEDIAN)) +
    labs(x = NULL, y = 'evidence lower bound') +
    theme_bw() +
    theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5))
    
gg1 / gg2 / gg3 / gg4 +
    plot_annotation(tag_levels = 'A')
    
# ggsave('tmp.pdf')




In [9]:
data1.CLUSTERID.unique(), data2.CLUSTERID.unique() 

(array(['1', '5', '2', '3', '6'], dtype=object),
 array(['2', '3', '5', '1', '4'], dtype=object))

In [10]:
import sklearn.metrics as mtr

In [11]:
z1 = RES1['data'][['MUTID', 'CLUSTERID']].drop_duplicates().CLUSTERID.values
z2 = RES2['data'][['MUTID', 'CLUSTERID']].drop_duplicates().CLUSTERID.values

mtr.adjusted_rand_score(z1, z2)

0.26998678280700206