In [2]:
import numpy as nmp
import pandas as pnd
import matplotlib.pyplot as plt
import pickle as pkl

import pymc3 as pmc

import clonosGP as cln

In [3]:
%load_ext autoreload
%autoreload 2

%matplotlib inline

In [4]:
DATA1 = pnd.read_csv('data/cll_Schuh_2012_CLL003.csv')
DATA2 = pnd.read_csv('data/cll_Schuh_2012_CLL006.csv')
DATA3 = pnd.read_csv('data/cll_Schuh_2012_CLL077.csv')
DATA4 = pnd.read_csv('data/cll_Rincon_2019_patient2.csv')

In [8]:
def run_model(prior, data):    
    nmp.random.seed(42)
    pmc.tt_rng(42)
    
    return cln.infer(data, 
                     model_args={'K': 20, 'prior': prior, 'cov': 'Mat32', 'lik': 'BBin', 'threshold': 0.0},
                     pymc3_args={'niters': 10000, 'method': 'advi', 'flow': 'scale-loc', 'learning_rate': 1e-2, 'random_seed': 42})

In [9]:
RES1 = [run_model('Flat', _) for _ in [DATA1, DATA2, DATA3, DATA4]]
RES2 = [run_model('GP0', _) for _ in [DATA1, DATA2, DATA3, DATA4]]

INFO:clonosGP:No PURITY column in the data. Assuming all samples have purity 100%.
INFO:clonosGP:No CNn column in the data. Assuming germline is diploid over all provided loci.
INFO:clonosGP:No CNt column in the data. Assuming all tumour samples are diploid over all provided loci.
INFO:clonosGP:No CNm column in the data. Multiplicity values will be approximated.
Average Loss = 413.23: 100%|██████████| 10000/10000 [00:11<00:00, 883.43it/s]
Finished [100%]: Average Loss = 413.24
INFO:pymc3.variational.inference:Finished [100%]: Average Loss = 413.24
INFO:clonosGP:Calculating posterior cluster weights and centres.
INFO:clonosGP:Calculating posterior CCF values.
INFO:clonosGP:Calculating posterior predictive distribution.
INFO:clonosGP:Calculating dispersion(s).
INFO:clonosGP:Finished.
INFO:clonosGP:No PURITY column in the data. Assuming all samples have purity 100%.
INFO:clonosGP:No CNn column in the data. Assuming germline is diploid over all provided loci.
INFO:clonosGP:No CNt column in

In [None]:
data1, centres1 = [_['data'] for _ in RES1], [_['centres'] for _ in RES1]
data2, centres2, centres_gp2 = [_['data'] for _ in RES2], [_['centres'] for _ in RES2], [_['centres_gp'] for _ in RES2]

DATASETS = ['CLL003 (Schuh et al. 2012)', 'CLL006 (Schuh et al. 2012)', 'CLL077 (Schuh et al. 2012)', 'Pt.2 (Rincon et al. 2019)']
LABELS = ['Flat', 'GP0-Mat32']

data1 = pnd.concat([df.assign(DATASET=lab) for df, lab in zip(data1, DATASETS)]).assign(LABEL = LABELS[0])
data2 = pnd.concat([df.assign(DATASET=lab) for df, lab in zip(data2, DATASETS)]).assign(LABEL = LABELS[1])
data = pnd.concat([data1, data2])

centres1 = pnd.concat([df.assign(DATASET=lab) for df, lab in zip(centres1, DATASETS)]).assign(LABEL = LABELS[0])
centres2 = pnd.concat([df.assign(DATASET=lab) for df, lab in zip(centres2, DATASETS)]).assign(LABEL = LABELS[1])
centres = pnd.concat([centres1, centres2])

centres_gp = pnd.concat([df.assign(DATASET=lab) for df, lab in zip(centres_gp2, DATASETS)]).assign(LABEL = LABELS[1])

In [None]:
%load_ext rpy2.ipython
%R library(tidyverse)
%R library(patchwork)

In [None]:
%%R -i data,centres,centres_gp -w 10 -h 10 --units in

auxfcn = function(df, ctrs, ctrs_gp, dataset) {
    df1 = df %>% filter(DATASET == dataset, LABEL == 'Flat')
    df2 = df %>% filter(DATASET == dataset, LABEL == 'GP0-Mat32')    
    ctrs1 = ctrs %>% filter(DATASET == dataset, LABEL == 'Flat')
    ctrs2 = ctrs %>% filter(DATASET == dataset, LABEL == 'GP0-Mat32')    
    ctrs_gp = ctrs_gp %>% filter(DATASET == dataset)
    
    gg1 =
        df2 %>%
        ggplot() +
        geom_line(aes(x = TIME2, y = VAF, group = MUTID, color = CLUSTERID)) +
        scale_x_continuous(breaks = unique(df2$TIME2), labels = unique(df2$SAMPLEID)) +
        scale_color_brewer(palette = 'Set2') +
        labs(x = 'sample', y = 'variant allele fraction', title = dataset)

    cids = df2 %>% filter(CLUSTERID != 'uncertain') %>% pull(CLUSTERID)
    ctrs2 = ctrs2 %>% filter(CLUSTERID %in% cids) %>% mutate(CLUSTERID = as.factor(CLUSTERID))
    ctrs_gp = ctrs_gp %>% filter(CLUSTERID %in% cids) %>% mutate(CLUSTERID = as.factor(CLUSTERID))

    gg2 =
        ggplot() +
        geom_ribbon(aes(x = TIME, ymin = PHI_LO, ymax = PHI_HI, fill = CLUSTERID), data = ctrs_gp, alpha = 0.5) +        
        geom_line(aes(x = TIME, y = PHI, color = CLUSTERID), data = ctrs_gp) +        
        geom_point(aes(x = TIME2, y = PHI, color = CLUSTERID), data = ctrs2) +
        scale_x_continuous(breaks = unique(df2$TIME2), labels = unique(df2$SAMPLEID)) +
        scale_fill_brewer(palette = 'Set2') +
        scale_color_brewer(palette = 'Set2') +
        labs(x = 'sample', y = 'cancer cell fraction', title = str_c(unique(ctrs2$LABEL), ' model'))

    cids = df1 %>% filter(CLUSTERID != 'uncertain') %>% pull(CLUSTERID)
    ctrs1 = ctrs1 %>% filter(CLUSTERID %in% cids) %>% mutate(CLUSTERID = as.factor(CLUSTERID))
    
    gg3 =
        ctrs1 %>%
        ggplot() +
        geom_line(aes(x = TIME2, y = PHI, group = CLUSTERID), linetype = 'dashed', position = position_dodge(width=0.05)) +    
        geom_linerange(aes(x = TIME2, ymin = PHI_LO, ymax = PHI_HI, group = CLUSTERID), position = position_dodge(width=0.05)) +        
        geom_point(aes(x = TIME2, y = PHI, group = CLUSTERID), position = position_dodge(width=0.05)) +
        scale_x_continuous(breaks = unique(df1$TIME2), labels = unique(df1$SAMPLEID)) +
        labs(x = 'sample', y = 'cancer cell fraction', title = str_c(unique(ctrs1$LABEL), ' model')) 

    gg1 + plot_layout(tag_level='new') + gg2 + gg3
}

row1 = auxfcn(data, centres, centres_gp, 'CLL003 (Schuh et al. 2012)')
row2 = auxfcn(data, centres, centres_gp, 'CLL006 (Schuh et al. 2012)')
row3 = auxfcn(data, centres, centres_gp, 'CLL077 (Schuh et al. 2012)')
row4 = auxfcn(data, centres, centres_gp, 'Pt.2 (Rincon et al. 2019)')

(row1 / row2 / row3 / row4) &
plot_annotation(tag_levels = c('A', 'i')) &
theme_bw() + 
theme(legend.position = 'none', 
      plot.title = element_text(hjust = 0.5))

# ggsave('tmp.pdf')

In [6]:
import sklearn.metrics as mtr

In [10]:
for i in [0,1,2,3]:
    z1 = RES1[i]['data'][['MUTID', 'CLUSTERID']].drop_duplicates().CLUSTERID.values
    z2 = RES2[i]['data'][['MUTID', 'CLUSTERID']].drop_duplicates().CLUSTERID.values

    print(mtr.adjusted_rand_score(z1, z2))

0.5404814004376368
0.7860513896171998
0.5838052095130237
0.6310053061809838
