analysis/clone_prevalences.Rmd

---
title: "Clone prevalence analysis"
author: "Davis J. McCarthy"
site: workflowr::wflow_site
---

## Load libraries and data

```{r setup, include=TRUE, results='hide', message=FALSE, warning=FALSE}
knitr::opts_chunk$set(echo = TRUE)
library(tidyverse)
library(viridis)
library(cowplot)
```

Load the Canopy clone inference results and the cell assignment results from
cardelino for 32 donor fibroblast cell lines.

```{r}
params <- list()
params$callset <- "filt_lenient.cell_coverage_sites"
fls <- list.files("data/sces")
fls <- fls[grepl(params$callset, fls)]
donors <- gsub(".*ce_([a-z]+)_.*", "\\1", fls)

cell_assign_list <- list()
for (don in donors) {
    cell_assign_list[[don]] <- readRDS(file.path("data/cell_assignment", 
        paste0("cardelino_results.", don, ".", params$callset, ".rds")))
    cat(paste("reading", don, "\n"))
}
canopy_list <- list()

prev_list <- list()
for (don in donors) {
  tmp_df <- data_frame(
    donor = don,
    clone = rownames(cell_assign_list[[don]]$tree$P),
    prev_canopy = cell_assign_list[[don]]$tree$P[, 1],
    prev_cardelino = NA,
    n_cells = length(cell_assign_list[[don]]$clone_assigned),
    n_assigned = sum(cell_assign_list[[don]]$clone_assigned != "unassigned"),
    prop_assigned = n_assigned / n_cells
  )
  for (i in seq_len(nrow(tmp_df))) {
    tmp_df$prev_cardelino[i] <- (sum(
      cell_assign_list[[don]]$clone_assigned == tmp_df$clone[i]) / 
        tmp_df$n_assigned[i])
  }
  prev_list[[don]] <- tmp_df
}
df_prev <- do.call("rbind", prev_list)

lm_eqn <- function(df) {
    m <- lm(prev_cardelino ~ prev_canopy, df);
    eq <- substitute(italic(r)^2~"="~r2, 
         list(a = format(coef(m)[1], digits = 2), 
              b = format(coef(m)[2], digits = 2), 
             r2 = format(summary(m)$r.squared, digits = 3)))
    as.character(as.expression(eq));                 
}

## Fit weighted regressions
fits <- df_prev %>%
  group_by(clone) %>% 
  do(fit = lm(prev_cardelino ~ prev_canopy, weights = prop_assigned, data = .))

fits_1grp <- df_prev %>%
  do(fit = lm(prev_cardelino ~ prev_canopy, weights = prop_assigned, data = .))

le_lin_fit <- function(dat) {
  the_fit <- lm(prev_cardelino ~ prev_canopy, weights = prop_assigned, dat)
  setNames(data.frame(t(coef(the_fit))), c("x0", "x1"))
}

fits_me <- df_prev %>%
  group_by(clone) %>% 
  do(le_lin_fit(.))

fits_me_1grp <- df_prev %>%
  do(le_lin_fit(.))

```

## Plot clone prevalences

Plot the estimated clone fractions from the cells assigned to a clone by 
cardelino against the estimated clone fractions from Canopy. 

```{r plot-prev, fig.height = 5, fig.width=7}
fits_1grp %>%
  broom::augment(fit) %>% 
  inner_join(., df_prev) %>%
ggplot(aes(x = prev_canopy, y = prev_cardelino, shape = clone, 
                    fill = prop_assigned)) +
  geom_abline(slope = 1, intercept = 0, colour = "gray40", linetype = 2) +
  geom_ribbon(aes(ymin = .fitted - 1.645 * .se.fit, ymax = .fitted + 1.645 * .se.fit),
              fill = "gray70", alpha = 0.7) +
  geom_abline(aes(intercept = x0, slope = x1), 
              data = fits_me_1grp,
              colour = "firebrick", size = 2) +
  geom_point(size = 3) +
  xlim(0, 1) + ylim(0, 1) +
  geom_text(x = 0.9, y = 0, colour = "black", label = lm_eqn(df_prev), 
            size = 5, parse = TRUE, data = df_prev[1,]) +
  scale_fill_viridis(name = "fraction of\ncells assigned", limits = c(0, 1)) +
  scale_shape_manual(values = 21:25) +
  xlab("Estimated clone prevalence (Canopy)") +
  ylab("Assigned clone fraction (cardelino)")

ggsave("figures/clone_prevalences/clone_prev_scatter.png", 
       height = 5, width = 7)
ggsave("figures/clone_prevalences/clone_prev_scatter.pdf", 
       height = 5, width = 7)

```

We can also look at the same plot as above, but now faceted by the different 
clones.

```{r plot-prev-facet-clone, fig.height = 7, fig.width=9}
fits %>%
  broom::augment(fit) %>% 
  inner_join(., df_prev) %>%
  ggplot(aes(x = prev_canopy, y = prev_cardelino)) +
  geom_abline(slope = 1, intercept = 0, colour = "gray40", linetype = 2) +
  geom_ribbon(aes(ymin = .fitted - 1.645 * .se.fit, ymax = .fitted + 1.645 * .se.fit),
              fill = "gray70", alpha = 0.7) +
  geom_abline(aes(intercept = x0, slope = x1), 
              data = fits_me,
              colour = "firebrick", size = 2) +
  geom_point(aes(fill = prop_assigned), size = 3, shape = 21) +
  xlim(0, 1) + ylim(0, 1) +
  facet_wrap(~clone) +
  scale_fill_viridis(name = "fraction of\ncells assigned", limits = c(0, 1)) +
  scale_shape_manual(values = 21:25) +
  xlab("Estimated clone prevalence (Canopy)") +
  ylab("Assigned clone fraction (cardelino)")

ggsave("figures/clone_prevalences/clone_prev_scatter_facet_clone.png", 
       height = 7, width = 9)
ggsave("figures/clone_prevalences/clone_prev_scatter_facet_clone.pdf", 
       height = 7, width = 9)

```

Since there are so few donors with four clones we can also make a version of the 
figure above with just clone1, clone2 and clone3 and fitted a weighted 
regression line, with points weighted by the fraction of cells assigned for the 
donor.

```{r plot-prev-facet-clone-3clones, fig.height = 4.5, fig.width=8.5}
fits %>%
  broom::augment(fit) %>% 
  inner_join(., df_prev) %>%
  dplyr::filter(clone != "clone4") %>%
  ggplot(aes(x = prev_canopy, y = prev_cardelino, fill = prop_assigned)) +
  geom_abline(slope = 1, intercept = 0, colour = "gray40", linetype = 2) +
  geom_ribbon(aes(ymin = .fitted - 1.645 * .se.fit, ymax = .fitted + 1.645 * .se.fit),
              fill = "gray70", alpha = 0.7) +
  geom_abline(aes(intercept = x0, slope = x1), 
              data = dplyr::filter(fits_me, clone != "clone4"),
              colour = "firebrick", size = 2) +
  geom_point(size = 3, shape = 21) +
  xlim(0, 1) + ylim(0, 1) +
  facet_wrap(~clone, nrow = 1) +
  scale_fill_viridis(name = "fraction of\ncells assigned", limits = c(0, 1)) +
  scale_shape_manual(values = 21:25) +
  xlab("Estimated clone prevalence (Canopy)") +
  ylab("Assigned clone fraction (cardelino)") +
  theme(axis.text = element_text(size = 9))

ggsave("figures/clone_prevalences/clone_prev_scatter_facet_clone_no_clone4.png", 
       height = 4.5, width = 8.5)
ggsave("figures/clone_prevalences/clone_prev_scatter_facet_clone_no_clone4.pdf", 
       height = 4.5, width = 8.5)

```

Also look at what happens if we filter out donors that have fewer than 75% of 
cells assigned (`r length(unique(dplyr::filter(df_prev, prop_assigned > 0.75)$donor))` donors).

```{r plot-prev-facet-clone-3clones-donorfilt, fig.height = 4.5, fig.width=10.5}
df_prev %>%
  dplyr::filter(clone != "clone4", prop_assigned > 0.75) %>%
  ggplot(aes(x = prev_canopy, y = prev_cardelino, shape = clone, 
             fill = prop_assigned)) +
  geom_abline(slope = 1, intercept = 0, colour = "gray40", linetype = 2) +
  geom_smooth(aes(group = 1), method = "lm", colour = "firebrick") +
  geom_point(size = 3) +
  xlim(0, 1) + ylim(0, 1) +
  facet_wrap(~clone, nrow = 1) +
  scale_fill_viridis(name = "fraction of\ncells assigned", limits = c(0, 1)) +
  scale_shape_manual(values = 21:25) +
  xlab("Estimated clone prevalence (Canopy)") +
  ylab("Assigned clone fraction (cardelino)")

ggsave("figures/clone_prevalences/clone_prev_scatter_facet_clone_no_clone4_75pctassigned.png", 
       height = 4.5, width = 10.5)
ggsave("figures/clone_prevalences/clone_prev_scatter_facet_clone_no_clone4_75pctassigned.pdf", 
       height = 4.5, width = 10.5)

```