In [1]:
library(tidyverse)
library(cytominer)
library(magrittr)
library(RCurl)

── Attaching packages ─────────────────────────────────────── tidyverse 1.2.1 ──
✔ ggplot2 3.0.0     ✔ purrr   0.2.5
✔ tibble  1.4.2     ✔ dplyr   0.7.6
✔ tidyr   0.8.1     ✔ stringr 1.3.1
✔ readr   1.1.1     ✔ forcats 0.3.0
“package ‘forcats’ was built under R version 3.4.3”── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
“package ‘cytominer’ was built under R version 3.4.3”
Attaching package: ‘cytominer’

The following object is masked from ‘package:stats’:

    aggregate

The following object is masked from ‘package:base’:

    transform


Attaching package: ‘magrittr’

The following object is masked from ‘package:purrr’:

    set_names

The following object is masked from ‘package:tidyr’:

    extract

“package ‘RCurl’ was built under R version 3.4.3”Loading required package: bitops

Attaching package: ‘RCurl’

The following object is masked from ‘package:tidyr’:

    comple

In [2]:
load_dataset  <- function(partition, dataset,feature){
    file_name  <- read_csv("../datasets.csv") 
    x  <-  file_name %>% filter(
         Partition == partition,
         Dataset == dataset,
         Features == feature) %>% 
         extract2("Link")

    return(read_csv(x) %>% 
          mutate(Metadata_dataset = dataset) %>%
          mutate(Metadata_partition = partition) %>% 
          mutate(Metadata_features = feature) 
          )
    }

# Load data 
We load training and test datasets for both genetic perturbation experiments 

In [3]:
# bbbc37 data 
bbbc037_train  <- load_dataset("Train","BBBC037","CellProfiler")  %>% 
    mutate(Metadata_x_mutation_status = "none")

bbbc037_test <- load_dataset("Test","BBBC037","CellProfiler")  %>% 
    mutate(Metadata_x_mutation_status = "none")

bbbc037  <- rbind(bbbc037_train, bbbc037_test)

Parsed with column specification:
cols(
  Dataset = col_character(),
  Partition = col_character(),
  Features = col_character(),
  Link = col_character()
)
“package ‘bindrcpp’ was built under R version 3.4.3”Parsed with column specification:
cols(
  .default = col_double(),
  Metadata_Well = col_character(),
  Metadata_Plate_Map_Name = col_character(),
  Metadata_well_position = col_character(),
  Metadata_gene_name = col_character(),
  Metadata_pert_name = col_character(),
  Metadata_broad_sample = col_character(),
  Metadata_cell_line = col_character(),
  Metadata_ASSAY_WELL_ROLE = col_character(),
  Metadata_pert_id = col_character(),
  Metadata_pert_mfc_id = col_character(),
  Metadata_pert_well = col_character(),
  Metadata_pert_id_vendor = col_character(),
  Metadata_cell_id = col_character(),
  Metadata_broad_sample_type = col_character(),
  Metadata_pert_type = col_character()
)
See spec(...) for full column specifications.
Parsed with column specification:
cols(
  Dataset = c

In [4]:
# bbbc043 data 
bbbc043_train  <- load_dataset("Train","BBBC043","CellProfiler")
 
bbbc043_test <- load_dataset("Test","BBBC043","CellProfiler")

bbbc043  <- rbind(bbbc043_train, bbbc043_test)

Parsed with column specification:
cols(
  Dataset = col_character(),
  Partition = col_character(),
  Features = col_character(),
  Link = col_character()
)
Parsed with column specification:
cols(
  .default = col_double(),
  Metadata_Well = col_character(),
  Metadata_Plate_Map_Name = col_character(),
  Metadata_well_position = col_character(),
  Metadata_pert_type = col_character(),
  Metadata_PublicID = col_character(),
  Metadata_Transcript = col_character(),
  Metadata_VirusPlateName = col_character(),
  Metadata_x_mutation_status = col_character(),
  Metadata_broad_sample = col_character(),
  Metadata_pert_name = col_character(),
  Metadata_pert_id = col_character(),
  Metadata_pert_mfc_id = col_character(),
  Metadata_pert_well = col_character(),
  Metadata_pert_id_vendor = col_character(),
  Metadata_cell_id = col_character(),
  Metadata_broad_sample_type = col_character(),
  Metadata_gene_name = col_character()
)
See spec(...) for full column specifications.
Parsed with column

## Check dimensionality

In [5]:
dim(bbbc043)
dim(bbbc037)

## Extract common features 

In [6]:
colnames_bbbc037 <- colnames(bbbc037)
colnames_bbbc043 <- colnames(bbbc043)


Metadata_names_bbbc037 <- c(
   stringr::str_subset(colnames_bbbc037, "^Meta")
) 

Metadata_names_bbbc043 <- c(
   stringr::str_subset(colnames_bbbc043, "^Meta")
) 

common_metadata  <- intersect(Metadata_names_bbbc037, Metadata_names_bbbc043)  

common_features  <- setdiff(intersect(colnames_bbbc037, colnames_bbbc043),common_metadata)

common_features %>% length

In [7]:
bbbc037_na_feature  <- cytominer::drop_na_columns(
    population = bbbc037,
    variables = common_features,
    cutoff = 0
    )

bbbc043_na_feature  <- cytominer::drop_na_columns(
    population = bbbc043,
    variables = common_features,
    cutoff = 0
    )

In [8]:
bbbc043_na_feature %>% print

character(0)


# Concatenate data sets

In [9]:
population  <- rbind(
    bbbc037 %>% 
        select(c(common_metadata, common_features)),
    bbbc043 %>% 
        select(c(common_metadata, common_features))
    ) %>% 
    mutate(Metadata_perturbation = 'genetic')

In [10]:
population %>% dim

# Normalize data
We use cytominer to normalize both datasets with respect to the controls, i.e. EMPTY genes

In [11]:
population_normalized  <- cytominer::normalize(
    population, 
    variables = common_features, 
    strata = c("Metadata_perturbation"), 
    sample = population %>% 
                filter(
                    Metadata_gene_name == 'EMPTY',
                    Metadata_partition == "Train"
                ), 
    operation = "standardize"
)


In [12]:
population %>% dim

# Aggregate data 

In [13]:
population_aggregated  <- cytominer::aggregate(
    population = population_normalized, 
    variables = common_features, 
    strata = c("Metadata_gene_name","Metadata_dataset","Metadata_x_mutation_status"), 
    operation = "mean"
) 

# Correlation matrix 

In [14]:
cor_matrix  <- cor(
    x = population_aggregated %>% 
        filter(Metadata_dataset == 'BBBC037') %>% 
        select(common_features) %>% 
        as.matrix() %>% 
        t, 
    y = population_aggregated %>% 
        filter(Metadata_dataset == 'BBBC043') %>% 
        select(common_features) %>% 
        as.matrix() %>% 
        t,
    use  = "complete.obs"
    ) 


# Submision file 

In [15]:
# set column names 
colnames(cor_matrix)  <- population_aggregated %>% 
                            filter(Metadata_dataset == 'BBBC043') %>%
                            extract2("Metadata_x_mutation_status")

# set row names 
rownames(cor_matrix)  <- population_aggregated %>% 
                            filter(Metadata_dataset == 'BBBC037') %>%
                            extract2("Metadata_gene_name")


df  <- cor_matrix %>% as_data_frame() %>% 
            mutate(Metadata_gene_name = population_aggregated %>% 
                            filter(Metadata_dataset == 'BBBC037') %>%
                            extract2("Metadata_gene_name")) %>% 
            select(Metadata_gene_name, everything())

# write submission file
write.csv(df,"../cytodata-baseline_R.csv",row.names = FALSE)