In [2]:
library(tidyverse)
library(cytominer)
library(magrittr)

In [7]:
load_dataset  <- function(partition, dataset,feature){
    file_name  <- read_csv("../datasets.csv") 
    x  <-  file_name %>% filter(
         Partition == partition,
         Dataset == dataset,
         Features == feature) %>% 
         extract2("Link")

    return(read_csv(x) %>% 
          mutate(Metadata_dataset = dataset) %>%
          mutate(Metadata_partition = partition) %>% 
          mutate(Metadata_features = feature) 
          )
    }

# Load data 
We load training and test datasets for both genetic perturbation experiments 

In [11]:
# bbbc37 data 
bbbc036_train  <- load_dataset("Train","BBBC036","CellProfiler")  %>% 
    mutate(Metadata_x_mutation_status = "none")

bbbc036_test <- load_dataset("Test","BBBC036","CellProfiler")  %>% 
    mutate(Metadata_x_mutation_status = "none")

bbbc036  <- rbind(bbbc036_train, bbbc036_test)

Parsed with column specification:
cols(
  Dataset = col_character(),
  Partition = col_character(),
  Features = col_character(),
  Link = col_character()
)
Parsed with column specification:
cols(
  .default = col_double(),
  Metadata_Plate = col_integer(),
  Metadata_Well = col_character(),
  Metadata_Assay_Plate_Barcode = col_integer(),
  Metadata_Plate_Map_Name = col_character(),
  Metadata_well_position = col_character(),
  Metadata_ASSAY_WELL_ROLE = col_character(),
  Metadata_broad_sample = col_character(),
  Metadata_solvent = col_character(),
  Metadata_pert_id = col_character(),
  Metadata_pert_mfc_id = col_character(),
  Metadata_pert_well = col_character(),
  Metadata_pert_id_vendor = col_character(),
  Metadata_cell_id = col_character(),
  Metadata_broad_sample_type = col_character(),
  Metadata_pert_vehicle = col_character(),
  Metadata_pert_type = col_character(),
  Cells_AreaShape_EulerNumber = col_integer(),
  Cells_Children_Cytoplasm_Count = col_integer(),
  Cells_Neig

In [12]:
bbbc036_train %>% dim()
bbbc036_test %>% dim()

In [13]:
# bbbc043 data 
bbbc022_train  <- load_dataset("Train","BBBC022","CellProfiler")
 
bbbc022_test <- load_dataset("Test","BBBC022","CellProfiler")

bbbc022  <- rbind(bbbc022_train, bbbc022_test)

Parsed with column specification:
cols(
  Dataset = col_character(),
  Partition = col_character(),
  Features = col_character(),
  Link = col_character()
)
Parsed with column specification:
cols(
  .default = col_double(),
  Metadata_Plate = col_integer(),
  Metadata_Well = col_character(),
  Metadata_Assay_Plate_Barcode = col_integer(),
  Metadata_Plate_Map_Name = col_character(),
  Metadata_well_position = col_character(),
  Metadata_broad_sample = col_character(),
  Metadata_source_name = col_character(),
  Metadata_compound_name = col_character(),
  Metadata_smiles = col_character(),
  Metadata_solvent = col_character(),
  Metadata_pert_id = col_character(),
  Metadata_pert_mfc_id = col_character(),
  Metadata_pert_well = col_character(),
  Metadata_pert_id_vendor = col_character(),
  Metadata_cell_id = col_character(),
  Metadata_broad_sample_type = col_character(),
  Metadata_pert_vehicle = col_character(),
  Metadata_pert_type = col_character(),
  Metadata_exp = col_character()

## Check dimensionality

In [15]:
dim(bbbc022)
dim(bbbc036)

## Extract common features 

In [16]:
colnames_bbbc022 <- colnames(bbbc022)
colnames_bbbc036 <- colnames(bbbc036)


Metadata_names_bbbc022 <- c(
   stringr::str_subset(colnames_bbbc022, "^Meta")
) 

Metadata_names_bbbc036 <- c(
   stringr::str_subset(colnames_bbbc036, "^Meta")
) 

common_metadata  <- intersect(Metadata_names_bbbc022, Metadata_names_bbbc036)  
common_features  <- setdiff(intersect(colnames_bbbc022, colnames_bbbc036),common_metadata)

bbbc037_na_feature  <- cytominer::drop_na_columns(
    population = bbbc037,
    variables = common_features,
    cutoff = 0
    )

bbbc043_na_feature  <- cytominer::drop_na_columns(
    population = bbbc043,
    variables = common_features,
    cutoff = 0
    )

# Concatenate data sets

In [17]:
population  <- rbind(
    bbbc022 %>% 
        select(c(common_metadata, common_features)),
    bbbc036 %>% 
        select(c(common_metadata, common_features))
    ) %>% 
    mutate(Metadata_perturbation = 'chemical') %>% 
    select(Metadata_perturbation, everything())

## Important: update column names! 

In [18]:
colnames_combined  <- colnames(population)

common_metadata  <- c(
   stringr::str_subset(colnames_combined, "^Meta")
) 

common_features  <- setdiff(colnames_combined, common_metadata)

Cytominer has problems handling column names '1', '2' so we rename them to 'Feature_1', ... 

In [19]:
common_features  <- paste0("Feature_",common_features)
colnames(population)  <- c(common_metadata, common_features)

# Normalize data
We use cytominer to normalize both datasets with respect to the controls, i.e. EMPTY genes

In [21]:
population_normalized  <- cytominer::normalize(
    population, 
    variables = common_features, 
    strata = c("Metadata_perturbation"), 
    sample = population %>% 
                filter(
                    Metadata_broad_sample == "DMSO"
                ), 
    operation = "standardize"
)

In [22]:
population_normalized %>% dim() %>% print

[1] 28776  1804


# Aggregate data 

In [29]:
population_aggregated  <- cytominer::aggregate(
    population = population_normalized, 
    variables = common_features, 
    strata = c("Metadata_broad_sample","Metadata_dataset"), 
    operation = "mean"
) 

In [30]:
population_normalized %>% extract2("Metadata_broad_sample") %>% print

    [1] "BRD-K98763141-001-06-8" "BRD-A41941932-001-03-0"
    [3] "BRD-A26711594-001-02-7" "BRD-K65667145-001-05-8"
    [5] "BRD-K61250553-001-02-9" "BRD-K78815826-001-05-4"
    [7] "BRD-K52662033-001-02-6" "BRD-K97688263-003-04-1"
    [9] "BRD-K85383046-001-02-5" "BRD-K81709173-001-03-8"
   [11] "BRD-K36927236-001-06-0" "BRD-A62421304-004-05-6"
   [13] "DMSO"                   "DMSO"                  
   [15] "BRD-K12219985-001-04-8" "BRD-A37817666-001-02-2"
   [17] "BRD-K29582677-001-02-7" "BRD-K34776109-001-03-4"
   [19] "BRD-K44067360-001-06-3" "BRD-K67556876-001-03-1"
   [21] "BRD-A09472452-001-02-8" "BRD-K00824317-001-03-0"
   [23] "BRD-K41713976-001-02-0" "BRD-A22081593-001-04-6"
   [25] "BRD-K47886988-323-03-0" "BRD-K68402494-001-04-7"
   [27] "BRD-K01244426-236-05-5" "BRD-K93201660-001-04-0"
   [29] "BRD-K62607865-001-03-0" "BRD-K59670716-001-02-6"
   [31] "BRD-A31521121-001-05-8" "BRD-A39268308-001-02-0"
   [33] "BRD-K83597974-003-05-7" "BRD-K78599730-001-03-4"
   [35] "BRD-K

In [25]:
population_aggregated %>% slice(1:2) %>% print

# A tibble: 2 x 1,784
  Metadata_broad_… Feature_Cells_A… Feature_Cells_A… Feature_Cells_A…
  <chr>                       <dbl>            <dbl>            <dbl>
1 BRD-A00100033-0…          -0.0284            0.410           -0.215
2 BRD-A00267231-0…           0.183            -0.341            0.412
# ... with 1,780 more variables: Feature_Cells_AreaShape_Compactness <dbl>,
#   Feature_Cells_AreaShape_Eccentricity <dbl>,
#   Feature_Cells_AreaShape_EulerNumber <dbl>,
#   Feature_Cells_AreaShape_Extent <dbl>,
#   Feature_Cells_AreaShape_FormFactor <dbl>,
#   Feature_Cells_AreaShape_MajorAxisLength <dbl>,
#   Feature_Cells_AreaShape_MaxFeretDiameter <dbl>,
#   Feature_Cells_AreaShape_MaximumRadius <dbl>,
#   Feature_Cells_AreaShape_MeanRadius <dbl>,
#   Feature_Cells_AreaShape_MedianRadius <dbl>,
#   Feature_Cells_AreaShape_MinFeretDiameter <dbl>,
#   Feature_Cells_AreaShape_MinorAxisLength <dbl>,
#   Feature_Cells_AreaShape_Orientation <dbl>,
#   Feature_Cells_AreaShape_Perimeter <dbl>

# Correlation matrix 

In [31]:
cor_matrix  <- cor(
    x = population_aggregated %>% 
        filter(Metadata_dataset == 'BBBC022') %>% 
        select(common_features) %>% 
        as.matrix() %>% 
        t, 
    y = population_aggregated %>% 
        filter(Metadata_dataset == 'BBBC036') %>% 
        select(common_features) %>% 
        as.matrix() %>% 
        t,
    use  = "complete.obs"
    ) 


# Submision file 

In [41]:
# set column names 
colnames(cor_matrix)  <- population_aggregated %>% 
                            filter(Metadata_dataset == 'BBBC036') %>%
                            extract2("Metadata_broad_sample")

# set row names 
#rownames(cor_matrix)  <- population_aggregated %>% 
#                            filter(Metadata_dataset == 'BBBC036') %>%
#                            extract2("Metadata_broad_sample")#


In [42]:
df  <- cor_matrix %>% as_data_frame() %>% 
            mutate(Metadata_broad_sample = population_aggregated %>% 
                            filter(Metadata_dataset == 'BBBC022') %>%
                            extract2("Metadata_broad_sample")) %>% 
            select(Metadata_broad_sample, everything())

# write submission file
write.csv(df,"../cytodata-baseline_R_day_2.csv",row.names = FALSE)

In [43]:
df %>% print

# A tibble: 1,601 x 2,241
   Metadata_broad_… `BRD-A00100033-… `BRD-A00267231-… `BRD-A00327403-…
   <chr>                       <dbl>            <dbl>            <dbl>
 1 BRD-A00587958-0…           -0.576           -0.669           -0.745
 2 BRD-A00821662-0…           -0.505           -0.568           -0.586
 3 BRD-A00827783-0…           -0.594           -0.703           -0.780
 4 BRD-A00993607-0…           -0.569           -0.668           -0.739
 5 BRD-A01078468-0…           -0.587           -0.668           -0.741
 6 BRD-A01295252-0…           -0.455           -0.547           -0.596
 7 BRD-A01493904-0…           -0.573           -0.646           -0.728
 8 BRD-A01636364-0…           -0.510           -0.658           -0.742
 9 BRD-A01643550-0…           -0.580           -0.671           -0.743
10 BRD-A01787639-3…           -0.633           -0.735           -0.799
# ... with 1,591 more rows, and 2,237 more variables:
#   `BRD-A00520476-001-03-3` <dbl>, `BRD-A00827783-001-04-8` <dbl>,
