In [4]:
library(tidyverse)
library(cytominer)
library(magrittr)

In [5]:
load_dataset  <- function(partition, dataset,feature){
    file_name  <- read_csv("../datasets.csv") 
    x  <-  file_name %>% filter(
         Partition == partition,
         Dataset == dataset,
         Features == feature) %>% 
         extract2("Link")

    return(read_csv(x) %>% 
          mutate(Metadata_dataset = dataset) %>%
          mutate(Metadata_partition = partition) %>% 
          mutate(Metadata_features = feature) 
          )
    }

# Load data 
We load training and test datasets for both genetic perturbation experiments 

In [None]:
# bbbc37 data 
bbbc036_train  <- load_dataset("Train","BBBC036","DeepLearning")  %>% 
    mutate(Metadata_x_mutation_status = "none")

bbbc036_test <- load_dataset("Test","BBBC036","DeepLearning")  %>% 
    mutate(Metadata_x_mutation_status = "none")

bbbc036  <- rbind(bbbc036_train, bbbc036_test)

Parsed with column specification:
cols(
  Dataset = col_character(),
  Partition = col_character(),
  Features = col_character(),
  Link = col_character()
)


In [None]:
bbbc036_train %>% dim()
bbbc036_test %>% dim()

In [None]:
# bbbc043 data 
bbbc022_train  <- load_dataset("Train","BBBC022","DeepLearning")
 
bbbc022_test <- load_dataset("Test","BBBC022","DeepLearning")

bbbc022  <- rbind(bbbc022_train, bbbc022_test)

## Check dimensionality

In [None]:
dim(bbbc022)
dim(bbbc036)

## Extract common features 

In [None]:
colnames_bbbc022 <- colnames(bbbc022)
colnames_bbbc036 <- colnames(bbbc036)


Metadata_names_bbbc022 <- c(
   stringr::str_subset(colnames_bbbc022, "^Meta")
) 

Metadata_names_bbbc036 <- c(
   stringr::str_subset(colnames_bbbc036, "^Meta")
) 

common_metadata  <- intersect(Metadata_names_bbbc022, Metadata_names_bbbc036)  
common_features  <- setdiff(intersect(colnames_bbbc022, colnames_bbbc036),common_metadata)

colnames_bbbc036 %>% length()

In [None]:
bbbc022_na_feature  <- cytominer::drop_na_columns(
    population = bbbc022  %>% 
                filter(
                    Metadata_broad_sample == "DMSO"
                ) %>% 
                slice(1:100),
    variables = common_features,
    cutoff = 0
    )

#bbbc036_na_feature  <- cytominer::drop_na_columns(
#    population = bbbc036,
#    variables = common_features,
#    cutoff = 0
#    )

In [None]:
bbbc022_na_feature %>% print

In [None]:
features_to_remove  <- cytominer::variance_threshold(
    variables = common_features,
    sample = bbbc022  %>% 
                filter(
                    Metadata_broad_sample == "DMSO"
                ) %>% 
                slice(1:100)
)

# Concatenate data sets

In [None]:
population  <- rbind(
    bbbc022 %>% 
        select(c(common_metadata, common_features)),
    bbbc036 %>% 
        select(c(common_metadata, common_features))
    ) %>% 
    mutate(Metadata_perturbation = 'chemical') %>% 
    select(Metadata_perturbation, everything())

## Important: update column names! 

In [None]:
colnames_combined  <- colnames(population)

common_metadata  <- c(
   stringr::str_subset(colnames_combined, "^Meta")
) 

common_features  <- setdiff(colnames_combined, common_metadata)


Cytominer has problems handling column names '1', '2' so we rename them to 'Feature_1', ... 

In [None]:
common_features  <- paste0("Feature_",common_features)
colnames(population)  <- c(common_metadata, common_features)

# Normalize data
We use cytominer to normalize both datasets with respect to the controls, i.e. EMPTY genes

In [None]:
population_normalized  <- cytominer::normalize(
    population, 
    variables = common_features, 
    strata = c("Metadata_perturbation"), 
    sample = population %>% 
                filter(
                    Metadata_broad_sample == "DMSO"
                ) %>% 
                slice(1:100), 
    operation = "standardize"
)

In [None]:
population_normalized %>% dim() %>% print

# Aggregate data 

In [None]:
population_aggregated  <- cytominer::aggregate(
    population = population_normalized, 
    variables = common_features, 
    strata = c("Metadata_broad_sample","Metadata_dataset"), 
    operation = "mean"
) 

In [None]:
population_normalized %>% extract2("Metadata_broad_sample") %>% print

In [None]:
population_aggregated %>% slice(1:2) %>% print

# Correlation matrix 

In [None]:
cor_matrix  <- cor(
    x = population_aggregated %>% 
        filter(Metadata_dataset == 'BBBC022') %>% 
        select(common_features) %>% 
        as.matrix() %>% 
        t, 
    y = population_aggregated %>% 
        filter(Metadata_dataset == 'BBBC036') %>% 
        select(common_features) %>% 
        as.matrix() %>% 
        t,
    use  = "complete.obs"
    ) 


# Submision file 

In [None]:
# set column names 
colnames(cor_matrix)  <- population_aggregated %>% 
                            filter(Metadata_dataset == 'BBBC036') %>%
                            extract2("Metadata_pert_id")

# set row names 
#rownames(cor_matrix)  <- population_aggregated %>% 
#                            filter(Metadata_dataset == 'BBBC036') %>%
#                            extract2("Metadata_broad_sample")#


In [None]:
df  <- cor_matrix %>% as_data_frame() %>% 
            mutate(Metadata_pert_id = population_aggregated %>% 
                            filter(Metadata_dataset == 'BBBC022') %>%
                            extract2("Metadata_pert_id")) %>% 
            select(Metadata_pert_id, everything())

# write submission file
write.csv(df,"../cytodata-baseline_R_day_2.csv",row.names = FALSE)

In [None]:
df %>% print