In [None]:
# This R environment comes with many helpful analytics packages installed
# It is defined by the kaggle/rstats Docker image: https://github.com/kaggle/docker-rstats
# For example, here's a helpful package to load

library(tidyverse) # metapackage of all tidyverse packages

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

list.files(path = "../input")

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
install.packages("pastecs")
install.packages("plyr")
install.packages("dplyr")
install.packages("ggplot2")
install.packages("tidyverse")
install.packages("data.table")
install.packages("plotly")
install.packages("heatmaply")
install.packages("ggcorrplot")
install.packages("factoextra")
install.packages("corrplot")

In [None]:
library(pastecs)
library(plyr)
library(dplyr)
library(ggplot2)
library(tidyverse)
library(data.table)
library(plotly)
library(heatmaply)
library(ggcorrplot)
library(factoextra)
library(corrplot)

In [None]:
# call dataset
train_targets_scored = read.csv("../input/lish-moa/train_targets_scored.csv", na.strings = c("","NA"), stringsAsFactors = F, header = T, check.names = FALSE)

In [None]:
# dimension of dataset
dim(train_targets_scored)
head(train_targets_scored)

In [None]:
str(train_targets_scored)

In [None]:
# number of missing values
sum(is.na(train_targets_scored))

In [None]:
#integer attributes
ICL <- train_targets_scored %>% select_if(is.numeric)

In [None]:
#name of integer attributes
names(ICL)

In [None]:
#number of integer attributes
length(ICL)

In [None]:
#Statistics Description of integer attributes
ICLS <- round(stat.desc(ICL),2)
ICLS

In [None]:
#convert first row to column values
ICLS <- setDT(ICLS, keep.rownames = TRUE)[]
colnames(ICLS)[1] <- "Statistics"
ICLS <- data.frame(ICLS, check.names = F)
ICLS

In [None]:
# prepare frequency table of activations of MoAs. 
# rowwise sum of each columns

FRQUENCYcoldf <- data.frame(t((ICLS[7,])))
FRQUENCYcoldf <- setDT(FRQUENCYcoldf, keep.rownames = TRUE)[]
FRQUENCYcoldf <- FRQUENCYcoldf[-1,]
colnames(FRQUENCYcoldf) <- c("MoAs", "Frequency_of_1")
FRQUENCYcoldf$Frequency_of_1 <- as.numeric(as.character(FRQUENCYcoldf$Frequency_of_1))
FRQUENCYcoldf <- FRQUENCYcoldf[order(FRQUENCYcoldf$Frequency_of_1 ,decreasing = TRUE),]
dim(FRQUENCYcoldf)
head(FRQUENCYcoldf)

In [None]:
# Top 20 MoAs with highest activations
FRQUENCYtopcoldf <- FRQUENCYcoldf[1:20,]
dim(FRQUENCYtopcoldf)
head(FRQUENCYtopcoldf)

In [None]:
#graph of Top 20 MoAs with highest activation
topcol <- ggplot(data=FRQUENCYtopcoldf, aes(x=reorder(MoAs,Frequency_of_1), y=Frequency_of_1)) + geom_bar(stat ="identity", aes(fill = Frequency_of_1)) + coord_flip() + labs(x = "MoAs") + scale_fill_gradient2(low='orange', mid='snow', high='blue')
topcol

In [None]:
# MoAs that does not have positive occurance for any samples
# All MoAs has atleast one positive occurance
FRQUENCYcoltotalzerodf <- FRQUENCYcoldf[FRQUENCYcoldf$Frequency_of_1 == 0,]
FRQUENCYcoltotalzerodf

In [None]:
# sum of positive MoAs occurrence by sample wise.
train_targets_scored$RowTotal <- rowSums(train_targets_scored[,2:207])

In [None]:
#Extract 1st column and last columns
FRQUENCYrowdf <- train_targets_scored[,c(1,208)]
FRQUENCYrowdf <- FRQUENCYrowdf[order(FRQUENCYrowdf$RowTotal, decreasing = TRUE),]


In [None]:
#Extract records whose columns sum is non-zero
FRQUENCYnonzerorowdf <- train_targets_scored[train_targets_scored$RowTotal >0,]
dim(FRQUENCYnonzerorowdf)
head(FRQUENCYnonzerorowdf)

In [None]:
# drop the added column of Row Total from train_targets_scored
train_targets_scored <- train_targets_scored[,-208]
dim(FRQUENCYrowdf)
head(FRQUENCYrowdf)

In [None]:
# Top 20 samples with highest count of activation
FRQUENCYtoprowdf <- FRQUENCYrowdf[1:20,]
dim(FRQUENCYtoprowdf)
head(FRQUENCYtoprowdf)

In [None]:
#graph of Top 20 samples with highest count of activation
toprow <- ggplot(FRQUENCYtoprowdf, aes(x=reorder(sig_id, RowTotal), y=RowTotal)) + geom_bar(stat ="identity", aes(fill = RowTotal)) + coord_flip() + labs(x = "sig_id") + scale_fill_gradient2(low='orange', mid='snow', high='blue')
toprow

In [None]:
# samples that do not have positive occurance for any MoAs
# Total 9367 samples does not have any MoAs. We can possibly remove these samples for data reduction.
FRQUENCYrowtotalzerodf <- FRQUENCYrowdf[FRQUENCYrowdf$RowTotal == 0,]
length(FRQUENCYrowtotalzerodf$RowTotal)

In [None]:
# Name of sample Ids that do not have positive occurance of any MoAs
dim(FRQUENCYrowtotalzerodf)
head(FRQUENCYrowtotalzerodf)

In [None]:
#Which MoAs seem to be correlated? 
# Compute a correlation matrix
ICL_corr <- cor(ICL)
dim(ICL_corr)
head(ICL_corr)

In [None]:
#minimum and maximum correlation values
min(ICL_corr); max(ICL_corr)

In [None]:
# Compute a matrix of correlation p-values
p.mat <- cor_pmat(ICL,)
# convert matrix to upper half
p.mat[lower.tri(p.mat,diag=TRUE)] <- 0
dim(p.mat)
head(p.mat)

In [None]:
correlation_coefficient <- c()
p_value = c()
MoA1 <- c()
MoA2 <- c()
for (i in 1:206){
    for (j in 1:206) {
                    x <- p.mat[i,j]
                    y <- ICL_corr[i,j]
                    if (x <= 0.05 & x != 0 ){
                                   correlation_coefficient <- c(correlation_coefficient, y)
                                   p_value <- c(p_value,x)
                                   MoA1 <- c(MoA1,rownames(p.mat)[i])
                                   MoA2 <- c(MoA2,colnames(p.mat)[j])
                                    }
                    }
                }

In [None]:
# dataframe of significant correlated MoAs = sig_cor_MoAs_df
sig_cor_MoAs_df <- data.frame(MoA1, MoA2, correlation_coefficient, p_value)
dim(sig_cor_MoAs_df)
head(sig_cor_MoAs_df)

In [None]:
# minimum value and maximum value of correlation coefficient of among MoAs.
min(abs(sig_cor_MoAs_df$correlation_coefficient)) ; max(abs(sig_cor_MoAs_df$correlation_coefficient))

In [None]:
#frequency of MoA1
FRQUENCYMoA1df <- count(sig_cor_MoAs_df, MoA1)
FRQUENCYMoA1df <- FRQUENCYMoA1df[order(FRQUENCYMoA1df$n ,decreasing = TRUE),]
colnames(FRQUENCYMoA1df)[2] <- "count1"
dim(FRQUENCYMoA1df)
head(FRQUENCYMoA1df)

In [None]:
#frequency of MoA2
FRQUENCYMoA2df <- count(sig_cor_MoAs_df, MoA2)
FRQUENCYMoA2df <- FRQUENCYMoA2df[order(FRQUENCYMoA2df$n ,decreasing = TRUE),]
colnames(FRQUENCYMoA2df)[2] <- "count2"
dim(FRQUENCYMoA2df)
head(FRQUENCYMoA2df)

In [None]:
# gather top 20 MoA1 and MoA2 with higheest frequency
# and plot correlation graph
#Top 20 of MoA1
F1 <- as.character(FRQUENCYMoA1df[1:20,1])
F1
#Top 20 of MoA2
F2 <- as.character(FRQUENCYMoA2df[1:20,1])
F2
F <- c(F1,F2)
F

In [None]:
#extract features from original dataset based on F1 and F2
ICLF <- ICL[,F]
dim(ICLF)
head(ICLF)

In [None]:
corrplot(cor(ICLF), method="color")

In [None]:
#add frequency count to sig_cor_MoA_df
sig_cor_MoAs_fre_df <- merge(sig_cor_MoAs_df, FRQUENCYMoA1df, by = "MoA1")
sig_cor_MoAs_fre_df <- merge(sig_cor_MoAs_fre_df, FRQUENCYMoA2df, by = "MoA2")

In [None]:
#count total frequency
sig_cor_MoAs_fre_df$total <- sig_cor_MoAs_fre_df$count1 + sig_cor_MoAs_fre_df$count2
sig_cor_MoAs_fre_df <- sig_cor_MoAs_fre_df[order(sig_cor_MoAs_fre_df$total ,decreasing = TRUE),]
dim(sig_cor_MoAs_fre_df)
head(sig_cor_MoAs_fre_df)

In [None]:
# extract top 100 of total 
SEM <- sig_cor_MoAs_fre_df[1:100,c(1,2)]
x1 <- levels( factor( SEM$MoA2))
x2 <- levels( factor( SEM$MoA1))
length(x1) ; length(x2)
X <- c(as.character(x1), as.character(x2))
X <- sort(X)

In [None]:
#extract features from original dataset based on x1 and x2
ICLX <- ICL[,X]
dim(ICLX)
head(ICLX)

In [None]:
corrplot(cor(ICLX), method="color")

In [None]:
# check if records (or samples) have the similar lable profile
    # we have already seen, there are 9367 samples ids that do not have any activation.
    # so we use "FRQUENCYnonzerorowdf" for this purpose

ind <- duplicated(FRQUENCYnonzerorowdf[,2:207])
FRQUENCYnonzerorowdf[ind,]
# that means total 14120 samples have duplicate label profile
# this table gives clear indication that lable wise cluster is possible, and we will consider clustering of MoAs for prediction purpose.