# HW _ 3 : Emperical Analysis
# Turn in for: Rahul Deshpande (deshpr)

### Step 1. Ensure the required packages are installed.

In [74]:
# Display the directory from where we are getting the packages.
.libPaths()
# One or more of the commands below will fail if the library is not installed.
library(e1071)
library(caret)
library(class)
library(rpart)
#install.packages('e1071', dependencies=TRUE)
require('e1071')
#install.packages('caret', dependencies = TRUE)
require(caret)

### Step 2. Load the Gene Expression Data

In [75]:
# To measure time (for debugging)
start.time <- Sys.time()
geneExpressionFileName  <- 'gene_expression_n438x978.txt' 
myData <- read.table(geneExpressionFileName, sep="\t", header=T)
dimnames(myData)[[1]] <- myData[,1]
myData <- myData[,-1]
myData[1:5, 1:5]
end.time <- Sys.time()
cat("took time: ",end.time - start.time)
dim(myData)

Unnamed: 0,PSME1,ATF1,RHEB,FOXO3,RHOA
ACETAZOLAMIDE,-0.015159099,-0.031470528,-0.004733488,0.02591061,0.0056296773
IRBESARTAN,-0.026811981,0.012151979,-0.025550148,-0.02401181,-0.0106717396
IPRATROPIUM BROMIDE,0.001017958,-0.008650622,-0.018128698,-0.02079971,-0.0002722781
EFAVIRENZ,-0.004398264,0.055387992,0.00465852,0.01380732,-0.0340697348
THIAMINE,0.001838965,-0.018079188,-0.011855532,-0.03705033,-0.0133954959


took time:  0.779546

#### We see that we have 978 genes, and 438 examples. 

### Step 3. Load the Adverse Drug Side Effect Data

In [76]:
sideEffectFileName <- 'ADRs_HLGT_n438x232.txt'
sideEffectData <- read.table(sideEffectFileName, sep="\t", header=T)
dimnames(sideEffectData)[[1]] <- sideEffectData[,1]
sideEffectData <- sideEffectData[,-1]
sideEffectData[1:5, 1:5]
dim(sideEffectData)
names <- names(sideEffectData)

Unnamed: 0,Abdominal.hernias.and.other.abdominal.wall.conditions,Abortions.and.stillbirth,Acid.base.disorders,Administration.site.reactions,Adrenal.gland.disorders
ACETAZOLAMIDE,0,0,1,1,0
IRBESARTAN,0,0,0,0,1
IPRATROPIUM BROMIDE,0,0,0,0,1
EFAVIRENZ,0,0,0,0,1
THIAMINE,0,0,0,0,0


#### We see we have 232 side effects and 438 examples. So if we combine this with the gene data, it is like saying we have 438 examples,
####  and 232 different  outcome variables.

### Step 4. Some more data refining

#### Make sure there are no NAs in the data, otherwise imputing might be required.

In [77]:
anyNA(myData)
anyNA(sideEffectData)

#### We see there are no NAs in the data. Therefore, we can begin with performing our experiments.

# Step 5. Experiments

### We now begin performing experiments on our data. Each cell block below performs a different experiment, 
### and outputs the results to a .csv file - this helps with recording results  for the final .csv file.

### Step 5.1 Experiment  1: t = 0.05,  k = 10

#### In this experiment, we calculate the accuracy  for all the side effects,  using k = 10 and t  = 0.05. The results of  the experiment
#### are written to  the output csv file, and the variable fileName has the path of this file.

In [69]:
start.time <- Sys.time()
sideEffectNames <- names(sideEffectData)
geneNames <- names(myData)
experimentName <- "t_0_01_kNN_k = 10"
experimentResults <- data.frame(a = character(), b= numeric())
experimentResults <- rbind(experimentResults, data.frame(a = "side effect name", b = 0))

################################### EXPERIMENT OUTPUT FILE NAME ################################################################
fileName <- paste(experimentName, '.csv')
################################################################################################################################

experimentFile <- paste("C:\\Users\\radeshpa\\Desktop\\BioInformatics\\Assignment3\\Experiment_output", fileName)

for(i in 1:length(sideEffectNames)){
    sideEffectName <- sideEffectNames[i]
    cat("Working on the side effect: ", sideEffectName, "\n")
    currentSideEffect <- sideEffectData[,sideEffectName]
    presentIndices <- which(currentSideEffect == 1)
    absentIndices <- which(currentSideEffect == 0)
   
    outerCount <- 3
    accuracyOverFoldsTotal <- 0
    
    for(j in 1:outerCount){
        
        
################################################  DATA  #######################################################################
        
        foldCount <- 10
        folds <- createFolds(currentSideEffect, foldCount, returnTrain = FALSE)
        # Get the values for every list (indices)

        foldIndices <- list()
        
        for(k in 1:length(folds)){
            foldIndices[[length(foldIndices)+1]] <- flds[[k]]
        }

        accuracyOverFolds <- 0
        accuracyOverFoldsSum <- 0
       
        
        for(k in 1:length(folds)){
            # Data of this fold.
            cat("Running fold number: ", k, " Count number: ", j, "\n")

            # We need to create a vector for indexing
            testingDataX <- myData[foldIndices[[k]],]


            testingDataY <- sideEffectData[foldIndices[[k]],sideEffectName]


            remainingIndices <- (Reduce(c,foldIndices[-k]))

            trainingDataX <- myData[c(remainingIndices), ]

            trainingDataY <- sideEffectData[c(remainingIndices), sideEffectName]

##################################################### FEATURE SELECTION : t- statistic ###########################################################            

            allpValues <- sapply(geneNames, function(geneName){
                geneData <- trainingDataX[,geneName]
                sampleOne <-  geneData[presentIndices]
                sampleOneLength <- length(sampleOne)
                sampleTwo <-  geneData[absentIndices]
                sampleTwoLength <- length(sampleTwo)
                if(sampleOneLength == 0 | sampleTwoLength == 0 | (sampleOneLength == 1 && sampleTwoLength != 1) | (sampleOneLength != 1 && sampleTwoLength == 1)){
                    pValue  <- 0 # we want this feature.
                }
                else{
                    pValue <- t.test(sampleOne, sampleTwo)$p.value
                }
                pValue            
            })

            ######## THRESHOLD FOR THE STATISTIC #############
            
            threshold <- 0.05
            sigpIndices <- which(allpValues < threshold)

            sigpValues <- allpValues[sigpIndices]
            featuresOfInterest <- names(trainingDataX)[sigpIndices]

            refinedTrainingDataX <- trainingDataX[, featuresOfInterest]

            
########################################## KNN, K = 10 #######################################################################################

            # Now filter the test set also!
            refinedTestingDataX <- testingDataX[,featuresOfInterest]
            predictionResults <- knn(refinedTrainingDataX, refinedTestingDataX, factor(trainingDataY), k = 10)

            accuracy <- (sum(predictionResults==testingDataY) + sum(is.na(predictionResults) & is.na(testingDataY))) / length(predictionResults)
            accuracyOverFoldsSum <- accuracyOverFoldsSum + accuracy
     
#######################################################################################################################        

        }
        
        accuracyOverFolds <- (accuracyOverFoldsSum)/length(folds)     
        accuracyOverFoldsTotal <- accuracyOverFoldsTotal + accuracyOverFolds
    }
    
    finalAccuracy  <- accuracyOverFoldsTotal/outerCount
    cat("Final Accuracy is: ", finalAccuracy, "\n")
    
    write.table(data.frame(a = sideEffectName, b = finalAccuracy), experimentFile, append = TRUE, sep = "\t", row.names = FALSE, col.names = FALSE)    
}

end.time <- Sys.time()
cat("took time: ",end.time - start.time)

Working on the side effect:  Abdominal.hernias.and.other.abdominal.wall.conditions 
gene data:  438 47  Count of side effect:  438 


ERROR: Error in eval(expr, envir, enclos): object 'flds' not found


### You can find the results of the experiment in the output file.

### Step 5.1 Experiment  2: t = 0.01,  SVM

#### In this experiment, we calculate the accuracy  for all the side effects,  using SVM and t  = 0.05. The results of  the experiment
#### are written to  the output csv file, and the variable fileName has the path of this file.

In [None]:
start.time <- Sys.time()
sideEffectNames <- names(sideEffectData)
geneNames <- names(myData)
experimentName <- "t_0_01_SVM"
experimentResults <- data.frame(a = character(), b= numeric())
experimentResults <- rbind(experimentResults, data.frame(a = "side effect name", b = 0))

################################### EXPERIMENT OUTPUT FILE NAME ################################################################

fileName <- paste(experimentName, '.csv')

################################################################################################################################

experimentFile <- paste("C:\\Users\\radeshpa\\Desktop\\BioInformatics\\Assignment3\\Experiment_output", fileName)

# For every side effect (the output/outcome),  perform 10 fold cross validation. Also perform feature selection and testing at 
# every fold.

for(i in 1:length(sideEffectNames)){
    sideEffectName <- sideEffectNames[i]
    cat("Working on the side effect: ", sideEffectName, "\n")
    currentSideEffect <- sideEffectData[,sideEffectName]
    presentIndices <- which(currentSideEffect == 1)
    absentIndices <- which(currentSideEffect == 0)
   
    outerCount <- 3
    accuracyOverFoldsTotal <- 0
    
    
    for(j in 1:outerCount){
        
        
################################################  DATA  #######################################################################
        
        foldCount <- 10
        folds <- createFolds(currentSideEffect, foldCount, returnTrain = FALSE)
        # Get the values for every list (indices)

        foldIndices <- list()
        
        for(k in 1:length(folds)){
            foldIndices[[length(foldIndices)+1]] <- folds[[k]]
        }

        accuracyOverFolds <- 0
        accuracyOverFoldsSum <- 0
       
        
        for(k in 1:length(folds)){

            # Data of this fold.
#            cat("Running fold number: ", k, " Count number: ", j, "\n")
            
            # Create the frames for the training data and the testing data.
            
            # We need to create a vector for indexing
            testingDataX <- myData[foldIndices[[k]],]


            testingDataY <- sideEffectData[foldIndices[[k]], sideEffectName]


            remainingIndices <- (Reduce(c,foldIndices[-k]))

            trainingDataX <- myData[c(remainingIndices), ]

            trainingDataY <- sideEffectData[c(remainingIndices), sideEffectName]

##################################################### FEATURE SELECTION : t- statistic ###########################################################            

            allpValues <- sapply(geneNames, function(geneName){
                geneData <- trainingDataX[,geneName]
                sampleOne <-  geneData[presentIndices]
                sampleOneLength <- length(sampleOne)
                sampleTwo <-  geneData[absentIndices]
                sampleTwoLength <- length(sampleTwo)

                if(sampleOneLength == 0 | sampleTwoLength == 0 | (sampleOneLength == 1 && sampleTwoLength != 1) | (sampleOneLength != 1 && sampleTwoLength == 1)){
                    # this is a great feature!
                    pValue  <- 0 # we want this feature.
                }
                else{

                    pValue <- t.test(sampleOne, sampleTwo)$p.value

                } 
                pValue            
            })


            threshold <- 0.01
            sigpIndices <- which(allpValues < threshold)

            sigpValues <- allpValues[sigpIndices]
            featuresOfInterest <- names(trainingDataX)[sigpIndices]

            refinedTrainingDataX <- trainingDataX[, featuresOfInterest]

            
########################################## SVM #######################################################################################

#            cat("Features of interest: ", length(featuresOfInterest), "\n")
            # Now filter the test set also!
            refinedTestingDataX <- testingDataX[,featuresOfInterest]
            svmModel <- svm(refinedTrainingDataX, trainingDataY)
            predictionResults <- predict(svmModel, refinedTestingDataX)
#            cat("Count of prediction results: ", length(predictionResults))

            probabilityThreshold <- 0.5
            predictionResults <- round(predictionResults) # default is 0.5
            accuracy <- (sum(predictionResults==testingDataY) + sum(is.na(predictionResults) & is.na(testingDataY))) / length(predictionResults)
            #cat("Accuracy for fold:  ", i, " is ", accuracy, "\n")
            accuracyOverFoldsSum <- accuracyOverFoldsSum + accuracy
            
#######################################################################################################################
           
        }
        
        accuracyOverFolds <- (accuracyOverFoldsSum)/length(folds)     
        accuracyOverFoldsTotal <- accuracyOverFoldsTotal + accuracyOverFolds
    }
    
    finalAccuracy  <- accuracyOverFoldsTotal/outerCount
#    cat("Final Accuracy is: ", finalAccuracy, "\n")
    
    write.table(data.frame(a = sideEffectName, b = finalAccuracy), experimentFile, append = TRUE, sep = "\t", row.names = FALSE, col.names = FALSE)    
}

end.time <- Sys.time()
cat("took time: ",end.time - start.time)

In [None]:
testingDataX <- myData[foldIndices[[k]],]

#            cat("Dimension of testying data x = ", dim(testingDataX), "\n")

testingDataY <- sideEffectData[foldIndices[[k]], sideEffectName]


In [None]:
start.time <- Sys.time()
sideEffectNames <- names(sideEffectData)
geneNames <- names(myData)
experimentName <- "t_0_01_kNN_k = 10"
experimentResults <- data.frame(a = character(), b= numeric())
experimentResults <- rbind(experimentResults, data.frame(a = "side effect name", b = 0))

################################### EXPERIMENT OUTPUT FILE NAME ################################################################
fileName <- paste(experimentName, '.csv')
################################################################################################################################

experimentFile <- paste("C:\\Users\\radeshpa\\Desktop\\BioInformatics\\Assignment3\\Experiment_output", fileName)

for(i in 1:1){
    sideEffectName <- sideEffectNames[i]
    cat("Working on the side effect: ", sideEffectName, "\n")
    currentSideEffect <- sideEffectData[,sideEffectName]
    presentIndices <- which(currentSideEffect == 1)
    absentIndices <- which(currentSideEffect == 0)

    
    subSetData <- myData[, featuresOfInterest]
    
    cat("gene data: ", dim(subSetData), " Count of side effect: ", length(currentSideEffect), "\n")
   
    outerCount <- 1
    accuracyOverFoldsTotal <- 0
    
    
    for(j in 1:outerCount){
        
        
################################################  DATA  #######################################################################
        
        foldCount <- 10
        folds <- createFolds(currentSideEffect, foldCount, returnTrain = FALSE)
        # Get the values for every list (indices)

        foldIndices <- list()
        
        for(k in 1:length(folds)){
            foldIndices[[length(foldIndices)+1]] <- flds[[k]]
        }

        accuracyOverFolds <- 0
        accuracyOverFoldsSum <- 0
       
        
        for(k in 1:length(folds)){

            # Data of this fold.
            cat("Running fold number: ", k, " Count number: ", j, "\n")

            # We need to create a vector for indexing
            testingDataX <- myData[foldIndices[[k]],]

#            cat("Dimension of testying data x = ", dim(testingDataX), "\n")

            testingDataY <- sideEffectData[foldIndices[[k]], sideEffectName]

#            cat("Length of testing data = ", length(testingDataY), "\n")


            remainingIndices <- (Reduce(c,foldIndices[-k]))

            trainingDataX <- myData[c(remainingIndices), ]

            trainingDataY <- sideEffectData[c(remainingIndices), sideEffectName]

##################################################### FEATURE SELECTION : t- statistic ###########################################################            

            allpValues <- sapply(geneNames, function(geneName){
                geneData <- trainingDataX[,geneName]
                sampleOne <-  geneData[presentIndices]
                sampleOneLength <- length(sampleOne)
                sampleTwo <-  geneData[absentIndices]
                sampleTwoLength <- length(sampleTwo)
        #        cat("sample one size = ", length(sampleOne), " and sample two size = ", length(sampleTwo), "\n")
                if(sampleOneLength == 0 | sampleTwoLength == 0 | (sampleOneLength == 1 && sampleTwoLength != 1) | (sampleOneLength != 1 && sampleTwoLength == 1)){
                    # this is a great feature!
                    pValue  <- 0 # we want this feature.
                }
                else{
        #           cat("sample one size = ", length(sampleOne), " and sample two size = ", length(sampleTwo), "\n")

                    pValue <- t.test(sampleOne, sampleTwo)$p.value
        #            cat(" gene: ", geneName, " side effect: ", sideEffectName, "  p Value : ", pValue, "\n")
                }
                pValue            
            })


            threshold <- 0.05
            sigpIndices <- which(allpValues < threshold)

            sigpValues <- allpValues[sigpIndices]
            featuresOfInterest <- names(trainingDataX)[sigpIndices]

            refinedTrainingDataX <- trainingDataX[, featuresOfInterest]

            
########################################## Decision Trees #######################################################################################

#            cat("Features of interest: ", length(featuresOfInterest), "\n")
            # Now filter the test set also!
            refinedTestingDataX <- testingDataX[,featuresOfInterest]
            svmModel <- svm(refinedTrainingDataX, trainingDataY)
            predictionResults <- predict(svmModel, refinedTestingDataX)
#            cat("Count of prediction results: ", length(predictionResults))

            probabilityThreshold <- 0.5
            predictionResults <- round(predictionResults) # default is 0.5
            accuracy <- (sum(predictionResults==testingDataY) + sum(is.na(predictionResults) & is.na(testingDataY))) / length(predictionResults)
            #cat("Accuracy for fold:  ", i, " is ", accuracy, "\n")
            accuracyOverFoldsSum <- accuracyOverFoldsSum + accuracy
            
#######################################################################################################################
           
        }
        
        accuracyOverFolds <- (accuracyOverFoldsSum)/length(folds)     
        accuracyOverFoldsTotal <- accuracyOverFoldsTotal + accuracyOverFolds
    }
    
    finalAccuracy  <- accuracyOverFoldsTotal/outerCount
    cat("Final Accuracy is: ", finalAccuracy, "\n")
    
    write.table(data.frame(a = sideEffectName, b = finalAccuracy), experimentFile, append = TRUE, sep = "\t", row.names = FALSE, col.names = FALSE)    
}

end.time <- Sys.time()
cat("took time: ",end.time - start.time)

### Use Correlation to determine the best genes. Also use threshold of 0.3 (-0.3, +0.3)

### Experiment : Correlation (-0.10 to 0.10), kNN
### Evaluation Metrics: Accuracy

In [9]:
start.time <- Sys.time()
sideEffectNames <- names(sideEffectData)
geneNames <- names(myData)
experimentName <- "correlation_0.10_kNN_k = 10_Accuracy_f-1"
experimentResults <- data.frame(a = character(), b= numeric())
experimentResults <- rbind(experimentResults, data.frame(a = "side effect name", b = 0))

################################### EXPERIMENT OUTPUT FILE NAME ################################################################
fileName <- paste(experimentName, '.csv')
################################################################################################################################

experimentFile <- paste("C:\\Users\\radeshpa\\Desktop\\BioInformatics\\Assignment3\\Experiment_output", fileName)

for(i in 1:1){
    sideEffectName <- sideEffectNames[i]
    cat("Working on the side effect: ", sideEffectName, "\n")
    currentSideEffect <- sideEffectData[,sideEffectName]
    presentIndices <- which(currentSideEffect == 1)
    absentIndices <- which(currentSideEffect == 0)

   
    outerCount <- 1
    accuracyOverFoldsTotal <- 0
    
    for(j in 1:outerCount){
        
################################################  DATA  #######################################################################
        
        foldCount <- 10
        folds <- createFolds(currentSideEffect, foldCount, returnTrain = FALSE)
        # Get the values for every list (indices)

        foldIndices <- list()
        
        for(k in 1:length(folds)){
            foldIndices[[length(foldIndices)+1]] <- folds[[k]]
        }

        accuracyOverFolds <- 0
        accuracyOverFoldsSum <- 0
       
        
        for(k in 1:length(folds)){

            # Data of this fold.
            cat("Running fold number: ", k, " Count number: ", j, "\n")

            # We need to create a vector for indexing
            testingDataX <- myData[foldIndices[[k]],]

            testingDataY <- sideEffectData[foldIndices[[k]], sideEffectName]

            remainingIndices <- (Reduce(c,foldIndices[-k]))

            trainingDataX <- myData[c(remainingIndices), ]

            trainingDataY <- sideEffectData[c(remainingIndices), sideEffectName]

##################################################### FEATURE SELECTION : CORRELATION ###########################################################            

            correlationResults <- apply(trainingDataX, 2, function(x){ cor(x, trainingDataY)})
            boundary <- 0.10
            geneIndices  <- c(which(correlationResults  > boundary), which(correlationResults < -boundary))

            featuresOfInterest <- names(trainingDataX)[geneIndices]

            refinedTrainingDataX <- trainingDataX[, featuresOfInterest]

            
########################################## Decision Trees #######################################################################################

#            cat("Features of interest: ", length(featuresOfInterest), "\n")
            # Now filter the test set also!
            refinedTestingDataX <- testingDataX[,featuresOfInterest]
            svmModel <- svm(refinedTrainingDataX, trainingDataY)
            predictionResults <- predict(svmModel, refinedTestingDataX)
#            cat("Count of prediction results: ", length(predictionResults))

            probabilityThreshold <- 0.5
            predictionResults <- round(predictionResults) # default is 0.5
            accuracy <- (sum(predictionResults==testingDataY) + sum(is.na(predictionResults) & is.na(testingDataY))) / length(predictionResults)
            #cat("Accuracy for fold:  ", i, " is ", accuracy, "\n")
            accuracyOverFoldsSum <- accuracyOverFoldsSum + accuracy
            
#######################################################################################################################
           
        }
        
        accuracyOverFolds <- (accuracyOverFoldsSum)/length(folds)     
        accuracyOverFoldsTotal <- accuracyOverFoldsTotal + accuracyOverFolds
    }
    
    finalAccuracy  <- accuracyOverFoldsTotal/outerCount
    cat("Final Accuracy is: ", finalAccuracy, "\n")
    
    write.table(data.frame(a = sideEffectName, b = finalAccuracy), experimentFile, append = TRUE, sep = "\t", row.names = FALSE, col.names = FALSE)    
}

end.time <- Sys.time()
cat("took time: ",end.time - start.time)

ERROR: Error in parse(text = x, srcfile = src): <text>:4:19: unexpected input
3: geneNames <- names(myData)
4: experimentName <- _
                     ^


## Decision Tree

In [51]:
a <- sideEffectData[,1]
b <- myData[1:3,1:3]
#names(b)
b[["Y"]] <- a
fittedTree <- rpart(Y ~ ., data = b, method = "class")
fittedTree <- prune(fittedTree, cp= fittedTree$cptable[which.min(fittedTree$cptable[,"xerror"]),"CP"])

#predictionResults <- predict(fittedTree, myData[], type="class")

ERROR: Error in cbind(yval2, yprob, nodeprob): number of rows of matrices must match (see arg 2)


In [65]:
start.time <- Sys.time()
sideEffectNames <- names(sideEffectData)
geneNames <- names(myData)
experimentName <- "correlation_0.10_decision_tree_f-1"
experimentResults <- data.frame(a = character(), b= numeric())
experimentResults <- rbind(experimentResults, data.frame(a = "side effect name", b = 0))

################################### EXPERIMENT OUTPUT FILE NAME ################################################################
fileName <- paste(experimentName, '.csv')
################################################################################################################################

experimentFile <- paste("C:\\Users\\radeshpa\\Desktop\\BioInformatics\\Assignment3\\Experiment_output", fileName)

for(i in 1:10){
    sideEffectName <- sideEffectNames[i]
    cat("Working on the side effect: ", sideEffectName, "\n")
    currentSideEffect <- sideEffectData[,sideEffectName]
    presentIndices <- which(currentSideEffect == 1)
    absentIndices <- which(currentSideEffect == 0)

   
    outerCount <- 1
    accuracyOverFoldsTotal <- 0
    
    for(j in 1:outerCount){
        
################################################  DATA  #######################################################################
        
        foldCount <- 10
        folds <- createFolds(currentSideEffect, foldCount, returnTrain = FALSE)
        # Get the values for every list (indices)

        foldIndices <- list()
        
        for(k in 1:length(folds)){
            foldIndices[[length(foldIndices)+1]] <- folds[[k]]
        }

        accuracyOverFolds <- 0
        accuracyOverFoldsSum <- 0
       
        
        for(k in 1:length(folds)){

            # Data of this fold.
#            cat("Running fold number: ", k, " Count number: ", j, "\n")

            # We need to create a vector for indexing
            testingDataX <- myData[foldIndices[[k]],]

            testingDataY <- sideEffectData[foldIndices[[k]], sideEffectName]

            remainingIndices <- (Reduce(c,foldIndices[-k]))

            trainingDataX <- myData[c(remainingIndices), ]

            trainingDataY <- sideEffectData[c(remainingIndices), sideEffectName]

##################################################### FEATURE SELECTION : CORRELATION ###########################################################            

            correlationResults <- apply(trainingDataX, 2, function(x){ cor(x, trainingDataY)})
            boundary <- 0.10
            geneIndices  <- c(which(correlationResults  > boundary), which(correlationResults < -boundary))

            featuresOfInterest <- names(trainingDataX)[geneIndices]

            refinedTrainingDataX <- trainingDataX[, featuresOfInterest]

            
########################################## Decision Trees #######################################################################################

            refinedTrainingDataX[["Y"]] <- trainingDataY
            fittedTree <- rpart(Y ~ ., data = refinedTrainingDataX, method = "class")
            fittedTreeFinal <- prune(fittedTree, cp= fittedTree$cptable[which.min(fittedTree$cptable[,"xerror"]),"CP"])
            
            refinedTestingDataX <- testingDataX[,featuresOfInterest]
            
            predictionResults <- predict(fittedTreeFinal, refinedTestingDataX, method = "prob")
            
            probabilityThreshold <- 0.5
            predictionResults <- round(predictionResults) # default is 0.5
            print(length(predictionResults))
            cat("Actual:\n")
            cat(testingDataY, "\n")
            cat("Predicted:\n")
            cat(predictionResults, "\n")
            cat("Where  equyal: ", sum(predictionResults==testingDataY), "\n")
            accuracy <- (sum(predictionResults==testingDataY, na.rm=T)) / length(predictionResults)
            cat("Accuracy for fold:  ", i, " is ", accuracy, "\n")
            accuracyOverFoldsSum <- accuracyOverFoldsSum + accuracy
            
            break

#######################################################################################################################
           
        }
        
        accuracyOverFolds <- (accuracyOverFoldsSum)/length(folds)     
        accuracyOverFoldsTotal <- accuracyOverFoldsTotal + accuracyOverFolds
    }
    
    finalAccuracy  <- accuracyOverFoldsTotal/outerCount
    cat("Final Accuracy is: ", finalAccuracy, "\n")
    
    write.table(data.frame(a = sideEffectName, b = finalAccuracy), experimentFile, append = TRUE, sep = "\t", row.names = FALSE, col.names = FALSE)    
}

end.time <- Sys.time()
cat("took time: ",end.time - start.time)

Working on the side effect:  Abdominal.hernias.and.other.abdominal.wall.conditions 
[1] 88
Actual:
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 
Predicted:
1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 
Where  equyal:  44 
Accuracy for fold:   1  is  0.5 
Final Accuracy is:  0.05 
Working on the side effect:  Abortions.and.stillbirth 
[1] 88
Actual:
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 
Predicted:
1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 
Where  equyal:  44 
Accuracy for fold:   2  is  0.5 
Final Accuracy is:  0.05 
Working on the side effect:  Acid.base.disorders 
[1] 88
Actual:
1 0 0 0 0 1 0 1 0 0 0 0 0 1 1 0 0 0 0 0 1 0 1 0 0 1 0 