In [1]:
# Display the directory from where we are getting the packages.
.libPaths()
# One or more of the commands below will fail if the library is not installed.
library(e1071)
library(caret)
library(class)
library(rpart)
#install.packages('e1071', dependencies=TRUE)
require('e1071')
#install.packages('caret', dependencies = TRUE)
require(caret)

Loading required package: lattice
Loading required package: ggplot2


In [2]:
# To measure time (for debugging)
start.time <- Sys.time()
geneExpressionFileName  <- 'gene_expression_n438x978.txt' 
myData <- read.table(geneExpressionFileName, sep="\t", header=T)
dimnames(myData)[[1]] <- myData[,1]
myData <- myData[,-1]
myData[1:5, 1:5]
end.time <- Sys.time()
cat("took time: ",end.time - start.time)
dim(myData)

Unnamed: 0,PSME1,ATF1,RHEB,FOXO3,RHOA
ACETAZOLAMIDE,-0.015159099,-0.031470528,-0.004733488,0.02591061,0.0056296773
IRBESARTAN,-0.026811981,0.012151979,-0.025550148,-0.02401181,-0.0106717396
IPRATROPIUM BROMIDE,0.001017958,-0.008650622,-0.018128698,-0.02079971,-0.0002722781
EFAVIRENZ,-0.004398264,0.055387992,0.00465852,0.01380732,-0.0340697348
THIAMINE,0.001838965,-0.018079188,-0.011855532,-0.03705033,-0.0133954959


took time:  0.7955532

In [3]:
sideEffectFileName <- 'ADRs_HLGT_n438x232.txt'
sideEffectData <- read.table(sideEffectFileName, sep="\t", header=T)
dimnames(sideEffectData)[[1]] <- sideEffectData[,1]
sideEffectData <- sideEffectData[,-1]
sideEffectData[1:5, 1:5]
dim(sideEffectData)
names <- names(sideEffectData)

Unnamed: 0,Abdominal.hernias.and.other.abdominal.wall.conditions,Abortions.and.stillbirth,Acid.base.disorders,Administration.site.reactions,Adrenal.gland.disorders
ACETAZOLAMIDE,0,0,1,1,0
IRBESARTAN,0,0,0,0,1
IPRATROPIUM BROMIDE,0,0,0,0,1
EFAVIRENZ,0,0,0,0,1
THIAMINE,0,0,0,0,0


In [4]:
anyNA(myData)
anyNA(sideEffectData)

In [5]:
start.time <- Sys.time()
sideEffectNames <- names(sideEffectData)
geneNames <- names(myData)
experimentName <- "t_0_05_SVM"
experimentResults <- data.frame(a = character(), b= numeric())
experimentResults <- rbind(experimentResults, data.frame(a = "side effect name", b = 0))

################################### EXPERIMENT OUTPUT FILE NAME ################################################################

fileName <- paste(experimentName, '.csv')

################################################################################################################################

experimentFile <- paste("C:\\Users\\radeshpa\\Desktop\\BioInformatics\\Assignment3\\Experiment_output", fileName)

# For every side effect (the output/outcome),  perform 10 fold cross validation. Also perform feature selection and testing at 
# every fold.

cat("Count of side effects is: ", length(sideEffectNames), "\n")

for(i in 1:length(sideEffectNames)){
    sideEffectName <- sideEffectNames[i]
    cat("Working on the side effect: ", sideEffectName, "\n")
    currentSideEffect <- sideEffectData[,sideEffectName]
    presentIndices <- which(currentSideEffect == 1)
    absentIndices <- which(currentSideEffect == 0)
   
    outerCount <- 3
    accuracyOverFoldsTotal <- 0
    
    
    for(j in 1:outerCount){
        
        
################################################  DATA  #######################################################################
        
        foldCount <- 10
        folds <- createFolds(currentSideEffect, foldCount, returnTrain = FALSE)
        # Get the values for every list (indices)

        foldIndices <- list()
        
        for(k in 1:length(folds)){
            foldIndices[[length(foldIndices)+1]] <- folds[[k]]
        }

        accuracyOverFolds <- 0
        accuracyOverFoldsSum <- 0
       
        
        for(k in 1:length(folds)){

            # Data of this fold.
#            cat("Running fold number: ", k, " Count number: ", j, "\n")
            
            # Create the frames for the training data and the testing data.
            
            # We need to create a vector for indexing
            testingDataX <- myData[foldIndices[[k]],]


            testingDataY <- sideEffectData[foldIndices[[k]], sideEffectName]


            remainingIndices <- (Reduce(c,foldIndices[-k]))

            trainingDataX <- myData[c(remainingIndices), ]

            trainingDataY <- sideEffectData[c(remainingIndices), sideEffectName]

##################################################### FEATURE SELECTION : t- statistic ###########################################################            

            allpValues <- sapply(geneNames, function(geneName){
                geneData <- trainingDataX[,geneName]
                sampleOne <-  geneData[presentIndices]
                sampleOneLength <- length(sampleOne)
                sampleTwo <-  geneData[absentIndices]
                sampleTwoLength <- length(sampleTwo)

                if(sampleOneLength == 0 | sampleTwoLength == 0 | (sampleOneLength == 1 && sampleTwoLength != 1) | (sampleOneLength != 1 && sampleTwoLength == 1)){
                    # this is a great feature!
                    pValue  <- 0 # we want this feature.
                }
                else{

                    pValue <- t.test(sampleOne, sampleTwo)$p.value

                } 
                pValue            
            })


            threshold <- 0.05
            sigpIndices <- which(allpValues < threshold)

            sigpValues <- allpValues[sigpIndices]
            featuresOfInterest <- names(trainingDataX)[sigpIndices]

            refinedTrainingDataX <- trainingDataX[, featuresOfInterest]

            
########################################## SVM #######################################################################################

#            cat("Features of interest: ", length(featuresOfInterest), "\n")
            # Now filter the test set also!
            refinedTestingDataX <- testingDataX[,featuresOfInterest]
            svmModel <- svm(refinedTrainingDataX, trainingDataY)
            predictionResults <- predict(svmModel, refinedTestingDataX)
#            cat("Count of prediction results: ", length(predictionResults))

            probabilityThreshold <- 0.5
            predictionResults <- round(predictionResults) # default is 0.5
            accuracy <- (sum(predictionResults==testingDataY) + sum(is.na(predictionResults) & is.na(testingDataY))) / length(predictionResults)
            #cat("Accuracy for fold:  ", i, " is ", accuracy, "\n")
            accuracyOverFoldsSum <- accuracyOverFoldsSum + accuracy
            
#######################################################################################################################
           
        }
        
        accuracyOverFolds <- (accuracyOverFoldsSum)/length(folds)     
        accuracyOverFoldsTotal <- accuracyOverFoldsTotal + accuracyOverFolds
    }
    
    finalAccuracy  <- accuracyOverFoldsTotal/outerCount
#    cat("Final Accuracy is: ", finalAccuracy, "\n")
    
    write.table(data.frame(a = sideEffectName, b = finalAccuracy), experimentFile, append = TRUE, sep = "\t", row.names = FALSE, col.names = FALSE)    
}

end.time <- Sys.time()
cat("took time: ",end.time - start.time)

Count of side effects is:  232 
Working on the side effect:  Abdominal.hernias.and.other.abdominal.wall.conditions 
Working on the side effect:  Abortions.and.stillbirth 
Working on the side effect:  Acid.base.disorders 
Working on the side effect:  Administration.site.reactions 
Working on the side effect:  Adrenal.gland.disorders 
Working on the side effect:  Age.related.factors 
Working on the side effect:  Allergic.conditions 
Working on the side effect:  Anaemias.nonhaemolytic.and.marrow.depression 
Working on the side effect:  Anal.and.rectal.conditions.NEC 
Working on the side effect:  Ancillary.infectious.topics 
Working on the side effect:  Aneurysms.and.artery.dissections 
Working on the side effect:  Angioedema.and.urticaria 
Working on the side effect:  Anterior.eye.structural.change..deposit.and.degeneration 
Working on the side effect:  Anxiety.disorders.and.symptoms 
Working on the side effect:  Appetite.and.general.nutritional.disorders 
Working on the side effect:  Art

took time:  35.71233