version 0.1.0

cran · Jan 31, 2020 · 50dc0a6 · 50dc0a6
commit 50dc0a6
Show file tree

Hide file tree

Showing 13 changed files with 576 additions and 0 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -0,0 +1,16 @@
+Package: IGST
+Type: Package
+Title: Informative Gene Selection Tool
+Version: 0.1.0
+Author: Nitesh Kumar Sharma, Dwijesh Chandra Mishra, Neeraj Budhlakoti and Md. Samir Farooqi
+Maintainer: Nitesh Kumar Sharma <sharmanitesh.iasri@gmail.com>
+Description: Mining informative genes with certain biological meanings are important for clinical diagnosis of disease and discovery of disease mechanisms in plants and animals. This process involves identification of relevant genes and removal of redundant genes as much as possible from a whole gene set. This package selects the informative genes related to a specific trait using gene expression dataset. These trait specific genes are considered as informative genes. This package returns the informative gene set from the high dimensional gene expression data using a combination of methods SVM and MRMR (for feature selection) with bootstrapping procedure. 
+Depends: R (>= 3.5)
+Imports: e1071, BootMRMR
+License: GPL-3
+Encoding: UTF-8
+LazyData: true
+NeedsCompilation: no
+Packaged: 2020-01-21 10:19:01 UTC; Nitesh-PC
+Repository: CRAN
+Date/Publication: 2020-01-31 16:00:06 UTC
diff --git a/MD5 b/MD5
@@ -0,0 +1,12 @@
+d26c3f949d3f5ccac0b0fd254c6d364c *DESCRIPTION
+015891f2d765b557316761f08fa3a009 *NAMESPACE
+aa99b1857cd75fb3ef05c5bc4210fed9 *R/IGST.bootmrmrsvm.pval.cutoff.R
+cb009fa34bb2b7f9e0e2b7049e4bd567 *R/IGST.bootmrmrsvm.weight.cutoff.R
+0fc6cc84658cee09c4a6f47777b737c7 *R/IGST.pval.bootmrmrsvm.R
+4b6a2b8c81b83a5ba3a2f97aa1e0b4c4 *R/IGST.weight.bootmrmrsvm.R
+0e85bb93ea49f3120097f34d2140f192 *data/rice_cold.RData
+ca71f92cb5014e547d6301a96ad23db1 *man/IGST.bootmrmrsvm.pval.cutoff.Rd
+70dc9368346ed32f9b6a3a508372d4ab *man/IGST.bootmrmrsvm.weight.cutoff.Rd
+f2e3022332700f01bd428559940fc646 *man/IGST.pval.bootmrmrsvm.Rd
+890e3a3568777ce85dd7697529da377f *man/IGST.weight.bootmrmrsvm.Rd
+39ed6bf4ce380848969cf6e1e6fcbd75 *man/rice_cold.Rd
diff --git a/NAMESPACE b/NAMESPACE
@@ -0,0 +1,8 @@
+exportPattern("^[[:alpha:]]+")
+importFrom("stats", "cor", "pnorm", "var")
+import(e1071)
+import(BootMRMR)
+
+
+
+
diff --git a/R/IGST.bootmrmrsvm.pval.cutoff.R b/R/IGST.bootmrmrsvm.pval.cutoff.R
@@ -0,0 +1,101 @@
+###########################################
+requireNamespace("e1071")
+requireNamespace("BootMRMR")
+
+IGST.bootmrmrsvm.pval.cutoff<-function (x, y, s, Q, v,re)
+{
+
+
+  this.call = match.call()
+  if ((!class(x) == "data.frame")) {
+    warning("x must be a data frame and rows as gene names")
+  }
+  if ((!class(y) == "numeric")) {
+    warning("y must be a vector of 1/-1's for two class problems")
+  }
+  if (!length(y) == ncol(x)) {
+    warning("Number of samples in x must have same number of sample labels in y")
+  }
+  if (re < 0 & re <= 50) {
+    warning("s must be numeric and sufficiently large")
+  }
+  if (Q < 0 & Q > 1) {
+    warning("Q is the quartile value of rank scores and must be within 0 and 1")
+  }
+  if (missing(Q)) {
+    Q <- 0.5
+  }
+  if (v < 0 & v > 1) {
+    warning("v is the tradeoff value between svm and mrmr and must be within 0 and 1 ")
+  }
+  if (s > nrow(x)) {
+    stop("Number of informative genes to be selected must be less than total number of genes")
+  }
+
+
+
+  stopifnot(!is.null(x) == TRUE, !is.null(y) == TRUE)
+  cls <- as.numeric(y)
+  genes <- rownames(x)
+  g1<- as.matrix(x)
+  g <- as.matrix(x)
+  n1 <- nrow(g)
+  M <- ncol(x)
+
+  GeneRankedList <- vector(length = n1)
+  M1 <- matrix(0, n1, re)
+  for (j in 1:re) {
+    samp <- sample(M, M, replace = TRUE)
+    x1 <- g[, samp]
+    y1 <- cls[samp]
+    qsi <- as.vector((apply(abs(cor(t(x1), method = "pearson",
+                                    use = "p") - diag(n1)), 1, sum))/(n1 - 1))
+    idx <- which(y1 == 1)
+    idy <- which(y1 == -1)
+    B = vector(mode = "numeric", n1)
+    for (i in 1:nrow(x1)) {
+      f.mes <- (((mean(x1[i, idx]) - mean(x1[i, ]))^2) +
+                  ((mean(x1[i, idy]) - mean(x1[i, ]))^2))/(var(x1[i,
+                                                                  idx]) + var(x1[i, idy]))
+      B[i] <- f.mes
+    }
+    svmModeli = svm(t(x1), as.matrix(y1), cost = 10,
+                    cachesize = 500, scale = FALSE, type = "C-classification",
+                    kernel = "linear")
+    w = abs(as.vector(t(svmModeli$coefs) %*% svmModeli$SV))
+    rsi <- abs(B)
+    rankingCriteria <- v * w + (1-v) * (rsi/qsi)
+    GeneRankedList <- sort(rankingCriteria, index.return = TRUE)$ix
+    rankvalue <- sort(GeneRankedList, index.return = TRUE)$ix
+    rankscore <- (n1 + 1 - rankvalue)/(n1)
+    M1[, j] <- as.vector(rankscore)
+  }
+  rankscore <- as.matrix(M1)
+  mu <- Q
+  R <- rankscore - mu
+  sam <- nrow(R)
+  pval.vec <- vector(mode = "numeric", length = nrow(rankscore))
+  for (i in 1:sam) {
+    z <- R[i, ]
+    z <- z[z != 0]
+    n11 <- length(z)
+    r <- rank(abs(z))
+    tplus <- sum(r[z > 0])
+    etplus <- n11 * (n11 + 1)/4
+    vtplus <- n11 * (n11 + 1) * (2 * n11 + 1)/24
+    p.value = pnorm(tplus, etplus, sqrt(vtplus), lower.tail = FALSE)
+    pval.vec[i] = p.value
+  }
+
+
+  pvalue <- pval.vec
+  #genes <- names(pvalue)
+  w11 <- as.vector(pvalue)
+  gene.id <- sort(w11, index.return = TRUE)$ix
+  temp <- gene.id[1:s]
+  select.gene <- genes[temp]
+  class(select.gene) <- "Informative geneset"
+  return(select.gene)
+
+
+}
diff --git a/R/IGST.bootmrmrsvm.weight.cutoff.R b/R/IGST.bootmrmrsvm.weight.cutoff.R
@@ -0,0 +1,79 @@
+################################################
+requireNamespace("e1071")
+requireNamespace("BootMRMR")
+
+IGST.bootmrmrsvm.weight.cutoff<-function (x, y, s, v, re)
+{
+
+
+  this.call = match.call()
+  if ((!class(x) == "data.frame")) {
+    warning("x must be a data frame and rows as gene names")
+  }
+  if ((!class(y) == "numeric")) {
+    warning("y must be a vector of 1/-1's for two class problems")
+  }
+  if (!length(y) == ncol(x)) {
+    warning("Number of samples in x must have same number of sample labels in y")
+  }
+  if (re < 0 & re <= 50) {
+    warning("s must be numeric and sufficiently large")
+  }
+
+  if (v < 0 & v > 1) {
+    warning("v is the tradeoff value between svm and mrmr and must be within 0 and 1 ")
+  }
+  if (s > nrow(x)) {
+    stop("Number of informative genes to be selected must be less than total number of genes")
+  }
+
+
+
+  cls <- as.numeric(y)
+  genes <- rownames(x)
+  g <- as.matrix(x)
+  n1 <- nrow(g)
+  M <- ncol(x)
+  GeneRankedList <- vector(length = n1)
+  M1 <- matrix(0, n1, re)
+  for (j in 1:re) {
+    samp <- sample(M, M, replace = TRUE)
+    x1 <- g[, samp]
+    y1 <- cls[samp]
+    qsi <- as.vector((apply(abs(cor(t(x1), method = "pearson",
+                                    use = "p") - diag(n1)), 1, sum))/(n1 - 1))
+    idx <- which(y1 == 1)
+    idy <- which(y1 == -1)
+    B = vector(mode = "numeric", n1)
+    for (i in 1:nrow(x1)) {
+      f.mes <- (((mean(x1[i, idx]) - mean(x1[i, ]))^2) +
+                  ((mean(x1[i, idy]) - mean(x1[i, ]))^2))/(var(x1[i,
+                                                                  idx]) + var(x1[i, idy]))
+      B[i] <- f.mes
+    }
+    svmModeli = svm(t(x1), as.matrix(y1), cost = 10,
+                    cachesize = 500, scale = FALSE, type = "C-classification",
+                    kernel = "linear")
+    w = abs(as.vector(t(svmModeli$coefs) %*% svmModeli$SV))
+    rsi <- abs(B)
+    Weight <- v * w + (1-v) * (rsi/qsi)
+    #rankingCriteria <- v * w + (1-v) * (rsi/qsi)
+    GeneRankedList <- sort(-Weight, index.return = TRUE)$ix
+    #GeneRankedList <- sort(-rankingCriteria, index.return = TRUE)$ix
+    rankvalue <- sort(GeneRankedList, index.return = TRUE)$ix
+    rankscore <- (n1 + 1 - rankvalue)/(n1)
+    M1[, j] <- as.vector(rankscore)
+  }
+
+  Weight <- as.vector(rowSums((M1), na.rm = FALSE,
+                              dims = 1))
+
+
+  #genes <- names(weights)
+  w1 <- as.vector(Weight)
+  gene.id <- sort(-w1, index.return = TRUE)$ix
+  temp <- gene.id[1:s]
+  select.gene <- genes[temp]
+  class(select.gene) <- "Informative geneset"
+  return(select.gene)
+}
diff --git a/R/IGST.pval.bootmrmrsvm.R b/R/IGST.pval.bootmrmrsvm.R
@@ -0,0 +1,85 @@
+###########################################################
+requireNamespace("e1071")
+requireNamespace("BootMRMR")
+
+IGST.pval.bootmrmrsvm<-function (x, y, re, Q, v)
+{
+
+  this.call = match.call()
+  if ((!class(x) == "data.frame")) {
+    warning("x must be a data frame and rows as gene names")
+  }
+  if ((!class(y) == "numeric")) {
+    warning("y must be a vector of 1/-1's for two class problems")
+  }
+  if (!length(y) == ncol(x)) {
+    warning("Number of samples in x must have same number of sample labels in y")
+  }
+  if (re < 0 & re <= 50) {
+    warning("s must be numeric and sufficiently large")
+  }
+  if (Q < 0 & Q > 1) {
+    warning("Q is the quartile value of rank scores and must be within 0 and 1")
+  }
+  if (missing(Q)) {
+    Q <- 0.5
+  }
+  if (v < 0 & v > 1) {
+    warning("v is the tradeoff value between svm and mrmr and must be within 0 and 1 ")
+  }
+
+
+
+  cls <- as.numeric(y)
+  genes <- rownames(x)
+  g <- as.matrix(x)
+  n1 <- nrow(g)
+  M <- ncol(x)
+  GeneRankedList <- vector(length = n1)
+  M1 <- matrix(0, n1, re)
+  for (j in 1:re) {
+    samp <- sample(M, M, replace = TRUE)
+    x1 <- g[, samp]
+    y1 <- cls[samp]
+    qsi <- as.vector((apply(abs(cor(t(x1), method = "pearson",
+                                    use = "p") - diag(n1)), 1, sum))/(n1 - 1))
+    idx <- which(y1 == 1)
+    idy <- which(y1 == -1)
+    B = vector(mode = "numeric", n1)
+    for (i in 1:nrow(x1)) {
+      f.mes <- (((mean(x1[i, idx]) - mean(x1[i, ]))^2) +
+                  ((mean(x1[i, idy]) - mean(x1[i, ]))^2))/(var(x1[i,
+                                                                  idx]) + var(x1[i, idy]))
+      B[i] <- f.mes
+    }
+    svmModeli = svm(t(x1), as.matrix(y1), cost = 10,
+                    cachesize = 500, scale = FALSE, type = "C-classification",
+                    kernel = "linear")
+    w = abs(as.vector(t(svmModeli$coefs) %*% svmModeli$SV))
+    rsi <- abs(B)
+    rankingCriteria <- v * w + (1-v) * (rsi/qsi)
+    GeneRankedList <- sort(-rankingCriteria, index.return = TRUE)$ix
+    rankvalue <- sort(GeneRankedList, index.return = TRUE)$ix
+    rankscore <- (n1 + 1 - rankvalue)/(n1)
+    M1[, j] <- as.vector(rankscore)
+  }
+  rankscore <- as.matrix(M1)
+  mu <- Q
+  R <- rankscore - mu
+  sam <- nrow(R)
+  pval.vec <- vector(mode = "numeric", length = nrow(rankscore))
+  for (i in 1:sam) {
+    z <- R[i, ]
+    z <- z[z != 0]
+    n11 <- length(z)
+    r <- rank(abs(z))
+    tplus <- sum(r[z > 0])
+    etplus <- n11 * (n11 + 1)/4
+    vtplus <- n11 * (n11 + 1) * (2 * n11 + 1)/24
+    p.value = pnorm(tplus, etplus, sqrt(vtplus), lower.tail = FALSE)
+    pval.vec[i] = p.value
+  }
+
+  class(pval.vec) <- "p values"
+  return(pval.vec)
+}
diff --git a/R/IGST.weight.bootmrmrsvm.R b/R/IGST.weight.bootmrmrsvm.R
@@ -0,0 +1,66 @@
+#################################################################
+requireNamespace("e1071")
+requireNamespace("BootMRMR")
+
+IGST.weight.bootmrmrsvm<-function (x, y, re, v)
+{
+
+  this.call = match.call()
+  if ((!class(x) == "data.frame")) {
+    warning("x must be a data frame and rows as gene names")
+  }
+  if ((!class(y) == "numeric")) {
+    warning("y must be a vector of 1/-1's for two class problems")
+  }
+  if (!length(y) == ncol(x)) {
+    warning("Number of samples in x must have same number of sample labels in y")
+  }
+  if (re < 0 & re <= 50) {
+    warning("s must be numeric and sufficiently large")
+  }
+
+  if (v < 0 & v > 1) {
+    warning("v is the tradeoff value between svm and mrmr and must be within 0 and 1 ")
+  }
+
+
+
+  cls <- as.numeric(y)
+  genes <- rownames(x)
+  g <- as.matrix(x)
+  n1 <- nrow(g)
+  M <- ncol(x)
+  GeneRankedList <- vector(length = n1)
+  M1 <- matrix(0, n1, re)
+  for (j in 1:re) {
+    samp <- sample(M, M, replace = TRUE)
+    x1 <- g[, samp]
+    y1 <- cls[samp]
+    qsi <- as.vector((apply(abs(cor(t(x1), method = "pearson",
+                                    use = "p") - diag(n1)), 1, sum))/(n1 - 1))
+    idx <- which(y1 == 1)
+    idy <- which(y1 == -1)
+    B = vector(mode = "numeric", n1)
+    for (i in 1:nrow(x1)) {
+      f.mes <- (((mean(x1[i, idx]) - mean(x1[i, ]))^2) +
+                  ((mean(x1[i, idy]) - mean(x1[i, ]))^2))/(var(x1[i,
+                                                                  idx]) + var(x1[i, idy]))
+      B[i] <- f.mes
+    }
+    svmModeli = svm(t(x1), as.matrix(y1), cost = 10,
+                    cachesize = 500, scale = FALSE, type = "C-classification",
+                    kernel = "linear")
+    w = abs(as.vector(t(svmModeli$coefs) %*% svmModeli$SV))
+    rsi <- abs(B)
+    Weight <- v * w + (1-v) * (rsi/qsi)
+
+    GeneRankedList <- sort(-Weight, index.return = TRUE)$ix
+
+    rankvalue <- sort(GeneRankedList, index.return = TRUE)$ix
+    rankscore <- (n1 + 1 - rankvalue)/(n1)
+    M1[, j] <- as.vector(rankscore)
+  }
+  #rankingCriteria=Weight
+  class(Weight) <- "Weight values"
+  return(Weight)
+}
diff --git a/data/rice_cold.RData b/data/rice_cold.RData