version 0.1.0

cran · Dec 15, 2023 · af77bc5 · af77bc5
commit af77bc5
Show file tree

Hide file tree

Showing 62 changed files with 5,233 additions and 0 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -0,0 +1,33 @@
+Package: gwid
+Type: Package
+Title: Genome-Wide Identity-by-Descent
+Version: 0.1.0
+Authors@R: c(
+    person("Soroush", "Mahmoudiandehkordi", email = "soroushmdg@gmail.com", role = c("aut", "cre")),
+    person("Mehdi", "Maadooliat", email = "mehdi.maadooliat@mu.edu", role = "aut"),
+    person("Steven", "Schrodi", email = "schrodi@wisc.edu", role = "aut")
+    )
+Maintainer: Soroush Mahmoudiandehkordi <soroushmdg@gmail.com>
+Description: Methods and tools for the analysis of Genome Wide 
+    Identity-by-Descent ('gwid') mapping data, focusing on testing whether there 
+    is a higher occurrence of Identity-By-Descent (IBD) segments around potential causal variants 
+    in cases compared to controls, which is crucial for identifying rare 
+    variants. To enhance its analytical power, 'gwid' incorporates a Sliding 
+    Window Approach, allowing for the detection and analysis of signals from 
+    multiple Single Nucleotide Polymorphisms (SNPs).
+License: MIT + file LICENSE
+Encoding: UTF-8
+Imports: data.table, gdsfmt, SNPRelate, Matrix, ggplot2, plotly, utils,
+        stats, RcppRoll, methods, piggyback
+RoxygenNote: 7.2.3
+Suggests: knitr, magrittr, rmarkdown, testthat (>= 3.0.0)
+Config/testthat/edition: 3
+URL: https://github.com/soroushmdg/gwid
+BugReports: https://github.com/soroushmdg/gwid/issues
+NeedsCompilation: no
+Packaged: 2023-12-14 14:00:32 UTC; soroush
+Author: Soroush Mahmoudiandehkordi [aut, cre],
+  Mehdi Maadooliat [aut],
+  Steven Schrodi [aut]
+Repository: CRAN
+Date/Publication: 2023-12-14 16:40:08 UTC
diff --git a/LICENSE b/LICENSE
@@ -0,0 +1,2 @@
+YEAR: 2023
+COPYRIGHT HOLDER: gwid authors
diff --git a/MD5 b/MD5
@@ -0,0 +1,61 @@
+2339c05f9e3c435f07321a4e762e8b52 *DESCRIPTION
+a4f8c4ebcedf71886abfb0f38d00582f *LICENSE
+7b30899cf199107d3f1fc47561c08705 *NAMESPACE
+2c3b2daa370d785e4f6dbde49476479b *NEWS.md
+6332b6ecfc4e7cc19c5da8fb2b1341d0 *R/build.R
+64c942e25f6d91d81669aafd0061facb *R/case_control.R
+740150a99e1b2efd66111d00740f5af6 *R/extract.R
+9da2728f4963e16ec0c11d243f9e1816 *R/haplotype_structure.R
+db170888f9cc8c1297063a10ceea3fa7 *R/hypothesis_test.R
+c60fdb80ce7bf582b26759d7f4f307f2 *R/plot.R
+f3b775e7d4b6e1c14ee69b5bf119447f *R/roh.R
+7bafd82be04ae30ae5799949be0622f7 *R/utils.R
+2240665d0af2cff75a432b3e9bc3a456 *README.md
+d4bbb420e24462d9319a33b5531c5c4c *man/build_gwas.Rd
+ee5e433c3343b0b232be4c386ca465fd *man/build_gwid.Rd
+5dc565005a8dd793ec3b21804d1e428b *man/build_phase.Rd
+c73a0350e14bec0bcc607e44b1b4d6bf *man/case_control.Rd
+8cd64096bdf69a922f395522623aa7c3 *man/extract.Rd
+746795c025f2ef6bc9621b35f57dec8c *man/extract.gwas.Rd
+628e46a65fb21880f88e68ee5d25ac28 *man/extract.gwid.Rd
+8c86fbe77bf43f0edd9232a55233a69b *man/extract_window.Rd
+73fd0f0127b04dab671e23d7be1e3e41 *man/extract_window.gwid.Rd
+6c0af988b8bea1fd105fc12cec041a5f *man/figures/README-example-1.png
+732e993b474ae19fd0b7ce2ef1d90ef0 *man/figures/README-example-2.png
+32cb276b6dcaf4f2afe9e06868a42cfa *man/figures/README-pressure-1.png
+195b676278ac65fd468b45b0b52c0f98 *man/figures/README-unnamed-chunk-4-1.png
+508a19cb0d595555d3c618c4601ce586 *man/figures/README-unnamed-chunk-4-2.png
+802018f087159e0bba40cf7a3f7aa562 *man/figures/README-unnamed-chunk-5-1.png
+3f5983f80814997f046474978b4ef880 *man/figures/final-copy-arrow.png
+a4b18d3f03e016f5d59a57ac951d62f2 *man/figures/hap_version1.png
+80ac1f8a4539bba99891dc76bad27be6 *man/fisher_test.Rd
+2170249144341d373f5973a793c09a35 *man/fisher_test.gwas.Rd
+1f687f8710bb3a7874dedb77da38f9ff *man/fisher_test.gwid.Rd
+0cb0bd11b4219d9b93e9c7d3bf759836 *man/fisher_test.result_snps.Rd
+e1a9c01cb8ae9f13ba1b79889aa9eb0e *man/gtest.Rd
+48aaa27ffa2d3a1e584d506434ada59f *man/gtest.haplotype_structure.Rd
+bf58970bbf0957697a48eccdd128bace *man/haplotype_frequency.Rd
+ad7ceda200a69d0a9a4a80bfcab648ac *man/haplotype_frequency.haplotype_structure.Rd
+5c3dea5a88221fccc41b522535bac879 *man/haplotype_structure.Rd
+6712c553404cc28c0994c1f9d79277b0 *man/haplotype_structure.gwas.Rd
+08ae515453217c6ead1ac9fb54934eed *man/haplotype_structure.gwid.Rd
+ed4175daae00ef492c3dd2b3b0deee91 *man/mcnemar_test.Rd
+d1d21173d35a9462d5348f41dd5117b5 *man/mcnemar_test.result_snps.Rd
+3c6d51be7b3122bbbce39b41d73b27f6 *man/mcnemar_test_permut.Rd
+7895d66099567bf491a0fa33051d047b *man/mcnemar_test_permut.result_snps.Rd
+6111173128a04fc7897e991267425de7 *man/permutation_test.Rd
+5d4652c85313b816dbfcc857dbe41e7a *man/permutation_test.gwas.Rd
+6c481fb77f8c389ecf2a18577f133828 *man/permutation_test.gwid.Rd
+9bb38a92a315d39c6c8be63c5f1f6955 *man/permutation_test.haplotype_structure.Rd
+133b060317998c060c07c3c285942670 *man/plot.gwas.Rd
+81ab70e931c0c38d2b4e50d406c1722c *man/plot.gwid.Rd
+1f4f27db65a8adbee93d4517ed3bfe64 *man/plot.haplotype_frequency.Rd
+129fbbfce42dc62d88638aa6dbab9209 *man/plot.haplotype_structure_frequency.Rd
+7d1cf05ccfed954b4bfe07c94d4f0dd4 *man/plot.result_snps.Rd
+d6c8dfb4a82c4bb88e90acbd49454229 *man/plot.test_snps.Rd
+415498f77efa020ce443c0a18f86db30 *man/print.Rd
+220c303504c370aaaa55efd2469aea31 *man/print.gwas.Rd
+c39de9a46e35312c0bb7c2f19ef2e487 *man/roh.Rd
+57ac3a33d8c7c9984cf155bd6a303742 *man/roh.phase.Rd
+5ca6eeeded4ff58448c72bb37d092b89 *man/subset.Rd
+a688861e104fd57dd6e6d4edaff1f11d *man/subset.gwid.Rd
diff --git a/NAMESPACE b/NAMESPACE
@@ -0,0 +1,48 @@
+# Generated by roxygen2: do not edit by hand
+
+S3method(extract,gwas)
+S3method(extract,gwid)
+S3method(extract_window,gwid)
+S3method(fisher_test,gwas)
+S3method(fisher_test,gwid)
+S3method(fisher_test,result_snps)
+S3method(gtest,haplotype_structure)
+S3method(haplotype_frequency,haplotype_structure)
+S3method(haplotype_structure,gwas)
+S3method(haplotype_structure,gwid)
+S3method(mcnemar_test,result_snps)
+S3method(mcnemar_test_permut,result_snps)
+S3method(permutation_test,gwas)
+S3method(permutation_test,gwid)
+S3method(permutation_test,haplotype_structure)
+S3method(plot,gwas)
+S3method(plot,gwid)
+S3method(plot,haplotype_frequency)
+S3method(plot,haplotype_structure_frequency)
+S3method(plot,result_snps)
+S3method(plot,test_snps)
+S3method(print,gwas)
+S3method(roh,phase)
+S3method(subset,gwid)
+export(build_gwas)
+export(build_gwid)
+export(build_phase)
+export(case_control)
+export(extract)
+export(extract_window)
+export(fisher_test)
+export(gtest)
+export(haplotype_frequency)
+export(haplotype_structure)
+export(mcnemar_test)
+export(mcnemar_test_permut)
+export(permutation_test)
+export(print)
+export(roh)
+export(subset)
+import(Matrix)
+import(data.table)
+importFrom(piggyback,pb_download)
+importFrom(stats,fisher.test)
+importFrom(stats,quantile)
+importFrom(stats,xtabs)
diff --git a/NEWS.md b/NEWS.md
@@ -0,0 +1,5 @@
+# gwid 0.1.0
+
+# gwid 0.0.2
+
+* Initial CRAN submission.
diff --git a/R/build.R b/R/build.R
@@ -0,0 +1,129 @@
+#' Open a SNP GDS file and extract information.
+#'
+#' @param gds_data File name
+#'
+#' @param caco An object of class caco. Output of \code{case_control} function.
+#' @param gwas_generator logical; if \code{TRUE} an object of class result_snps
+#' will be saved inside output list.
+#'
+#' @return a list of seven objects; including smp.id, snp.id, snp.pos, smp.indx,
+#' smp.snp (a matrix with samples in rows and snp in columns), caco,
+#' snps(column sum of smp.snp for each case control)
+#'
+#'
+#' @export
+build_gwas <- function(gds_data = "name.gds", caco = "name.Rda", gwas_generator = TRUE) {
+  if (missing(caco)) {
+    stop(" provide case_control list object or case_control rda (contains list of case_control) file name ")
+  }
+  if (!is.list(caco)) {
+    caco <- case_control(caco)
+  }
+  genoRA <- SNPRelate::snpgdsOpen(gds_data)
+  smp.id <- gdsfmt::read.gdsn(gdsfmt::index.gdsn(genoRA, "sample.id"))
+  snp.id <- gdsfmt::read.gdsn(gdsfmt::index.gdsn(genoRA, "snp.rs.id"))
+  snp.pos <- gdsfmt::read.gdsn(gdsfmt::index.gdsn(genoRA, "snp.position"))
+  smp.indx <- which(smp.id %in% unique(unlist(caco)))
+  smp.snp <- list()
+  for (j in 1:length(caco)) {
+    smp.snp[[j]] <- (gdsfmt::read.gdsn(gdsfmt::index.gdsn(genoRA, "genotype"))[which(smp.id[smp.indx] %in% caco[[j]]), ])
+    rownames(smp.snp[[j]]) <- smp.id[which(smp.id[smp.indx] %in% caco[[j]])]
+    colnames(smp.snp[[j]]) <- snp.id
+    smp.snp[[j]][smp.snp[[j]] == 3] <- NA
+  }
+  names(smp.snp) <- names(caco)
+  SNPRelate::snpgdsClose(genoRA)
+  output <- list(smp.id = smp.id, snp.id = snp.id, snp.pos = snp.pos, smp.indx = smp.indx, smp.snp = smp.snp, caco = caco)
+  class(output) <- "gwas"
+  if (gwas_generator) {
+    output$snps <- extract(output)
+  }
+  return(output)
+}
+
+
+#' Read .vcf structured text format files and reduce the size of file.
+#'
+#' @param phased_vcf A file name for a variant call format (vcf) file.
+#'
+#' @param caco An object of class caco. Output of \code{case_control} function.
+#'
+#' @return the output will be a a list of class phase contains two sparse matrix
+#' for each haplotype.
+#'
+#' @export
+build_phase <- function(phased_vcf = "name.vcf", caco) {
+  if (missing(caco) || is.null(phased_vcf)) stop("case_control and 'phased vcf' are needed")
+  phased <- vector(mode = "list", length = 2)
+  # read only colnames (because of memory issues)
+  tmp <- data.table::fread(phased_vcf, nrows = 0)
+  # select those samples who are in caco (only relevant samples)
+  # tmp2 <- data.table::fread(phased_vcf,select = which(colnames(tmp) %in% snp_smp$smp.id[snp_smp$smp.indx]))
+  tmp2 <- data.table::fread(phased_vcf, select = which(colnames(tmp) %in% unique(unlist(caco))))
+  # find row and column index of 1's (zeros generate in sparse matrix)
+  ind1 <- which(tmp2[, lapply(.SD, substr, 1, 1)] == "1", arr.ind = TRUE)
+  ind2 <- which(tmp2[, lapply(.SD, substr, 3, 3)] == "1", arr.ind = TRUE)
+  phased[[1]] <- Matrix::sparseMatrix(i = ind1[, 1], j = ind1[, 2], x = 1, dims = c(nrow(tmp2), ncol(tmp2)))
+  phased[[2]] <- Matrix::sparseMatrix(i = ind2[, 1], j = ind2[, 2], x = 1, dims = c(nrow(tmp2), ncol(tmp2)))
+  colnames(phased[[1]]) <- colnames(phased[[2]]) <- colnames(tmp2)
+  names(phased) <- c("Hap.1", "Hap.2")
+  class(phased) <- "phase"
+  return(phased)
+}
+
+#' Open a ibd file and extract information.
+#'
+#' @param ibd_data a file name for output of \href{http://faculty.washington.edu/browning/refined-ibd.html}{Refined IBD}
+#'
+#' @param gwas object of class gwas
+#' @param gwid_generator logical; if \code{TRUE} an object of class result_snps
+#' will be saved inside output list.
+#'
+#' @return the output will be a object(list) of class gwid contains
+#' profile object, IBD object and result_snps object.
+#'
+#' @export
+build_gwid <- function(ibd_data = "name.ibd", gwas = "object of class gwas", gwid_generator = TRUE) {
+  ibd <- data.table::fread(ibd_data)
+  V1 <- V2 <- V3 <- V4 <- V5 <- V6 <- V7 <- V8 <- V9 <- NULL
+  ibd <- ibd[V1 %in% unlist(unique(gwas[["caco"]])) & V3 %in% unlist(unique(gwas[["caco"]]))]
+  class(ibd) <- append("IBD", class(ibd))
+  seq2 <- Vectorize(seq.default, vectorize.args = c("from", "to"))
+  profile <- ind <- vector(mode = "list", length = length(gwas[["caco"]])) # list length 6
+  for (j in seq_along(gwas[["caco"]])) {
+    ind[[j]] <- which(ibd$V1 %in% gwas[["caco"]][[j]] & ibd$V3 %in% gwas[["caco"]][[j]])
+    a1 <- ibd[V1 %in% gwas[["caco"]][[j]] & V3 %in% gwas[["caco"]][[j]]]
+    a2 <- seq2(match(a1$V6, gwas$snp.pos), match(a1$V7, gwas$snp.pos))
+    Un1 <- unlist(a2)
+    profile[[j]] <- Matrix::sparseMatrix(
+      i = rep(seq_along(a2), lengths(a2)),
+      j = Un1,
+      x = 1
+    )
+    if (ncol(profile[[j]]) < length(gwas$snp.pos)) {
+      if (min(Un1) > 1) {
+        mytemp <- Matrix(0, nrow = nrow(profile[[j]]), ncol = (min(Un1) - 1))
+        profile[[j]] <- cbind(mytemp, profile[[j]])
+      }
+
+      if (max(Un1) < length(gwas$snp.pos)) {
+        mytemp <- Matrix(0, nrow = nrow(profile[[j]]), ncol = (length(gwas$snp.pos) - max(Un1)))
+        profile[[j]] <- cbind(profile[[j]], mytemp)
+      }
+
+      profile[[j]] <- methods::as(profile[[j]], "sparseMatrix")
+    }
+  }
+
+  names(profile) <- names(ind) <- names(gwas[["caco"]])
+
+  class(profile) <- "profile"
+  output <- list(profile = profile, IND = ind)
+  output$snp_pos <- gwas[["snp.pos"]]
+  output$ibd <- ibd
+  class(output) <- "gwid"
+  if (gwid_generator) {
+    output$res <- extract(output)
+  }
+  return(output)
+}
diff --git a/R/case_control.R b/R/case_control.R
@@ -0,0 +1,54 @@
+csv2Rdat <- function(name = "", rep = 3) {
+  if (!file.exists(name)) message("File doesn't exist")
+  caco <- list()
+  output <- list()
+  input.data <- utils::read.csv(name)
+  check.column <- function(v) {
+    return(sum(c("1", "2") %in% as.integer(v)))
+  }
+  cacos <- names(which(apply(input.data, 2, check.column) == 2))
+
+  for (i in 1:length(cacos)) {
+    caco$cases <- input.data$SampleName[which(input.data[, cacos[i]] == 2)]
+    cont.ind <- which(input.data[, cacos[i]] == 1)
+    if (length(caco$cases) * rep > length(cont.ind)) caco$cases <- sample(caco$cases, trunc(length(cont.ind) / rep))
+    conts <- sample(cont.ind, length(caco$cases) * rep)
+    for (j in 1:rep) {
+      caco[[paste0("cont", j)]] <- input.data$SampleName[conts[rep(1:rep, each = length(caco$cases)) == j]]
+    }
+    output[[cacos[i]]] <- caco
+
+  }
+  output
+}
+
+
+#' Reload saved case-control list file
+#'
+#' @param case_control_rda A character string giving the name of the case-control
+#' file to load. The file is a list of character vectors including subject names
+#' in each case-control groups or csv file including subject name for a disease.
+#'
+#' @param ... name of a column (disease name) of csv file.
+#'
+#' @return The output will be a list of character vectors include subject names
+#' and groups. The class of returned object is caco.
+#'
+#' @export
+case_control <- function(case_control_rda, ...) {
+  if (missing(case_control_rda)) {
+    stop("provide a case_control list (rda file)")
+  }
+  substrRight <- function(x, n) {
+    substr(x, nchar(x) - n + 1, nchar(x))
+  }
+  if (substrRight(case_control_rda,3) == "csv") {
+    output <- csv2Rdat(case_control_rda)[[...]]
+    class(output) <- "caco"
+    return(output)
+  } else {
+    output <- get(load(case_control_rda))
+    class(output) <- "caco"
+    return(output)
+  }
+}