Skip to content

Commit

Permalink
add more export functions
Browse files Browse the repository at this point in the history
  • Loading branch information
delomast committed May 19, 2021
1 parent dc466bb commit dbb9612
Show file tree
Hide file tree
Showing 9 changed files with 273 additions and 3 deletions.
Binary file modified EFGLmh_0.1.0_manual.pdf
Binary file not shown.
Binary file modified How_to_use_EFGLmh.pdf
Binary file not shown.
3 changes: 3 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@ export(cleanGrandma)
export(combineEFGLdata)
export(construct_EFGLdata)
export(dumpTable)
export(exportCKMRsimAF)
export(exportCKMRsimLG)
export(exportGenAlEx)
export(exportGenePop)
export(exportGrandma)
Expand All @@ -16,6 +18,7 @@ export(exportProgenyStyle)
export(exportRubias_baseline)
export(exportRubias_mixture)
export(exportSNPPIT)
export(exportStructure)
export(genoSuccess)
export(getInds)
export(getLoci)
Expand Down
141 changes: 141 additions & 0 deletions R/export_data.R
Original file line number Diff line number Diff line change
Expand Up @@ -269,6 +269,147 @@ exportGenAlEx <- function(x, filename, pops = NULL, loci = NULL, title = "", use
}
}

#' write a Structure input file.
#'
#' Columns are Ind, Pop (converted to integers), then genotypes. Missing alleles
#' are coded as -9
#'
#' @param x an EFGLdata object
#' @param filename the name of the file to write
#' @param pops a vector of pops to include. If not specified,
#' all pops are used.
#' @param loci a vector of loci to include. If not specified,
#' all loci are used.
#' @return nothing, just writes a file
#' @export
exportStructure <- function(x, filename, pops = NULL, loci = NULL){
if(ncol(x$genotypes) < 3) stop("no genotypes")

if(is.null(loci)) loci <- getLoci(x)
if(is.null(pops)) pops <- getPops(x)
if(any(!pops %in% getPops(x))) stop("not all pops are in this EFGLdata object")
loci2 <- c(paste0(loci, ".A1"), paste0(loci, ".A2"))
l <- colnames(x$genotypes)[3:ncol(x$genotypes)]
if(any(!loci2 %in% l)) stop("one or more loci were not found in input")

g <- x$genotypes %>% filter(Pop %in% pops) %>% arrange(Pop) %>%
mutate(Pop = as.numeric(as.factor(Pop))) # turn pops into integers
gp <- g %>% select(Ind, Pop)

# convert genotypes to numerical formatting
to_remove <- c()
for(i in loci){
a <- g %>% select(paste0(i, ".A1"), paste0(i, ".A2"))
alleleIndex <- a %>% tidyr::gather(locus, allele, 1:2) %>%
filter(!is.na(allele)) %>% pull(allele) %>% unique
if(length(alleleIndex) < 1){
warning("all missing data for locus", i, ". Skipping this locus.")
to_remove <- c(to_remove, i)
next
}
alleleIndex <- tibble(allele = alleleIndex,
index = 1:length(alleleIndex))
a1 <- a %>% pull(1)
a2 <- a %>% pull(2)
a1 <- alleleIndex$index[match(a1, alleleIndex$allele)]
a2 <- alleleIndex$index[match(a2, alleleIndex$allele)]
a1[is.na(a1)] <- -9
a2[is.na(a2)] <- -9
gp <- gp %>% tibble::add_column(!!i := a1,
!!paste0(i, ".A2") := a2)
}
loci <- loci[!loci %in% to_remove]

cat(loci, file = filename, sep = "\t", append = FALSE) # row of locus names
cat("\n", file = filename, append = TRUE)
# Ind, Pop, genotypes
write.table(gp, file = filename, sep = "\t", append = TRUE,
col.names = FALSE, row.names = FALSE, quote = FALSE)
}

#' return a CKMRsim allele frequency input tibble.
#'
#' No chromosome and position information is used.
#' All populations specified are combined into one allele frequency output.
#' Any loci with only missing genotypes are omitted from the output. The output
#' should then be run through the CKMRsim function \code{reindex_markers()}.
#'
#' @param x an EFGLdata object
#' @param pops a vector of pops to include. If not specified,
#' all pops are used.
#' @param loci a vector of loci to include. If not specified,
#' all loci are used.
#' @return a tibble
#' @export
exportCKMRsimAF <- function(x, pops = NULL, loci = NULL){
if(ncol(x$genotypes) < 3) stop("no genotypes")

if(is.null(loci)) loci <- getLoci(x)
if(is.null(pops)) pops <- getPops(x)
if(any(!pops %in% getPops(x))) stop("not all pops are in this EFGLdata object")
loci2 <- c(paste0(loci, ".A1"), paste0(loci, ".A2"))
l <- colnames(x$genotypes)[3:ncol(x$genotypes)]
if(any(!loci2 %in% l)) stop("one or more loci were not found in input")

g <- x$genotypes %>% filter(Pop %in% pops) %>% select(-Pop) %>%
gather(Locus, Allele, 2:ncol(.)) %>%
mutate(Locus = gsub("\\.A[12]$", "", Locus)) %>%
filter(Locus %in% loci, !is.na(Allele)) %>%
group_by(Locus) %>% count(Allele) %>%
mutate(Freq = n / sum(n)) %>% ungroup() %>%
mutate(Chrom = "Unk", Pos = 1, LocIdx = NA, AlleIdx = NA) %>%
select(Chrom, Locus, Pos, Allele, LocIdx, AlleIdx, Freq)

locRemoved <- loci[!loci %in% g$Locus]
if(length(locRemoved) > 0) warning("Removed ", paste(locRemoved, collapse = " "), " for all missing data.")

# remove spaces from marker names if applicable
if(any(grepl(" ", loci))){
warning("Removing spaces from locus names in output.")
if(length(unique(gsub(" ", "", loci))) != length(unique(loci))) stop("Marker names are not unique after removal of spaces.")
g$Locus <- gsub(" ", "", g$Locus)
}

return(g)
}

#' Return a long genotype tibble for input to CKMRsim for relationship inference.
#'
#' @param x an EFGLdata object
#' @param pops a vector of pops to include. If not specified,
#' all pops are used.
#' @param loci a vector of loci to include. If not specified,
#' all loci are used.
#' @return a tibble
#' @export
exportCKMRsimLG <- function(x, pops = NULL, loci = NULL){
if(ncol(x$genotypes) < 3) stop("no genotypes")

if(is.null(loci)) loci <- getLoci(x)
if(is.null(pops)) pops <- getPops(x)
if(any(!pops %in% getPops(x))) stop("not all pops are in this EFGLdata object")
loci2 <- c(paste0(loci, ".A1"), paste0(loci, ".A2"))
l <- colnames(x$genotypes)[3:ncol(x$genotypes)]
if(any(!loci2 %in% l)) stop("one or more loci were not found in input")

g <- x$genotypes %>% filter(Pop %in% pops) %>% select(-Pop) %>%
gather(Locus, Allele, 2:ncol(.)) %>%
mutate(gene_copy = ifelse(grepl("\\.A1$", Locus), 1, 2),
Locus = gsub("\\.A[12]$", "", Locus)) %>%
filter(Locus %in% loci) %>% rename(Indiv = Ind) %>%
select(Indiv, Locus, gene_copy, Allele) %>% arrange(Indiv, Locus, gene_copy)

# remove spaces from marker names if applicable
if(any(grepl(" ", loci))){
warning("Removing spaces from locus names in output.")
if(length(unique(gsub(" ", "", loci))) != length(unique(loci))) stop("Marker names are not unique after removal of spaces.")
g$Locus <- gsub(" ", "", g$Locus)
}

return(g)
}


#' write a SNPPIT input file. Will warn about skipping loci with > 2 alleles.
#' @param x an EFGLdata object
#' @param filename the name of the file to write
Expand Down
26 changes: 26 additions & 0 deletions man/exportCKMRsimAF.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

23 changes: 23 additions & 0 deletions man/exportCKMRsimLG.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

26 changes: 26 additions & 0 deletions man/exportStructure.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

33 changes: 33 additions & 0 deletions test.R
Original file line number Diff line number Diff line change
Expand Up @@ -95,3 +95,36 @@ numInds(d_fil)
exportSNPPIT(d_fil, filename = "testSNPPIT.txt", baseline = c("OmyDWOR19S", "OmyEFSW19S"), mixture = c("OmyLYON19S", "OmyOXBO19S"))

dupTable

d <- readInData(exampleData)
d$genotypes <- d$genotypes %>% mutate(test.A1 = NA, test.A2 = NA)
af <- exportCKMRsimAF(d)

d <- readInData(exampleData)
af <- exportCKMRsimAF(d)

af <- CKMRsim::reindex_markers(af)
ex1_ckmr <- CKMRsim::create_ckmr(
D = af,
kappa_matrix = CKMRsim::kappas[c("PO", "FS", "HS", "U"), ],
ge_mod_assumed = CKMRsim::ge_model_TGIE,
ge_mod_true = CKMRsim::ge_model_TGIE,
ge_mod_assumed_pars_list = list(epsilon = 0.005),
ge_mod_true_pars_list = list(epsilon = 0.005)
)
CKMRsim::simulate_Qij(ex1_ckmr,
calc_relats = c("PO", "FS", "U"),
sim_relats = c("PO", "FS", "HS", "U"), reps = 10)
lg_1 <- exportCKMRsimLG(d, pops = "OmyDWOR19S")
lg_2 <- exportCKMRsimLG(d, pops = getPops(d)[getPops(d) != "OmyDWOR19S"])

po_pairwise_logls <- CKMRsim::pairwise_kin_logl_ratios(D1 = lg_1,
D2 = lg_2,
CK = ex1_ckmr,
numer = "PO",
denom = "U",
num_cores = 1)
summary(po_pairwise_logls$logl_ratio)
po_pairwise_logls %>% filter(logl_ratio > 5, num_loc > 300)

exportStructure(d, "testStruc.txt")
24 changes: 21 additions & 3 deletions vignettes/How_to_use_EFGLmh.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,7 @@ data1$genotypes
data1$metadata
```

This (hopefully) makes it simple for you to pull out or modify data manually if there is not a specially written function in the package addressing what you want to accomplish. Simply refer to `data1$genotypes` or `data1$metadata` and treat it as you would treat any dataframe or tibble. One thing to remember if you modify things manually: the genotypes and metadata tibbles in an `EFGLdata` object should have the same individuals in the same order. You can check that you haven't messed things up by using the `construct_EFGLdata` function.
This (hopefully) makes it simple for you to pull out or modify data manually if there is not a specially written function in the package addressing what you want to accomplish. Simply refer to `data1$genotypes` or `data1$metadata` and treat it as you would treat any dataframe or tibble. One thing to remember if you modify things manually: the genotypes and metadata tibbles in an `EFGLdata` object should have the same individuals in the same order. You can check that you haven't messed things up by using the `construct_EFGLdata` function (performs some basic checks).
```{r, eval=FALSE}
# do some modification on data1
# ...
Expand Down Expand Up @@ -262,7 +262,7 @@ all_data

# exporting data

These functions export data in formats used by other packages and programs. They are listed here mainly so you can have a list of the export functions in one place and see an example. For a full explanation of the options within each of these functions, consult the manual.
These functions export data in formats used by other packages and programs. They are listed here mainly so you can have a list of the export functions in one place and see an example. For a full explanation of the options within each of these functions, consult the manual. Most have options to subset loci and populations.

A rubias baseline
```{r}
Expand Down Expand Up @@ -298,6 +298,15 @@ A hierfstat input dataframe
hfstat_in <- exportHierFstat(all_data)
```

A CKMRsim allele frequency tibble. Note that loci with all missing genotypes will be removed. And all pops (or all pops specified with the `pops` argument) are combined.
```{r}
ckmr_af <- exportCKMRsimAF(all_data)
```

A long format tibble of genotypes. Meant for input into CKMRsim. Note that pop information is not included in the export.
```{r}
ckmr_af <- exportCKMRsimLG(all_data, pops = c("OmyOXBO19S"))
```

Write a GenePop file
```{r, eval=FALSE}
Expand All @@ -309,13 +318,22 @@ Write a GenAlEx file
exportGenAlEx(all_data, "genalexInput.txt")
```

Write a Structure file
```{r, eval=FALSE}
exportStructure(all_data, "genalexInput.txt")
```

Write a "Progeny-export-style" file that can be later loaded back into EFGLmh with `readInData()`. This file will have Pop, Ind, metadata, genotypes (2 column per call).
```{r, eval=FALSE}
exportProgenyStyle(all_data, "genalexInput.txt")
```

Write a SNPPIT input file (only biallelic markers used)
```{r, eval=FALSE}
exportSNPPIT(all_data, "snppitInput.txt", baseline = c("OmyOXBO19S", "newPop"),
mixture = c("OmyLYON19S", "specialInds"), errorRate = .005)
```


Write PLINK input files (only biallelic markers used)
```{r, eval=FALSE}
exportPlink(data1, "testPlink.ped", map = "testPlink.map")
Expand Down

0 comments on commit dbb9612

Please sign in to comment.