Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
aaeba1e
commit d948e36
Showing
21 changed files
with
474 additions
and
256 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,23 +1,23 @@ | ||
Package: textreadr | ||
Title: Read Text Documents into R | ||
Version: 1.0.2 | ||
Version: 1.2.0 | ||
Authors@R: c(person("Tyler", "Rinker", email = "tyler.rinker@gmail.com", role = c("aut", "cre")), | ||
person("Bryan", "Goodrich", role = "ctb"), person("Dason", "Kurkiewicz", role = "ctb")) | ||
Maintainer: Tyler Rinker <tyler.rinker@gmail.com> | ||
Description: A small collection of convenience tools for reading text documents into R. | ||
Depends: R (>= 3.2.2) | ||
Depends: R (>= 3.3.0) | ||
Suggests: tesseract, testthat | ||
Imports: antiword, curl, data.table, pdftools, readxl, rvest, striprtf, | ||
textshape, tools, utils, xml2 | ||
License: GPL-2 | ||
LazyData: TRUE | ||
RoxygenNote: 7.1.0 | ||
RoxygenNote: 7.1.2 | ||
BugReports: https://github.com/trinker/textreadr/issues?state=open | ||
URL: https://github.com/trinker/textreadr | ||
NeedsCompilation: no | ||
Packaged: 2020-06-16 22:56:19 UTC; trinker | ||
Packaged: 2021-10-08 22:11:53 UTC; TylerRinker | ||
Author: Tyler Rinker [aut, cre], | ||
Bryan Goodrich [ctb], | ||
Dason Kurkiewicz [ctb] | ||
Repository: CRAN | ||
Date/Publication: 2020-06-17 05:50:02 UTC | ||
Date/Publication: 2021-10-09 15:30:02 UTC |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,61 @@ | ||
#' Read in .odt Content | ||
#' | ||
#' Read in the content from a .odt file. | ||
#' | ||
#' @param file The path to the .odt file. | ||
#' @param skip The number of lines to skip. | ||
#' @param remove.empty logical. If `TRUE` empty elements in the vector are | ||
#' removed. | ||
#' @param trim logical. If `TRUE` the leading/training white space is | ||
#' removed. | ||
#' @param ... ignored. | ||
#' @return Returns a character vector. | ||
#' @keywords odt | ||
#' @export | ||
#' @examples | ||
#' \dontrun{ | ||
#' url <- "https://github.com/trinker/textreadr/raw/master/inst/docs/Hello_World.odt" | ||
#' file <- download(url) | ||
#' (txt <- read_odt(file)) | ||
#' } | ||
read_odt <- function (file, skip = 0, remove.empty = TRUE, trim = TRUE, ...) { | ||
|
||
filetype <- tools::file_ext(file) | ||
if (filetype %in% c('odt') && grepl('^([fh]ttp)', file)){ | ||
|
||
file <- download(file) | ||
|
||
} | ||
|
||
## create temp dir | ||
tmp <- tempfile() | ||
if (!dir.create(tmp)) stop("Temporary directory could not be established.") | ||
|
||
## clean up | ||
on.exit(unlink(tmp, recursive=TRUE)) | ||
|
||
## unzip docx | ||
xmlfile <- file.path(tmp, "content.xml") | ||
utils::unzip(file, exdir = tmp) | ||
|
||
## read in the unzipped docx | ||
doc <- xml2::read_xml(xmlfile) | ||
|
||
## extract the content | ||
#rm_na <- function(x) x[!is.na(x)] | ||
|
||
pvalues <- xml2::xml_text(xml2::xml_find_all(doc, "//text:p")) | ||
|
||
## formatting | ||
if (isTRUE(remove.empty)) pvalues <- pvalues[!grepl("^\\s*$", pvalues)] | ||
if (skip > 0) pvalues <- pvalues[-seq(skip)] | ||
if (isTRUE(trim)) pvalues <- trimws(pvalues) | ||
if (length(pvalues) == 0) pvalues <- '' | ||
|
||
pvalues | ||
|
||
} | ||
|
||
|
||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.