Skip to content

Commit

Permalink
version 1.2.0
Browse files Browse the repository at this point in the history
  • Loading branch information
trinker authored and cran-robot committed Oct 9, 2021
1 parent aaeba1e commit d948e36
Show file tree
Hide file tree
Showing 21 changed files with 474 additions and 256 deletions.
10 changes: 5 additions & 5 deletions DESCRIPTION
@@ -1,23 +1,23 @@
Package: textreadr
Title: Read Text Documents into R
Version: 1.0.2
Version: 1.2.0
Authors@R: c(person("Tyler", "Rinker", email = "tyler.rinker@gmail.com", role = c("aut", "cre")),
person("Bryan", "Goodrich", role = "ctb"), person("Dason", "Kurkiewicz", role = "ctb"))
Maintainer: Tyler Rinker <tyler.rinker@gmail.com>
Description: A small collection of convenience tools for reading text documents into R.
Depends: R (>= 3.2.2)
Depends: R (>= 3.3.0)
Suggests: tesseract, testthat
Imports: antiword, curl, data.table, pdftools, readxl, rvest, striprtf,
textshape, tools, utils, xml2
License: GPL-2
LazyData: TRUE
RoxygenNote: 7.1.0
RoxygenNote: 7.1.2
BugReports: https://github.com/trinker/textreadr/issues?state=open
URL: https://github.com/trinker/textreadr
NeedsCompilation: no
Packaged: 2020-06-16 22:56:19 UTC; trinker
Packaged: 2021-10-08 22:11:53 UTC; TylerRinker
Author: Tyler Rinker [aut, cre],
Bryan Goodrich [ctb],
Dason Kurkiewicz [ctb]
Repository: CRAN
Date/Publication: 2020-06-17 05:50:02 UTC
Date/Publication: 2021-10-09 15:30:02 UTC
36 changes: 20 additions & 16 deletions MD5
@@ -1,28 +1,30 @@
daeb648327e4a45750e568b65b0da93f *DESCRIPTION
a110a2362269d19dbe63c5bab55da360 *NAMESPACE
5e829a4b9c903664fd9e7a806780b4cd *NEWS
9510e45f9b48e52c1a90508275d5fbe4 *DESCRIPTION
5a21160422db036542bba2a9bc738f0b *NAMESPACE
46826baa0226fe4f81bad1c71804cc51 *NEWS
d8302f1bc911cabf2eaba4834ef5b6ab *R/as_transcript.R
a1d5ff34974699faee0e63bf81084eee *R/browse.R
1895a3051f97c2063c961593bf46bf9c *R/browse.R
862f96290fca5dc1a370833773e0576c *R/download.R
f377f916caa75535d6133d0bbc17eecf *R/loop_utilities.R
ef038ac83c1c325c8ea0a54174f35f52 *R/peek.R
50dbc8eb0f4fad392b5de5829457377e *R/read_dir.R
6e694aa76b32f61b15c31b70cbf99bed *R/read_dir_transcript.R
fbc21a0bc3dee4284c60f5f7f532d5b0 *R/read_doc.R
1284f6a0808e78401afd7eca9f726e2f *R/read_document.R
6da3eb25b3bd531a86d07154b0ce3fb2 *R/read_docx.R
40107b81cab50bf8aff4a1ad473a63f9 *R/read_html.R
f1a6b0641d25721e335ca65e810733ca *R/read_pdf.R
fb806a2420d6df30609223a304a0f775 *R/read_document.R
80be04c6eaa5fe4187034d8f4d020c1b *R/read_docx.R
7ca09b9c118669fd7ed1339868c573dc *R/read_html.R
2490d8d03cda966e197be1a95aacce6c *R/read_odt.R
97bfbfcd674a67b6d51015ffd9654ad0 *R/read_pdf.R
51211abf67c2c55e6946d23669f54f01 *R/read_pptx.R
f522cab841182af3477d5708b3a88442 *R/read_rtf.R
f95b32724603ce2297ec71d1cdf63613 *R/read_transcript.R
3c44d45c4df1d3ddab610dd04a580963 *R/read_transcript.R
06efc7b4e04653fdce4e1ddb0f61329d *R/textreadr-package.R
803751bedca4c226d008a1b599d8008d *R/textreadr_class.R
ea601d99e969069699ba02ceb1bc7213 *R/un_zip.R
200032871870b7e7814517ad0779e5e8 *R/utils.R
dc1c8d9c5cca9c011f128b2788e0504a *README.md
5fd5870f851cd835cf479fb95716d347 *data/presidential_debates_2012.rda
11fcdd3f98d8c74f4c78fa83edaf4084 *inst/CITATION
01bdf20949a13d4f2bb5798c5f01760c *README.md
a1f68293ca82e91b0004c570b8b4d769 *data/presidential_debates_2012.rda
53de65566c0c3bf77d2a81c2fef72b0e *inst/CITATION
6ef761b98e46491d2b0b4a40c886cc42 *inst/docs/Hello_World.odt
2523a08f53a2ff6d09acde4606030dbf *inst/docs/Hello_World.pptx
e011839dce9cca2cdce99a7c41858a3b *inst/docs/Maas2011/neg/0_3.txt
4404b68a9abcd771a5ff136ada1dd678 *inst/docs/Maas2011/neg/10_2.txt
Expand Down Expand Up @@ -78,6 +80,7 @@ fb0484e92c1b788cf1d7837230d7840f *inst/docs/trans3.docx
f8f5c51c7c6c40fb00d4903bb8272688 *inst/docs/trans4.xlsx
3932e81a118b86b9c50d6155868cd12a *inst/docs/trans5.xls
74d5f6d329035a126d75a5973e4820d4 *inst/docs/trans6.doc
834e148a77f08cb667237265a33afc0c *inst/docs/trans8.odt
bdb0af2e1b5c7554ed5645dd6d592ea7 *inst/docs/transcripts/trans1.docx
ec67af8023c2798b5d90315202f1e416 *inst/docs/transcripts/trans2.docx
fb0484e92c1b788cf1d7837230d7840f *inst/docs/transcripts/trans3.docx
Expand All @@ -86,7 +89,7 @@ f8f5c51c7c6c40fb00d4903bb8272688 *inst/docs/transcripts/trans4.xlsx
74d5f6d329035a126d75a5973e4820d4 *inst/docs/transcripts/trans6.doc
d4903fd1b16222f2f6f74360bb61e732 *inst/extract_text_app/app/extract_text.Rmd
279f5417d416a631102dcd216f74f289 *man/as_transcript.Rd
139229f92b144f69f0dc8c2862da0cfe *man/browse.Rd
1b7eca13fe04160bbd5c3cdb2065f079 *man/browse.Rd
984cce99e94e7bbb695bfacd9a77a953 *man/download.Rd
15ff98991dba60ae84370858e3fd6dfd *man/loop_utilities.Rd
6c5b7186e562a9824c87b43c8d877ebf *man/peek.Rd
Expand All @@ -97,11 +100,12 @@ d21b922884ea26675d6392fa517f0e68 *man/read_dir_transcript.Rd
20a8d7b195e8982980e91d18c964f289 *man/read_doc.Rd
06c6f56bcdea1fa3eb62ccab8bbf026b *man/read_document.Rd
6a533788da4247a64984084675653b0d *man/read_docx.Rd
e68f871db9c8e718e19825eea13e0d03 *man/read_html.Rd
67790a1f3834b24859e28e44ecc37e02 *man/read_pdf.Rd
8e032dc176e223f7d08fb8e7dd9d6df9 *man/read_html.Rd
d99e3cca1a6364be1cafbad892da4efc *man/read_odt.Rd
fd3a773c1bed8af6297065d484a6b658 *man/read_pdf.Rd
7cb3c6d877dbc3984bbe149b57ddf994 *man/read_pptx.Rd
c04fef4b063ce4980acae1cc588fbf63 *man/read_rtf.Rd
01ca2eae48594b94895454deb7d371ed *man/read_transcript.Rd
4197a0cf8dfbae42f0f70bd4c8ed32de *man/read_transcript.Rd
71a95cbb83cf79cbc37192718c7c460d *man/textreadr.Rd
4a492ee036feda95480a04aadffd71b2 *man/un_zip.Rd
6176535a9928df24c9ed83f37f38c960 *tests/testthat.R
Expand Down
1 change: 1 addition & 0 deletions NAMESPACE
Expand Up @@ -13,6 +13,7 @@ export(read_doc)
export(read_document)
export(read_docx)
export(read_html)
export(read_odt)
export(read_pdf)
export(read_pptx)
export(read_rtf)
Expand Down
13 changes: 13 additions & 0 deletions NEWS
Expand Up @@ -18,6 +18,19 @@ And constructed with the following guidelines:



textreadr 1.0.3 - 1.2.0
----------------------------------------------------------------

BUG FIXES

* `read_docx` would return the same word as 2 separate words if different
characters within the word had different styling (pseudocode example:
'<w:p><bold>h</bold>ello word<w:p>' returned 'h ello world').

NEW FEATURES

* `read_odt` added to read in .odt files.



textreadr 0.9.1 - 1.0.2
Expand Down
2 changes: 1 addition & 1 deletion R/browse.R
Expand Up @@ -3,7 +3,7 @@
#' Use the operating system defaults to open directories and files.
#'
#' @param x A vector (typically of length one) of paths to directories of files.
#' @references <http://stackoverflow.com/q/12135732/1000343>
#' @references <https://stackoverflow.com/q/12135732/1000343>
#' @note This function is operating system and setting dependent. Results may
#' not be consistent across operating systems. Depending upon the default
#' programs for file types the results may vary as well. Some files may not be
Expand Down
1 change: 1 addition & 0 deletions R/read_document.R
Expand Up @@ -76,6 +76,7 @@ read_document <- function(file, skip = 0, remove.empty = TRUE, trim = TRUE,
fun <- switch(filetype,
pdf = {function(x, ...) {read_pdf(x, remove.empty = FALSE, trim = FALSE, ocr = ocr, ...)[["text"]]}},
docx = {function(x, ...) {read_docx(x, remove.empty = FALSE, trim = FALSE, ...)}},
odt = {function(x, ...) {read_odt(x, remove.empty = FALSE, trim = FALSE, ...)}},
doc = {function(x, ...) {read_doc(x, remove.empty = FALSE, trim = FALSE, format=format, ...)}},
rtf = {function(x, ...) {read_rtf(x, remove.empty = FALSE, trim = FALSE, ...)}},
html = {function(x, ...) {read_html(x, remove.empty = FALSE, trim = FALSE, ...)}},
Expand Down
17 changes: 5 additions & 12 deletions R/read_docx.R
Expand Up @@ -42,20 +42,13 @@ read_docx <- function (file, skip = 0, remove.empty = TRUE, trim = TRUE, ...) {
## read in the unzipped docx
doc <- xml2::read_xml(xmlfile)

### extract the content
# children <- lapply(xml2::xml_find_all(doc, '//w:p'), xml2::xml_children)
# pvalues <- unlist(lapply(children, function(x) {
# paste(xml2::xml_text(xml2::xml_find_all(x, 'w:t')), collapse = ' ')
# }))

## extract the content
rm_na <- function(x) x[!is.na(x)]

pvalues <- unlist(lapply(lapply(xml2::xml_find_all(doc, '//w:p'), xml2::xml_children), function(x) {

paste(rm_na(unlist(xml2::xml_text(xml2::xml_find_all(x, './/w:t')))), collapse = ' ')

}))
# pvalues <- unlist(lapply(lapply(xml2::xml_find_all(doc, '//w:p'), xml2::xml_children), function(x) {
# paste(rm_na(unlist(xml2::xml_text(xml2::xml_find_all(x, './/w:t')))), collapse = ' ')
# }))
pvalues <- xml2::xml_text(xml2::xml_find_all(doc, '//w:p'))

## formatting
if (isTRUE(remove.empty)) pvalues <- pvalues[!grepl("^\\s*$", pvalues)]
Expand All @@ -65,4 +58,4 @@ read_docx <- function (file, skip = 0, remove.empty = TRUE, trim = TRUE, ...) {

pvalues

}
}
2 changes: 1 addition & 1 deletion R/read_html.R
Expand Up @@ -16,7 +16,7 @@
#' @rdname read_html
#' @export
#' @references The xpath is taken from Tony Breyal's response on StackOverflow:
#' <http://stackoverflow.com/questions/3195522/is-there-a-simple-way-in-r-to-extract-only-the-text-elements-of-an-html-page/3195926#3195926>
#' <https://stackoverflow.com/questions/3195522/is-there-a-simple-way-in-r-to-extract-only-the-text-elements-of-an-html-page/3195926#3195926>
#' @examples
#' html_dat <- read_html(
#' system.file("docs/textreadr_creed.html", package = "textreadr")
Expand Down
61 changes: 61 additions & 0 deletions R/read_odt.R
@@ -0,0 +1,61 @@
#' Read in .odt Content
#'
#' Read in the content from a .odt file.
#'
#' @param file The path to the .odt file.
#' @param skip The number of lines to skip.
#' @param remove.empty logical. If `TRUE` empty elements in the vector are
#' removed.
#' @param trim logical. If `TRUE` the leading/training white space is
#' removed.
#' @param ... ignored.
#' @return Returns a character vector.
#' @keywords odt
#' @export
#' @examples
#' \dontrun{
#' url <- "https://github.com/trinker/textreadr/raw/master/inst/docs/Hello_World.odt"
#' file <- download(url)
#' (txt <- read_odt(file))
#' }
read_odt <- function (file, skip = 0, remove.empty = TRUE, trim = TRUE, ...) {

filetype <- tools::file_ext(file)
if (filetype %in% c('odt') && grepl('^([fh]ttp)', file)){

file <- download(file)

}

## create temp dir
tmp <- tempfile()
if (!dir.create(tmp)) stop("Temporary directory could not be established.")

## clean up
on.exit(unlink(tmp, recursive=TRUE))

## unzip docx
xmlfile <- file.path(tmp, "content.xml")
utils::unzip(file, exdir = tmp)

## read in the unzipped docx
doc <- xml2::read_xml(xmlfile)

## extract the content
#rm_na <- function(x) x[!is.na(x)]

pvalues <- xml2::xml_text(xml2::xml_find_all(doc, "//text:p"))

## formatting
if (isTRUE(remove.empty)) pvalues <- pvalues[!grepl("^\\s*$", pvalues)]
if (skip > 0) pvalues <- pvalues[-seq(skip)]
if (isTRUE(trim)) pvalues <- trimws(pvalues)
if (length(pvalues) == 0) pvalues <- ''

pvalues

}




2 changes: 1 addition & 1 deletion R/read_pdf.R
Expand Up @@ -14,7 +14,7 @@
#' [tesseract::ocr()] function. This will create temporary .png
#' files and will require a much larger compute time.
#' @param ... Other arguments passed to [pdftools::pdf_text()][pdftools::pdftools].
#' @note A word of caution from [Carl Witthoft](http://stackoverflow.com/a/9187015/1000343)"
#' @note A word of caution from [Carl Witthoft](https://stackoverflow.com/a/9187015/1000343)"
#' "Just a warning to others who may be hoping to extract data: PDF is a
#' container, not a format. If the original document does not contain actual
#' text, as opposed to bitmapped images of text or possibly even uglier things
Expand Down
60 changes: 56 additions & 4 deletions R/read_transcript.R
Expand Up @@ -60,6 +60,8 @@
#' (doc4 <- system.file("docs/trans4.xlsx", package = "textreadr"))
#' (doc5 <- system.file("docs/trans5.xls", package = "textreadr"))
#' (doc6 <- system.file("docs/trans6.doc", package = "textreadr"))
#' ##(doc7 <- system.file("docs/trans7.rtf", package = "textreadr"))
#' (doc8 <- system.file("docs/trans8.odt", package = "textreadr"))
#'
#' dat1 <- read_transcript(doc1)
#' dat2 <- read_transcript(doc1, col.names = c("person", "dialogue"))
Expand All @@ -76,8 +78,8 @@
#'
#' ## MS doc format
#' \dontrun{
#' dat7 <- read_transcript(doc6) ## need to skip Researcher
#' dat8 <- read_transcript(doc6, skip = 1)
#' dat6b <- read_transcript(doc6) ## need to skip Researcher
#' dat6c <- read_transcript(doc6, skip = 1)
#' }
#'
#' ## rtf format
Expand All @@ -87,7 +89,10 @@
#' )
#' dat9 <- read_transcript(rtf_doc, skip = 1)
#' }
#'
#'
#' ## odt format
#' read_transcript(doc8)
#'
#' ## text string input
#' trans <- "sam: Computer is fun. Not too fun.
#' greg: No it's not, it's dumb.
Expand Down Expand Up @@ -147,7 +152,7 @@ function(file, col.names = c("Person", "Dialogue"), text.var = NULL, merge.broke
}

if (is.null(sep)) {
if (y %in% c("docx", "doc", "txt", "text", 'pdf', 'rtf')) {
if (y %in% c("docx", "doc", "txt", "text", 'pdf', 'rtf', 'odt')) {
sep <- ":"
} else {
sep <- ","
Expand Down Expand Up @@ -179,6 +184,14 @@ function(file, col.names = c("Person", "Dialogue"), text.var = NULL, merge.broke
paste(which(sep_hits), collapse=", "))
}
},
odt = {
x <- read.odt(file, skip = skip, sep = sep, max.person.nchar = max.person.nchar)
sep_hits <- grepl(sep, x[, 2])
if(any(sep_hits)) {
warning(sprintf("The following text contains the \"%s\" separator and may not have split correctly:\n", sep),
paste(which(sep_hits), collapse=", "))
}
},
rtf = {
x <- read.rtf(file, skip = skip, sep = sep, max.person.nchar = max.person.nchar, ...)
sep_hits <- grepl(sep, x[, 2])
Expand Down Expand Up @@ -305,6 +318,45 @@ function(file, skip = 0, sep = ":", max.person.nchar = 20) {
}



read.odt <-
function(file, skip = 0, sep = ":", max.person.nchar = 20) {

## create temp dir
tmp <- tempfile()
if (!dir.create(tmp)) stop("Temporary directory could not be established.")

## clean up
on.exit(unlink(tmp, recursive=TRUE))

## unzip docx
xmlfile <- file.path(tmp, "content.xml")
utils::unzip(file, exdir = tmp)

## Import XML
doc <- xml2::read_xml(xmlfile)

## extract the content
pvalues <- xml2::xml_text(xml2::xml_find_all(doc, "//text:p"))

pvalues <- pvalues[!grepl("^\\s*$", pvalues)] # Remove empty lines
if (skip > 0) pvalues <- pvalues[-seq(skip)] # Ignore these many lines
if (any(grepl(paste0("^.{", max.person.nchar, ",}", sep), pvalues))) {
warning(sprintf(paste0(
"I've detected the separator beyond %s characters from the line start. Parsing may be incorrect...\n",
" Consider manually searching the .docx for use of the separator in-text rather than to separate person/text."
), max.person.nchar))
}
keys <- sapply(gregexpr(paste0("^.*?", sep), pvalues), function(x) x > 0)
speaker <- regmatches(pvalues, gregexpr(paste0("^.*?", sep), pvalues))
pvalues <- gsub(paste0("^.*?", sep), "", pvalues) # Remove speaker from lines
speaker <- rep(speaker[which(keys)], diff(c(which(keys), length(speaker)+1)))
speaker <- unlist(speaker) # Make sure it's a vector
speaker <- substr(speaker, 1, nchar(speaker)-nchar(sep)) # Remove ending colon
transcript <- data.frame(X1 = trimws(speaker), X2 = trimws(pvalues), stringsAsFactors = FALSE)
return(transcript)
}

read.rtf <-
function(file, skip = 0, sep = ":", max.person.nchar = 20, ...) {

Expand Down

0 comments on commit d948e36

Please sign in to comment.