version 1.2.0

cran · Oct 9, 2021 · d948e36 · d948e36
1 parent aaeba1e
commit d948e36
Show file tree

Hide file tree

Showing 21 changed files with 474 additions and 256 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,23 +1,23 @@
 Package: textreadr
 Title: Read Text Documents into R
-Version: 1.0.2
+Version: 1.2.0
 Authors@R: c(person("Tyler", "Rinker", email = "tyler.rinker@gmail.com", role = c("aut", "cre")),
            person("Bryan", "Goodrich", role = "ctb"), person("Dason", "Kurkiewicz", role = "ctb"))
 Maintainer: Tyler Rinker <tyler.rinker@gmail.com>
 Description: A small collection of convenience tools for reading text documents into R.
-Depends: R (>= 3.2.2)
+Depends: R (>= 3.3.0)
 Suggests: tesseract, testthat
 Imports: antiword, curl, data.table, pdftools, readxl, rvest, striprtf,
         textshape, tools, utils, xml2
 License: GPL-2
 LazyData: TRUE
-RoxygenNote: 7.1.0
+RoxygenNote: 7.1.2
 BugReports: https://github.com/trinker/textreadr/issues?state=open
 URL: https://github.com/trinker/textreadr
 NeedsCompilation: no
-Packaged: 2020-06-16 22:56:19 UTC; trinker
+Packaged: 2021-10-08 22:11:53 UTC; TylerRinker
 Author: Tyler Rinker [aut, cre],
   Bryan Goodrich [ctb],
   Dason Kurkiewicz [ctb]
 Repository: CRAN
-Date/Publication: 2020-06-17 05:50:02 UTC
+Date/Publication: 2021-10-09 15:30:02 UTC
diff --git a/MD5 b/MD5
@@ -1,28 +1,30 @@
-daeb648327e4a45750e568b65b0da93f *DESCRIPTION
-a110a2362269d19dbe63c5bab55da360 *NAMESPACE
-5e829a4b9c903664fd9e7a806780b4cd *NEWS
+9510e45f9b48e52c1a90508275d5fbe4 *DESCRIPTION
+5a21160422db036542bba2a9bc738f0b *NAMESPACE
+46826baa0226fe4f81bad1c71804cc51 *NEWS
 d8302f1bc911cabf2eaba4834ef5b6ab *R/as_transcript.R
-a1d5ff34974699faee0e63bf81084eee *R/browse.R
+1895a3051f97c2063c961593bf46bf9c *R/browse.R
 862f96290fca5dc1a370833773e0576c *R/download.R
 f377f916caa75535d6133d0bbc17eecf *R/loop_utilities.R
 ef038ac83c1c325c8ea0a54174f35f52 *R/peek.R
 50dbc8eb0f4fad392b5de5829457377e *R/read_dir.R
 6e694aa76b32f61b15c31b70cbf99bed *R/read_dir_transcript.R
 fbc21a0bc3dee4284c60f5f7f532d5b0 *R/read_doc.R
-1284f6a0808e78401afd7eca9f726e2f *R/read_document.R
-6da3eb25b3bd531a86d07154b0ce3fb2 *R/read_docx.R
-40107b81cab50bf8aff4a1ad473a63f9 *R/read_html.R
-f1a6b0641d25721e335ca65e810733ca *R/read_pdf.R
+fb806a2420d6df30609223a304a0f775 *R/read_document.R
+80be04c6eaa5fe4187034d8f4d020c1b *R/read_docx.R
+7ca09b9c118669fd7ed1339868c573dc *R/read_html.R
+2490d8d03cda966e197be1a95aacce6c *R/read_odt.R
+97bfbfcd674a67b6d51015ffd9654ad0 *R/read_pdf.R
 51211abf67c2c55e6946d23669f54f01 *R/read_pptx.R
 f522cab841182af3477d5708b3a88442 *R/read_rtf.R
-f95b32724603ce2297ec71d1cdf63613 *R/read_transcript.R
+3c44d45c4df1d3ddab610dd04a580963 *R/read_transcript.R
 06efc7b4e04653fdce4e1ddb0f61329d *R/textreadr-package.R
 803751bedca4c226d008a1b599d8008d *R/textreadr_class.R
 ea601d99e969069699ba02ceb1bc7213 *R/un_zip.R
 200032871870b7e7814517ad0779e5e8 *R/utils.R
-dc1c8d9c5cca9c011f128b2788e0504a *README.md
-5fd5870f851cd835cf479fb95716d347 *data/presidential_debates_2012.rda
-11fcdd3f98d8c74f4c78fa83edaf4084 *inst/CITATION
+01bdf20949a13d4f2bb5798c5f01760c *README.md
+a1f68293ca82e91b0004c570b8b4d769 *data/presidential_debates_2012.rda
+53de65566c0c3bf77d2a81c2fef72b0e *inst/CITATION
+6ef761b98e46491d2b0b4a40c886cc42 *inst/docs/Hello_World.odt
 2523a08f53a2ff6d09acde4606030dbf *inst/docs/Hello_World.pptx
 e011839dce9cca2cdce99a7c41858a3b *inst/docs/Maas2011/neg/0_3.txt
 4404b68a9abcd771a5ff136ada1dd678 *inst/docs/Maas2011/neg/10_2.txt
@@ -78,6 +80,7 @@ fb0484e92c1b788cf1d7837230d7840f *inst/docs/trans3.docx
 f8f5c51c7c6c40fb00d4903bb8272688 *inst/docs/trans4.xlsx
 3932e81a118b86b9c50d6155868cd12a *inst/docs/trans5.xls
 74d5f6d329035a126d75a5973e4820d4 *inst/docs/trans6.doc
+834e148a77f08cb667237265a33afc0c *inst/docs/trans8.odt
 bdb0af2e1b5c7554ed5645dd6d592ea7 *inst/docs/transcripts/trans1.docx
 ec67af8023c2798b5d90315202f1e416 *inst/docs/transcripts/trans2.docx
 fb0484e92c1b788cf1d7837230d7840f *inst/docs/transcripts/trans3.docx
@@ -86,7 +89,7 @@ f8f5c51c7c6c40fb00d4903bb8272688 *inst/docs/transcripts/trans4.xlsx
 74d5f6d329035a126d75a5973e4820d4 *inst/docs/transcripts/trans6.doc
 d4903fd1b16222f2f6f74360bb61e732 *inst/extract_text_app/app/extract_text.Rmd
 279f5417d416a631102dcd216f74f289 *man/as_transcript.Rd
-139229f92b144f69f0dc8c2862da0cfe *man/browse.Rd
+1b7eca13fe04160bbd5c3cdb2065f079 *man/browse.Rd
 984cce99e94e7bbb695bfacd9a77a953 *man/download.Rd
 15ff98991dba60ae84370858e3fd6dfd *man/loop_utilities.Rd
 6c5b7186e562a9824c87b43c8d877ebf *man/peek.Rd
@@ -97,11 +100,12 @@ d21b922884ea26675d6392fa517f0e68 *man/read_dir_transcript.Rd
 20a8d7b195e8982980e91d18c964f289 *man/read_doc.Rd
 06c6f56bcdea1fa3eb62ccab8bbf026b *man/read_document.Rd
 6a533788da4247a64984084675653b0d *man/read_docx.Rd
-e68f871db9c8e718e19825eea13e0d03 *man/read_html.Rd
-67790a1f3834b24859e28e44ecc37e02 *man/read_pdf.Rd
+8e032dc176e223f7d08fb8e7dd9d6df9 *man/read_html.Rd
+d99e3cca1a6364be1cafbad892da4efc *man/read_odt.Rd
+fd3a773c1bed8af6297065d484a6b658 *man/read_pdf.Rd
 7cb3c6d877dbc3984bbe149b57ddf994 *man/read_pptx.Rd
 c04fef4b063ce4980acae1cc588fbf63 *man/read_rtf.Rd
-01ca2eae48594b94895454deb7d371ed *man/read_transcript.Rd
+4197a0cf8dfbae42f0f70bd4c8ed32de *man/read_transcript.Rd
 71a95cbb83cf79cbc37192718c7c460d *man/textreadr.Rd
 4a492ee036feda95480a04aadffd71b2 *man/un_zip.Rd
 6176535a9928df24c9ed83f37f38c960 *tests/testthat.R

diff --git a/NAMESPACE b/NAMESPACE
@@ -13,6 +13,7 @@ export(read_doc)
 export(read_document)
 export(read_docx)
 export(read_html)
+export(read_odt)
 export(read_pdf)
 export(read_pptx)
 export(read_rtf)

diff --git a/NEWS b/NEWS
@@ -18,6 +18,19 @@ And constructed with the following guidelines:
 
 
 
+textreadr 1.0.3 - 1.2.0
+----------------------------------------------------------------
+
+BUG FIXES
+
+* `read_docx` would return the same word as 2 separate words if different 
+  characters within the word had different styling (pseudocode example: 
+  '<w:p><bold>h</bold>ello word<w:p>' returned 'h ello world').
+
+NEW FEATURES
+
+* `read_odt` added to read in .odt files.
+
 
 
 textreadr 0.9.1 - 1.0.2

diff --git a/R/browse.R b/R/browse.R
@@ -3,7 +3,7 @@
 #' Use the operating system defaults to open directories and files.
 #'
 #' @param x A vector (typically of length one) of paths to directories of files.
-#' @references <http://stackoverflow.com/q/12135732/1000343>
+#' @references <https://stackoverflow.com/q/12135732/1000343>
 #' @note This function is operating system and setting dependent.  Results may
 #' not be consistent across operating systems.  Depending upon the default
 #' programs for file types the results may vary as well.  Some files may not be

diff --git a/R/read_document.R b/R/read_document.R
@@ -76,6 +76,7 @@ read_document <- function(file, skip = 0, remove.empty = TRUE, trim = TRUE,
     fun <- switch(filetype,
         pdf = {function(x, ...) {read_pdf(x, remove.empty = FALSE, trim = FALSE, ocr = ocr, ...)[["text"]]}},
         docx = {function(x, ...) {read_docx(x, remove.empty = FALSE, trim = FALSE, ...)}},
+        odt = {function(x, ...) {read_odt(x, remove.empty = FALSE, trim = FALSE, ...)}},        
         doc = {function(x, ...) {read_doc(x, remove.empty = FALSE, trim = FALSE, format=format, ...)}},
         rtf = {function(x, ...) {read_rtf(x, remove.empty = FALSE, trim = FALSE, ...)}},
         html = {function(x, ...) {read_html(x, remove.empty = FALSE, trim = FALSE, ...)}},

diff --git a/R/read_docx.R b/R/read_docx.R
@@ -42,20 +42,13 @@ read_docx <- function (file, skip = 0, remove.empty = TRUE, trim = TRUE, ...) {
     ## read in the unzipped docx
     doc <- xml2::read_xml(xmlfile)
 
-    ### extract the content
-    # children <- lapply(xml2::xml_find_all(doc, '//w:p'), xml2::xml_children)
-    # pvalues <- unlist(lapply(children, function(x) {
-    #     paste(xml2::xml_text(xml2::xml_find_all(x, 'w:t')), collapse = ' ')
-    # }))
-
     ## extract the content
     rm_na <- function(x) x[!is.na(x)]
 
-    pvalues <- unlist(lapply(lapply(xml2::xml_find_all(doc, '//w:p'), xml2::xml_children), function(x) {
-
-        paste(rm_na(unlist(xml2::xml_text(xml2::xml_find_all(x, './/w:t')))), collapse = ' ')
-
-    }))    
+    # pvalues <- unlist(lapply(lapply(xml2::xml_find_all(doc, '//w:p'), xml2::xml_children), function(x) {
+    #     paste(rm_na(unlist(xml2::xml_text(xml2::xml_find_all(x, './/w:t')))), collapse = ' ')
+    # }))   
+    pvalues <- xml2::xml_text(xml2::xml_find_all(doc, '//w:p'))
 
     ## formatting
     if (isTRUE(remove.empty)) pvalues <- pvalues[!grepl("^\\s*$", pvalues)]
@@ -65,4 +58,4 @@ read_docx <- function (file, skip = 0, remove.empty = TRUE, trim = TRUE, ...) {
 
     pvalues
 
-}
+}
diff --git a/R/read_html.R b/R/read_html.R
@@ -16,7 +16,7 @@
 #' @rdname read_html
 #' @export
 #' @references The xpath is taken from Tony Breyal's response on StackOverflow:
-#' <http://stackoverflow.com/questions/3195522/is-there-a-simple-way-in-r-to-extract-only-the-text-elements-of-an-html-page/3195926#3195926>
+#' <https://stackoverflow.com/questions/3195522/is-there-a-simple-way-in-r-to-extract-only-the-text-elements-of-an-html-page/3195926#3195926>
 #' @examples
 #' html_dat <- read_html(
 #'     system.file("docs/textreadr_creed.html", package = "textreadr")

diff --git a/R/read_odt.R b/R/read_odt.R
@@ -0,0 +1,61 @@
+#' Read in .odt Content
+#'
+#' Read in the content from a .odt file.
+#'
+#' @param file The path to the .odt file.
+#' @param skip The number of lines to skip.
+#' @param remove.empty logical.  If `TRUE` empty elements in the vector are
+#' removed.
+#' @param trim logical.  If `TRUE` the leading/training white space is
+#' removed.
+#' @param ... ignored.
+#' @return Returns a character vector.
+#' @keywords odt
+#' @export
+#' @examples
+#' \dontrun{
+#' url <- "https://github.com/trinker/textreadr/raw/master/inst/docs/Hello_World.odt"
+#' file <- download(url)
+#' (txt <- read_odt(file))
+#' }
+read_odt <- function (file, skip = 0, remove.empty = TRUE, trim = TRUE, ...) {
+
+    filetype <- tools::file_ext(file)
+    if (filetype %in% c('odt') && grepl('^([fh]ttp)', file)){
+
+        file <- download(file)
+
+    }   
+
+    ## create temp dir
+    tmp <- tempfile()
+    if (!dir.create(tmp)) stop("Temporary directory could not be established.")
+
+    ## clean up
+    on.exit(unlink(tmp, recursive=TRUE))
+
+    ## unzip docx
+    xmlfile <- file.path(tmp, "content.xml")
+    utils::unzip(file, exdir = tmp)
+
+    ## read in the unzipped docx
+    doc <- xml2::read_xml(xmlfile)
+
+    ## extract the content
+    #rm_na <- function(x) x[!is.na(x)]
+
+    pvalues <- xml2::xml_text(xml2::xml_find_all(doc, "//text:p"))
+
+    ## formatting
+    if (isTRUE(remove.empty)) pvalues <- pvalues[!grepl("^\\s*$", pvalues)]
+    if (skip > 0) pvalues <- pvalues[-seq(skip)]
+    if (isTRUE(trim)) pvalues <- trimws(pvalues)
+    if (length(pvalues) == 0) pvalues <- ''
+
+    pvalues
+
+}
+
+
+
+
diff --git a/R/read_pdf.R b/R/read_pdf.R
@@ -14,7 +14,7 @@
 #' [tesseract::ocr()] function.  This will create temporary .png
 #' files and will require a much larger compute time.
 #' @param ... Other arguments passed to [pdftools::pdf_text()][pdftools::pdftools].
-#' @note A word of caution from [Carl Witthoft](http://stackoverflow.com/a/9187015/1000343)"
+#' @note A word of caution from [Carl Witthoft](https://stackoverflow.com/a/9187015/1000343)"
 #' "Just a warning to others who may be hoping to extract data: PDF is a
 #' container, not a format. If the original document does not contain actual
 #' text, as opposed to bitmapped images of text or possibly even uglier things

diff --git a/R/read_transcript.R b/R/read_transcript.R
@@ -60,6 +60,8 @@
 #' (doc4 <- system.file("docs/trans4.xlsx", package = "textreadr"))
 #' (doc5 <- system.file("docs/trans5.xls", package = "textreadr"))
 #' (doc6 <- system.file("docs/trans6.doc", package = "textreadr"))
+#' ##(doc7 <- system.file("docs/trans7.rtf", package = "textreadr"))
+#' (doc8 <- system.file("docs/trans8.odt", package = "textreadr"))
 #'
 #' dat1 <- read_transcript(doc1)
 #' dat2 <- read_transcript(doc1, col.names = c("person", "dialogue"))
@@ -76,8 +78,8 @@
 #'
 #' ## MS doc format
 #' \dontrun{
-#' dat7 <- read_transcript(doc6) ## need to skip Researcher
-#' dat8 <- read_transcript(doc6, skip = 1)
+#' dat6b <- read_transcript(doc6) ## need to skip Researcher
+#' dat6c <- read_transcript(doc6, skip = 1)
 #' }
 #'
 #' ## rtf format
@@ -87,7 +89,10 @@
 #' )
 #' dat9 <- read_transcript(rtf_doc, skip = 1)
 #' }
-#'
+#' 
+#' ## odt format
+#' read_transcript(doc8)
+#' 
 #' ## text string input
 #' trans <- "sam: Computer is fun. Not too fun.
 #' greg: No it's not, it's dumb.
@@ -147,7 +152,7 @@ function(file, col.names = c("Person", "Dialogue"), text.var = NULL, merge.broke
     }
 
     if (is.null(sep)) {
-        if (y %in% c("docx", "doc", "txt", "text", 'pdf', 'rtf')) {
+        if (y %in% c("docx", "doc", "txt", "text", 'pdf', 'rtf', 'odt')) {
             sep <- ":"
         } else {
             sep <- ","
@@ -179,6 +184,14 @@ function(file, col.names = c("Person", "Dialogue"), text.var = NULL, merge.broke
                         paste(which(sep_hits), collapse=", "))
             }
         },
+        odt = {
+            x <- read.odt(file, skip = skip, sep = sep, max.person.nchar = max.person.nchar)
+            sep_hits <- grepl(sep, x[, 2])
+            if(any(sep_hits)) {
+                warning(sprintf("The following text contains the \"%s\" separator and may not have split correctly:\n", sep),
+                    paste(which(sep_hits), collapse=", "))
+                }
+            },       
         rtf = {
             x <- read.rtf(file, skip = skip, sep = sep, max.person.nchar = max.person.nchar, ...)
             sep_hits <- grepl(sep, x[, 2])
@@ -305,6 +318,45 @@ function(file, skip = 0, sep = ":", max.person.nchar = 20) {
 }
 
 
+
+read.odt <-
+function(file, skip = 0, sep = ":", max.person.nchar = 20) {
+
+    ## create temp dir
+    tmp <- tempfile()
+    if (!dir.create(tmp)) stop("Temporary directory could not be established.")
+
+    ## clean up
+    on.exit(unlink(tmp, recursive=TRUE))
+
+    ## unzip docx
+    xmlfile <- file.path(tmp, "content.xml")
+    utils::unzip(file, exdir = tmp)
+
+    ## Import XML
+    doc <- xml2::read_xml(xmlfile)
+
+    ## extract the content
+    pvalues <- xml2::xml_text(xml2::xml_find_all(doc, "//text:p"))
+
+    pvalues <- pvalues[!grepl("^\\s*$", pvalues)]  # Remove empty lines
+    if (skip > 0) pvalues <- pvalues[-seq(skip)]   # Ignore these many lines
+    if (any(grepl(paste0("^.{", max.person.nchar, ",}", sep), pvalues))) {
+        warning(sprintf(paste0(
+            "I've detected the separator beyond %s characters from the line start.  Parsing may be incorrect...\n",
+            "  Consider manually searching the .docx for use of the separator in-text rather than to separate person/text."
+        ), max.person.nchar))
+    }
+    keys    <- sapply(gregexpr(paste0("^.*?", sep), pvalues), function(x) x > 0)
+    speaker <- regmatches(pvalues, gregexpr(paste0("^.*?", sep), pvalues))
+    pvalues <- gsub(paste0("^.*?", sep), "", pvalues)  # Remove speaker from lines
+    speaker <- rep(speaker[which(keys)], diff(c(which(keys), length(speaker)+1)))
+    speaker <- unlist(speaker)  # Make sure it's a vector
+    speaker <- substr(speaker, 1, nchar(speaker)-nchar(sep)) # Remove ending colon
+    transcript <- data.frame(X1 = trimws(speaker), X2 = trimws(pvalues), stringsAsFactors = FALSE)
+    return(transcript)
+}
+
 read.rtf <-
 function(file, skip = 0, sep = ":", max.person.nchar = 20, ...) {