version 0.3.1

cran · Feb 19, 2017 · c9db109 · c9db109
1 parent 092076b
commit c9db109
Show file tree

Hide file tree

Showing 18 changed files with 325 additions and 60 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,22 +1,24 @@
 Package: textreadr
 Title: Read Text Documents into R
-Version: 0.3.0
+Version: 0.3.1
 Authors@R: c(person("Tyler", "Rinker", email =
-        "tyler.rinker@gmail.com", role = c("aut", "cre")))
+        "tyler.rinker@gmail.com", role = c("aut", "cre")),
+        person("Bryan", "Goodrich", role = "ctb"))
 Maintainer: Tyler Rinker <tyler.rinker@gmail.com>
 Description: A small collection of convenience tools for reading text
         documents into R.
 Depends: R (>= 3.2.2)
 Suggests: testthat
 Imports: curl, pdftools, readxl, textshape, tools, utils, XML
-Date: 2017-01-10
+Date: 2017-02-19
 License: GPL-2
 LazyData: TRUE
 RoxygenNote: 5.0.1
 BugReports: https://github.com/trinker/textreadr/issues?state=open
 URL: https://github.com/trinker/textreadr
 NeedsCompilation: no
-Packaged: 2017-01-11 03:36:44 UTC; Tyler
-Author: Tyler Rinker [aut, cre]
+Packaged: 2017-02-19 19:48:33 UTC; Tyler
+Author: Tyler Rinker [aut, cre],
+  Bryan Goodrich [ctb]
 Repository: CRAN
-Date/Publication: 2017-01-11 09:00:19
+Date/Publication: 2017-02-19 23:11:59
diff --git a/MD5 b/MD5
@@ -1,22 +1,23 @@
-fb0195977bd2d97b5766fcca39f266f4 *DESCRIPTION
-d5a43133cd863fdbb87c0a045760d980 *NAMESPACE
-90b08bdd39e4c9eb6adc8e912e75a89b *NEWS
+8ef3f63a21185aeb69115b6a7377ada9 *DESCRIPTION
+7caed8dff99d545ccfae1524a038e2a7 *NAMESPACE
+ead46602dd0f1f4c19c71404643f0d4f *NEWS
 754bab72a74f2f5086dcc215237c0309 *R/antiword_loc.R
 1a5e0719213c416f085d57b7c2fb3ce9 *R/check_antiword_installed.R
 2f78d20e39c9954761cf7ee8a3bdcd64 *R/download.R
 8fc28e2c3ca067e1f35df28c31c64d4d *R/peek.R
 16b46b358bf68ca09608608225b8f3e4 *R/read_dir.R
+9aaa9f31a40399f43238a0688baabc81 *R/read_dir_transcript.R
 2c80b99180133c06ddeb742d33fc854d *R/read_doc.R
 10551a14b33a0b73f063fef116edcc79 *R/read_document.R
 8879e0fd1c14480924da8c80eeef0bc5 *R/read_docx.R
 7152c5cc8d6ed8ce0f747fc9188d9f43 *R/read_pdf.R
-7e48ecc0f158553f9318f00722573cdc *R/read_transcript.R
+7aab833aada3ba3e0a0900eda7ac1b05 *R/read_transcript.R
 06efc7b4e04653fdce4e1ddb0f61329d *R/textreadr-package.R
 216061909b866e52dd33372fb002dc94 *R/textreadr_class.R
 8fd9b10b5eff4bfb04eb73feaa76020d *R/utils.R
-4a4d4edc5f0d905ea87d39668363202c *README.md
-54fda9091185266384057bbfc6cb19df *data/presidential_debates_2012.rda
-9cd91e2e0b0a1779f6ec9415a7785004 *inst/CITATION
+066582babc26ee147ceedf69b6468754 *README.md
+120d0410043e32e96db875bf3db2a8f5 *data/presidential_debates_2012.rda
+25683e10e66dc7f24179e6f3f4634d07 *inst/CITATION
 e011839dce9cca2cdce99a7c41858a3b *inst/docs/Maas2011/neg/0_3.txt
 4404b68a9abcd771a5ff136ada1dd678 *inst/docs/Maas2011/neg/10_2.txt
 534c4bcca4c6068905c7b86c6b68b2f8 *inst/docs/Maas2011/neg/11_3.txt
@@ -65,19 +66,26 @@ bdb0af2e1b5c7554ed5645dd6d592ea7 *inst/docs/trans1.docx
 ec67af8023c2798b5d90315202f1e416 *inst/docs/trans2.docx
 fb0484e92c1b788cf1d7837230d7840f *inst/docs/trans3.docx
 f8f5c51c7c6c40fb00d4903bb8272688 *inst/docs/trans4.xlsx
-3932e81a118b86b9c50d6155868cd12a *inst/docs/trans5.xls
+50ac740138c3ed2fcadabdc2791d664f *inst/docs/trans5.xls
 74d5f6d329035a126d75a5973e4820d4 *inst/docs/trans6.doc
+bdb0af2e1b5c7554ed5645dd6d592ea7 *inst/docs/transcripts/trans1.docx
+ec67af8023c2798b5d90315202f1e416 *inst/docs/transcripts/trans2.docx
+fb0484e92c1b788cf1d7837230d7840f *inst/docs/transcripts/trans3.docx
+f8f5c51c7c6c40fb00d4903bb8272688 *inst/docs/transcripts/trans4.xlsx
+3932e81a118b86b9c50d6155868cd12a *inst/docs/transcripts/trans5.xls
+74d5f6d329035a126d75a5973e4820d4 *inst/docs/transcripts/trans6.doc
 661616fb187c7cd8c95faddee2236204 *man/check_antiword_installed.Rd
 c04ff740fbcf527389cbaefef6d85fe8 *man/download.Rd
 65c255955130ab5bc9475fdee726d6fa *man/peek.Rd
 cc8fcdb21de5d7ad2ac67bb3a13411f1 *man/presidential_debates_2012.Rd
 6891bc1093d957f4d85302f02fc721df *man/print.textreadr.Rd
 3cfb6a3aa97d612e80c1fb685f1bc758 *man/read_dir.Rd
+adada96ce5520b4a58e53a7068250db5 *man/read_dir_transcript.Rd
 f38e809472479721c41792bff6a7dc74 *man/read_doc.Rd
 ea867bb0114b2ed603af05215c03db38 *man/read_document.Rd
 eebe4957389412f901e985c8e253f89c *man/read_docx.Rd
 03dbcc95805b0770a2e3ab7078bf0469 *man/read_pdf.Rd
-add2781f1fa0e59b3ca22d50358ef91c *man/read_transcript.Rd
+ffcf25354b0fa52fcb24570e8eb79016 *man/read_transcript.Rd
 cd71cd2817cb1319bca3ea860e6db97d *man/textreadr.Rd
 6176535a9928df24c9ed83f37f38c960 *tests/testthat.R
 84e113cf420544b29a1c69b692f313b7 *tests/testthat/test-read_transcript.R
diff --git a/NAMESPACE b/NAMESPACE
@@ -7,6 +7,7 @@ export(check_antiword_installed)
 export(download)
 export(peek)
 export(read_dir)
+export(read_dir_transcript)
 export(read_doc)
 export(read_document)
 export(read_docx)

diff --git a/NEWS b/NEWS
@@ -18,6 +18,15 @@ And constructed with the following guidelines:
 
 
 
+textreadr 0.3.1
+----------------------------------------------------------------
+
+NEW FEATURES
+
+* `read_dir_transcript` added to complement `read-dir` aimed at a directory of
+  transcripts.
+
+
 
 textreadr 0.0.1 - 0.3.0
 ----------------------------------------------------------------

diff --git a/R/read_dir_transcript.R b/R/read_dir_transcript.R
@@ -0,0 +1,123 @@
+#' Read In Multiple Transcript Files From a Directory
+#'
+#' Read in multiple transcript files from a directory and create a
+#' \code{\link[base]{data.frame}}.
+#'
+#' @param path Path to the directory.
+#' @param col.names  A character vector specifying the column names of the
+#' transcript columns (document, person, dialogue).
+#' @param pattern An optional regular expression. Only file names which match
+#' the regular expression will be returned.
+#' @param all.files Logical.   If \code{FALSE}, only the names of visible files
+#' are returned. If \code{TRUE}, all file names will be returned.
+#' @param recursive Logical. Should the listing recurse into directories?
+#' @param skip Integer; the number of lines of the data file to skip before
+#' beginning to read data.
+#' @param merge.broke.tot logical.  If \code{TRUE} and if the file being read in
+#' is .docx with broken space between a single turn of talk read_transcript
+#' will attempt to merge these into a single turn of talk.
+#' @param header logical.  If \code{TRUE} the file contains the names of the
+#' variables as its first line.
+#' @param dash A character string to replace the en and em dashes special
+#' characters (default is to remove).
+#' @param ellipsis A character string to replace the ellipsis special characters.
+#' @param quote2bracket logical. If \code{TRUE} replaces curly quotes with curly
+#' braces (default is \code{FALSE}).  If \code{FALSE} curly quotes are removed.
+#' @param rm.empty.rows logical.  If \code{TRUE}
+#' \code{\link[textreadr]{read_transcript}}  attempts to remove empty rows.
+#' @param na A character string to be interpreted as an \code{NA} value.
+#' @param sep The field separator character. Values on each line of the file are
+#' separated by this character.  The default of \code{NULL} instructs
+#' \code{\link[textreadr]{read_transcript}} to use a separator suitable for the file
+#' type being read in.
+#' @param comment.char A character vector of length one containing a single
+#' character or an empty string. Use \code{""} to turn off the interpretation of
+#' comments altogether.
+#' @param max.person.nchar The max number of characters long names are expected
+#' to be.  This information is used to warn the user if a separat appears beyond
+#' this length in the text.
+#' @param \ldots ignored.
+#' @return Returns a dataframe of documents, dialogue, and people.
+#' @export
+#' @seealso read_transcript
+#' @examples
+#' skips <- c(0, 1, 1, 0, 0, 1)
+#' path <- system.file("docs/transcripts", package = 'textreadr')
+#' textreadr::peek(read_dir_transcript(path, skip = skips), Inf)
+#'
+#' \dontrun{
+#' ## with additional  cleaning
+#' library(tidyverse, textshape, textclean)
+#'
+#' path %>%
+#'     read_dir_transcript(skip = skips) %>%
+#'     textclean::filter_row("Person", "^\\[") %>%
+#'     mutate(
+#'         Person = stringi::stri_replace_all_regex(Person, "(^/\\s*)|(:\\s*$)", "") %>%
+#'             trimws(),
+#'         Dialogue = stringi::stri_replace_all_regex(Dialogue, "(^/\\s*)", "")
+#'     ) %>%
+#'     peek(Inf)
+#' }
+read_dir_transcript <- function(path, col.names = c("Document", "Person", "Dialogue"),
+    pattern = NULL, all.files = FALSE,
+    recursive = FALSE, skip = 0, merge.broke.tot = TRUE, header = FALSE, dash = "", ellipsis = "...",
+    quote2bracket = FALSE, rm.empty.rows = TRUE, na = "", sep = NULL,
+    comment.char = "", max.person.nchar = 20, ...) {
+
+    to_read_in <- list_files(path, all.files = all.files, full.names = TRUE, recursive = recursive)
+    if (identical(character(0), to_read_in)) {
+        stop("The following location does not appear to contain files:\n   -", path)
+    }
+
+    if (length(skip) == 1) skip <- rep(skip, length(to_read_in))
+    if (length(merge.broke.tot) == 1) merge.broke.tot <- rep(merge.broke.tot, length(to_read_in))
+    if (length(header) == 1) header<- rep(header, length(to_read_in))
+    if (length(dash) == 1) dash <- rep(dash, length(to_read_in))
+    if (length(ellipsis) == 1) ellipsis <- rep(ellipsis, length(to_read_in))
+    if (length(quote2bracket) == 1) quote2bracket <- rep(quote2bracket, length(to_read_in))
+    if (length(rm.empty.rows) == 1) rm.empty.rows <- rep(rm.empty.rows, length(to_read_in))
+    if (length(na) == 1) na <- rep(na, length(to_read_in))
+    if (is.null(sep)) sep <- lapply(seq_along(to_read_in), function(i) NULL)
+    if (length(sep) == 1) sep <- rep(sep, length(to_read_in))
+    if (length(comment.char) == 1) comment.char <- rep(comment.char, length(to_read_in))
+    if (length(max.person.nchar) == 1) max.person.nchar <- rep(max.person.nchar, length(to_read_in))
+
+    arg_list <- list(skip, merge.broke.tot, header, dash, ellipsis, quote2bracket,
+        rm.empty.rows, na, sep, comment.char, max.person.nchar
+    )
+
+    arg_nms <- c('skip', 'merge.broke.tot', 'header', 'dash', 'ellipsis', 'quote2bracket',
+        'rm.empty.rows', 'na', 'sep', 'comment.char', 'max.person.nchar'
+    )
+
+    lapply(seq_along(arg_list), function(i){
+
+        if(length(arg_list[[i]]) != length(to_read_in)) stop(paste0('`', arg_nms[i], '` is not of length 1 or the same length as files in `path`'))
+    })
+
+    #paste(c('skip', 'merge.broke.tot', 'header', 'dash', 'ellipsis',
+    # 'quote2bracket', 'rm.empty.rows', 'na', 'sep',
+    # 'comment.char', 'max.person.nchar'), collapse=", ")
+
+    reads <- Map(function(x, y1, y2, y3, y4, y5, y6, y7, y8, y9, y10, y11) {
+        try(read_transcript(x, col.names = col.names[-1], skip = y1,
+            merge.broke.tot = y2, header = y3, dash = y4, ellipsis = y5,
+            quote2bracket = y6, rm.empty.rows = y7, na = y8, sep = y9,
+            comment.char = y10, max.person.nchar = y11, ...))
+    }, to_read_in, skip, merge.broke.tot, header, dash, ellipsis,
+       quote2bracket, rm.empty.rows, na, sep, comment.char, max.person.nchar)
+
+    names(reads) <- tools::file_path_sans_ext(basename(to_read_in))
+
+    goods <- !sapply(reads, inherits, 'try-error')
+    if (any(!goods)) {
+        warning(paste0("The following files did not read in correctly:\n",
+            paste0('  - ', to_read_in[!goods], collapse = "\n")
+        ))
+    }
+    textshape::tidy_list(reads[goods], col.names[1])
+
+}
+
+
diff --git a/R/read_transcript.R b/R/read_transcript.R
@@ -1,4 +1,4 @@
-#' read_transcripts Into R
+#' Read Transcripts Into R
 #'
 #' Read .docx, .csv, .xlsx, .xlsx, or .txt transcript style files into R.
 #'