Skip to content

Commit

Permalink
version 0.3.1
Browse files Browse the repository at this point in the history
  • Loading branch information
trinker authored and cran-robot committed Feb 19, 2017
1 parent 092076b commit c9db109
Show file tree
Hide file tree
Showing 18 changed files with 325 additions and 60 deletions.
14 changes: 8 additions & 6 deletions DESCRIPTION
@@ -1,22 +1,24 @@
Package: textreadr
Title: Read Text Documents into R
Version: 0.3.0
Version: 0.3.1
Authors@R: c(person("Tyler", "Rinker", email =
"tyler.rinker@gmail.com", role = c("aut", "cre")))
"tyler.rinker@gmail.com", role = c("aut", "cre")),
person("Bryan", "Goodrich", role = "ctb"))
Maintainer: Tyler Rinker <tyler.rinker@gmail.com>
Description: A small collection of convenience tools for reading text
documents into R.
Depends: R (>= 3.2.2)
Suggests: testthat
Imports: curl, pdftools, readxl, textshape, tools, utils, XML
Date: 2017-01-10
Date: 2017-02-19
License: GPL-2
LazyData: TRUE
RoxygenNote: 5.0.1
BugReports: https://github.com/trinker/textreadr/issues?state=open
URL: https://github.com/trinker/textreadr
NeedsCompilation: no
Packaged: 2017-01-11 03:36:44 UTC; Tyler
Author: Tyler Rinker [aut, cre]
Packaged: 2017-02-19 19:48:33 UTC; Tyler
Author: Tyler Rinker [aut, cre],
Bryan Goodrich [ctb]
Repository: CRAN
Date/Publication: 2017-01-11 09:00:19
Date/Publication: 2017-02-19 23:11:59
26 changes: 17 additions & 9 deletions MD5
@@ -1,22 +1,23 @@
fb0195977bd2d97b5766fcca39f266f4 *DESCRIPTION
d5a43133cd863fdbb87c0a045760d980 *NAMESPACE
90b08bdd39e4c9eb6adc8e912e75a89b *NEWS
8ef3f63a21185aeb69115b6a7377ada9 *DESCRIPTION
7caed8dff99d545ccfae1524a038e2a7 *NAMESPACE
ead46602dd0f1f4c19c71404643f0d4f *NEWS
754bab72a74f2f5086dcc215237c0309 *R/antiword_loc.R
1a5e0719213c416f085d57b7c2fb3ce9 *R/check_antiword_installed.R
2f78d20e39c9954761cf7ee8a3bdcd64 *R/download.R
8fc28e2c3ca067e1f35df28c31c64d4d *R/peek.R
16b46b358bf68ca09608608225b8f3e4 *R/read_dir.R
9aaa9f31a40399f43238a0688baabc81 *R/read_dir_transcript.R
2c80b99180133c06ddeb742d33fc854d *R/read_doc.R
10551a14b33a0b73f063fef116edcc79 *R/read_document.R
8879e0fd1c14480924da8c80eeef0bc5 *R/read_docx.R
7152c5cc8d6ed8ce0f747fc9188d9f43 *R/read_pdf.R
7e48ecc0f158553f9318f00722573cdc *R/read_transcript.R
7aab833aada3ba3e0a0900eda7ac1b05 *R/read_transcript.R
06efc7b4e04653fdce4e1ddb0f61329d *R/textreadr-package.R
216061909b866e52dd33372fb002dc94 *R/textreadr_class.R
8fd9b10b5eff4bfb04eb73feaa76020d *R/utils.R
4a4d4edc5f0d905ea87d39668363202c *README.md
54fda9091185266384057bbfc6cb19df *data/presidential_debates_2012.rda
9cd91e2e0b0a1779f6ec9415a7785004 *inst/CITATION
066582babc26ee147ceedf69b6468754 *README.md
120d0410043e32e96db875bf3db2a8f5 *data/presidential_debates_2012.rda
25683e10e66dc7f24179e6f3f4634d07 *inst/CITATION
e011839dce9cca2cdce99a7c41858a3b *inst/docs/Maas2011/neg/0_3.txt
4404b68a9abcd771a5ff136ada1dd678 *inst/docs/Maas2011/neg/10_2.txt
534c4bcca4c6068905c7b86c6b68b2f8 *inst/docs/Maas2011/neg/11_3.txt
Expand Down Expand Up @@ -65,19 +66,26 @@ bdb0af2e1b5c7554ed5645dd6d592ea7 *inst/docs/trans1.docx
ec67af8023c2798b5d90315202f1e416 *inst/docs/trans2.docx
fb0484e92c1b788cf1d7837230d7840f *inst/docs/trans3.docx
f8f5c51c7c6c40fb00d4903bb8272688 *inst/docs/trans4.xlsx
3932e81a118b86b9c50d6155868cd12a *inst/docs/trans5.xls
50ac740138c3ed2fcadabdc2791d664f *inst/docs/trans5.xls
74d5f6d329035a126d75a5973e4820d4 *inst/docs/trans6.doc
bdb0af2e1b5c7554ed5645dd6d592ea7 *inst/docs/transcripts/trans1.docx
ec67af8023c2798b5d90315202f1e416 *inst/docs/transcripts/trans2.docx
fb0484e92c1b788cf1d7837230d7840f *inst/docs/transcripts/trans3.docx
f8f5c51c7c6c40fb00d4903bb8272688 *inst/docs/transcripts/trans4.xlsx
3932e81a118b86b9c50d6155868cd12a *inst/docs/transcripts/trans5.xls
74d5f6d329035a126d75a5973e4820d4 *inst/docs/transcripts/trans6.doc
661616fb187c7cd8c95faddee2236204 *man/check_antiword_installed.Rd
c04ff740fbcf527389cbaefef6d85fe8 *man/download.Rd
65c255955130ab5bc9475fdee726d6fa *man/peek.Rd
cc8fcdb21de5d7ad2ac67bb3a13411f1 *man/presidential_debates_2012.Rd
6891bc1093d957f4d85302f02fc721df *man/print.textreadr.Rd
3cfb6a3aa97d612e80c1fb685f1bc758 *man/read_dir.Rd
adada96ce5520b4a58e53a7068250db5 *man/read_dir_transcript.Rd
f38e809472479721c41792bff6a7dc74 *man/read_doc.Rd
ea867bb0114b2ed603af05215c03db38 *man/read_document.Rd
eebe4957389412f901e985c8e253f89c *man/read_docx.Rd
03dbcc95805b0770a2e3ab7078bf0469 *man/read_pdf.Rd
add2781f1fa0e59b3ca22d50358ef91c *man/read_transcript.Rd
ffcf25354b0fa52fcb24570e8eb79016 *man/read_transcript.Rd
cd71cd2817cb1319bca3ea860e6db97d *man/textreadr.Rd
6176535a9928df24c9ed83f37f38c960 *tests/testthat.R
84e113cf420544b29a1c69b692f313b7 *tests/testthat/test-read_transcript.R
1 change: 1 addition & 0 deletions NAMESPACE
Expand Up @@ -7,6 +7,7 @@ export(check_antiword_installed)
export(download)
export(peek)
export(read_dir)
export(read_dir_transcript)
export(read_doc)
export(read_document)
export(read_docx)
Expand Down
9 changes: 9 additions & 0 deletions NEWS
Expand Up @@ -18,6 +18,15 @@ And constructed with the following guidelines:



textreadr 0.3.1
----------------------------------------------------------------

NEW FEATURES

* `read_dir_transcript` added to complement `read-dir` aimed at a directory of
transcripts.



textreadr 0.0.1 - 0.3.0
----------------------------------------------------------------
Expand Down
123 changes: 123 additions & 0 deletions R/read_dir_transcript.R
@@ -0,0 +1,123 @@
#' Read In Multiple Transcript Files From a Directory
#'
#' Read in multiple transcript files from a directory and create a
#' \code{\link[base]{data.frame}}.
#'
#' @param path Path to the directory.
#' @param col.names A character vector specifying the column names of the
#' transcript columns (document, person, dialogue).
#' @param pattern An optional regular expression. Only file names which match
#' the regular expression will be returned.
#' @param all.files Logical. If \code{FALSE}, only the names of visible files
#' are returned. If \code{TRUE}, all file names will be returned.
#' @param recursive Logical. Should the listing recurse into directories?
#' @param skip Integer; the number of lines of the data file to skip before
#' beginning to read data.
#' @param merge.broke.tot logical. If \code{TRUE} and if the file being read in
#' is .docx with broken space between a single turn of talk read_transcript
#' will attempt to merge these into a single turn of talk.
#' @param header logical. If \code{TRUE} the file contains the names of the
#' variables as its first line.
#' @param dash A character string to replace the en and em dashes special
#' characters (default is to remove).
#' @param ellipsis A character string to replace the ellipsis special characters.
#' @param quote2bracket logical. If \code{TRUE} replaces curly quotes with curly
#' braces (default is \code{FALSE}). If \code{FALSE} curly quotes are removed.
#' @param rm.empty.rows logical. If \code{TRUE}
#' \code{\link[textreadr]{read_transcript}} attempts to remove empty rows.
#' @param na A character string to be interpreted as an \code{NA} value.
#' @param sep The field separator character. Values on each line of the file are
#' separated by this character. The default of \code{NULL} instructs
#' \code{\link[textreadr]{read_transcript}} to use a separator suitable for the file
#' type being read in.
#' @param comment.char A character vector of length one containing a single
#' character or an empty string. Use \code{""} to turn off the interpretation of
#' comments altogether.
#' @param max.person.nchar The max number of characters long names are expected
#' to be. This information is used to warn the user if a separat appears beyond
#' this length in the text.
#' @param \ldots ignored.
#' @return Returns a dataframe of documents, dialogue, and people.
#' @export
#' @seealso read_transcript
#' @examples
#' skips <- c(0, 1, 1, 0, 0, 1)
#' path <- system.file("docs/transcripts", package = 'textreadr')
#' textreadr::peek(read_dir_transcript(path, skip = skips), Inf)
#'
#' \dontrun{
#' ## with additional cleaning
#' library(tidyverse, textshape, textclean)
#'
#' path %>%
#' read_dir_transcript(skip = skips) %>%
#' textclean::filter_row("Person", "^\\[") %>%
#' mutate(
#' Person = stringi::stri_replace_all_regex(Person, "(^/\\s*)|(:\\s*$)", "") %>%
#' trimws(),
#' Dialogue = stringi::stri_replace_all_regex(Dialogue, "(^/\\s*)", "")
#' ) %>%
#' peek(Inf)
#' }
read_dir_transcript <- function(path, col.names = c("Document", "Person", "Dialogue"),
pattern = NULL, all.files = FALSE,
recursive = FALSE, skip = 0, merge.broke.tot = TRUE, header = FALSE, dash = "", ellipsis = "...",
quote2bracket = FALSE, rm.empty.rows = TRUE, na = "", sep = NULL,
comment.char = "", max.person.nchar = 20, ...) {

to_read_in <- list_files(path, all.files = all.files, full.names = TRUE, recursive = recursive)
if (identical(character(0), to_read_in)) {
stop("The following location does not appear to contain files:\n -", path)
}

if (length(skip) == 1) skip <- rep(skip, length(to_read_in))
if (length(merge.broke.tot) == 1) merge.broke.tot <- rep(merge.broke.tot, length(to_read_in))
if (length(header) == 1) header<- rep(header, length(to_read_in))
if (length(dash) == 1) dash <- rep(dash, length(to_read_in))
if (length(ellipsis) == 1) ellipsis <- rep(ellipsis, length(to_read_in))
if (length(quote2bracket) == 1) quote2bracket <- rep(quote2bracket, length(to_read_in))
if (length(rm.empty.rows) == 1) rm.empty.rows <- rep(rm.empty.rows, length(to_read_in))
if (length(na) == 1) na <- rep(na, length(to_read_in))
if (is.null(sep)) sep <- lapply(seq_along(to_read_in), function(i) NULL)
if (length(sep) == 1) sep <- rep(sep, length(to_read_in))
if (length(comment.char) == 1) comment.char <- rep(comment.char, length(to_read_in))
if (length(max.person.nchar) == 1) max.person.nchar <- rep(max.person.nchar, length(to_read_in))

arg_list <- list(skip, merge.broke.tot, header, dash, ellipsis, quote2bracket,
rm.empty.rows, na, sep, comment.char, max.person.nchar
)

arg_nms <- c('skip', 'merge.broke.tot', 'header', 'dash', 'ellipsis', 'quote2bracket',
'rm.empty.rows', 'na', 'sep', 'comment.char', 'max.person.nchar'
)

lapply(seq_along(arg_list), function(i){

if(length(arg_list[[i]]) != length(to_read_in)) stop(paste0('`', arg_nms[i], '` is not of length 1 or the same length as files in `path`'))
})

#paste(c('skip', 'merge.broke.tot', 'header', 'dash', 'ellipsis',
# 'quote2bracket', 'rm.empty.rows', 'na', 'sep',
# 'comment.char', 'max.person.nchar'), collapse=", ")

reads <- Map(function(x, y1, y2, y3, y4, y5, y6, y7, y8, y9, y10, y11) {
try(read_transcript(x, col.names = col.names[-1], skip = y1,
merge.broke.tot = y2, header = y3, dash = y4, ellipsis = y5,
quote2bracket = y6, rm.empty.rows = y7, na = y8, sep = y9,
comment.char = y10, max.person.nchar = y11, ...))
}, to_read_in, skip, merge.broke.tot, header, dash, ellipsis,
quote2bracket, rm.empty.rows, na, sep, comment.char, max.person.nchar)

names(reads) <- tools::file_path_sans_ext(basename(to_read_in))

goods <- !sapply(reads, inherits, 'try-error')
if (any(!goods)) {
warning(paste0("The following files did not read in correctly:\n",
paste0(' - ', to_read_in[!goods], collapse = "\n")
))
}
textshape::tidy_list(reads[goods], col.names[1])

}


2 changes: 1 addition & 1 deletion R/read_transcript.R
@@ -1,4 +1,4 @@
#' read_transcripts Into R
#' Read Transcripts Into R
#'
#' Read .docx, .csv, .xlsx, .xlsx, or .txt transcript style files into R.
#'
Expand Down

0 comments on commit c9db109

Please sign in to comment.