Skip to content

Commit

Permalink
improve import from redirect URLs or URLs w/o extensions (closes #36)
Browse files Browse the repository at this point in the history
  • Loading branch information
leeper committed Dec 22, 2015
1 parent 2e1fa8b commit 2172307
Show file tree
Hide file tree
Showing 5 changed files with 43 additions and 16 deletions.
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ Authors@R: c(person("Chung-hong", "Chan", role = "aut", email = "chainsawtiney@g
person("Ista", "Zahn", role = "aut"))
Description: Streamlined data import and export by making assumptions that the user is probably willing to make: 'import()' and 'export()' determine the data structure from the file extension, reasonable defaults are used for data import and export (e.g., 'stringsAsFactors=FALSE'), web-based import is natively supported (including from SSL/HTTPS), compressed files can be read directly without explicit decompression, and fast import packages are used where appropriate.
Depends: R (>= 2.15.0)
Imports: tools, utils, urltools, foreign, haven, longurl, openxlsx, readODS, jsonlite, XML, curl (>= 0.6), data.table (>= 1.9.5), readxl, yaml
Imports: tools, utils, urltools, foreign, haven, openxlsx, readODS, jsonlite, XML, curl (>= 0.6), data.table (>= 1.9.5), readxl, yaml
Suggests: bit64, testthat, knitr, magrittr
License: GPL-2
VignetteBuilder: knitr
1 change: 0 additions & 1 deletion NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -20,4 +20,3 @@ importFrom(jsonlite, fromJSON, toJSON)
importFrom(openxlsx, read.xlsx, write.xlsx)
importFrom(readxl, read_excel)
importFrom(curl, curl_download, curl_fetch_memory)
importFrom(longurl, expand_urls)
2 changes: 2 additions & 0 deletions NEWS
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
# CHANGES TO v0.2.5 #

* If file format for a remote file cannot be identified from the supplied URL or the final URL reported by `curl::curl_fetch_memory()`, the HTTP headers are checked for a filename in the Content-Disposition header. (#36)
* Removed longurl dependency. This is no longer needed because we can identify formats using curl's url argument.
* Fixed a bug related to importing European-style ("csv2") format files. (#44)
* Updated CSVY import to embed variable-level metadata. (#52)
* Use `urltools::url_parse()` to extract file extensions from complex URLs (e.g., those with query arguments). (#56)
Expand Down
51 changes: 39 additions & 12 deletions R/import.R
Original file line number Diff line number Diff line change
Expand Up @@ -167,25 +167,52 @@ import.clipboard <- function(header = TRUE, sep = "\t", ...) {
}
}

import <- function(file, format, setclass, expandurl = TRUE, ...) {
import <- function(file, format, setclass, ...) {
if (grepl("^http.*://", file)) {
if(missing(format)) {
if (isTRUE(expandurl)) {
l_url <- expand_urls(file, warn = FALSE, .progress = FALSE)
if (!is.na(l_url$expanded_url[1])) {
file <- l_url$expanded_url[1]
}
}
fmt <- get_ext(file)
} else {
if (!missing(format)) {
fmt <- get_type(format)
}
# try to extract format from URL
try(fmt <- get_ext(file), silent = TRUE)
if (inherits(fmt, "try-error")) {
fmt <- "TMP"
}
# save file locally
temp_file <- tempfile(fileext = paste0(".", fmt))
on.exit(unlink(temp_file))
u <- curl_fetch_memory(file)
writeBin(object = u$content, con = temp_file)
#parse_headers(u$headers) # placeholder
file <- temp_file

if (fmt == "TMP") {
# try to extract format from curl's final URL
try(fmt <- get_ext(u$url), silent = TRUE)
if (inherits(fmt, "try-error")) {
# try to extract format from headers
h1 <- parse_headers(u$headers)
# check `Content-Disposition` header
if (any(grepl("^Content-Disposition", h1))) {
h <- h1[grep("filename", h1)]
if (length(h)) {
file <- regmatches(h, regexpr("(?<=\")(.*)(?<!\")", h, perl = TRUE))
if (!length(file)) {
file <- regmatches(h, regexpr("(?<=filename=)(.*)", h, perl = TRUE))
}
file <- paste0(dirname(temp_file), "/", file)
file.rename(temp_file, file)
}
}
# check `Content-Type` header
#if (any(grepl("^Content-Type", h1))) {
# h <- h1[grep("^Content-Type", h1)]
# ## PARSE MIME TYPE
#}
} else {
file <- sub("TMP$", fmt, temp_file)
file.rename(temp_file, file)
}
} else {
file <- temp_file
}
}
if (grepl("zip$", file)) {
file <- parse.zip(file)
Expand Down
3 changes: 1 addition & 2 deletions man/import.Rd
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,12 @@
\alias{import}
\title{Read data.frame or matrix from a file}
\usage{
import(file, format, setclass, expandurl = TRUE, ...)
import(file, format, setclass, ...)
}
\arguments{
\item{file}{A character string naming a file, URL, or single-file .zip or .tar archive.}
\item{format}{An optional character string code of file format, which can be used to override the format inferred from \code{file}. Shortcuts include: \dQuote{,} (for comma-separated values), \dQuote{;} (for semicolon-separated values), and \dQuote{|} (for pipe-separated values).}
\item{setclass}{An optional character vector specifying one or more classes to set on the import. By default, all the return object is always a \dQuote{data.frame}. Reasonable values for this might be \dQuote{tbl_df} (if using dplyr) or \dQuote{data.table} (if using data.table). Warnings will be produced if a class is used from a package that is not loaded and/or available.}
\item{expandurl}{Logical, whether or not to use \code{\link{longurl}} to expand a \code{file} that is specified through a shortened URL so that its \code{format} can be automatically determined.}
\item{...}{Additional arguments passed to the underlying import functions. For example, this can control column classes for delimited file types, or control the use of haven for Stata and SPSS or readxl for Excel (.xlsx) format. See details below.}
}
\value{An R data.frame. If \code{setclass} is used, this data.frame may have additional class attribute values.}
Expand Down

0 comments on commit 2172307

Please sign in to comment.