improve import from redirect URLs or URLs w/o extensions (closes #36)

gesistsa · Dec 22, 2015 · 2172307 · 2172307
1 parent 2e1fa8b
commit 2172307
Show file tree

Hide file tree

Showing 5 changed files with 43 additions and 16 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -11,7 +11,7 @@ Authors@R: c(person("Chung-hong", "Chan", role = "aut", email = "chainsawtiney@g
              person("Ista", "Zahn", role = "aut"))
 Description: Streamlined data import and export by making assumptions that the user is probably willing to make: 'import()' and 'export()' determine the data structure from the file extension, reasonable defaults are used for data import and export (e.g., 'stringsAsFactors=FALSE'), web-based import is natively supported (including from SSL/HTTPS), compressed files can be read directly without explicit decompression, and fast import packages are used where appropriate.
 Depends: R (>= 2.15.0)
-Imports: tools, utils, urltools, foreign, haven, longurl, openxlsx, readODS, jsonlite, XML, curl (>= 0.6), data.table (>= 1.9.5), readxl, yaml
+Imports: tools, utils, urltools, foreign, haven, openxlsx, readODS, jsonlite, XML, curl (>= 0.6), data.table (>= 1.9.5), readxl, yaml
 Suggests: bit64, testthat, knitr, magrittr
 License: GPL-2
 VignetteBuilder: knitr
diff --git a/NAMESPACE b/NAMESPACE
@@ -20,4 +20,3 @@ importFrom(jsonlite, fromJSON, toJSON)
 importFrom(openxlsx, read.xlsx, write.xlsx)
 importFrom(readxl, read_excel)
 importFrom(curl, curl_download, curl_fetch_memory)
-importFrom(longurl, expand_urls)
diff --git a/NEWS b/NEWS
@@ -1,5 +1,7 @@
 # CHANGES TO v0.2.5 #
 
+ * If file format for a remote file cannot be identified from the supplied URL or the final URL reported by `curl::curl_fetch_memory()`, the HTTP headers are checked for a filename in the Content-Disposition header. (#36)
+ * Removed longurl dependency. This is no longer needed because we can identify formats using curl's url argument.
  * Fixed a bug related to importing European-style ("csv2") format files. (#44)
  * Updated CSVY import to embed variable-level metadata. (#52)
  * Use `urltools::url_parse()` to extract file extensions from complex URLs (e.g., those with query arguments). (#56)

diff --git a/R/import.R b/R/import.R
@@ -167,25 +167,52 @@ import.clipboard <- function(header = TRUE, sep = "\t", ...) {
     }
 }
 
-import <- function(file, format, setclass, expandurl = TRUE, ...) {
+import <- function(file, format, setclass, ...) {
     if (grepl("^http.*://", file)) {
-        if(missing(format)) {
-            if (isTRUE(expandurl)) {
-                l_url <- expand_urls(file, warn = FALSE, .progress = FALSE)
-                if (!is.na(l_url$expanded_url[1])) {
-                    file <- l_url$expanded_url[1]
-                }
-            }
-            fmt <- get_ext(file)
-        } else {
+        if (!missing(format)) {
             fmt <- get_type(format)
+        } 
+        # try to extract format from URL
+        try(fmt <- get_ext(file), silent = TRUE)
+        if (inherits(fmt, "try-error")) {
+            fmt <- "TMP"
         }
+        # save file locally
         temp_file <- tempfile(fileext = paste0(".", fmt))
         on.exit(unlink(temp_file))
         u <- curl_fetch_memory(file)
         writeBin(object = u$content, con = temp_file)
-        #parse_headers(u$headers) # placeholder
-        file <- temp_file
+
+        if (fmt == "TMP") {
+            # try to extract format from curl's final URL
+            try(fmt <- get_ext(u$url), silent = TRUE)
+            if (inherits(fmt, "try-error")) {
+                # try to extract format from headers
+                h1 <- parse_headers(u$headers)
+                # check `Content-Disposition` header
+                if (any(grepl("^Content-Disposition", h1))) {
+                    h <- h1[grep("filename", h1)]
+                    if (length(h)) {
+                        file <- regmatches(h, regexpr("(?<=\")(.*)(?<!\")", h, perl = TRUE))
+                        if (!length(file)) {
+                            file <- regmatches(h, regexpr("(?<=filename=)(.*)", h, perl = TRUE))
+                        }
+                        file <- paste0(dirname(temp_file), "/", file)
+                        file.rename(temp_file, file)
+                    }
+                }
+                # check `Content-Type` header
+                #if (any(grepl("^Content-Type", h1))) {
+                #    h <- h1[grep("^Content-Type", h1)]
+                #    ## PARSE MIME TYPE
+                #}
+            } else {
+                file <- sub("TMP$", fmt, temp_file)
+                file.rename(temp_file, file)
+            }
+        } else {
+            file <- temp_file
+        }
     }
     if (grepl("zip$", file)) {
         file <- parse.zip(file)

diff --git a/man/import.Rd b/man/import.Rd
@@ -2,13 +2,12 @@
 \alias{import}
 \title{Read data.frame or matrix from a file}
 \usage{
-    import(file, format, setclass, expandurl = TRUE, ...)
+    import(file, format, setclass, ...)
 }
 \arguments{
     \item{file}{A character string naming a file, URL, or single-file .zip or .tar archive.}
     \item{format}{An optional character string code of file format, which can be used to override the format inferred from \code{file}. Shortcuts include: \dQuote{,} (for comma-separated values), \dQuote{;} (for semicolon-separated values), and \dQuote{|} (for pipe-separated values).}
     \item{setclass}{An optional character vector specifying one or more classes to set on the import. By default, all the return object is always a \dQuote{data.frame}. Reasonable values for this might be \dQuote{tbl_df} (if using dplyr) or \dQuote{data.table} (if using data.table). Warnings will be produced if a class is used from a package that is not loaded and/or available.}
-    \item{expandurl}{Logical, whether or not to use \code{\link{longurl}} to expand a \code{file} that is specified through a shortened URL so that its \code{format} can be automatically determined.}
     \item{...}{Additional arguments passed to the underlying import functions. For example, this can control column classes for delimited file types, or control the use of haven for Stata and SPSS or readxl for Excel (.xlsx) format. See details below.}
 }
 \value{An R data.frame. If \code{setclass} is used, this data.frame may have additional class attribute values.}