diff --git a/DESCRIPTION b/DESCRIPTION index 8ec2e74..3a24e6c 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: dplyr Type: Package Title: dplyr: a grammar of data manipulation -Version: 0.1.2 +Version: 0.1.3 Author: Hadley Wickham , Romain Francois Maintainer: Hadley Wickham @@ -16,7 +16,7 @@ Suggests: hflights, RSQLite, RSQLite.extfuns, RMySQL, RPostgreSQL, ggplot2, mgcv VignetteBuilder: knitr LazyData: yes -LinkingTo: Rcpp (>= 0.11.0), BH (>= 1.51.0-2) +LinkingTo: Rcpp (>= 0.11.1), BH (>= 1.51.0-2) License: MIT + file LICENSE Collate: 'RcppExports.R' 'all-equal.r' 'base.R' 'bench-compare.r' 'cbind.r' 'chain.r' 'compute-collect.r' 'copy-to.r' @@ -35,7 +35,7 @@ Collate: 'RcppExports.R' 'all-equal.r' 'base.R' 'bench-compare.r' 'translate-sql-helpers.r' 'translate-sql-base.r' 'translate-sql-window.r' 'translate-sql.r' 'type-sum.r' 'utils-format.r' 'utils.r' 'view.r' 'zzz.r' -Packaged: 2014-02-24 14:57:47 UTC; hadley +Packaged: 2014-03-14 21:16:41 UTC; hadley NeedsCompilation: yes Repository: CRAN -Date/Publication: 2014-02-24 16:36:07 +Date/Publication: 2014-03-15 00:36:22 diff --git a/MD5 b/MD5 index a0e573b..022a38a 100644 --- a/MD5 +++ b/MD5 @@ -1,6 +1,6 @@ -9562a9616e03bc929613661404fcfa2b *DESCRIPTION +ffb471b74ddb3ed14c68b94e80c8bd2f *DESCRIPTION efcdf377730211753577de79ade6efdd *LICENSE -f85798b8637ba953e3809a652e16bb9e *NAMESPACE +dfa9b298237419bb0f9ae6b0ba2fb481 *NAMESPACE a3c77dc900f7307b5c9be0f028c98c54 *R/RcppExports.R 05ca53535d5563dad7a4ab2977d38981 *R/all-equal.r ddb4cf3f09b89bda6a1c338bad01eaaf *R/base.R @@ -14,7 +14,7 @@ ebb12675ad0f086ae1f96b5853f03e1c *R/data-hflights.r 5744860420ce850f3f6f83c02d0d59d4 *R/data-nasa.r dbc4f2616f56258bcf6b76f71fa826ea *R/data-temp.r 273c08017cd317e97b96410c2474924c *R/data.r -4136061f3154d4981dada3bca96f80e0 *R/dbi-s3.r +443e253d381bd462e77b8b6e5ba67ece *R/dbi-s3.r 2ad6e93a659bdb0307275e035028b20d *R/desc.r 10f579a57cc418ae8e83eca8c265b0cc *R/do.r 58978c7bf590a7369381542205230dcd *R/dplyr.r @@ -28,14 +28,14 @@ d6c7d60c41541ff6f61ef15065efc869 *R/grouped-dt.r acb7601f471d804cf744a93944ae5e24 *R/inline.r 2e8f63b45d4c33f74238609015ec43f6 *R/join-df.r fdad6d946b904a191bd0e82ac7addcdb *R/join-dt.r -bdadffe13d2a6ac6b720f606b04919c4 *R/join-sql.r +9a0977d3fa2c7dd664942940de0d1f04 *R/join-sql.r 4294e1dcc0e52baadb5924d69ba7723f *R/join.r 284f0bc5a4ab77b391101a4bd076cd8e *R/lead-lag.R fcf6a838baefa7aa2baafdd320905ff2 *R/location.R e77ce019a92042c7e9aff43b471b274c *R/manip-cube.r 8f9a897715df8722119073e827f541dd *R/manip-df.r -35898e73ac28af17c9ecc267dd5a044b *R/manip-dt.r -32dd0477fa684a840bc0c1429956e6df *R/manip-grouped-dt.r +cad6208c82713597384ad24dd8626e91 *R/manip-dt.r +58dbefbbeef55f115ae53f789a8b1c70 *R/manip-grouped-dt.r da6282cdc3e02970636b1f26db808cb8 *R/manip-sql.r a3d5227861d70494c3f4104d7ab1c274 *R/manip.r be30476e59730e96b59f0441bf618e2e *R/nth-value.R @@ -69,7 +69,7 @@ a0d09b171ea3b2ae82265876b9bedec3 *R/tbl-df.r 34f0eea158e652e12db73e9c39e2f884 *R/translate-sql-window.r b15b25978391ae58cac82a91b659f865 *R/translate-sql.r e51520b2617e10c424fd0b45beaa259d *R/type-sum.r -85f584c1d3d83e9c3f7f9bc3866e17b2 *R/utils-format.r +ab8a65154aade259dba4b996121aab8c *R/utils-format.r 6491444d141099b758785078e6ccaf5b *R/utils.r 4d9128b66f58cee5fcb57a8e470c82c4 *R/view.r 1de6fcb5d9c556ccd931f51f425d946d *R/zzz.r @@ -80,21 +80,21 @@ db40a0145d2a88069865e7f18d3dcf1f *data/nasa.rda f3b987de99285483ea0cae5028207cfe *demo/bench-merge.R d328eee82d3c54cb13af35bb696caaa6 *demo/bench-rbind.R d6bff9b2006cfdf6fe61a1b6cffcd285 *demo/bench-set.R -ee27bbc32ea6fa4f471537ab2a629450 *inst/doc/databases.R -fec565e75f4edbf88fd294bf88fe4183 *inst/doc/databases.Rmd -3cc882bcfad57b5abf4b651d7f4ce4f9 *inst/doc/databases.html +f2c42b95eb8ebac57ee3bb91a62c4324 *inst/doc/databases.R +3503bfe827ec6123450c6de9da45f445 *inst/doc/databases.Rmd +d0119a9073373dad4968fcc0c4993320 *inst/doc/databases.html 53c28c818e61a19907db6a0b76786943 *inst/doc/hybrid-evaluation.R 5b90937d6069340c27f29589f8dd478f *inst/doc/hybrid-evaluation.Rmd 35e707c0233a6d87e5aec274bf8cb32a *inst/doc/hybrid-evaluation.html 7880dcbbc2ef5a9066a47fdf72d9b53f *inst/doc/introduction.R -05e037a31750ef8fd422dcff0b40b111 *inst/doc/introduction.Rmd -da8a118d325a2cf376a1f8f70b994db6 *inst/doc/introduction.html -aaa2bd4eeb0c7ba04eedaf49172b523e *inst/doc/window-functions.R -ff626a70b5119f8bcbe51f06cf57f892 *inst/doc/window-functions.Rmd -ae44f8b30264822383b34920767f7b9b *inst/doc/window-functions.html +fc3fd6d6c9a7bae5339eba50b18e0279 *inst/doc/introduction.Rmd +8d930dd5c36db6a1e44572eee4048335 *inst/doc/introduction.html +6ee55a112c571a3ffa65271b6a6946b5 *inst/doc/window-functions.R +18c6d2bdc11f9774d76bbc372b0c23ec *inst/doc/window-functions.Rmd +9d0ebffd3c9c19aaf3893d852abe0f03 *inst/doc/window-functions.html 2e55b2b2c065264bf676715799917446 *inst/include/dplyr.h d712283197ffe37c10fb60ef728a4652 *inst/include/dplyr/BoolResult.h -38447847a9619459158c71a8d3a362e6 *inst/include/dplyr/Collecter.h +95bbd7e79710004cbcea0d4c3e864473 *inst/include/dplyr/Collecter.h a95ac349adb05141ec17bc25ce8dcfd9 *inst/include/dplyr/DataFrameJoinVisitors.h 72ae8dca809b468f4c674062b8d90079 *inst/include/dplyr/DataFrameVisitors.h e7f539d2e686294c257d5dd6edab558b *inst/include/dplyr/DataFrameVisitorsIndexMap.h @@ -102,9 +102,9 @@ ec9e401da57308dad4df9f919a70262b *inst/include/dplyr/DataFrameVisitorsIndexSet.h 9c2ddb8a034638bbf84c729edb7764e1 *inst/include/dplyr/EmptySubset.h 0b201d156dd149ac2214246d83e0587c *inst/include/dplyr/FullDataFrame.h 07d812af781c7a597cf4e7cc7d1f8037 *inst/include/dplyr/Gatherer.h -180bf19356abec5515724dbd863af7b6 *inst/include/dplyr/GroupedDataFrame.h +899a9a8db739204288f693caf1ea69fd *inst/include/dplyr/GroupedDataFrame.h 81d6731f78b1e17697aaa93a465955d4 *inst/include/dplyr/JoinVisitor.h -c3bf69c1340361c1854ab5777db52675 *inst/include/dplyr/JoinVisitorImpl.h +8cf938eb62881cf95dc7786f4b481dcf *inst/include/dplyr/JoinVisitorImpl.h 5f2cab6f3f7d5d4d4c6c115856513617 *inst/include/dplyr/NamedListAccumulator.h ba58b85eebf33f253f856976b8b401c3 *inst/include/dplyr/Order.h 5633c9fe5ef847bf5906eb29815ddff5 *inst/include/dplyr/OrderVisitor.h @@ -149,7 +149,7 @@ a4a7b9d8ff53dfa73eb0feca626a6ea1 *inst/include/dplyr/Result/max.h d3e3361e8ab107064f07e0822e3f1f65 *inst/include/dplyr/Result/min.h baf9bbb433b610536594c214d68b42dc *inst/include/dplyr/SummarisedVariable.h 60bede6dba3153744e89c578d67394a1 *inst/include/dplyr/VectorVisitor.h -252a2841f207733058e3b3dca2f2c243 *inst/include/dplyr/VectorVisitorImpl.h +b8d26aff32b8d40719eebdda220b05dd *inst/include/dplyr/VectorVisitorImpl.h 0a2ea8b5c04bb2b4b3b4d0e769a8b690 *inst/include/dplyr/check_supported_type.h e829621788476eb0ade32804c9da173b *inst/include/dplyr/comparisons.h 7d4d427a6dc08dc4e08140771945406c *inst/include/dplyr/registration.h @@ -184,7 +184,7 @@ cd00f5b80e3fe023bb254a819e17a572 *inst/include/tools/hash.h 41148b4d529eef9ebf82f4719cdb48dd *inst/include/tools/tools.h e8314bb53785945b394173724249edb0 *inst/include/tools/wrap_subset.h da3fe6cebc883b4d0581bdcda1ca7146 *inst/tests/helper-data.r -376fed4bfd27f2211423537539b88319 *inst/tests/test-arrange.r +1a8aa7229346de4146476cd2992cacbc *inst/tests/test-arrange.r 4d52dee57e752f870273d56d01904474 *inst/tests/test-cbind.R 39988efc666e80566c47eb579443096d *inst/tests/test-copying.R 7428eb39633a9cdcba500c2550774a43 *inst/tests/test-count.r @@ -195,13 +195,13 @@ b8182a80cabc5e0dd47a5a76a9d4c66c *inst/tests/test-equiv-manip.r e4a0af09c1235803412874a1e43d3088 *inst/tests/test-filter-windowed.R 9667de562a8713b78df2577583c0f593 *inst/tests/test-filter.r 8bfc92dd97695014b3b0fe5567ef71bf *inst/tests/test-group-by.r -9d71f585287a49567ca67e8337afd40c *inst/tests/test-joins.r +00770ba2f1c5bca8f81bc901c0c6b86f *inst/tests/test-joins.r f24c693643de1ea17b87c1df9d32d1ec *inst/tests/test-lead-lag.R 3566efc0ada3ac3ad8c09f6d9e3ce2c2 *inst/tests/test-mutate-windowed.R 0775fdf22738928c152a63a0448bfdf2 *inst/tests/test-mutate.r 881e7e5e30b63f953657d5b89532e760 *inst/tests/test-nth-value.R -42cb5694e5d14710fa3ecaf6ce519035 *inst/tests/test-rbind.r -4bdde1886d71f2f89d58b3563db309df *inst/tests/test-select.r +e284b22d0a7d0ca0264526e6dfc9847e *inst/tests/test-rbind.r +8d106d9d27aec54eede28bfdfbd9c918 *inst/tests/test-select.r bf6c33082e25991bc81b4bbc54ea13fd *inst/tests/test-sql-escape.r 0f1b6d7ab07eb412c9fecf2c76748921 *inst/tests/test-sql-translation.r e1a5cffac4ebe5f0d9a773f5c6b0c37a *inst/tests/test-sqlite-do.r @@ -234,7 +234,7 @@ f79bf9c4ca8f9b3b810419356f654590 *man/hflights_df.Rd 7c40154e5820f7993d5d12a50d70fa20 *man/join.Rd 28ab0e094aafee09c186b8b8cf0ba5a2 *man/join.tbl_df.Rd 0e46ac1917fd5bc3e9e36218705afe81 *man/join.tbl_dt.Rd -09d836d3bf214774f4a9f20eba0fc686 *man/join.tbl_sql.Rd +69f8f22450c6ee248b0db2571c5930e4 *man/join.tbl_sql.Rd 6a81c20b2f495f643fc4edd5055592e9 *man/lahman.Rd 4741950e34ee9a9e0d33b127f8fcb319 *man/lead-lag.Rd 89c5b5da3f33d9e875975668b0b50e2a *man/location.Rd @@ -282,19 +282,19 @@ dde90f02a4ecf1906e681d2ad580bccf *src/Makevars dde90f02a4ecf1906e681d2ad580bccf *src/Makevars.win f01fca84c3e964251c648d7aca863b80 *src/RcppExports.cpp a6c5f62c1c14b7794bc7bfcc59ca8c77 *src/address.cpp -8fb59ae44735657c3c6176840653d6ff *src/dplyr.cpp +f22b7e38a7ed37e5a5cd19a936d2fea1 *src/dplyr.cpp 90995377fb4770719a3271327bc5d3fd *src/filter.cpp e2cbde53c2cbc6feefbbbd265d9347c7 *src/init.cpp 6f882eb21a671ae3f49e2a7c9529c43c *src/window.cpp 3a7a51bb5059fdf3d254d644385012d5 *tests/test-all.R -fec565e75f4edbf88fd294bf88fe4183 *vignettes/databases.Rmd +3503bfe827ec6123450c6de9da45f445 *vignettes/databases.Rmd 82084b31c1380fe79dd08e41069b8de3 *vignettes/disabled/benchmark-baseball.Rmd 5b90937d6069340c27f29589f8dd478f *vignettes/hybrid-evaluation.Rmd -05e037a31750ef8fd422dcff0b40b111 *vignettes/introduction.Rmd +fc3fd6d6c9a7bae5339eba50b18e0279 *vignettes/introduction.Rmd 50c26b952a43373d699173780d05a337 *vignettes/joins.graffle 391a1e1601255c9e368dbd32b68b6126 *vignettes/notes/mysql-setup.Rmd 5c93a8a98d068f0f241879dbe7e21ef9 *vignettes/notes/postgres-setup.Rmd ef5a210df50e79ac302dd92dd1e15ffb *vignettes/notes/vagrant-setup.Rmd -ff626a70b5119f8bcbe51f06cf57f892 *vignettes/window-functions.Rmd +18c6d2bdc11f9774d76bbc372b0c23ec *vignettes/window-functions.Rmd 83cdde894e0c44ffda5a9dbae3c80092 *vignettes/windows.graffle 2cc473a6bd316193615aee5045fcc835 *vignettes/windows.png diff --git a/NAMESPACE b/NAMESPACE index 76b2a93..65ac4e7 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -182,6 +182,7 @@ S3method(sql_create_indexes,MySQLConnection) S3method(sql_insert_into,MySQLConnection) S3method(sql_insert_into,PostgreSQLConnection) S3method(sql_insert_into,SQLiteConnection) +S3method(sql_select,DBIConnection) S3method(src_tbls,src_local) S3method(src_tbls,src_sql) S3method(summarise,data.frame) diff --git a/R/dbi-s3.r b/R/dbi-s3.r index 0484c23..f92639b 100644 --- a/R/dbi-s3.r +++ b/R/dbi-s3.r @@ -397,7 +397,12 @@ sql_analyze.MySQLConnection <- function(con, table) { qry_run(con, sql) } -sql_select <- function(con, select, from, where = NULL, group_by = NULL, +sql_select <- function(con, ...) { + UseMethod("sql_select") +} + +#' @export +sql_select.DBIConnection <- function(con, select, from, where = NULL, group_by = NULL, having = NULL, order_by = NULL, limit = NULL, offset = NULL) { diff --git a/R/join-sql.r b/R/join-sql.r index afc4820..d2ac94f 100644 --- a/R/join-sql.r +++ b/R/join-sql.r @@ -64,7 +64,7 @@ #' semi_join(people, hof) #' #' # All people not in the hall of fame -#' semi_join(people, hof, anti = TRUE) +#' anti_join(people, hof) #' #' # Find all managers #' manager <- tbl(lahman_sqlite(), "Managers") @@ -118,21 +118,24 @@ join_sql <- function(x, y, type, by = NULL, copy = FALSE, auto_index = FALSE, ...) { type <- match.arg(type, c("left", "right", "inner", "full")) by <- by %||% common_by(x, y) - + y <- auto_copy(x, y, copy, indexes = if (auto_index) list(by)) # Ensure tables have unique names x_names <- auto_names(x$select) y_names <- auto_names(y$select) - + uniques <- unique_names(x_names, y_names, by) - if (!is.null(uniques)) { + if (is.null(uniques)) { + sel_vars <- c(x_names, y_names) + } else { x <- update(x, select = setNames(x$select, uniques$x)) - y <- update(x, select = setNames(y$select, uniques$y)) + y <- update(y, select = setNames(y$select, uniques$y)) + + sel_vars <- unique(c(uniques$x, uniques$y)) } + vars <- lapply(c(by, setdiff(sel_vars, by)), as.name) - vars <- lapply(c(by, setdiff(c(x_names, y_names), by)), as.name) - join <- switch(type, left = sql("LEFT"), inner = sql("INNER"), right = stop("Right join not supported", call. = FALSE), full = stop("Full join not supported", call. = FALSE)) @@ -154,7 +157,7 @@ is.join <- function(x) { semi_join_sql <- function(x, y, anti = FALSE, by = NULL, copy = FALSE, auto_index = FALSE, ...) { - + by <- by %||% common_by(x, y) y <- auto_copy(x, y, copy, indexes = if (auto_index) list(by)) @@ -162,8 +165,8 @@ semi_join_sql <- function(x, y, anti = FALSE, by = NULL, copy = FALSE, by_escaped <- escape(ident(by), collapse = NULL, con = con) left <- escape(ident("_LEFT"), con = con) right <- escape(ident("_RIGHT"), con = con) - - join <- sql(paste0(left, ".", by_escaped, " = ", right, ".", by_escaped, + + join <- sql(paste0(left, ".", by_escaped, " = ", right, ".", by_escaped, collapse = " AND ")) from <- build_sql( diff --git a/R/manip-dt.r b/R/manip-dt.r index 47be50b..ed49379 100644 --- a/R/manip-dt.r +++ b/R/manip-dt.r @@ -123,7 +123,10 @@ arrange.tbl_dt <- function(.data, ...) { #' @export select.data.table <- function(.data, ...) { vars <- select_vars(names(.data), ..., env = parent.frame()) - .data[, vars, drop = FALSE, with = FALSE] + + out <- .data[, vars, drop = FALSE, with = FALSE] + setnames(out, names(vars)) + out } #' @export diff --git a/R/manip-grouped-dt.r b/R/manip-grouped-dt.r index a5a80ff..83e0918 100644 --- a/R/manip-grouped-dt.r +++ b/R/manip-grouped-dt.r @@ -141,6 +141,7 @@ arrange.grouped_dt <- function(.data, ...) { select.grouped_dt <- function(.data, ...) { vars <- select_vars(names(.data), ..., env = parent.frame()) out <- .data[, vars, drop = FALSE, with = FALSE] + setnames(out, names(vars)) grouped_dt( data = out, diff --git a/R/utils-format.r b/R/utils-format.r index 5935f6f..fcc5bc7 100644 --- a/R/utils-format.r +++ b/R/utils-format.r @@ -15,7 +15,7 @@ dim_desc <- function(x) { d <- dim(x) d2 <- format(d, big.mark = ",", justify = "none", trim = TRUE) d2[is.na(d)] <- "??" - + paste0("[", paste0(d2, collapse = " x "), "]") } @@ -24,16 +24,18 @@ dim_desc <- function(x) { trunc_mat <- function(x, n = NULL) { rows <- nrow(x) if (!is.na(rows) && rows == 0) return() - + if (is.null(n)) { if (is.na(rows) || rows > getOption("dplyr.print_max")) { - n <- getOption("dplyr.print_min") + n <- getOption("dplyr.print_min") } else { n <- rows } } - + df <- as.data.frame(head(x, n)) + if (nrow(df) == 0) return() + mat <- format(df, justify = "left") width <- getOption("width") @@ -50,7 +52,7 @@ trunc_mat <- function(x, n = NULL) { df[[1]] <- substr(df[[1]], 1, width) } shrunk <- format(df[, !too_wide, drop = FALSE]) - + needs_dots <- is.na(rows) || rows > n if (needs_dots) { dot_width <- pmin(w[-1][!too_wide], 3) @@ -64,14 +66,14 @@ trunc_mat <- function(x, n = NULL) { vars <- colnames(mat)[too_wide] types <- vapply(df[too_wide], type_sum, character(1)) var_types <- paste0(vars, " (", types, ")", collapse = ", ") - + cat(wrap("Variables not shown: ", var_types), "\n", sep = "") } } wrap <- function(..., indent = 0) { x <- paste0(..., collapse = "") - wrapped <- strwrap(x, indent = indent, exdent = indent + 2, + wrapped <- strwrap(x, indent = indent, exdent = indent + 2, width = getOption("width")) paste0(wrapped, collapse = "\n") } diff --git a/inst/doc/databases.R b/inst/doc/databases.R index 43ba7d6..bbda378 100644 --- a/inst/doc/databases.R +++ b/inst/doc/databases.R @@ -83,13 +83,13 @@ translate_sql(1L) ## ------------------------------------------------------------------------ -translate_sql(glob(x, y)) +translate_sql(glob(x, y)) translate_sql(x %like% "ab*") ## ------------------------------------------------------------------------ planes <- group_by(hflights_sqlite, TailNum) -delay <- summarise(planes, +delay <- summarise(planes, count = n(), dist = mean(Distance), delay = mean(ArrDelay) @@ -107,12 +107,12 @@ if (has_lahman("postgres")) { ## ------------------------------------------------------------------------ if (has_lahman("postgres")) { daily <- group_by(hflights_postgres, Year, Month, DayofMonth) - + # Find the most and least delayed flight each day - bestworst <- filter(daily, ArrDelay == min(ArrDelay) || + bestworst <- filter(daily, ArrDelay == min(ArrDelay) || ArrDelay == max(ArrDelay)) bestworst$query - + # Rank each flight within a daily ranked <- mutate(daily, rank = rank(desc(ArrDelay))) ranked$query diff --git a/inst/doc/databases.Rmd b/inst/doc/databases.Rmd index 4d696d7..e2e98cd 100644 --- a/inst/doc/databases.Rmd +++ b/inst/doc/databases.Rmd @@ -19,9 +19,9 @@ As well as working with local in-memory data like data frames and data tables, d Since R almost exclusively works with in-memory data, if you do have a lot of data in a database, you can't just dump it into R. Instead, you'll have to work with subsets or aggregates, and dplyr aims to make that as easy as possible. If you're working with large data, it's also likely that you'll need support to get the data into the database and to ensure you have the right indices for good performance. dplyr provides some simple tools to help with these tasks but they are no substitute for a local expert. -The motivation for supporting databases in dplyr is that you never pull down the right subset or aggregate from the database the first time, and usually you have to iterate between R and SQL many times before you get the perfect dataset. Switching between languages is cognitively challenging (especially because R and SQL are so perilously similar), so dplyr allows you to write R code that is automatically translated to SQL. The goal of dplyr is not to replace every SQL function with an R function: that would be difficult and error prone. Instead, dplyr only generates `SELECT` statements, the SQL you write most often as an analyst. +The motivation for supporting databases in dplyr is that you never pull down the right subset or aggregate from the database the first time, and usually you have to iterate between R and SQL many times before you get the perfect dataset. Switching between languages is cognitively challenging (especially because R and SQL are so perilously similar), so dplyr allows you to write R code that is automatically translated to SQL. The goal of dplyr is not to replace every SQL function with an R function: that would be difficult and error prone. Instead, dplyr only generates `SELECT` statements, the SQL you write most often as an analyst. -To get the most out of this chapter, you'll need to be familiar with querying SQL databases using the `SELECT` statement. +To get the most out of this chapter, you'll need to be familiar with querying SQL databases using the `SELECT` statement. If you have some familiarity with SQL and you'd like to learn more, I found [how indexes work in SQLite](http://www.sqlite.org/queryplanner.html) and [10 easy steps to a complete understanding of SQL](http://tech.pro/tutorial/1555/10-easy-steps-to-a-complete-understanding-of-sql) to be particularly helpful. ## Getting started @@ -88,7 +88,7 @@ c3 <- mutate(c2, Speed = Distance / AirTime * 60) c4 <- arrange(c3, Year, Month, DayofMonth, UniqueCarrier) ``` -Suprisingly, this sequence of operations never actually touches the database. It's not until you ask for the data (e.g. by printing `c4`) that dplyr generates the SQL and requests the results from the database, and even then it only pulls down 10 rows. +Suprisingly, this sequence of operations never actually touches the database. It's not until you ask for the data (e.g. by printing `c4`) that dplyr generates the SQL and requests the results from the database, and even then it only pulls down 10 rows. ```{r} c4 @@ -118,7 +118,7 @@ There are three ways to force the computation of a query: * `collect()` executes the query and returns the results to R. -* `compute()` executes the query and stores the results in a temporary table +* `compute()` executes the query and stores the results in a temporary table in the database. * `collapse()` turns the query into a table expresion. @@ -129,17 +129,17 @@ You are most likely to use `collect()`: once you have interactively converged on dplyr tries to prevent you from accidentally performing expensive query operations: -* `nrow()` is always `NA`: in general, there's no way to determine how +* `nrow()` is always `NA`: in general, there's no way to determine how many rows a query will return unless you actually run it. * Printing a tbl only runs the query enough to get the first 10 rows -* You can use `tail()` on database tbls: you can't find the last rows - without executing the whole query. +* You can use `tail()` on database tbls: you can't find the last rows + without executing the whole query. ## SQL translation -When doing simple mathematical operations of the form you normally use when filtering, mutating and summarising it's relatively straightforward to translate R code to SQL (or indeed to any programming language). +When doing simple mathematical operations of the form you normally use when filtering, mutating and summarising it's relatively straightforward to translate R code to SQL (or indeed to any programming language). To experiment with the translation, use `translate_sql()`. The following examples work through some basic differences between R and SQL. @@ -164,8 +164,8 @@ translate_sql(1L) dplyr knows how to convert the following R functions to SQL: * basic math operators: `+`, `-`, `*`, `/`, `%%`, `^` -* math functions: `abs`, `acos`, `acosh`, `asin`, `asinh`, `atan`, `atan2`, - `atanh`, `ceiling`, `cos`, `cosh`, `cot`, `coth`, `exp`, `floor`, +* math functions: `abs`, `acos`, `acosh`, `asin`, `asinh`, `atan`, `atan2`, + `atanh`, `ceiling`, `cos`, `cosh`, `cot`, `coth`, `exp`, `floor`, `log`, `log10`, `round`, `sign`, `sin`, `sinh`, `sqrt`, `tan`, `tanh` * logical comparisons: `<`, `<=`, `!=`, `>=`, `>`, `==`, `%in%` * boolean operations: `&`, `&&`, `|`, `||`, `!`, `xor` @@ -183,7 +183,7 @@ translate_sql(mean(x, trim = T)) Any function that dplyr does't know how to convert it leaves as is - that means if you want to use any other function that database provides, you can use it as is. Here a couple of examples that will work with [SQLite](http://www.sqlite.org/lang_corefunc.html): ```{r} -translate_sql(glob(x, y)) +translate_sql(glob(x, y)) translate_sql(x %like% "ab*") ``` @@ -193,7 +193,7 @@ SQLite lacks window functions, which are needed for grouped mutation and filteri ```{r} planes <- group_by(hflights_sqlite, TailNum) -delay <- summarise(planes, +delay <- summarise(planes, count = n(), dist = mean(Distance), delay = mean(ArrDelay) @@ -231,12 +231,12 @@ The following examples shows the grouped filter and mutate possible with Postgre ```{r} if (has_lahman("postgres")) { daily <- group_by(hflights_postgres, Year, Month, DayofMonth) - + # Find the most and least delayed flight each day - bestworst <- filter(daily, ArrDelay == min(ArrDelay) || + bestworst <- filter(daily, ArrDelay == min(ArrDelay) || ArrDelay == max(ArrDelay)) bestworst$query - + # Rank each flight within a daily ranked <- mutate(daily, rank = rank(desc(ArrDelay))) ranked$query @@ -253,7 +253,7 @@ In terms of functionality, MySQL lies somewhere between SQLite and PostgreSQL. I Bigquery is a hosted database server provided by google. To connect, you need to provide your `project`, `dataset` and optionally a project for `billing` (if billing for `project` isn't enabled). After you create the src, your web browser will open and ask you to authenticate. Your credentials are stored in a local cache, so you should only need to do this once. -Bigquery supports only a single SQL statement: [SELECT](https://developers.google.com/bigquery/query-reference). Fortunately this is all you need for data analysis, and within SELECT bigquery provides comprehensive coverage similar level to postgresql. +Bigquery supports only a single SQL statement: [SELECT](https://developers.google.com/bigquery/query-reference). Fortunately this is all you need for data analysis, and within SELECT bigquery provides comprehensive coverage similar level to postgresql. ## Picking a database diff --git a/inst/doc/databases.html b/inst/doc/databases.html index 2ddd0da..adf69a2 100644 --- a/inst/doc/databases.html +++ b/inst/doc/databases.html @@ -200,9 +200,9 @@

Databases

Since R almost exclusively works with in-memory data, if you do have a lot of data in a database, you can't just dump it into R. Instead, you'll have to work with subsets or aggregates, and dplyr aims to make that as easy as possible. If you're working with large data, it's also likely that you'll need support to get the data into the database and to ensure you have the right indices for good performance. dplyr provides some simple tools to help with these tasks but they are no substitute for a local expert.

-

The motivation for supporting databases in dplyr is that you never pull down the right subset or aggregate from the database the first time, and usually you have to iterate between R and SQL many times before you get the perfect dataset. Switching between languages is cognitively challenging (especially because R and SQL are so perilously similar), so dplyr allows you to write R code that is automatically translated to SQL. The goal of dplyr is not to replace every SQL function with an R function: that would be difficult and error prone. Instead, dplyr only generates SELECT statements, the SQL you write most often as an analyst.

+

The motivation for supporting databases in dplyr is that you never pull down the right subset or aggregate from the database the first time, and usually you have to iterate between R and SQL many times before you get the perfect dataset. Switching between languages is cognitively challenging (especially because R and SQL are so perilously similar), so dplyr allows you to write R code that is automatically translated to SQL. The goal of dplyr is not to replace every SQL function with an R function: that would be difficult and error prone. Instead, dplyr only generates SELECT statements, the SQL you write most often as an analyst.

-

To get the most out of this chapter, you'll need to be familiar with querying SQL databases using the SELECT statement.

+

To get the most out of this chapter, you'll need to be familiar with querying SQL databases using the SELECT statement. If you have some familiarity with SQL and you'd like to learn more, I found how indexes work in SQLite and 10 easy steps to a complete understanding of SQL to be particularly helpful.

Getting started

@@ -236,7 +236,7 @@

Getting started

hflights_sqlite
 
-
#> Source: sqlite 3.7.17 [/private/tmp/Rtmp88DKxz/Rinst5e594af8e4dc/dplyr/db/hflights.sqlite]
+
#> Source: sqlite 3.7.17 [/private/tmp/RtmptxkEBK/Rinst1424b59c21ad0/dplyr/db/hflights.sqlite]
 #> From: hflights [227,496 x 21]
 #> 
 #>    Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier FlightNum
@@ -263,7 +263,7 @@ 

Basic verbs

select(hflights_sqlite, Year:DayofMonth, DepDelay, ArrDelay)
 
-
#> Source: sqlite 3.7.17 [/private/tmp/Rtmp88DKxz/Rinst5e594af8e4dc/dplyr/db/hflights.sqlite]
+
#> Source: sqlite 3.7.17 [/private/tmp/RtmptxkEBK/Rinst1424b59c21ad0/dplyr/db/hflights.sqlite]
 #> From: hflights [227,496 x 5]
 #> 
 #>    Year Month DayofMonth DepDelay ArrDelay
@@ -277,7 +277,7 @@ 

Basic verbs

filter(hflights_sqlite, depDelay > 240)
 
-
#> Source: sqlite 3.7.17 [/private/tmp/Rtmp88DKxz/Rinst5e594af8e4dc/dplyr/db/hflights.sqlite]
+
#> Source: sqlite 3.7.17 [/private/tmp/RtmptxkEBK/Rinst1424b59c21ad0/dplyr/db/hflights.sqlite]
 #> From: hflights [389 x 21]
 #> Filter: depDelay > 240 
 #> 
@@ -296,7 +296,7 @@ 

Basic verbs

arrange(hflights_sqlite, Year, Month, DayofMonth)
 
-
#> Source: sqlite 3.7.17 [/private/tmp/Rtmp88DKxz/Rinst5e594af8e4dc/dplyr/db/hflights.sqlite]
+
#> Source: sqlite 3.7.17 [/private/tmp/RtmptxkEBK/Rinst1424b59c21ad0/dplyr/db/hflights.sqlite]
 #> From: hflights [227,496 x 21]
 #> Arrange: Year, Month, DayofMonth 
 #> 
@@ -315,7 +315,7 @@ 

Basic verbs

mutate(hflights_sqlite, speed = AirTime / Distance)
 
-
#> Source: sqlite 3.7.17 [/private/tmp/Rtmp88DKxz/Rinst5e594af8e4dc/dplyr/db/hflights.sqlite]
+
#> Source: sqlite 3.7.17 [/private/tmp/RtmptxkEBK/Rinst1424b59c21ad0/dplyr/db/hflights.sqlite]
 #> From: hflights [227,496 x 22]
 #> 
 #>    Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier FlightNum
@@ -333,7 +333,7 @@ 

Basic verbs

summarise(hflights_sqlite, delay = mean(DepTime))
 
-
#> Source: sqlite 3.7.17 [/private/tmp/Rtmp88DKxz/Rinst5e594af8e4dc/dplyr/db/hflights.sqlite]
+
#> Source: sqlite 3.7.17 [/private/tmp/RtmptxkEBK/Rinst1424b59c21ad0/dplyr/db/hflights.sqlite]
 #> From: <derived table> [?? x 1]
 #> 
 #>    delay
@@ -361,12 +361,12 @@ 

Lazyness

c4 <- arrange(c3, Year, Month, DayofMonth, UniqueCarrier)
-

Suprisingly, this sequence of operations never actually touches the database. It's not until you ask for the data (e.g. by printing c4) that dplyr generates the SQL and requests the results from the database, and even then it only pulls down 10 rows.

+

Suprisingly, this sequence of operations never actually touches the database. It's not until you ask for the data (e.g. by printing c4) that dplyr generates the SQL and requests the results from the database, and even then it only pulls down 10 rows.

c4
 
-
#> Source: sqlite 3.7.17 [/private/tmp/Rtmp88DKxz/Rinst5e594af8e4dc/dplyr/db/hflights.sqlite]
+
#> Source: sqlite 3.7.17 [/private/tmp/RtmptxkEBK/Rinst1424b59c21ad0/dplyr/db/hflights.sqlite]
 #> From: hflights [109,996 x 8]
 #> Filter: DepDelay > 0 
 #> Arrange: Year, Month, DayofMonth, UniqueCarrier 
@@ -403,7 +403,7 @@ 

Lazyness

#> FROM "hflights" #> WHERE "DepDelay" > 0.0 #> ORDER BY "Year", "Month", "DayofMonth", "UniqueCarrier" -#> <SQLiteConnection: DBI CON (24184, 0)> +#> <SQLiteConnection: DBI CON (82546, 0)>

You can also ask the database how it plans to execute the query with explain(). The output for SQLite is explained in more detail on the SQLite website and is helpful if you're trying to figure out what indexes are being used.

@@ -429,7 +429,7 @@

Forcing computation

  • collect() executes the query and returns the results to R.

  • -
  • compute() executes the query and stores the results in a temporary table +

  • compute() executes the query and stores the results in a temporary table in the database.

  • collapse() turns the query into a table expresion.

@@ -441,16 +441,16 @@

Performance considerations

dplyr tries to prevent you from accidentally performing expensive query operations:

    -
  • nrow() is always NA: in general, there's no way to determine how +

  • nrow() is always NA: in general, there's no way to determine how many rows a query will return unless you actually run it.

  • Printing a tbl only runs the query enough to get the first 10 rows

  • -
  • You can use tail() on database tbls: you can't find the last rows -without executing the whole query.

  • +
  • You can use tail() on database tbls: you can't find the last rows +without executing the whole query.

SQL translation

-

When doing simple mathematical operations of the form you normally use when filtering, mutating and summarising it's relatively straightforward to translate R code to SQL (or indeed to any programming language).

+

When doing simple mathematical operations of the form you normally use when filtering, mutating and summarising it's relatively straightforward to translate R code to SQL (or indeed to any programming language).

To experiment with the translation, use translate_sql(). The following examples work through some basic differences between R and SQL.

@@ -508,8 +508,8 @@

SQL translation

  • basic math operators: +, -, *, /, %%, ^
  • -
  • math functions: abs, acos, acosh, asin, asinh, atan, atan2, -atanh, ceiling, cos, cosh, cot, coth, exp, floor, +
  • math functions: abs, acos, acosh, asin, asinh, atan, atan2, +atanh, ceiling, cos, cosh, cot, coth, exp, floor, log, log10, round, sign, sin, sinh, sqrt, tan, tanh
  • logical comparisons: <, <=, !=, >=, >, ==, %in%
  • boolean operations: &, &&, |, ||, !, xor
  • @@ -526,7 +526,7 @@

    SQL translation

    Any function that dplyr does't know how to convert it leaves as is - that means if you want to use any other function that database provides, you can use it as is. Here a couple of examples that will work with SQLite:

    -
    translate_sql(glob(x, y)) 
    +
    translate_sql(glob(x, y))
     
    #> <SQL> GLOB("x", "y")
    @@ -543,7 +543,7 @@ 

    Grouping

    SQLite lacks window functions, which are needed for grouped mutation and filtering. This means that only really useful operation for grouped sqlite tables in summarise(). The grouped summarise from the introduction translates well - the only difference is that databases always drop NULLs (their equivalent of missing values), so we don't supply na.rm = TRUE.

    planes <- group_by(hflights_sqlite, TailNum)
    -delay <- summarise(planes, 
    +delay <- summarise(planes,
       count = n(),
       dist = mean(Distance),
       delay = mean(ArrDelay)
    @@ -570,7 +570,6 @@ 

    Postgresql

    #> Loading required package: RPostgreSQL
    -#> Auto-disconnecting postgres connection (24184, 0)
     

    Postgres is a considerably more powerful database than SQLite. It has:

    @@ -586,7 +585,7 @@

    Postgresql

    daily <- group_by(hflights_postgres, Year, Month, DayofMonth) # Find the most and least delayed flight each day - bestworst <- filter(daily, ArrDelay == min(ArrDelay) || + bestworst <- filter(daily, ArrDelay == min(ArrDelay) || ArrDelay == max(ArrDelay)) bestworst$query @@ -596,14 +595,14 @@

    Postgresql

    }
    -
    #> Auto-disconnecting postgres connection (24184, 2)
    +
    #> Auto-disconnecting postgres connection (82546, 2)
     
    #> <Query> SELECT "Year", "Month", "DayofMonth", "DayOfWeek", "DepTime", "ArrTime", "UniqueCarrier", "FlightNum", "TailNum", "ActualElapsedTime", "AirTime", "ArrDelay", "DepDelay", "Origin", "Dest", "Distance", "TaxiIn", "TaxiOut", "Cancelled", "CancellationCode", "Diverted", rank() OVER (PARTITION BY ("Year", "Month", "DayofMonth") ORDER BY "ArrDelay" DESC) AS "rank"
     #> FROM "hflights"
     #> An object of class "PostgreSQLConnection"
     #> Slot "Id":
    -#> [1] 24184     1
    +#> [1] 82546     1
     

    MySQL and MariaDB

    @@ -616,7 +615,7 @@

    Bigquery

    Bigquery is a hosted database server provided by google. To connect, you need to provide your project, dataset and optionally a project for billing (if billing for project isn't enabled). After you create the src, your web browser will open and ask you to authenticate. Your credentials are stored in a local cache, so you should only need to do this once.

    -

    Bigquery supports only a single SQL statement: SELECT. Fortunately this is all you need for data analysis, and within SELECT bigquery provides comprehensive coverage similar level to postgresql.

    +

    Bigquery supports only a single SQL statement: SELECT. Fortunately this is all you need for data analysis, and within SELECT bigquery provides comprehensive coverage similar level to postgresql.

    Picking a database

    diff --git a/inst/doc/introduction.Rmd b/inst/doc/introduction.Rmd index 90942bf..ad50a90 100644 --- a/inst/doc/introduction.Rmd +++ b/inst/doc/introduction.Rmd @@ -269,7 +269,7 @@ filter( ) ``` -This is difficult to read because the order of the operations is from inside to out, and the arguments are a long way away from the function. To get around this problem, dplyr provides the `%.%` operator. `x %.% f(y)` turns into `f(x, y)` so you can use it to rewrite multiple operations so you can read from left-to-riht, top-to-bottom: +This is difficult to read because the order of the operations is from inside to out, and the arguments are a long way away from the function. To get around this problem, dplyr provides the `%.%` operator. `x %.% f(y)` turns into `f(x, y)` so you can use it to rewrite multiple operations so you can read from left-to-right, top-to-bottom: ```{r, eval = FALSE} hflights %.% @@ -309,11 +309,11 @@ Compared to DBI and the database connection algorithms: * it hides, as much as possible, the fact that you're working with a remote database * you don't need to know any sql (although it helps!) -* it shims over the many differences between the difference DBI implementations +* it shims over the many differences between the different DBI implementations ## Multidimensional arrays / cubes -`tbl_cube()` provides an experimental interface to multidimenssional arrays or data cubes. If you're using this form of data in R, please get in touch so I can better understand your needs. +`tbl_cube()` provides an experimental interface to multidimensional arrays or data cubes. If you're using this form of data in R, please get in touch so I can better understand your needs. # Comparisons diff --git a/inst/doc/introduction.html b/inst/doc/introduction.html index ef45f96..623af29 100644 --- a/inst/doc/introduction.html +++ b/inst/doc/introduction.html @@ -631,18 +631,18 @@

    Chaining

    )
    -
    #> Source: local data frame [14 x 4]
    -#> Groups: DayofMonth, Month
    +
    #> Source: local data frame [14 x 5]
    +#> Groups: Year, Month
     #> 
    -#>    DayofMonth Month   arr   dep
    -#> 1           4     2 44.08 47.17
    -#> 2           3     3 35.13 38.20
    -#> 3          14     3 46.64 36.14
    -#> 4           4     4 38.72 27.95
    -#> ..        ...   ...   ...   ...
    +#>    Year Month DayofMonth   arr   dep
    +#> 1  2011     2          4 44.08 47.17
    +#> 2  2011     3          3 35.13 38.20
    +#> 3  2011     3         14 46.64 36.14
    +#> 4  2011     4          4 38.72 27.95
    +#> ..  ...   ...        ...   ...   ...
     
    -

    This is difficult to read because the order of the operations is from inside to out, and the arguments are a long way away from the function. To get around this problem, dplyr provides the %.% operator. x %.% f(y) turns into f(x, y) so you can use it to rewrite multiple operations so you can read from left-to-riht, top-to-bottom:

    +

    This is difficult to read because the order of the operations is from inside to out, and the arguments are a long way away from the function. To get around this problem, dplyr provides the %.% operator. x %.% f(y) turns into f(x, y) so you can use it to rewrite multiple operations so you can read from left-to-right, top-to-bottom:

    hflights %.%
       group_by(Year, Month, DayofMonth) %.%
    @@ -683,12 +683,12 @@ 

    Databases

    • it hides, as much as possible, the fact that you're working with a remote database
    • you don't need to know any sql (although it helps!)
    • -
    • it shims over the many differences between the difference DBI implementations
    • +
    • it shims over the many differences between the different DBI implementations

    Multidimensional arrays / cubes

    -

    tbl_cube() provides an experimental interface to multidimenssional arrays or data cubes. If you're using this form of data in R, please get in touch so I can better understand your needs.

    +

    tbl_cube() provides an experimental interface to multidimensional arrays or data cubes. If you're using this form of data in R, please get in touch so I can better understand your needs.

    Comparisons

    diff --git a/inst/doc/window-functions.R b/inst/doc/window-functions.R index 90ccb57..762870d 100644 --- a/inst/doc/window-functions.R +++ b/inst/doc/window-functions.R @@ -10,7 +10,7 @@ batting <- select(tbl_df(Batting), playerID, yearID, teamID, G, AB:H) batting <- arrange(batting, playerID, yearID, teamID) players <- group_by(batting, playerID) -# For each player, find the two years with most home runs +# For each player, find the two years with most hits filter(players, min_rank(desc(H)) <= 2 & H > 0) # Within each player, rank each year by the number of games played mutate(players, G_rank = min_rank(G)) @@ -67,7 +67,7 @@ mutate(players, G_delta = G - lag(G)) ## ----, results = "hide"-------------------------------------------------- # Find when a player changed teams -filter(players, teamID != lag(teamID)); TRUE +filter(players, teamID != lag(teamID)) ## ------------------------------------------------------------------------ diff --git a/inst/doc/window-functions.Rmd b/inst/doc/window-functions.Rmd index 3dd3997..ba2d595 100644 --- a/inst/doc/window-functions.Rmd +++ b/inst/doc/window-functions.Rmd @@ -20,7 +20,7 @@ batting <- select(tbl_df(Batting), playerID, yearID, teamID, G, AB:H) batting <- arrange(batting, playerID, yearID, teamID) players <- group_by(batting, playerID) -# For each player, find the two years with most home runs +# For each player, find the two years with most hits filter(players, min_rank(desc(H)) <= 2 & H > 0) # Within each player, rank each year by the number of games played mutate(players, G_rank = min_rank(G)) @@ -137,7 +137,7 @@ You can use them to: ```{r, results = "hide"} # Find when a player changed teams - filter(players, teamID != lag(teamID)); TRUE + filter(players, teamID != lag(teamID)) ``` `lead()` and `lag()` have an optional argument `order_by`. If set, instead of using the row order to determine which value comes before another, they will use another variable. This important if you have not already sorted the data, or you want to sort one way and lag another. diff --git a/inst/doc/window-functions.html b/inst/doc/window-functions.html index 96cf328..366b51c 100644 --- a/inst/doc/window-functions.html +++ b/inst/doc/window-functions.html @@ -204,7 +204,7 @@

    Window functions and grouped mutate/filter

    batting <- arrange(batting, playerID, yearID, teamID) players <- group_by(batting, playerID) -# For each player, find the two years with most home runs +# For each player, find the two years with most hits filter(players, min_rank(desc(H)) <= 2 & H > 0) # Within each player, rank each year by the number of games played mutate(players, G_rank = min_rank(G)) @@ -355,7 +355,7 @@

    Lead and lag

  • Find out when a value changes.

    # Find when a player changed teams
    -filter(players, teamID != lag(teamID)); TRUE
    +filter(players, teamID != lag(teamID))
     
@@ -371,12 +371,12 @@

Lead and lag

#>   year value running
-#> 1 2000     0      14
-#> 2 2001     1      10
-#> 3 2002     4      14
-#> 4 2003     9       9
-#> 5 2004    16      55
-#> 6 2005    25      39
+#> 1 2000     0      46
+#> 2 2001     1      46
+#> 3 2002     4      45
+#> 4 2003     9      55
+#> 5 2004    16      41
+#> 6 2005    25      25
 

diff --git a/inst/include/dplyr/Collecter.h b/inst/include/dplyr/Collecter.h
index 2b4f6c6..5b1b331 100644
--- a/inst/include/dplyr/Collecter.h
+++ b/inst/include/dplyr/Collecter.h
@@ -120,9 +120,13 @@ namespace dplyr {
             
             SEXP* levels_ptr = Rcpp::internal::r_vector_start(levels) ;
             int* source_ptr = Rcpp::internal::r_vector_start(source) ;
-            for( int i=0; isecond ;
+            for( int i=0; isecond ;
+                }
             } 
         }
         
diff --git a/inst/include/dplyr/GroupedDataFrame.h b/inst/include/dplyr/GroupedDataFrame.h
index ff7c462..b93c575 100644
--- a/inst/include/dplyr/GroupedDataFrame.h
+++ b/inst/include/dplyr/GroupedDataFrame.h
@@ -93,7 +93,7 @@ namespace Rcpp {
     
     template <>
     inline bool is( SEXP x){
-        return Rf_inherits(x, "grouped_df" ) ;
+        return Rf_inherits(x, "grouped_df" ) && Rf_getAttrib(x, Rf_install("vars") ) != R_NilValue ;
     }
     
     inline GroupedDataFrameIndexIterator::GroupedDataFrameIndexIterator( const GroupedDataFrame& gdf_ ) : 
diff --git a/inst/include/dplyr/JoinVisitorImpl.h b/inst/include/dplyr/JoinVisitorImpl.h
index 15d5c00..a375510 100644
--- a/inst/include/dplyr/JoinVisitorImpl.h
+++ b/inst/include/dplyr/JoinVisitorImpl.h
@@ -152,7 +152,11 @@ namespace dplyr{
         boost::hash string_hash ;
     
         inline SEXP get(int i){
-            return i>=0 ? left_levels_ptr[ left[i] - 1] : right_levels_ptr[right[-i-1] - 1] ;    
+            if( i >= 0 ){
+                return ( left[i] == NA_INTEGER ) ? NA_STRING : left_levels_ptr[ left[i] - 1] ;
+            } else {
+                return ( right[-i-1] == NA_INTEGER ) ? NA_STRING : right_levels_ptr[right[-i-1] - 1] ;                  
+            }
         }
         
     } ;
diff --git a/inst/include/dplyr/VectorVisitorImpl.h b/inst/include/dplyr/VectorVisitorImpl.h
index 25b0b35..1d4c332 100644
--- a/inst/include/dplyr/VectorVisitorImpl.h
+++ b/inst/include/dplyr/VectorVisitorImpl.h
@@ -18,6 +18,7 @@ namespace dplyr {
     template <> inline std::string VectorVisitorType(){ return "numeric" ; }
     template <> inline std::string VectorVisitorType() { return "logical" ; }
     template <> inline std::string VectorVisitorType() { return "character" ; }
+    template <> inline std::string VectorVisitorType() { return "list" ; }
     
     /** 
      * Implementations 
@@ -102,14 +103,28 @@ namespace dplyr {
         inline SEXP subset_int_index( const Container& index ) const {
             int n = output_size(index) ;
             VECTOR out = Rcpp::no_init(n) ;
-            // TODO: find a way to mark that we don't need the NA handling
-            for( int i=0; i
+    template 
+    SEXP VectorVisitorImpl::subset_int_index( const Container& index ) const {
+        int n = output_size(index) ;
+        List out(n) ;
+        for( int i=0; i 
     class PromoteClassVisitor : public VisitorImpl {
     public:
@@ -268,6 +283,8 @@ namespace dplyr {
                 return new VectorVisitorImpl( vec ) ;
             case LGLSXP:  return new VectorVisitorImpl( vec ) ;
             case STRSXP:  return new VectorVisitorImpl( vec ) ;
+                
+            case VECSXP:  return new VectorVisitorImpl( vec ) ;
             default: break ;
         }
         
diff --git a/inst/tests/test-arrange.r b/inst/tests/test-arrange.r
index d33c4ea..8945f15 100644
--- a/inst/tests/test-arrange.r
+++ b/inst/tests/test-arrange.r
@@ -80,3 +80,10 @@ test_that("arrange uses the white list", {
 
 })
 
+test_that("arrange handles list columns (#282)", {
+  df <- data.frame( a = 2:1 )
+  df$b <- list( "foo", "bar" )
+  res <- arrange(df, a)
+  expect_equal(res$b, list( "bar", "foo" ) )
+})
+
diff --git a/inst/tests/test-joins.r b/inst/tests/test-joins.r
index ba9f764..ef5cf66 100644
--- a/inst/tests/test-joins.r
+++ b/inst/tests/test-joins.r
@@ -122,3 +122,11 @@ test_that("univariate left join has all columns, all rows", {
   expect_equal(j1$z.y, c(1, 1, 2, 3, NA))
   expect_equal(j2$z.y, c(1, 2, 3, 3, NA))
 })
+
+test_that("inner_join does not segfault on NA in factors (#306)", {
+  a <- data.frame(x=c("p", "q", NA), y=c(1, 2, 3), stringsAsFactors=TRUE)
+  b <- data.frame(x=c("p", "q", "r"), z=c(4,5,6), stringsAsFactors=TRUE)
+  res <- inner_join(a, b)
+  expect_equal( nrow(res), 2L )
+})
+
diff --git a/inst/tests/test-rbind.r b/inst/tests/test-rbind.r
index 3384594..4396a7e 100644
--- a/inst/tests/test-rbind.r
+++ b/inst/tests/test-rbind.r
@@ -80,3 +80,13 @@ test_that( "rbind handles NULL",{
   expect_equal(nrow(res), 30L)
 })
 
+test_that( "rbind handles NA in factors #279", {
+  xx <- as.data.frame(list(a=as.numeric(NA), b="c", c="d")) 
+  zz <- as.data.frame(list(a=1, b=as.character(NA), c="b"))
+  expect_warning( res <- rbind_list( xx, zz ) )
+  
+  expect_equal(res$a, c(NA,1.0))
+  expect_equal(res$b, c("c", NA))
+  expect_equal(res$c, c("d","b"))
+  
+})
diff --git a/inst/tests/test-select.r b/inst/tests/test-select.r
index 3959c56..33a72f1 100644
--- a/inst/tests/test-select.r
+++ b/inst/tests/test-select.r
@@ -62,3 +62,26 @@ test_that("num_range selects numeric ranges", {
   expect_equal(select_vars(vars, num_range("x", 10:11, width = 2)), vars[5:6])
 })
 
+# Data table -------------------------------------------------------------------
+
+test_that("select changes columns in copy of data table", {dt <- data.table(x = 1:4, y = letters[1:4])
+
+  expect_equal(names(select(dt, x, z = y)), c("x", "z"))
+  expect_equal(names(dt), c("x", "y"))
+
+
+  gdt <- dt %.% group_by(x)
+  expect_equal(names(select(gdt, x, z = y)), c("x", "z"))
+  expect_equal(names(gdt), c("x", "y"))
+})
+
+test_that("select can be before group_by (#309)",{
+  df <- data.frame(id=c(1,1,2,2,2,3,3,4,4,5), year=c(2013,2013,2012,2013,2013,2013,2012,2012,2013,2013), var1=rnorm(10))
+  dfagg <- df %.%
+    group_by(id, year) %.%
+    select(id, year, var1) %.%
+    summarise(var1=mean(var1))
+  expect_equal(names(dfagg), c("id", "year", "var1"))
+  expect_equal(attr(dfagg, "vars" ), list(quote(id)))
+  
+})
diff --git a/man/join.tbl_sql.Rd b/man/join.tbl_sql.Rd
index dd6f604..16de4a2 100644
--- a/man/join.tbl_sql.Rd
+++ b/man/join.tbl_sql.Rd
@@ -93,7 +93,7 @@ hof <- tbl(lahman_sqlite(), "HallOfFame")
 semi_join(people, hof)
 
 # All people not in the hall of fame
-semi_join(people, hof, anti = TRUE)
+anti_join(people, hof)
 
 # Find all managers
 manager <- tbl(lahman_sqlite(), "Managers")
diff --git a/src/dplyr.cpp b/src/dplyr.cpp
index 3435881..f20d3f7 100644
--- a/src/dplyr.cpp
+++ b/src/dplyr.cpp
@@ -1387,21 +1387,23 @@ DataFrame select_grouped( GroupedDataFrame gdf, const CharacterVector& keep, Cha
   // handle vars  attribute : make a shallow copy of the list and alter 
   //   its names attribute
   List vars = shallow_copy( copy.attr("vars") ); 
+  
   int nv = vars.size() ;
   for( int i=0; i=`, `>`, `==`, `%in%`
 * boolean operations: `&`, `&&`, `|`, `||`, `!`, `xor`
@@ -183,7 +183,7 @@ translate_sql(mean(x, trim = T))
 Any function that dplyr does't know how to convert it leaves as is - that means if you want to use any other function that database provides, you can use it as is. Here a couple of examples that will work with [SQLite](http://www.sqlite.org/lang_corefunc.html):
 
 ```{r}
-translate_sql(glob(x, y)) 
+translate_sql(glob(x, y))
 translate_sql(x %like% "ab*")
 ```
 
@@ -193,7 +193,7 @@ SQLite lacks window functions, which are needed for grouped mutation and filteri
 
 ```{r}
 planes <- group_by(hflights_sqlite, TailNum)
-delay <- summarise(planes, 
+delay <- summarise(planes,
   count = n(),
   dist = mean(Distance),
   delay = mean(ArrDelay)
@@ -231,12 +231,12 @@ The following examples shows the grouped filter and mutate possible with Postgre
 ```{r}
 if (has_lahman("postgres")) {
   daily <- group_by(hflights_postgres, Year, Month, DayofMonth)
-  
+
   # Find the most and least delayed flight each day
-  bestworst <- filter(daily, ArrDelay == min(ArrDelay) || 
+  bestworst <- filter(daily, ArrDelay == min(ArrDelay) ||
     ArrDelay == max(ArrDelay))
   bestworst$query
-  
+
   # Rank each flight within a daily
   ranked <- mutate(daily, rank = rank(desc(ArrDelay)))
   ranked$query
@@ -253,7 +253,7 @@ In terms of functionality, MySQL lies somewhere between SQLite and PostgreSQL. I
 
 Bigquery is a hosted database server provided by google. To connect, you need to provide your `project`, `dataset` and optionally a project for `billing` (if billing for `project` isn't enabled). After you create the src, your web browser will open and ask you to authenticate. Your credentials are stored in a local cache, so you should only need to do this once.
 
-Bigquery supports only a single SQL statement: [SELECT](https://developers.google.com/bigquery/query-reference). Fortunately this is all you need for data analysis, and within SELECT bigquery provides comprehensive coverage similar level to postgresql. 
+Bigquery supports only a single SQL statement: [SELECT](https://developers.google.com/bigquery/query-reference). Fortunately this is all you need for data analysis, and within SELECT bigquery provides comprehensive coverage similar level to postgresql.
 
 ## Picking a database
 
diff --git a/vignettes/introduction.Rmd b/vignettes/introduction.Rmd
index 90942bf..ad50a90 100644
--- a/vignettes/introduction.Rmd
+++ b/vignettes/introduction.Rmd
@@ -269,7 +269,7 @@ filter(
 )
 ```
 
-This is difficult to read because the order of the operations is from inside to out, and the arguments are a long way away from the function. To get around this problem, dplyr provides the `%.%` operator. `x %.% f(y)` turns into `f(x, y)` so you can use it to rewrite multiple operations so you can read from left-to-riht, top-to-bottom:
+This is difficult to read because the order of the operations is from inside to out, and the arguments are a long way away from the function. To get around this problem, dplyr provides the `%.%` operator. `x %.% f(y)` turns into `f(x, y)` so you can use it to rewrite multiple operations so you can read from left-to-right, top-to-bottom:
 
 ```{r, eval = FALSE}
 hflights %.%
@@ -309,11 +309,11 @@ Compared to DBI and the database connection algorithms:
 
 * it hides, as much as possible, the fact that you're working with a remote database
 * you don't need to know any sql (although it helps!)
-* it shims over the many differences between the difference DBI implementations
+* it shims over the many differences between the different DBI implementations
 
 ## Multidimensional arrays / cubes
 
-`tbl_cube()` provides an experimental interface to multidimenssional arrays or data cubes. If you're using this form of data in R, please get in touch so I can better understand your needs.
+`tbl_cube()` provides an experimental interface to multidimensional arrays or data cubes. If you're using this form of data in R, please get in touch so I can better understand your needs.
 
 # Comparisons
 
diff --git a/vignettes/window-functions.Rmd b/vignettes/window-functions.Rmd
index 3dd3997..ba2d595 100644
--- a/vignettes/window-functions.Rmd
+++ b/vignettes/window-functions.Rmd
@@ -20,7 +20,7 @@ batting <- select(tbl_df(Batting), playerID, yearID, teamID, G, AB:H)
 batting <- arrange(batting, playerID, yearID, teamID)
 players <- group_by(batting, playerID)
 
-# For each player, find the two years with most home runs
+# For each player, find the two years with most hits
 filter(players, min_rank(desc(H)) <= 2 & H > 0)
 # Within each player, rank each year by the number of games played
 mutate(players, G_rank = min_rank(G))
@@ -137,7 +137,7 @@ You can use them to:
   
     ```{r, results = "hide"}
     # Find when a player changed teams
-    filter(players, teamID != lag(teamID)); TRUE
+    filter(players, teamID != lag(teamID))
     ```
 
 `lead()` and `lag()` have an optional argument `order_by`. If set, instead of using the row order to determine which value comes before another, they will use another variable. This important if you have not already sorted the data, or you want to sort one way and lag another.