gonum · sbinet · Jan 15, 2019 · Jan 15, 2019 · Jan 15, 2019 · Jan 8, 2019
diff --git a/.travis.yml b/.travis.yml
@@ -4,18 +4,22 @@ language: go
 
 # Versions of go that are explicitly supported by gonum plus go tip.
 go:
- - 1.9.x
  - 1.10.x
  - 1.11.x
  - master
 
 matrix:
+ fast_finish: true
  allow_failures:
    - go: master
 
 before_install:
  # Required for format check.
  - go get golang.org/x/tools/cmd/goimports
+ # Required for imports check.
+ - go get gonum.org/v1/tools/cmd/check-imports
+ # Required for copyright header check.
+ - go get gonum.org/v1/tools/cmd/check-copyright
  # Required for coverage.
  - go get golang.org/x/tools/cmd/cover
  - go get github.com/mattn/goveralls
@@ -25,14 +29,16 @@ go_import_path: gonum.org/v1/exp
 # Get deps, build, test, and ensure the code is gofmt'ed.
 # If we are building as gonum, then we have access to the coveralls api key, so we can run coverage as well.
 script:
+ - ${TRAVIS_BUILD_DIR}/.travis/check-copyright.sh
  - ${TRAVIS_BUILD_DIR}/.travis/check-formatting.sh
  - go get -d -t -v ./...
  - go build -v ./...
  - go test -v ./...
- - go test -a -tags bounds -x -v ./...
- - go test -a -tags noasm -x -v ./...
- - go test -a -tags appengine -x -v ./...
+ - go test -a -tags bounds -v ./...
+ - go test -a -tags noasm -v ./...
+ - go test -a -tags appengine -v ./...
  - if [[ $TRAVIS_SECURE_ENV_VARS = "true" ]]; then bash ./.travis/test-coverage.sh; fi
+ - ${TRAVIS_BUILD_DIR}/.travis/check-imports.sh
  # This is run last since it alters the tree.
  - ${TRAVIS_BUILD_DIR}/.travis/check-generate.sh
 
diff --git a/.travis/check-copyright.sh b/.travis/check-copyright.sh
@@ -0,0 +1,4 @@
+#!/bin/bash
+
+set -e
+check-copyright -notice "Copyright ©20[0-9]{2} The Gonum Authors\. All rights reserved\."
diff --git a/.travis/check-imports.sh b/.travis/check-imports.sh
@@ -0,0 +1,4 @@
+#!/bin/bash
+
+set -e
+check-imports -b "math/rand,github.com/gonum/.*"
diff --git a/.travis/test-coverage.sh b/.travis/test-coverage.sh
@@ -13,7 +13,7 @@ testCover() {
 	# switch to the directory to check
 	pushd $d > /dev/null
 	# create the coverage profile
-	coverageresult=`go test -v -coverprofile=$PROFILE_OUT`
+	coverageresult=`go test -v $TAGS -coverprofile=$PROFILE_OUT`
 	# output the result so we can check the shell output
 	echo ${coverageresult}
 	# append the results to acc.out if coverage didn't fail, else set the retval to 1 (failed)
@@ -27,8 +27,8 @@ testCover() {
 # Init acc.out
 echo "mode: set" > $ACC_OUT
 
-# Run test coverage on all directories containing go files except testlapack and testblas.
-find . -type d -not -path '*testlapack*' -and -not -path '*testblas*' | while read d; do testCover $d || exit; done
+# Run test coverage on all directories containing go files except testlapack testblas and testgraph.
+find . -type d -not -path '*testlapack*' -and -not -path '*testblas*' -and -not -path '*testgraph*' | while read d; do testCover $d || exit; done
 
 # Upload the coverage profile to coveralls.io
 [ -n "$COVERALLS_TOKEN" ] && goveralls -coverprofile=$ACC_OUT -service=travis-ci -repotoken $COVERALLS_TOKEN
diff --git a/README.md b/README.md
@@ -1,4 +1,4 @@
-# Gonum exp [![Build Status](https://travis-ci.org/gonum/exp.svg?branch=master)](https://travis-ci.org/gonum/exp)
+# Gonum exp [![Build Status](https://travis-ci.org/gonum/exp.svg?branch=master)](https://travis-ci.org/gonum/exp) [![Coverage Status](https://coveralls.io/repos/gonum/exp/badge.svg?branch=master&service=github)](https://coveralls.io/github/gonum/exp?branch=master) [![GoDoc](https://godoc.org/gonum.org/v1/exp?status.svg)](https://godoc.org/gonum.org/v1/exp) [![Go Report Card](https://goreportcard.com/badge/github.com/gonum/exp)](https://goreportcard.com/report/github.com/gonum/exp)
 
 ## Issues
 

diff --git a/dframe/README.md b/dframe/README.md
@@ -0,0 +1,184 @@
+# dframe
+
+`dframe` is a work-in-progress [Data Frame](https://en.wikipedia.org/wiki/Pandas_%28software%29) a-la [pandas](https://pandas.pydata.org/pandas-docs/stable/index.html).
+
+`dframe` is leveraging [Apache Arrow](https://arrow.apache.org/) and its [Go backend](https://godoc.org/github.com/apache/arrow/go/arrow).
+
+## Proposal
+
+We propose to introduce a new `Frame` type inside the `dframe` package: a 2-dim data structure to handle:
+
+- tabular data with heterogeneous columns (like a `SQL` table)
+- arbitrary matrix data with row and column labels
+- any other form of observational/statistical dataset.
+
+For a good cross-pollination and integration with the Gonum and Go scientific ecosystem, it is expected for other "companion" packages tailored for a few focused operations to appear:
+
+- integration with `gonum/plot`,
+- integration with `gonum/stat`,
+- integration with `gonum/mat` (_e.g.:_ creation of `dframe.Frame`s from `gonum/mat.Vector` or `gonum/mat.Matrix`, and vice versa)
+- `hdf5` loading/saving of `dframe.Frame`s,
+- integration with `encoding/csv` or `npyio`,
+- integration with `database/sql`,
+- etc...
+
+### Previous work
+
+The data frame concept comes from `R`'s `data.frame` and Python's `pandas.DataFrame`:
+
+- https://www.rdocumentation.org/packages/base/versions/3.4.3/topics/data.frame
+- https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.html
+
+A few data frame-like implementations in Go have also been investigated:
+
+- [kniren/gota](https://github.com/kniren/gota)
+- [tobgu/qframe](https://github.com/tobgu/qframe)
+
+Some inspiration from this previous body of work will be drawn, both in terms of API and performance hindsight.
+
+### dframe
+
+The main type should be:
+
+```go
+package dframe
+
+type Frame struct {
+	// contains filtered or unexported fields
+}
+
+// Err returns the first error encountered during operations on a Frame.
+func (df *Frame) Err() error { ... }
+
+// NumRows returns the number of rows of this Frame.
+func (df *Frame) NumRows() int { ... }
+
+// NumCols returns the number of columns of this Frame.
+func (df *Frame) NumCols() int { ... }
+
+// Column returns the i-th column of this Frame.
+func (df *Frame) Column(i int) *array.Column { ... }
+
+// ColumnNames returns the list of column names of this Frame.
+func (df *Frame) ColumnNames() []string { ... }
+```
+
+It is expected to build `dframe.Frame` on top of the `arrow/array.Interface`.
+Leveraging [Arrow](https://arrow.apache.org) for `dframe` enables interoperability with many analysis frameworks, possibly written in other languages than Go.
+Arrow arrays are well specified: their memory layout is standardized and the IPC mechanism to send or receive them over the wire is also specified.
+This increases the confidence the data we are writing or the analysis pipelines we build with Arrow could be migrated to something else (another language, another framework) if the need should arise.
+The Go Arrow package is not feature complete yet with regard to the other language implementations (C++, Java.)
+However, the Go implementation already ships with SIMD optimized operations and has the infrastructure for zero-copy support.
+
+`tobgu/qframe` presents a `QFrame` type that is essentially immutable.
+Operations on a `QFrame`, such as copying columns, dropping columns, sorting them or applying some kind of operation on columns, return a new `QFrame`, leaving the original untouched.
+
+Arrow uses a ref-counting mechanism for all the types that involve memory allocation (mainly to address workloads involving memory allocated on a GPGPU, by a SQL database or a mmap-file.)
+This ref-counting mechanism is presented to the user as a pair of methods `Retain`/`Release` that increment and decrement that reference count.
+It would seem this mechanism prevents from exposing an API with "chained methods":
+
+```go
+o := df.Slice(0, 10).Select("col1", "col2").Apply("col1 + col2")
+```
+Each intermediate `Frame` -- the one returned by `Slice`, the one returned by `Select`, ... -- would be "leaked" as it is missing a call to `Release()` to correctly decrement its reference count.
+If we want an immutable `Frame` -- without leaking memory, the code above should instead be rewritten as:
+
+```go
+sli := df.Slice(0, 10)
+defer sli.Release()
+
+sel := sli.Select("col1", "col2")
+defer sel.Release()
+
+o := sel.Apply("col1 + col2")
+defer o.Release()
+```
+It is not clear (to me!) yet whether an immutable `Frame` makes much sense in Go and with this ref-counting mechanism coming from Arrow.
+
+However, introducing a `dframe.Tx` transaction could tackle the memory leak.
+One can achieve the above goal if one only allows modifications of the underlying `Frame` through a transaction, where all operations are applied to a single temporary `Frame`:
+
+```go
+// Exec runs the provided function inside an atomic read/write transaction,
+// applied on this Frame.
+func (df *Frame) Exec(f func(tx *Tx) error) error { ... }
+
+func example(df *dframe.Frame) {
+	err := df.Exec(func(tx *dframe.Tx) error {
+		tx.Slice(0, 10).Select("col1", "col2").Apply("col1 + col2")
+		return nil
+	})
+	if err != nil {
+		log.Fatal(err)
+	}
+}
+```
+
+Or, without a "chained methods" API:
+
+```go
+func example(df *dframe.Frame) {
+	err := df.Exec(func(tx *dframe.Tx) error {
+		tx.Slice(0, 10)
+		tx.Select("col1", "col2")
+		tx.Apply("col1 + col2")
+		return nil
+	})
+	if err != nil {
+		log.Fatal(err)
+	}
+}
+```
+Introducing a transaction has another nice feature: if the set of operations fails for some reason, one can rollback to the original state of the `Frame`.
+
+Finally, with a transaction context, one can build some kind of AST of operations that should be applied to a `Frame` and optionally optimize it behind the scene as one knows the complete set of operations to be carried.
+
+```go
+// Open opens an already existing Frame using the provided driver technology,
+// located at the provided source.
+//
+// Possible drivers: hdf5, npyio, csv, json, hdfs, spark, sql, ...
+func Open(drv, src string) (*Frame, error) { ... }
+
+// Create creates a new Frame, using the provided driver technology
+func Create(drv, dst string, schema *arrow.Schema, opts ...Option) (*Frame, error) { ... }
+
+// New creates a new in-memory data frame with the provided memory schema.
+func New(schema *arrow.Schema, opts ...Option) (*Frame, error) { ... }
+
+// FromMem creates a new data frame from the provided in-memory data.
+func FromMem(dict Dict, opts ...Option) (*Frame, error) { ... }
+
+// FromArrays creates a new data frame from the provided schema and arrays.
+func FromArrays(schema *arrow.Schema, arrs []array.Interface, opts ...Option) (*Frame, error) { ... }
+
+// FromCols creates a new data frame from the provided schema and columns.
+func FromCols(cols []array.Column, opts ...Option) (*Frame, error) { ... }
+
+// FromTable creates a new data frame from the provided arrow table.
+func FromTable(tbl array.Table, opts ...Option) (*Frame, error) { ... }
+
+// FromFrame returns a new data frame created by applying the provided
+// transaction on the provided frame.
+func FromFrame(df *Frame, f func(tx *Tx) error) (*Frame, error) { ... }
+
+// Exec runs the provided function inside an atomic read/write transaction,
+// applied on this Frame.
+func (df *Frame) Exec(f func(tx *Tx) error) error { ... }
+
+// RExec runs the provided function inside an atomic read-only transaction,
+// applied on this Frame.
+func (df *Frame) RExec(f func(tx *Tx) error) error { ... }
+```
+
+### Operations
+
+One should be able to carry the following operations on a `dframe.Frame`:
+
+- retrieve the list of columns that a `Frame` is made of,
+- create new columns that are the result of an operation on a set of already existing columns of that `Frame`,
+- drop columns from a `Frame`
+- append new data to a `Frame`, (either a new column or a new row)
+- select a subset of columns from a `Frame`
+- create different versions of a `Frame`: _e.g._ create `sub` from `Frame` `df` where `sub` is a subset of `df`.
+