Skip to content
Permalink
Browse files

V0.9.0 (#195)

Ongoing notes:

* **CUDA**: Better CUDA support (IN PROGRESS)
    * ~ColMajor used by default if engine is CUDA.~ (ColMajor is supported, but defaults to using RowMajor for all the major cuBLAS versions. Careful reasoning of the parameters obviates the need for ColMajor by  default, which causes more headaches. It is still supported)
    * Transposition will be automatically done when performing transports back to CPU.
    * cudnn operations supported (IN PROGRESS) (note: these are the ones I use more often hence gets bigger attention):
        * [x] Conv2d
        * [x] Dropout
        * [x] Maxpool2d 
        * [x] BatchNorm
        * [x] Rectify
    * Other CUDA related optimizations
        *  [x] full cuBLAS support
* **New Ops**:
    * BatchNorm 
    * InvSqrt
    * CUDA enabled ops in `ops/nn` (preview for how things will start to look in v0.10.0)
* **New Features**:
    * Limited shape inference.  Working towards a calculus for shapes (first raised in #96 and #97).
* **Optimizations**:
    * Optimizations of basic ops to use engine functions if available, otherwise, fall back to using `Apply`, which adds a penalty from repeatedly calling functions.
    * Faster VMs (1 of 2 VMs): ~greedy goroutines grabs gigs from a priority queue. This causes faster execution of  code in general.~ (this is moved to a future version of 0.9.xx):
```
benchmark                           old ns/op      new ns/op      delta
BenchmarkTapeMachineExecution-8     3129074510     2695304022     -13.86%

benchmark                           old allocs     new allocs     delta
BenchmarkTapeMachineExecution-8     25745          25122          -2.42%

benchmark                           old bytes      new bytes      delta
BenchmarkTapeMachineExecution-8     4804578705     4803784111     -0.02%
````
* **Code generation**: some exported API is now auto generated
* **New Solver** : @ynqa added the Momentum solver.
* **Breaking API**: `Solver` now take a slice of `ValueGrad` instead of `Nodes`. `ValueGrad` is an interface, of which a `*Node` fulfils. An additional utility function `NodesToValueGrads` has been added to aid with refactoring.  This was done for two reasons:
    * ~The support for BatchNorm operation, which is a verily impure and highly stateful function. The BatchNorm Op has internal states that need to have their gradients updated as well. But the internal state of BatchNorm isn't really part of  the expression graph, and really it shouldn't be.~ Turns out there was a better API for `BatchNorm`. 
    * In the next version,  v0.10.0. We aim to  do [better package organization](#91) for managability.  With this API breaking change, the solver now is less dependent on the other parts of Gorgonia and can be easily separated.
* **Breaking Semantics**: A `gorgonia.VM` now implements `io.Closer`. It should be treated as a resource as well as a computation device - the VM must be `Close()`d in order for the resources acquired by the VM to actually be released. Turns out, automatic resource management is too difficult. Who'd thunk that?
  • Loading branch information...
chewxy committed Aug 19, 2018
1 parent 32c6fd8 commit 978b1c3c0f795d09d0710238cb5f82853b99a460
Showing with 8,579 additions and 2,501 deletions.
  1. +1 −2 .travis.yml
  2. +4 −4 .travis/linux/OpenBLAS/install.sh
  3. +1 −1 .travis/linux/install.sh
  4. +20 −13 Gopkg.lock
  5. +3 −3 README.md
  6. +126 −0 api_gen.go
  7. +0 −3 batch_cuda.go
  8. +11 −2 bench_concurrent_training_test.go
  9. +1 −0 blas.go
  10. +9 −6 broadcast_test.go
  11. +12 −5 cmd/cudagen/main.go
  12. +177 −0 cmd/genapi/main.go
  13. +107 −0 cmd/gencudaengine/main.go
  14. +127 −0 cmd/gencudaengine/tmpl.go
  15. +1 −2 collections.go
  16. +2 −2 collections_test.go
  17. +87 −70 compile.go
  18. +1 −0 complex_test.go
  19. +30 −0 const.go
  20. +86 −1 cuda modules/src/elembinop.cu
  21. +1 −0 cuda modules/src/elemunaryop.cu
  22. +59 −0 cuda modules/src/misc.cu
  23. +1 −0 cuda modules/src/sigmoid32.cu
  24. +1 −0 cuda modules/src/sigmoid64.cu
  25. +78 −292 cuda.go
  26. +33 −0 cuda/arena.go
  27. +701 −0 cuda/arith.go
  28. +32 −5 { → cuda}/bfc.go
  29. +1 −1 { → cuda}/bfc_test.go
  30. +11 −0 cuda/builtin.go
  31. +701 −0 cuda/cmp.go
  32. +62 −0 cuda/debug.go
  33. +179 −0 cuda/engine.go
  34. +18 −0 cuda/errors.go
  35. +41 −0 cuda/extension.go
  36. +281 −0 cuda/external.go
  37. +65 −0 cuda/interfaces.go
  38. +282 −0 cuda/linalg.go
  39. +21 −0 cuda/release.go
  40. +138 −0 cuda/utils.go
  41. +27 −9 cuda_test.go
  42. +4 −0 debug.go
  43. +39 −7 differentiation.go
  44. +1 −2 differentiation_test.go
  45. +1 −2 dual_test.go
  46. +71 −0 engine_test.go
  47. +8 −21 equalities.go
  48. +15 −0 errors.go
  49. +1 −0 example_autodiff_test.go
  50. +1 −2 example_basic_test.go
  51. +5 −6 example_concurrent_training_test.go
  52. +28 −11 example_linearregression_test.go
  53. +1 −0 example_symdiff_test.go
  54. +5 −2 examples/charRNN/model.go
  55. +17 −12 examples/convnet/main.go
  56. +349 −0 examples/convnet_cuda/main.go
  57. +0 −30 examples/cuda/main.go
  58. +2 −0 examples/logisticregression/main.go
  59. +4 −3 examples/stacked autoencoder/stackedDA.go
  60. +0 −1 execution.go
  61. +4 −1 gorgonia.go
  62. +4 −0 graph_test.go
  63. +9 −0 interfaces.go
  64. +48 −0 known_issues_test.go
  65. +13 −1 math_nooptim.go
  66. +104 −6 nn.go
  67. +207 −35 nn_test.go
  68. +33 −1 node.go
  69. +5 −1 node_test.go
  70. +7 −1 noextern.go
  71. +1 −1 noextern_test.go
  72. +3 −16 op_infidel.go
  73. +434 −54 op_math.go
  74. +129 −79 op_math_cuda.go
  75. +49 −49 op_math_cuda_test.go
  76. +6 −0 op_math_noextern.go
  77. +358 −87 op_math_test.go
  78. +527 −63 op_nn.go
  79. +12 −31 op_reduction.go
  80. +14 −16 op_reduction_test.go
  81. +38 −486 op_tensor.go
  82. +4 −258 op_tensor_test.go
  83. +35 −162 operations.go
  84. +86 −51 operations_test.go
  85. +6 −16 operatorLinAlg.go
  86. +6 −14 operatorPointwise_binary.go
  87. +4 −0 operatorPointwise_binary_test.go
  88. +173 −223 operatorPointwise_unary.go
  89. +30 −23 operatorPointwise_unary_const.go
  90. +5 −2 operatorPointwise_unary_test.go
  91. +153 −0 ops/nn/activation_cuda.go
  92. +89 −0 ops/nn/api_cuda.go
  93. +151 −0 ops/nn/api_cuda_test.go
  94. +32 −0 ops/nn/api_nocuda.go
  95. +220 −0 ops/nn/batchnorm_cuda.go
  96. +265 −0 ops/nn/convolution_cuda.go
  97. +177 −0 ops/nn/dropout_cuda.go
  98. +188 −0 ops/nn/maxpool_cuda.go
  99. +45 −0 ops/nn/scratch.go
  100. +46 −0 ops/nn/scratch_cuda.go
  101. +82 −0 ops/nn/utils.go
  102. +353 −80 solvers.go
  103. +63 −14 solvers_test.go
  104. +3 −5 testsetup_test.go
  105. +71 −8 utils.go
  106. +25 −0 values.go
  107. +6 −3 vm.go
  108. +23 −6 vm_genera.go
  109. +12 −70 vm_genera_cuda.go
  110. +2 −0 vm_genera_cuda_test.go
  111. +22 −11 vm_genera_test.go
  112. +44 −16 vm_tape.go
  113. +25 −90 vm_tape_cuda.go
  114. +7 −1 vm_tape_nocuda.go
  115. +30 −0 weights.go
@@ -23,14 +23,13 @@ cache:
- .travis/OpenBLAS.cache

before_install:
- source ${TRAVIS_BUILD_DIR}/.travis/$TRAVIS_OS_NAME/install_cuda.sh
- go get github.com/mattn/goveralls

go_import_path: gorgonia.org/gorgonia

# Install the appropriate blas library (if any) and associated gonum software.
install:
- travis_wait source ${TRAVIS_BUILD_DIR}/.travis/$TRAVIS_OS_NAME/$BLAS_LIB/install.sh
- travis_wait source ${TRAVIS_BUILD_DIR}/.travis/$TRAVIS_OS_NAME/install.sh

script:
- source ${TRAVIS_BUILD_DIR}/.travis/$TRAVIS_OS_NAME/$BLAS_LIB/test.sh
@@ -50,12 +50,12 @@ set -ex
# # copy the cache files into /usr
# sudo cp -r ${CACHE_DIR}/* /usr/

travis_retry sudo apt-get install liblapack-dev liblapack3 libopenblas-base libopenblas-dev
# travis_retry sudo apt-get install liblapack-dev liblapack3 libopenblas-base libopenblas-dev

# install gonum/blas against OpenBLAS
export CGO_LDFLAGS="-L/usr/lib -lopenblas"
go get github.com/gonum/blas
go install -v -x github.com/gonum/blas
# export CGO_LDFLAGS="-L/usr/lib -lopenblas"
go get gonum.org/v1/gonum/blas
go install -v -x gonum.org/v1/gonum/blas


# run the OS common installation script
@@ -1,3 +1,3 @@
set -ex
go get -u -t -v ./...
go get -d -u -t -v ./...
set +ex

Some generated files are not rendered by default. Learn more.

@@ -314,9 +314,9 @@ Furthermore, there are some additional requirements:

1. [CUDA toolkit 9.0](https://developer.nvidia.com/cuda-toolkit) is required. Installing this installs the `nvcc` compiler which is required to run your code with CUDA.
2. Be sure to follow the [post-installation steps](http://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#post-installation-actions)
2. `go install gorgonia.org/gorgonia/cmd/cudagen`. This installs the `cudagen` program. Running `cudagen` will generate the relevant CUDA related code for Gorgonia.
3. The CUDA ops must be manually enabled in your code with the `UseCudaFor` option.
4. `runtime.LockOSThread()` must be called in the main function where the VM is running. CUDA requires thread affinity, and therefore the OS thread must be locked.
3. `go install gorgonia.org/gorgonia/cmd/cudagen`. This installs the `cudagen` program. Running `cudagen` will generate the relevant CUDA related code for Gorgonia. Note that you will need a folder at `src\gorgonia.org\gorgonia\cuda modules\target`
4. The CUDA ops must be manually enabled in your code with the `UseCudaFor` option.
5. `runtime.LockOSThread()` must be called in the main function where the VM is running. CUDA requires thread affinity, and therefore the OS thread must be locked.

Because `nvcc` only plays well with `gcc` version 6 and below (the current version is 7), this is also quite helpful: `sudo ln -s /path/to/gcc-6 /usr/local/cuda-9.0/bin/gcc`

@@ -0,0 +1,126 @@
package gorgonia

// Code generated by genapi, which is a API generation tool for Gorgonia. DO NOT EDIT.

// Abs performs a pointwise abs.
func Abs(a *Node) (*Node, error) { return unaryOpNode(newElemUnaryOp(absOpType, a), a) }

// Sign performs a pointwise sign.
func Sign(a *Node) (*Node, error) { return unaryOpNode(newElemUnaryOp(signOpType, a), a) }

// Ceil performs a pointwise ceil.
func Ceil(a *Node) (*Node, error) { return unaryOpNode(newElemUnaryOp(ceilOpType, a), a) }

// Floor performs a pointwise floor.
func Floor(a *Node) (*Node, error) { return unaryOpNode(newElemUnaryOp(floorOpType, a), a) }

// Sin performs a pointwise sin.
func Sin(a *Node) (*Node, error) { return unaryOpNode(newElemUnaryOp(sinOpType, a), a) }

// Cos performs a pointwise cos.
func Cos(a *Node) (*Node, error) { return unaryOpNode(newElemUnaryOp(cosOpType, a), a) }

// Exp performs a pointwise exp.
func Exp(a *Node) (*Node, error) { return unaryOpNode(newElemUnaryOp(expOpType, a), a) }

// Log performs a pointwise log.
func Log(a *Node) (*Node, error) { return unaryOpNode(newElemUnaryOp(lnOpType, a), a) }

// Log2 performs a pointwise log2.
func Log2(a *Node) (*Node, error) { return unaryOpNode(newElemUnaryOp(log2OpType, a), a) }

// Neg performs a pointwise neg.
func Neg(a *Node) (*Node, error) { return unaryOpNode(newElemUnaryOp(negOpType, a), a) }

// Square performs a pointwise square.
func Square(a *Node) (*Node, error) { return unaryOpNode(newElemUnaryOp(squareOpType, a), a) }

// Sqrt performs a pointwise sqrt.
func Sqrt(a *Node) (*Node, error) { return unaryOpNode(newElemUnaryOp(sqrtOpType, a), a) }

// Inverse performs a pointwise inverse.
func Inverse(a *Node) (*Node, error) { return unaryOpNode(newElemUnaryOp(inverseOpType, a), a) }

// InverseSqrt performs a pointwise inversesqrt.
func InverseSqrt(a *Node) (*Node, error) { return unaryOpNode(newElemUnaryOp(inverseSqrtOpType, a), a) }

// Cube performs a pointwise cube.
func Cube(a *Node) (*Node, error) { return unaryOpNode(newElemUnaryOp(cubeOpType, a), a) }

// Tanh performs a pointwise tanh.
func Tanh(a *Node) (*Node, error) { return unaryOpNode(newElemUnaryOp(tanhOpType, a), a) }

// Sigmoid performs a pointwise sigmoid.
func Sigmoid(a *Node) (*Node, error) { return unaryOpNode(newElemUnaryOp(sigmoidOpType, a), a) }

// Log1p performs a pointwise log1p.
func Log1p(a *Node) (*Node, error) { return unaryOpNode(newElemUnaryOp(log1pOpType, a), a) }

// Expm1 performs a pointwise expm1.
func Expm1(a *Node) (*Node, error) { return unaryOpNode(newElemUnaryOp(expm1OpType, a), a) }

// Softplus performs a pointwise softplus.
func Softplus(a *Node) (*Node, error) { return unaryOpNode(newElemUnaryOp(softplusOpType, a), a) }

// Add perfors a pointwise add operation.
func Add(a, b *Node) (*Node, error) { return binOpNode(newElemBinOp(addOpType, a, b), a, b) }

// Sub perfors a pointwise sub operation.
func Sub(a, b *Node) (*Node, error) { return binOpNode(newElemBinOp(subOpType, a, b), a, b) }

// HadamardProd perfors a pointwise hadamardprod operation.
func HadamardProd(a, b *Node) (*Node, error) { return binOpNode(newElemBinOp(mulOpType, a, b), a, b) }

// HadamardDiv perfors a pointwise hadamarddiv operation.
func HadamardDiv(a, b *Node) (*Node, error) { return binOpNode(newElemBinOp(divOpType, a, b), a, b) }

// Pow perfors a pointwise pow operation.
func Pow(a, b *Node) (*Node, error) { return binOpNode(newElemBinOp(powOpType, a, b), a, b) }

// Lt perfors a pointwise lt operation.
// retSame indicates if the data type of the return value should be the same as the input data type. It defaults to Bool otherwise.
func Lt(a, b *Node, retSame bool) (*Node, error) {
op := newElemBinOp(ltOpType, a, b)
op.retSame = retSame
return binOpNode(op, a, b)
}

// Gt perfors a pointwise gt operation.
// retSame indicates if the data type of the return value should be the same as the input data type. It defaults to Bool otherwise.
func Gt(a, b *Node, retSame bool) (*Node, error) {
op := newElemBinOp(gtOpType, a, b)
op.retSame = retSame
return binOpNode(op, a, b)
}

// Lte perfors a pointwise lte operation.
// retSame indicates if the data type of the return value should be the same as the input data type. It defaults to Bool otherwise.
func Lte(a, b *Node, retSame bool) (*Node, error) {
op := newElemBinOp(lteOpType, a, b)
op.retSame = retSame
return binOpNode(op, a, b)
}

// Gte perfors a pointwise gte operation.
// retSame indicates if the data type of the return value should be the same as the input data type. It defaults to Bool otherwise.
func Gte(a, b *Node, retSame bool) (*Node, error) {
op := newElemBinOp(gteOpType, a, b)
op.retSame = retSame
return binOpNode(op, a, b)
}

// Eq perfors a pointwise eq operation.
// retSame indicates if the data type of the return value should be the same as the input data type. It defaults to Bool otherwise.
func Eq(a, b *Node, retSame bool) (*Node, error) {
op := newElemBinOp(eqOpType, a, b)
op.retSame = retSame
return binOpNode(op, a, b)
}

// Ne perfors a pointwise ne operation.
// retSame indicates if the data type of the return value should be the same as the input data type. It defaults to Bool otherwise.
func Ne(a, b *Node, retSame bool) (*Node, error) {
op := newElemBinOp(neOpType, a, b)
op.retSame = retSame
return binOpNode(op, a, b)
}

This file was deleted.

@@ -1,10 +1,11 @@
// +build concurrentTraining

package gorgonia_test

import (
"io"
"runtime"
"testing"

"gorgonia.org/tensor"
)

func BenchmarkTrainingConcurrent(b *testing.B) {
@@ -26,3 +27,11 @@ func BenchmarkTrainingNonConcurrent(b *testing.B) {

runtime.GC()
}

func BenchmarkTapeMachineExecution(b *testing.B) {
m, c, machine := linregSetup(tensor.Float64)
for i := 0; i < b.N; i++ {
linregRun(m, c, machine, 100, false)
}
machine.(io.Closer).Close()
}
@@ -25,6 +25,7 @@ type BLAS interface {
type batchedBLAS interface {
WorkAvailable() <-chan struct{}
DoWork()
Close() error
BLAS
}

@@ -40,7 +40,11 @@ func TestBroadcastPattern(t *testing.T) {
assert.Equal([]int{1}, bcpat.on()[1])
}

func TestBroadcast2(t *testing.T) {
func TestBroadcast(t *testing.T) {
if CUDA {
t.SkipNow()
}

assert := assert.New(t)
var g *ExprGraph
var x, y, z *Node
@@ -53,13 +57,13 @@ func TestBroadcast2(t *testing.T) {
g = NewGraph()
x = NewMatrix(g, Float64, WithShape(2, 3), WithValue(xT), WithName("x"))
y = NewVector(g, Float64, WithShape(2), WithValue(yT), WithName("y"))
z, err = Broadcast(addOpType, x, y, NewBroadcastPattern(nil, []byte{1}))
if err != nil {
if z, err = Broadcast(addOpType, x, y, NewBroadcastPattern(nil, []byte{1})); err != nil {
ioutil.WriteFile("Broadcast.dot", []byte(g.ToDot()), 0644)
t.Fatal(err)
}

m = NewLispMachine(g, ExecuteFwdOnly())
defer m.Close()
if err := m.RunAll(); err != nil {
t.Fatal(err)
}
@@ -68,16 +72,15 @@ func TestBroadcast2(t *testing.T) {
g = NewGraph()
x = NewMatrix(g, Float64, WithShape(2, 3), WithValue(xT), WithName("x"))
y = NewVector(g, Float64, WithShape(2), WithValue(yT), WithName("y"))
z, err = Broadcast(addOpType, y, x, NewBroadcastPattern([]byte{1}, nil))
if err != nil {
if z, err = Broadcast(addOpType, y, x, NewBroadcastPattern([]byte{1}, nil)); err != nil {
ioutil.WriteFile("Broadcast.dot", []byte(g.ToDot()), 0644)
t.Fatalf("%+v", err)
}

m = NewLispMachine(g, ExecuteFwdOnly())
defer m.Close()
if err := m.RunAll(); err != nil {
t.Fatal(err)
}
assert.Equal([]float64{100, 101, 102, 203, 204, 205}, extractF64s(z.Value()))

}

0 comments on commit 978b1c3

Please sign in to comment.
You can’t perform that action at this time.