From 6225e920eeae98759a718688265d34ee255370ee Mon Sep 17 00:00:00 2001
From: David Cuadrado <dcuadrado@truora.com>
Date: Fri, 9 Oct 2020 15:09:34 -0500
Subject: [PATCH 01/12] Adding Softmax operation

It's still missing the methods to allow to back propagate it
---
 op_softmax.go      | 281 +++++++++++++++++++++++++++++++++++++++++++++
 op_softmax_test.go |  39 +++++++
 2 files changed, 320 insertions(+)
 create mode 100644 op_softmax.go
 create mode 100644 op_softmax_test.go

diff --git a/op_softmax.go b/op_softmax.go
new file mode 100644
index 00000000..7a52bfc7
--- /dev/null
+++ b/op_softmax.go
@@ -0,0 +1,281 @@
+package gorgonia
+
+import (
+	"fmt"
+	"hash"
+	"math"
+
+	"github.com/chewxy/hm"
+	"github.com/pkg/errors"
+	"gorgonia.org/tensor"
+)
+
+type softmaxOp struct {
+}
+
+func newSoftmaxOp(inputShape tensor.Shape) *softmaxOp {
+	softmaxop := &softmaxOp{}
+
+	return softmaxop
+}
+
+// Softmax -  implements the softmax operation described here: http://proceedings.mlr.press/v48/martins16.pdf
+// Current implementation only supports float64
+func Softmax(x *Node) (*Node, error) {
+	xShape := x.Shape()
+	op := newSoftmaxOp(xShape)
+
+	return ApplyOp(op, x)
+}
+
+func (op *softmaxOp) Arity() int {
+	return 1
+}
+
+func (op *softmaxOp) ReturnsPtr() bool { return false }
+
+func (op *softmaxOp) CallsExtern() bool { return false }
+
+func (op *softmaxOp) WriteHash(h hash.Hash) {
+	fmt.Fprintf(h, "Softmax{}()")
+}
+
+func (op *softmaxOp) Hashcode() uint32 { return simpleHash(op) }
+
+func (op *softmaxOp) String() string {
+	return fmt.Sprintf("Softmax{}()")
+}
+
+func (op *softmaxOp) InferShape(inputs ...DimSizer) (tensor.Shape, error) {
+	s := inputs[0].(tensor.Shape).Clone()
+	return s, nil
+}
+
+func (op *softmaxOp) Type() hm.Type {
+	a := hm.TypeVariable('a')
+	t := newTensorType(1, a)
+
+	return hm.NewFnType(t, t)
+}
+
+func (op *softmaxOp) OverwritesInput() int { return -1 }
+
+func (op *softmaxOp) checkInput(inputs ...Value) (tensor.Tensor, error) {
+	if err := checkArity(op, len(inputs)); err != nil {
+		return nil, err
+	}
+
+	var in tensor.Tensor
+	var ok bool
+
+	if in, ok = inputs[0].(tensor.Tensor); !ok {
+		return nil, errors.Errorf("Expected input to be a tensor")
+	}
+
+	if in.Shape().Dims() != 1 {
+		return nil, errors.Errorf("Expected input to have 1 dimensions")
+	}
+
+	return in, nil
+}
+
+func (op *softmaxOp) Do(inputs ...Value) (retVal Value, err error) {
+	inputTensor, err := op.checkInput(inputs...)
+	if err != nil {
+		return nil, fmt.Errorf("Can't check Softmax input: %w", err)
+	}
+
+	var output interface{}
+
+	switch arr := inputTensor.Data().(type) {
+	case []float64:
+		output = float64softMax(arr)
+	case []float32:
+		output = float32softMax(arr)
+	default:
+		return nil, fmt.Errorf("Softmax needs either []float32 or []float64, got %T", arr)
+	}
+
+	return tensor.New(tensor.Of(inputTensor.Dtype()), tensor.WithShape(inputTensor.Size()), tensor.WithEngine(inputTensor.Engine()), tensor.WithBacking(output)), nil
+}
+
+// FIXME: go2
+func float64softMax(arr []float64) interface{} {
+	output := make([]float64, len(arr))
+	sum := 0.0
+
+	for i, v := range arr {
+		exp := math.Exp(v)
+		sum += exp
+
+		output[i] = exp
+	}
+
+	for i := range output {
+		output[i] /= sum
+	}
+
+	return output
+}
+
+func float32softMax(arr []float32) interface{} {
+	output := make([]float32, len(arr))
+	sum := float32(0.0)
+
+	for i, v := range arr {
+		exp := float32(math.Exp(float64(v)))
+		sum += exp
+
+		output[i] = exp
+	}
+
+	for i := range output {
+		output[i] /= sum
+	}
+
+	return output
+}
+
+type softmaxDiffOp struct {
+}
+
+func newSoftmaxOpDiff() *softmaxDiffOp {
+	return &softmaxDiffOp{}
+}
+
+func (op *softmaxDiffOp) Arity() int {
+	return 2
+}
+
+func (op *softmaxDiffOp) ReturnsPtr() bool { return false }
+
+func (op *softmaxDiffOp) CallsExtern() bool { return false }
+
+func (op *softmaxDiffOp) WriteHash(h hash.Hash) {
+	fmt.Fprintf(h, "SoftmaxDiff{}()")
+}
+
+func (op *softmaxDiffOp) Hashcode() uint32 { return simpleHash(op) }
+
+func (op *softmaxDiffOp) String() string {
+	return fmt.Sprintf("SoftmaxDiff{}()")
+}
+
+func (op *softmaxDiffOp) InferShape(inputs ...DimSizer) (tensor.Shape, error) {
+	s := inputs[0].(tensor.Shape).Clone()
+
+	return s, nil
+}
+
+func (op *softmaxDiffOp) Type() hm.Type {
+	aType := hm.TypeVariable('a')
+
+	ta := newTensorType(1, aType)
+
+	return hm.NewFnType(ta, ta, ta) // f(float64, float64) float64
+}
+
+func (op *softmaxDiffOp) OverwritesInput() int { return -1 }
+
+func (op *softmaxDiffOp) checkInput(inputs ...Value) (tensor.Tensor, tensor.Tensor, error) {
+	if err := checkArity(op, len(inputs)); err != nil {
+		return nil, nil, err
+	}
+
+	var (
+		in tensor.Tensor
+
+		gradient tensor.Tensor
+		ok       bool
+	)
+
+	switch t := inputs[0].(type) {
+	case *dualValue:
+		if in, ok = t.Value.(tensor.Tensor); !ok {
+			return nil, nil, errors.Errorf("input should be a tensor, got %T", inputs[0])
+		}
+	case tensor.Tensor:
+		in = t
+	default:
+		return nil, nil, errors.Errorf("input type is not supported, got %T", inputs[0])
+	}
+
+	switch t := inputs[1].(type) {
+	case *dualValue:
+		if gradient, ok = t.Value.(tensor.Tensor); !ok {
+			return nil, nil, errors.Errorf("gradient should be a tensor, got %T", inputs[1])
+		}
+	case tensor.Tensor:
+		gradient = t
+	default:
+		return nil, nil, errors.Errorf("gradient type is not supported, got %T", inputs[1])
+	}
+
+	if in.Shape().Dims() != 1 || gradient.Shape().Dims() != 1 {
+		return nil, nil, errors.Errorf("Expected input to have 1 dimensions")
+	}
+
+	return in, gradient, nil
+}
+
+func (op *softmaxDiffOp) Do(inputs ...Value) (Value, error) {
+	inputTensor, gradTensor, err := op.checkInput(inputs...)
+	if err != nil {
+		return nil, fmt.Errorf("Can't check SoftmaxDiff input: %w", err)
+	}
+
+	if inputTensor.Size() != gradTensor.Size() {
+		return nil, fmt.Errorf("softmaxDiffOp.Do inputs sizes should be equal")
+	}
+
+	if !isFloat32Or64Array(inputTensor.Data()) {
+		return nil, fmt.Errorf("softmaxDiffOp.Do expected input to be []float64 or []float32, got %T", inputTensor.Data())
+	}
+
+	if !isFloat32Or64Array(gradTensor.Data()) {
+		return nil, fmt.Errorf("softmaxDiffOp.Do expected input to be []float64, got %T", gradTensor.Data())
+	}
+
+	input := inputTensor.Data().([]float64)
+	value := gradTensor.Data().([]float64)
+
+	output := make([]float64, len(input)*len(value))
+
+	for i := 0; i < len(value); i++ {
+		for j := 0; j < len(input); j++ {
+			if i == j {
+				output[i*j+j] = value[i] * (1 - input[i])
+			} else {
+				output[i*j+j] = -value[i] * input[i]
+			}
+		}
+	}
+
+	val := tensor.New(
+		tensor.Of(inputTensor.Dtype()),
+		tensor.WithShape(len(input), len(value)),
+		tensor.WithEngine(inputTensor.Engine()),
+		tensor.WithBacking(output), // FIXME
+	)
+
+	return val, nil
+}
+
+func isFloat32Or64Array(v interface{}) bool {
+	if _, ok := v.([]float64); ok {
+		return true
+	}
+
+	if _, ok := v.([]float32); ok {
+		return true
+	}
+
+	return false
+}
+
+// ensure it complies with the Op interface
+var (
+	_ Op = &softmaxOp{}
+
+	_ Op = &softmaxDiffOp{}
+)
diff --git a/op_softmax_test.go b/op_softmax_test.go
new file mode 100644
index 00000000..f654662e
--- /dev/null
+++ b/op_softmax_test.go
@@ -0,0 +1,39 @@
+package gorgonia
+
+import (
+	"testing"
+
+	"github.com/stretchr/testify/require"
+	"gorgonia.org/tensor"
+)
+
+var testCasesSoftMaxDo = []struct {
+	input    []float64
+	expected []float64
+}{
+	{
+		[]float64{0.2094, -1.0, 0.6411, 0.0, -0.3909}, []float64{0.2382105379413429, 0.07107636737487558, 0.36681399568548617, 0.19320559786800362, 0.13069350113029174},
+	},
+	{
+		[]float64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10}, []float64{7.801341612780742e-05, 0.00021206245143623275, 0.0005764455082375902, 0.0015669413501390804, 0.004259388198344144, 0.0115782175399118, 0.031472858344688034, 0.08555209892803112, 0.23255471590259755, 0.6321492583604866},
+	},
+	{
+		[]float64{0.1, 0.1, 0.1}, []float64{0.3333333333333333, 0.3333333333333333, 0.3333333333333333},
+	},
+	{
+		[]float64{-0.1, 0.3, -1.1, 2.7}, []float64{0.05180179352659075, 0.07727919496508177, 0.019056814854240642, 0.8518621966540868},
+	},
+}
+
+func TestSoftmaxDo(t *testing.T) {
+	c := require.New(t)
+
+	for i, testCase := range testCasesSoftMaxDo {
+		tt := tensor.New(tensor.Of(tensor.Float64), tensor.WithShape(len(testCase.input)), tensor.WithBacking(testCase.input))
+		op := newSoftmaxOp(tt.Shape())
+
+		out, err := op.Do(tt)
+		c.NoError(err, "failed test case: %d", i)
+		c.Equal(testCase.expected, out.Data(), "failed test case: %d", i)
+	}
+}

From f1ab8c3f069841205f2f03a7a776e052a960ba4f Mon Sep 17 00:00:00 2001
From: David Cuadrado <dcuadrado@truora.com>
Date: Sat, 10 Oct 2020 17:38:06 -0500
Subject: [PATCH 02/12] Improve implementation of the SoftMap operation

---
 op_softmax.go | 161 ++++++++++++++++----------------------------------
 1 file changed, 52 insertions(+), 109 deletions(-)

diff --git a/op_softmax.go b/op_softmax.go
index 7a52bfc7..3ba0f3ba 100644
--- a/op_softmax.go
+++ b/op_softmax.go
@@ -3,7 +3,6 @@ package gorgonia
 import (
 	"fmt"
 	"hash"
-	"math"
 
 	"github.com/chewxy/hm"
 	"github.com/pkg/errors"
@@ -11,19 +10,23 @@ import (
 )
 
 type softmaxOp struct {
+	shape tensor.Shape
+	axes  []int
 }
 
-func newSoftmaxOp(inputShape tensor.Shape) *softmaxOp {
-	softmaxop := &softmaxOp{}
+func newSoftmaxOp(inputShape tensor.Shape, axes ...int) *softmaxOp {
+	softmaxop := &softmaxOp{
+		shape: inputShape,
+		axes:  axes,
+	}
 
 	return softmaxop
 }
 
-// Softmax -  implements the softmax operation described here: http://proceedings.mlr.press/v48/martins16.pdf
-// Current implementation only supports float64
-func Softmax(x *Node) (*Node, error) {
+// SoftMax2 -  implements the softmax operation
+func SoftMax2(x *Node, axis ...int) (*Node, error) {
 	xShape := x.Shape()
-	op := newSoftmaxOp(xShape)
+	op := newSoftmaxOp(xShape, axis...)
 
 	return ApplyOp(op, x)
 }
@@ -85,55 +88,36 @@ func (op *softmaxOp) Do(inputs ...Value) (retVal Value, err error) {
 		return nil, fmt.Errorf("Can't check Softmax input: %w", err)
 	}
 
-	var output interface{}
-
-	switch arr := inputTensor.Data().(type) {
-	case []float64:
-		output = float64softMax(arr)
-	case []float32:
-		output = float32softMax(arr)
-	default:
-		return nil, fmt.Errorf("Softmax needs either []float32 or []float64, got %T", arr)
+	aShape := inputTensor.Shape()
+	axis := aShape.Dims() - 1 // default: last dim
+	if aShape.IsColVec() || (aShape.IsVector() && !aShape.IsRowVec()) {
+		axis = 0
 	}
 
-	return tensor.New(tensor.Of(inputTensor.Dtype()), tensor.WithShape(inputTensor.Size()), tensor.WithEngine(inputTensor.Engine()), tensor.WithBacking(output)), nil
-}
-
-// FIXME: go2
-func float64softMax(arr []float64) interface{} {
-	output := make([]float64, len(arr))
-	sum := 0.0
-
-	for i, v := range arr {
-		exp := math.Exp(v)
-		sum += exp
+	if len(op.axes) > 0 {
+		if op.axes[0] >= axis+1 || op.axes[0] < 0 {
+			return nil, errors.Errorf("Cannot perform SoftMax on axis %d. Input has shape %v", op.axes[0], aShape)
+		}
 
-		output[i] = exp
+		axis = op.axes[0]
 	}
 
-	for i := range output {
-		output[i] /= sum
+	exp, err := tensor.Exp(inputTensor)
+	if err != nil {
+		return nil, fmt.Errorf("error calculating exp for SoftMax: %w", err)
 	}
 
-	return output
-}
-
-func float32softMax(arr []float32) interface{} {
-	output := make([]float32, len(arr))
-	sum := float32(0.0)
-
-	for i, v := range arr {
-		exp := float32(math.Exp(float64(v)))
-		sum += exp
-
-		output[i] = exp
+	sum, err := tensor.Sum(exp, axis)
+	if err != nil {
+		return nil, fmt.Errorf("error calculating sum for SoftMax: %w", err)
 	}
 
-	for i := range output {
-		output[i] /= sum
+	div, err := tensor.Div(exp, sum)
+	if err != nil {
+		return nil, fmt.Errorf("error calculating div for SoftMax: %w", err)
 	}
 
-	return output
+	return div, nil
 }
 
 type softmaxDiffOp struct {
@@ -144,7 +128,7 @@ func newSoftmaxOpDiff() *softmaxDiffOp {
 }
 
 func (op *softmaxDiffOp) Arity() int {
-	return 2
+	return 1
 }
 
 func (op *softmaxDiffOp) ReturnsPtr() bool { return false }
@@ -172,105 +156,64 @@ func (op *softmaxDiffOp) Type() hm.Type {
 
 	ta := newTensorType(1, aType)
 
-	return hm.NewFnType(ta, ta, ta) // f(float64, float64) float64
+	return hm.NewFnType(ta, ta) // f(float64) float64
 }
 
 func (op *softmaxDiffOp) OverwritesInput() int { return -1 }
 
-func (op *softmaxDiffOp) checkInput(inputs ...Value) (tensor.Tensor, tensor.Tensor, error) {
+func (op *softmaxDiffOp) checkInput(inputs ...Value) (tensor.Tensor, error) {
 	if err := checkArity(op, len(inputs)); err != nil {
-		return nil, nil, err
+		return nil, err
 	}
 
 	var (
 		in tensor.Tensor
 
-		gradient tensor.Tensor
-		ok       bool
+		ok bool
 	)
 
 	switch t := inputs[0].(type) {
 	case *dualValue:
 		if in, ok = t.Value.(tensor.Tensor); !ok {
-			return nil, nil, errors.Errorf("input should be a tensor, got %T", inputs[0])
+			return nil, errors.Errorf("input should be a tensor, got %T", inputs[0])
 		}
 	case tensor.Tensor:
 		in = t
 	default:
-		return nil, nil, errors.Errorf("input type is not supported, got %T", inputs[0])
-	}
-
-	switch t := inputs[1].(type) {
-	case *dualValue:
-		if gradient, ok = t.Value.(tensor.Tensor); !ok {
-			return nil, nil, errors.Errorf("gradient should be a tensor, got %T", inputs[1])
-		}
-	case tensor.Tensor:
-		gradient = t
-	default:
-		return nil, nil, errors.Errorf("gradient type is not supported, got %T", inputs[1])
+		return nil, errors.Errorf("input type is not supported, got %T", inputs[0])
 	}
 
-	if in.Shape().Dims() != 1 || gradient.Shape().Dims() != 1 {
-		return nil, nil, errors.Errorf("Expected input to have 1 dimensions")
-	}
-
-	return in, gradient, nil
+	return in, nil
 }
 
 func (op *softmaxDiffOp) Do(inputs ...Value) (Value, error) {
-	inputTensor, gradTensor, err := op.checkInput(inputs...)
+	inputTensor, err := op.checkInput(inputs...)
 	if err != nil {
 		return nil, fmt.Errorf("Can't check SoftmaxDiff input: %w", err)
 	}
 
-	if inputTensor.Size() != gradTensor.Size() {
-		return nil, fmt.Errorf("softmaxDiffOp.Do inputs sizes should be equal")
-	}
-
-	if !isFloat32Or64Array(inputTensor.Data()) {
-		return nil, fmt.Errorf("softmaxDiffOp.Do expected input to be []float64 or []float32, got %T", inputTensor.Data())
-	}
-
-	if !isFloat32Or64Array(gradTensor.Data()) {
-		return nil, fmt.Errorf("softmaxDiffOp.Do expected input to be []float64, got %T", gradTensor.Data())
-	}
-
-	input := inputTensor.Data().([]float64)
-	value := gradTensor.Data().([]float64)
-
-	output := make([]float64, len(input)*len(value))
-
-	for i := 0; i < len(value); i++ {
-		for j := 0; j < len(input); j++ {
-			if i == j {
-				output[i*j+j] = value[i] * (1 - input[i])
-			} else {
-				output[i*j+j] = -value[i] * input[i]
-			}
-		}
+	diag, err := tensor.Diag(inputTensor)
+	if err != nil {
+		return nil, fmt.Errorf("softmaxDiffOp.Do error calculating diag: %w", err)
 	}
 
-	val := tensor.New(
-		tensor.Of(inputTensor.Dtype()),
-		tensor.WithShape(len(input), len(value)),
-		tensor.WithEngine(inputTensor.Engine()),
-		tensor.WithBacking(output), // FIXME
-	)
+	sm := inputTensor.Clone().(tensor.Tensor)
+	sm.Reshape(inputTensor.Shape().TotalSize(), 1)
 
-	return val, nil
-}
+	smT := sm.Clone().(tensor.Tensor)
+	smT.Transpose()
 
-func isFloat32Or64Array(v interface{}) bool {
-	if _, ok := v.([]float64); ok {
-		return true
+	smDot, err := tensor.Dot(sm, smT)
+	if err != nil {
+		return nil, fmt.Errorf("softmaxDiffOp.Do error calculating dot product: %w", err)
 	}
 
-	if _, ok := v.([]float32); ok {
-		return true
+	result, err := tensor.Sub(diag, smDot)
+	if err != nil {
+		return nil, fmt.Errorf("softmaxDiffOp.Do error calculating sub: %w", err)
 	}
 
-	return false
+	return result, nil
 }
 
 // ensure it complies with the Op interface

From 8bd8db48182a1a936488d9e59dbad2a96df66511 Mon Sep 17 00:00:00 2001
From: David Cuadrado <dcuadrado@truora.com>
Date: Sun, 11 Oct 2020 15:18:38 -0500
Subject: [PATCH 03/12] Replace SoftMax with the operation

---
 op_softmax.go      | 62 ++++++++++++++++++++++++++++++++++++++++------
 operations.go      |  2 +-
 operations_test.go |  3 ++-
 3 files changed, 58 insertions(+), 9 deletions(-)

diff --git a/op_softmax.go b/op_softmax.go
index 3ba0f3ba..42629644 100644
--- a/op_softmax.go
+++ b/op_softmax.go
@@ -23,8 +23,8 @@ func newSoftmaxOp(inputShape tensor.Shape, axes ...int) *softmaxOp {
 	return softmaxop
 }
 
-// SoftMax2 -  implements the softmax operation
-func SoftMax2(x *Node, axis ...int) (*Node, error) {
+// SoftMax -  implements the softmax operation
+func SoftMax(x *Node, axis ...int) (*Node, error) {
 	xShape := x.Shape()
 	op := newSoftmaxOp(xShape, axis...)
 
@@ -120,6 +120,55 @@ func (op *softmaxOp) Do(inputs ...Value) (retVal Value, err error) {
 	return div, nil
 }
 
+// DoDiff calculates the diff and sets its value to the output node. Implementation for ADOp interface.
+func (op *softmaxOp) DoDiff(ctx ExecutionContext, inputs Nodes, output *Node) error {
+	if len(inputs) != 1 {
+		return fmt.Errorf("SoftmaxOp.DoDiff needs 1 arguments")
+	}
+
+	odv := output.boundTo.(*dualValue)
+	odvd := odv.Value.(tensor.Tensor)
+	diffOp := newSoftmaxOpDiff()
+
+	result, err := diffOp.Do()
+	if err != nil {
+		return err
+	}
+
+	sum, err := odvd.(*tensor.Dense).Add(result.(*tensor.Dense), tensor.UseUnsafe())
+	if err != nil {
+		return err
+	}
+
+	odv.d = sum
+
+	return nil
+}
+
+// SymDiff applies the diff op. Implementation for SDOp interface.
+func (op *softmaxOp) SymDiff(inputs Nodes, output, grad *Node) (Nodes, error) {
+	err := checkArity(op, len(inputs))
+	if err != nil {
+		return nil, err
+	}
+
+	diffOp := newSoftmaxOpDiff()
+	nodes := make(Nodes, 1)
+
+	nodes[0], err = ApplyOp(diffOp, output)
+
+	return nodes, err
+}
+
+// DiffWRT is an implementation for the SDOp interface
+func (op *softmaxOp) DiffWRT(inputs int) []bool {
+	if inputs != 1 {
+		panic(fmt.Sprintf("softmax operator only supports one input, got %d instead", inputs))
+	}
+
+	return []bool{true}
+}
+
 type softmaxDiffOp struct {
 }
 
@@ -192,10 +241,7 @@ func (op *softmaxDiffOp) Do(inputs ...Value) (Value, error) {
 		return nil, fmt.Errorf("Can't check SoftmaxDiff input: %w", err)
 	}
 
-	diag, err := tensor.Diag(inputTensor)
-	if err != nil {
-		return nil, fmt.Errorf("softmaxDiffOp.Do error calculating diag: %w", err)
-	}
+	diag := tensor.New(tensor.AsDenseDiag(inputTensor))
 
 	sm := inputTensor.Clone().(tensor.Tensor)
 	sm.Reshape(inputTensor.Shape().TotalSize(), 1)
@@ -218,7 +264,9 @@ func (op *softmaxDiffOp) Do(inputs ...Value) (Value, error) {
 
 // ensure it complies with the Op interface
 var (
-	_ Op = &softmaxOp{}
+	_ Op   = &softmaxOp{}
+	_ ADOp = &softmaxOp{}
+	_ SDOp = &softmaxOp{}
 
 	_ Op = &softmaxDiffOp{}
 )
diff --git a/operations.go b/operations.go
index 9ffd3619..a15057fe 100644
--- a/operations.go
+++ b/operations.go
@@ -157,7 +157,7 @@ func unaryOpNode(op Op, a *Node) (retVal *Node, err error) {
 //		e^(a[i]) / sum((e^(a[i])))
 // For a more numerically stable SoftMax, use StableSoftMax.
 // TODO: MULTI RANK SOFTMAX
-func SoftMax(a *Node, axes ...int) (retVal *Node, err error) {
+func SoftMaxOld(a *Node, axes ...int) (retVal *Node, err error) {
 	aShape := a.Shape()
 	axis := aShape.Dims() - 1 // default: last dim
 	if a.IsColVec() || (a.IsVector() && !a.IsRowVec()) {
diff --git a/operations_test.go b/operations_test.go
index ad1ca075..dcb0cba7 100644
--- a/operations_test.go
+++ b/operations_test.go
@@ -399,7 +399,8 @@ func TestMisha(t *testing.T) {
 
 func TestSoftMax(t *testing.T) {
 	defer runtime.GC()
-	assert := assert.New(t)
+
+	assert := require.New(t)
 	g := NewGraph()
 	xT := tensor.New(tensor.WithBacking([]float64{0.1, 0.2, -0.3, 0.4, 0.5}))
 	x := NewVector(g, Float64, WithShape(5), WithValue(xT))

From 22ae75acd506440fe8964581121a5a4c3f595c44 Mon Sep 17 00:00:00 2001
From: David Cuadrado <dcuadrado@truora.com>
Date: Sun, 11 Oct 2020 21:59:14 -0500
Subject: [PATCH 04/12] Improve tensor operations in soft max

And handle the errors properly
---
 op_softmax.go | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/op_softmax.go b/op_softmax.go
index 42629644..a294e854 100644
--- a/op_softmax.go
+++ b/op_softmax.go
@@ -244,12 +244,20 @@ func (op *softmaxDiffOp) Do(inputs ...Value) (Value, error) {
 	diag := tensor.New(tensor.AsDenseDiag(inputTensor))
 
 	sm := inputTensor.Clone().(tensor.Tensor)
-	sm.Reshape(inputTensor.Shape().TotalSize(), 1)
+
+	err = sm.Reshape(inputTensor.Shape().TotalSize(), 1)
+	if err != nil {
+		return nil, fmt.Errorf("softmaxDiffOp.Do error reshaping the value: %w", err)
+	}
 
 	smT := sm.Clone().(tensor.Tensor)
-	smT.Transpose()
 
-	smDot, err := tensor.Dot(sm, smT)
+	err = smT.T()
+	if err != nil {
+		return nil, fmt.Errorf("softmaxDiffOp.Do error transposing the value: %w", err)
+	}
+
+	smDot, err := tensor.MatMul(sm, smT)
 	if err != nil {
 		return nil, fmt.Errorf("softmaxDiffOp.Do error calculating dot product: %w", err)
 	}

From 030cfb1ecef81b6e5939b7d1b75cb8b814b7e5df Mon Sep 17 00:00:00 2001
From: David Cuadrado <dcuadrado@truora.com>
Date: Mon, 12 Oct 2020 11:48:02 -0500
Subject: [PATCH 05/12] Fix broken test

And add debugging statements
---
 complex_test.go      |  7 ++++++-
 known_issues_test.go | 11 ++++++-----
 op_math.go           | 10 +++++++++-
 op_softmax.go        | 15 +++++++++------
 4 files changed, 30 insertions(+), 13 deletions(-)

diff --git a/complex_test.go b/complex_test.go
index 9c1e2ac7..bdcad4c8 100644
--- a/complex_test.go
+++ b/complex_test.go
@@ -1,6 +1,9 @@
 package gorgonia
 
-import "testing"
+import (
+	"runtime/debug"
+	"testing"
+)
 
 func TestWeirdNetwork(t *testing.T) {
 	const (
@@ -138,6 +141,8 @@ func TestWeirdNetwork(t *testing.T) {
 	for i := 0; i < 2; i++ {
 		if err = m.RunAll(); err != nil {
 			t.Errorf("%d %v", i, err)
+			t.Log(string(debug.Stack()))
+
 			break
 		}
 
diff --git a/known_issues_test.go b/known_issues_test.go
index 4b060117..9c2f3321 100644
--- a/known_issues_test.go
+++ b/known_issues_test.go
@@ -1,10 +1,10 @@
 package gorgonia
 
 import (
-	"log"
 	"testing"
 
 	"github.com/stretchr/testify/assert"
+	"github.com/stretchr/testify/require"
 	"gorgonia.org/tensor"
 )
 
@@ -303,11 +303,12 @@ func TestIssue363(t *testing.T) {
 }
 
 func TestIssue368(t *testing.T) {
+	c := require.New(t)
+
 	g := NewGraph()
 	x := NewTensor(g, Float32, 2, WithShape(2, 5), WithInit(GlorotU(1.0)))
+
 	sm, err := SoftMax(x, 1)
-	if err != nil {
-		log.Fatal(err)
-	}
-	_ = sm
+	c.NoError(err)
+	c.NotNil(sm)
 }
diff --git a/op_math.go b/op_math.go
index f418030d..c59426e6 100644
--- a/op_math.go
+++ b/op_math.go
@@ -732,6 +732,7 @@ func (op linAlgBinOp) do(inputs []Value, opts ...tensor.FuncOpt) (retVal Value,
 		if err = a.T(); err != nil {
 			return nil, errors.Wrap(err, tFail)
 		}
+
 		// untranspose
 		defer a.T()
 	}
@@ -740,6 +741,7 @@ func (op linAlgBinOp) do(inputs []Value, opts ...tensor.FuncOpt) (retVal Value,
 		if err = b.T(); err != nil {
 			return nil, errors.Wrap(err, tFail)
 		}
+
 		// untranspose
 		defer b.T()
 	}
@@ -751,9 +753,11 @@ func (op linAlgBinOp) do(inputs []Value, opts ...tensor.FuncOpt) (retVal Value,
 		retVal, err = tensor.MatVecMul(a, b, opts...)
 	case vecDotOperator:
 		var ret interface{}
+
 		if ret, err = tensor.Inner(a, b); err != nil {
 			return nil, errors.Wrapf(err, "Failed to carry out linalgBinOp operation %v", op)
 		}
+
 		retVal, _ = anyToScalar(ret)
 	case outerProdOperator:
 		retVal, err = tensor.Outer(a, b, opts...)
@@ -761,8 +765,12 @@ func (op linAlgBinOp) do(inputs []Value, opts ...tensor.FuncOpt) (retVal Value,
 		// checks were done when the op was created
 		retVal, err = batchedMatMul(a, b, nil, op.transA, op.transB, false)
 	}
-	return
 
+	if err != nil {
+		return nil, fmt.Errorf("linAlgBinOp %v %s %v error: %w", a.Shape(), op.āBinaryOperator, b.Shape(), err)
+	}
+
+	return retVal, nil
 }
 
 func (op linAlgBinOp) preallocBatchMatMul(incr bool, prealloc Value, inputs ...Value) (retVal Value, err error) {
diff --git a/op_softmax.go b/op_softmax.go
index a294e854..dc68c816 100644
--- a/op_softmax.go
+++ b/op_softmax.go
@@ -50,15 +50,16 @@ func (op *softmaxOp) String() string {
 }
 
 func (op *softmaxOp) InferShape(inputs ...DimSizer) (tensor.Shape, error) {
-	s := inputs[0].(tensor.Shape).Clone()
-	return s, nil
+	s := inputs[0].(tensor.Shape)
+
+	return tensor.Shape{s.TotalSize()}, nil
 }
 
 func (op *softmaxOp) Type() hm.Type {
 	a := hm.TypeVariable('a')
 	t := newTensorType(1, a)
 
-	return hm.NewFnType(t, t)
+	return hm.NewFnType(t, t) // f(float64) float64
 }
 
 func (op *softmaxOp) OverwritesInput() int { return -1 }
@@ -68,8 +69,10 @@ func (op *softmaxOp) checkInput(inputs ...Value) (tensor.Tensor, error) {
 		return nil, err
 	}
 
-	var in tensor.Tensor
-	var ok bool
+	var (
+		in tensor.Tensor
+		ok bool
+	)
 
 	if in, ok = inputs[0].(tensor.Tensor); !ok {
 		return nil, errors.Errorf("Expected input to be a tensor")
@@ -94,6 +97,7 @@ func (op *softmaxOp) Do(inputs ...Value) (retVal Value, err error) {
 		axis = 0
 	}
 
+	// FIXME: v0.10
 	if len(op.axes) > 0 {
 		if op.axes[0] >= axis+1 || op.axes[0] < 0 {
 			return nil, errors.Errorf("Cannot perform SoftMax on axis %d. Input has shape %v", op.axes[0], aShape)
@@ -242,7 +246,6 @@ func (op *softmaxDiffOp) Do(inputs ...Value) (Value, error) {
 	}
 
 	diag := tensor.New(tensor.AsDenseDiag(inputTensor))
-
 	sm := inputTensor.Clone().(tensor.Tensor)
 
 	err = sm.Reshape(inputTensor.Shape().TotalSize(), 1)

From 64352142d9b33b2a8eced2aa9f2ccb9e26a6f701 Mon Sep 17 00:00:00 2001
From: David Cuadrado <dcuadrado@truora.com>
Date: Mon, 12 Oct 2020 18:35:04 -0500
Subject: [PATCH 06/12] WIP

---
 example_err_test.go        |  2 ++
 example_operations_test.go | 17 ++++++----
 op_softmax.go              | 66 +++++++++++++++++++-------------------
 3 files changed, 45 insertions(+), 40 deletions(-)

diff --git a/example_err_test.go b/example_err_test.go
index 297ce11a..b32aca9e 100644
--- a/example_err_test.go
+++ b/example_err_test.go
@@ -48,6 +48,7 @@ func Example_errorHandling() {
 			)),
 		)),
 	))
+
 	fmt.Printf("nn2: %v\n", nn2)
 
 	defer func() {
@@ -55,6 +56,7 @@ func Example_errorHandling() {
 			fmt.Printf("An error occurs (caught by recover()): %v\n", r)
 		}
 	}()
+
 	nn2PlusWrong := Must(Add(nn2, wrong2))
 	_ = nn2PlusWrong
 
diff --git a/example_operations_test.go b/example_operations_test.go
index 9180cf67..34d7f121 100644
--- a/example_operations_test.go
+++ b/example_operations_test.go
@@ -2,7 +2,6 @@ package gorgonia
 
 import (
 	"fmt"
-	"log"
 	"strings"
 
 	"gorgonia.org/tensor"
@@ -29,12 +28,16 @@ func ExampleSoftMax() {
 	sm := Must(SoftMax(c))
 	m := NewTapeMachine(g)
 	if err := m.RunAll(); err != nil {
-		log.Fatal(err)
+		panic(err)
 	}
+
 	fmt.Printf("a:\n%v\nsoftmax(a) - along last axis (default behaviour):\n%1.2f", a.Value(), sm1.Value())
 	fmt.Printf("b:\n%v\nsoftmax(b) - along axis 0:\n%1.2f", b.Value(), sm0.Value())
+
 	tmp := fmt.Sprintf("c %v:\n%v\nsoftmax(c) - along last axis (default behaviour) %v:\n%1.2f", c.Value().Shape(), c.Value(), sm.Value().Shape(), sm.Value())
+
 	fmt.Println(strings.Replace(tmp, "\n\n\n", "\n\n", -1))
+
 	// the requirement to use tmp and strings.Replace is because when Go runs example tests, it strips excess newlines.
 
 	// Output:
@@ -76,12 +79,12 @@ func ExampleConcat() {
 
 	z, err := Concat(2, x, y)
 	if err != nil {
-		log.Fatal(err)
+		panic(err)
 	}
 
 	m := NewTapeMachine(g)
 	if err := m.RunAll(); err != nil {
-		log.Fatal(err)
+		panic(err)
 	}
 	tmp := fmt.Sprintf("z %v\n%v", z.Value().Shape(), z.Value())
 	fmt.Println(strings.Replace(tmp, "\n\n", "\n", -1)) // this is because
@@ -155,18 +158,18 @@ func ExampleUnconcat() {
 
 	z, err := Concat(2, x, y)
 	if err != nil {
-		log.Fatal(err)
+		panic(err)
 	}
 
 	unconcats, err := Unconcat(z, 2, 2)
 	if err != nil {
-		log.Fatal(err)
+		panic(err)
 	}
 	a, b := unconcats[0], unconcats[1]
 
 	m := NewTapeMachine(g)
 	if err := m.RunAll(); err != nil {
-		log.Fatal(err)
+		panic(err)
 	}
 	tmp := fmt.Sprintf("a %v\n%v\nb %v\n%v", a.Value().Shape(), a.Value(), b.Value().Shape(), b.Value())
 	fmt.Println(strings.Replace(tmp, "\n\n", "\n", -1))
diff --git a/op_softmax.go b/op_softmax.go
index dc68c816..3502fb3f 100644
--- a/op_softmax.go
+++ b/op_softmax.go
@@ -3,6 +3,7 @@ package gorgonia
 import (
 	"fmt"
 	"hash"
+	"os"
 
 	"github.com/chewxy/hm"
 	"github.com/pkg/errors"
@@ -78,10 +79,6 @@ func (op *softmaxOp) checkInput(inputs ...Value) (tensor.Tensor, error) {
 		return nil, errors.Errorf("Expected input to be a tensor")
 	}
 
-	if in.Shape().Dims() != 1 {
-		return nil, errors.Errorf("Expected input to have 1 dimensions")
-	}
-
 	return in, nil
 }
 
@@ -116,12 +113,40 @@ func (op *softmaxOp) Do(inputs ...Value) (retVal Value, err error) {
 		return nil, fmt.Errorf("error calculating sum for SoftMax: %w", err)
 	}
 
-	div, err := tensor.Div(exp, sum)
+	ss := sum.Shape()
+	dimsDiff := exp.Shape().Dims() - ss.Dims()
+	if dimsDiff == 0 {
+		div, err := tensor.Div(exp, sum)
+		if err != nil {
+			return nil, fmt.Errorf("error calculating div for SoftMax: %w", err)
+		}
+
+		return div, nil
+	}
+
+	fmt.Fprintf(os.Stderr, "initial sum: %v axis=%d expShape=%v expDims=%d\nDIFF: %d\n", sum, axis, exp.Shape(), exp.Dims(), dimsDiff)
+
+	newShape := tensor.Shape(tensor.BorrowInts(ss.Dims() + dimsDiff))
+	copy(newShape, ss)
+	copy(newShape[axis+1:], newShape[axis:])
+	newShape[axis] = 1
+
+	fmt.Fprintf(os.Stderr, "new shape: %v\n", newShape)
+
+	if err = sum.Reshape(newShape...); err != nil {
+		return nil, fmt.Errorf("error reshaping sum for SoftMax: %w", err)
+	}
+
+	fmt.Fprintf(os.Stderr, "sum reshaped: \n%v\nshape: %v\n", sum, sum.Shape())
+
+	sum, err = tensor.Repeat(sum, axis, exp.Shape()[1:]...)
 	if err != nil {
-		return nil, fmt.Errorf("error calculating div for SoftMax: %w", err)
+		return nil, fmt.Errorf("error repeating sum for SoftMax: %w", err)
 	}
 
-	return div, nil
+	fmt.Fprintf(os.Stderr, "sum repeated: \n%v\nshape: %v\nexp=\n%v\n", sum, sum.Shape(), exp)
+
+	return tensor.Div(exp, sum)
 }
 
 // DoDiff calculates the diff and sets its value to the output node. Implementation for ADOp interface.
@@ -245,32 +270,7 @@ func (op *softmaxDiffOp) Do(inputs ...Value) (Value, error) {
 		return nil, fmt.Errorf("Can't check SoftmaxDiff input: %w", err)
 	}
 
-	diag := tensor.New(tensor.AsDenseDiag(inputTensor))
-	sm := inputTensor.Clone().(tensor.Tensor)
-
-	err = sm.Reshape(inputTensor.Shape().TotalSize(), 1)
-	if err != nil {
-		return nil, fmt.Errorf("softmaxDiffOp.Do error reshaping the value: %w", err)
-	}
-
-	smT := sm.Clone().(tensor.Tensor)
-
-	err = smT.T()
-	if err != nil {
-		return nil, fmt.Errorf("softmaxDiffOp.Do error transposing the value: %w", err)
-	}
-
-	smDot, err := tensor.MatMul(sm, smT)
-	if err != nil {
-		return nil, fmt.Errorf("softmaxDiffOp.Do error calculating dot product: %w", err)
-	}
-
-	result, err := tensor.Sub(diag, smDot)
-	if err != nil {
-		return nil, fmt.Errorf("softmaxDiffOp.Do error calculating sub: %w", err)
-	}
-
-	return result, nil
+	return inputTensor, nil
 }
 
 // ensure it complies with the Op interface

From 1eac0e6258462cf2e021b9da6c9b313105d6dc14 Mon Sep 17 00:00:00 2001
From: chewxy <chewxy@gmail.com>
Date: Tue, 13 Oct 2020 11:14:03 +1100
Subject: [PATCH 07/12] Fixed softmax broadcasting issue

Co-authored-by: David Cuadrado <73729+dcu@users.noreply.github.com>
---
 op_softmax.go | 15 ++++-----------
 1 file changed, 4 insertions(+), 11 deletions(-)

diff --git a/op_softmax.go b/op_softmax.go
index 3502fb3f..dc0bb008 100644
--- a/op_softmax.go
+++ b/op_softmax.go
@@ -3,7 +3,6 @@ package gorgonia
 import (
 	"fmt"
 	"hash"
-	"os"
 
 	"github.com/chewxy/hm"
 	"github.com/pkg/errors"
@@ -114,7 +113,8 @@ func (op *softmaxOp) Do(inputs ...Value) (retVal Value, err error) {
 	}
 
 	ss := sum.Shape()
-	dimsDiff := exp.Shape().Dims() - ss.Dims()
+	es := exp.Shape()
+	dimsDiff := es.Dims() - ss.Dims()
 	if dimsDiff == 0 {
 		div, err := tensor.Div(exp, sum)
 		if err != nil {
@@ -124,28 +124,21 @@ func (op *softmaxOp) Do(inputs ...Value) (retVal Value, err error) {
 		return div, nil
 	}
 
-	fmt.Fprintf(os.Stderr, "initial sum: %v axis=%d expShape=%v expDims=%d\nDIFF: %d\n", sum, axis, exp.Shape(), exp.Dims(), dimsDiff)
+	// MULTIDIMENSIONAL SOFTMAX
 
 	newShape := tensor.Shape(tensor.BorrowInts(ss.Dims() + dimsDiff))
 	copy(newShape, ss)
 	copy(newShape[axis+1:], newShape[axis:])
 	newShape[axis] = 1
 
-	fmt.Fprintf(os.Stderr, "new shape: %v\n", newShape)
-
 	if err = sum.Reshape(newShape...); err != nil {
 		return nil, fmt.Errorf("error reshaping sum for SoftMax: %w", err)
 	}
 
-	fmt.Fprintf(os.Stderr, "sum reshaped: \n%v\nshape: %v\n", sum, sum.Shape())
-
-	sum, err = tensor.Repeat(sum, axis, exp.Shape()[1:]...)
-	if err != nil {
+	if sum, err = tensor.Repeat(sum, axis, es[axis]); err != nil {
 		return nil, fmt.Errorf("error repeating sum for SoftMax: %w", err)
 	}
 
-	fmt.Fprintf(os.Stderr, "sum repeated: \n%v\nshape: %v\nexp=\n%v\n", sum, sum.Shape(), exp)
-
 	return tensor.Div(exp, sum)
 }
 

From 1f860421de7669d40fa556472877aef29a3ddc73 Mon Sep 17 00:00:00 2001
From: David Cuadrado <dcuadrado@truora.com>
Date: Mon, 12 Oct 2020 19:20:03 -0500
Subject: [PATCH 08/12] Fix Example_errorHandling test for SoftMax

---
 example_err_test.go | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/example_err_test.go b/example_err_test.go
index b32aca9e..098e22f8 100644
--- a/example_err_test.go
+++ b/example_err_test.go
@@ -61,9 +61,8 @@ func Example_errorHandling() {
 	_ = nn2PlusWrong
 
 	// Output:
-	// nn: ÷ false(%a, %f) :: Matrix float32
-	// An error occurs: Type inference error. Op: + false. Children: [Matrix float32, Matrix float64], OpType:Matrix a → Matrix a → Matrix a: Unable to unify while inferring type of + false: Unification Fail: float64 ~ float32 cannot be unified
-	// nn2: ÷ false(%a, %f) :: Matrix float32
-	// An error occurs (caught by recover()): Type inference error. Op: + false. Children: [Matrix float32, Matrix float64], OpType:Matrix a → Matrix a → Matrix a: Unable to unify while inferring type of + false: Unification Fail: float64 ~ float32 cannot be unified
-
+	// nn: Softmax{}()(%9) :: Vector float32
+	// An error occurs: Type inference error. Op: + false. Children: [Vector float32, Matrix float64], OpType:Vector a → Matrix a → Matrix a: Unable to unify while inferring type of + false: Unification Fail: float64 ~ float32 cannot be unified
+	// nn2: Softmax{}()(%9) :: Vector float32
+	// An error occurs (caught by recover()): Type inference error. Op: + false. Children: [Vector float32, Matrix float64], OpType:Vector a → Matrix a → Matrix a: Unable to unify while inferring type of + false: Unification Fail: float64 ~ float32 cannot be unified
 }

From 9e3af62bbe48c119f79563da0489b2557b6fb136 Mon Sep 17 00:00:00 2001
From: chewxy <chewxy@gmail.com>
Date: Tue, 13 Oct 2020 12:00:20 +1100
Subject: [PATCH 09/12] Added Difffor softmax Updated softmax test to also
 check for values

Co-authored-by: David Cuadrado <73729+dcu@users.noreply.github.com>
---
 op_softmax.go      | 17 ++++++++++++++---
 operations_test.go | 13 ++++++++++---
 2 files changed, 24 insertions(+), 6 deletions(-)

diff --git a/op_softmax.go b/op_softmax.go
index dc0bb008..2ef384b4 100644
--- a/op_softmax.go
+++ b/op_softmax.go
@@ -126,6 +126,7 @@ func (op *softmaxOp) Do(inputs ...Value) (retVal Value, err error) {
 
 	// MULTIDIMENSIONAL SOFTMAX
 
+	// REPEAT SUM (traditionally called broadcasting)
 	newShape := tensor.Shape(tensor.BorrowInts(ss.Dims() + dimsDiff))
 	copy(newShape, ss)
 	copy(newShape[axis+1:], newShape[axis:])
@@ -152,7 +153,7 @@ func (op *softmaxOp) DoDiff(ctx ExecutionContext, inputs Nodes, output *Node) er
 	odvd := odv.Value.(tensor.Tensor)
 	diffOp := newSoftmaxOpDiff()
 
-	result, err := diffOp.Do()
+	result, err := diffOp.Do(odv)
 	if err != nil {
 		return err
 	}
@@ -262,8 +263,18 @@ func (op *softmaxDiffOp) Do(inputs ...Value) (Value, error) {
 	if err != nil {
 		return nil, fmt.Errorf("Can't check SoftmaxDiff input: %w", err)
 	}
-
-	return inputTensor, nil
+	y := inputTensor.(*tensor.Dense)
+	s := y.Shape()
+	fst := tensor.ProdInts([]int(s))
+	y.Reshape(fst, 1)
+	yᵀ := y.ShallowClone()
+	yᵀ.T()
+	yyᵀ, err := tensor.MatMul(y, yᵀ)
+	if err != nil {
+		return nil, err
+	}
+	diag := tensor.New(tensor.AsDenseDiag(y.Data()))
+	return diag.Sub(yyᵀ.(*tensor.Dense)) // jacobian
 }
 
 // ensure it complies with the Op interface
diff --git a/operations_test.go b/operations_test.go
index dcb0cba7..04a44449 100644
--- a/operations_test.go
+++ b/operations_test.go
@@ -399,8 +399,7 @@ func TestMisha(t *testing.T) {
 
 func TestSoftMax(t *testing.T) {
 	defer runtime.GC()
-
-	assert := require.New(t)
+	assert := assert.New(t)
 	g := NewGraph()
 	xT := tensor.New(tensor.WithBacking([]float64{0.1, 0.2, -0.3, 0.4, 0.5}))
 	x := NewVector(g, Float64, WithShape(5), WithValue(xT))
@@ -412,7 +411,7 @@ func TestSoftMax(t *testing.T) {
 		t.Error(err)
 	}
 
-	m := NewTapeMachine(g)
+	m := NewTapeMachine(g, TraceExec())
 	defer m.Close()
 	if err := m.RunAll(); err != nil {
 		t.Error(err)
@@ -454,6 +453,14 @@ func TestSoftMax(t *testing.T) {
 
 	assert.Equal(smg, sm2g)
 	assert.Equal(xG, x2G)
+
+	correctGrad := []float64{
+		-0, -0, -8.379839604304342, -0, -0,
+	}
+
+	if !floatsEqual64(correctGrad, smg.Data().([]float64)) {
+		t.Errorf("Expected results to be %v. Got %v.", correctGrad, smg.Data())
+	}
 }
 
 var sliceTests = []struct {

From edad66702275e7509414adc9ce35f82e632abe91 Mon Sep 17 00:00:00 2001
From: chewxy <chewxy@gmail.com>
Date: Tue, 13 Oct 2020 22:48:04 +1100
Subject: [PATCH 10/12] Updated softmax op and softmax diffop

---
 op_softmax.go      | 168 ++++++++++++++++++++++++++++++++-------------
 operations_test.go |  38 +++++-----
 2 files changed, 137 insertions(+), 69 deletions(-)

diff --git a/op_softmax.go b/op_softmax.go
index 2ef384b4..84d7ad04 100644
--- a/op_softmax.go
+++ b/op_softmax.go
@@ -6,18 +6,24 @@ import (
 
 	"github.com/chewxy/hm"
 	"github.com/pkg/errors"
+	"gonum.org/v1/gonum/blas"
+	"gonum.org/v1/gonum/blas/gonum"
 	"gorgonia.org/tensor"
 )
 
 type softmaxOp struct {
 	shape tensor.Shape
-	axes  []int
+	axis  int
 }
 
 func newSoftmaxOp(inputShape tensor.Shape, axes ...int) *softmaxOp {
+	axis := -1
+	if len(axes) > 0 {
+		axis = axes[0]
+	}
 	softmaxop := &softmaxOp{
 		shape: inputShape,
-		axes:  axes,
+		axis:  axis,
 	}
 
 	return softmaxop
@@ -31,22 +37,20 @@ func SoftMax(x *Node, axis ...int) (*Node, error) {
 	return ApplyOp(op, x)
 }
 
-func (op *softmaxOp) Arity() int {
-	return 1
-}
+func (op *softmaxOp) Arity() int { return 1 }
 
 func (op *softmaxOp) ReturnsPtr() bool { return false }
 
 func (op *softmaxOp) CallsExtern() bool { return false }
 
 func (op *softmaxOp) WriteHash(h hash.Hash) {
-	fmt.Fprintf(h, "Softmax{}()")
+	fmt.Fprintf(h, "Softmax{%v}()", op.axis)
 }
 
 func (op *softmaxOp) Hashcode() uint32 { return simpleHash(op) }
 
 func (op *softmaxOp) String() string {
-	return fmt.Sprintf("Softmax{}()")
+	return fmt.Sprintf("Softmax{%d}()", op.axis)
 }
 
 func (op *softmaxOp) InferShape(inputs ...DimSizer) (tensor.Shape, error) {
@@ -57,9 +61,7 @@ func (op *softmaxOp) InferShape(inputs ...DimSizer) (tensor.Shape, error) {
 
 func (op *softmaxOp) Type() hm.Type {
 	a := hm.TypeVariable('a')
-	t := newTensorType(1, a)
-
-	return hm.NewFnType(t, t) // f(float64) float64
+	return hm.NewFnType(a, a) // f(float64) float64
 }
 
 func (op *softmaxOp) OverwritesInput() int { return -1 }
@@ -92,14 +94,8 @@ func (op *softmaxOp) Do(inputs ...Value) (retVal Value, err error) {
 	if aShape.IsColVec() || (aShape.IsVector() && !aShape.IsRowVec()) {
 		axis = 0
 	}
-
-	// FIXME: v0.10
-	if len(op.axes) > 0 {
-		if op.axes[0] >= axis+1 || op.axes[0] < 0 {
-			return nil, errors.Errorf("Cannot perform SoftMax on axis %d. Input has shape %v", op.axes[0], aShape)
-		}
-
-		axis = op.axes[0]
+	if op.axis != -1 {
+		axis = op.axis
 	}
 
 	exp, err := tensor.Exp(inputTensor)
@@ -150,15 +146,16 @@ func (op *softmaxOp) DoDiff(ctx ExecutionContext, inputs Nodes, output *Node) er
 	}
 
 	odv := output.boundTo.(*dualValue)
-	odvd := odv.Value.(tensor.Tensor)
+	idv := inputs[0].boundTo.(*dualValue)
+	idvd := idv.d.(*tensor.Dense)
 	diffOp := newSoftmaxOpDiff()
 
-	result, err := diffOp.Do(odv)
+	result, err := diffOp.Do(odv.Value, odv.d)
 	if err != nil {
 		return err
 	}
 
-	sum, err := odvd.(*tensor.Dense).Add(result.(*tensor.Dense), tensor.UseUnsafe())
+	sum, err := idvd.Add(result.(*tensor.Dense), tensor.UseUnsafe())
 	if err != nil {
 		return err
 	}
@@ -178,7 +175,7 @@ func (op *softmaxOp) SymDiff(inputs Nodes, output, grad *Node) (Nodes, error) {
 	diffOp := newSoftmaxOpDiff()
 	nodes := make(Nodes, 1)
 
-	nodes[0], err = ApplyOp(diffOp, output)
+	nodes[0], err = ApplyOp(diffOp, output, grad)
 
 	return nodes, err
 }
@@ -193,15 +190,14 @@ func (op *softmaxOp) DiffWRT(inputs int) []bool {
 }
 
 type softmaxDiffOp struct {
+	axis int
 }
 
 func newSoftmaxOpDiff() *softmaxDiffOp {
 	return &softmaxDiffOp{}
 }
 
-func (op *softmaxDiffOp) Arity() int {
-	return 1
-}
+func (op *softmaxDiffOp) Arity() int { return 2 }
 
 func (op *softmaxDiffOp) ReturnsPtr() bool { return false }
 
@@ -224,57 +220,133 @@ func (op *softmaxDiffOp) InferShape(inputs ...DimSizer) (tensor.Shape, error) {
 }
 
 func (op *softmaxDiffOp) Type() hm.Type {
-	aType := hm.TypeVariable('a')
-
-	ta := newTensorType(1, aType)
+	a := hm.TypeVariable('a')
 
-	return hm.NewFnType(ta, ta) // f(float64) float64
+	return hm.NewFnType(a, a, a) // f(float64) float64
 }
 
 func (op *softmaxDiffOp) OverwritesInput() int { return -1 }
 
-func (op *softmaxDiffOp) checkInput(inputs ...Value) (tensor.Tensor, error) {
+func (op *softmaxDiffOp) checkInput(inputs ...Value) (tensor.Tensor, tensor.Tensor, error) {
 	if err := checkArity(op, len(inputs)); err != nil {
-		return nil, err
+		return nil, nil, err
 	}
 
 	var (
-		in tensor.Tensor
-
-		ok bool
+		in   tensor.Tensor
+		grad tensor.Tensor
+		ok   bool
 	)
 
 	switch t := inputs[0].(type) {
 	case *dualValue:
 		if in, ok = t.Value.(tensor.Tensor); !ok {
-			return nil, errors.Errorf("input should be a tensor, got %T", inputs[0])
+			return nil, nil, errors.Errorf("input should be a tensor, got %T", inputs[0])
 		}
 	case tensor.Tensor:
 		in = t
 	default:
-		return nil, errors.Errorf("input type is not supported, got %T", inputs[0])
+		return nil, nil, errors.Errorf("input type is not supported, got %T", inputs[0])
 	}
 
-	return in, nil
+	switch t := inputs[1].(type) {
+	case *dualValue:
+		if grad, ok = t.Value.(tensor.Tensor); !ok {
+			return nil, nil, errors.Errorf("input should be a tensor, got %T", inputs[1])
+		}
+	case tensor.Tensor:
+		grad = t
+	default:
+		return nil, nil, errors.Errorf("input type is not supported, got %T", inputs[1])
+	}
+
+	return in, grad, nil
 }
 
 func (op *softmaxDiffOp) Do(inputs ...Value) (Value, error) {
-	inputTensor, err := op.checkInput(inputs...)
+	y, grad, err := op.checkInput(inputs...)
 	if err != nil {
 		return nil, fmt.Errorf("Can't check SoftmaxDiff input: %w", err)
 	}
-	y := inputTensor.(*tensor.Dense)
+
 	s := y.Shape()
-	fst := tensor.ProdInts([]int(s))
-	y.Reshape(fst, 1)
-	yᵀ := y.ShallowClone()
-	yᵀ.T()
-	yyᵀ, err := tensor.MatMul(y, yᵀ)
-	if err != nil {
-		return nil, err
+	axis := op.axis
+	if axis == -1 {
+		axis = s.Dims() - 1
 	}
-	diag := tensor.New(tensor.AsDenseDiag(y.Data()))
-	return diag.Sub(yyᵀ.(*tensor.Dense)) // jacobian
+	/*
+		What follows is an a bit of a splayed out algorithm
+		Let's imagine Y, and dY are both (a,b,c)-shaped tensors.
+		We reshape it to a matrix. Let's examine the cases:
+		case axis = 0:
+			we reshape it to (1, a*b*c)
+		case axis = 1:
+			we reshape it to (a, b*c)
+		case axis = 2:
+			we reshape it to (a*b*c, 1)
+
+		We'll call the result matrix M, with shape (N, D)
+
+		Now, we'll do some work:
+		1. Make scalars of shape (N,).
+		2. Make mulars of shape (D,). To facilitate multiplication, we set the initial valus
+		   to the identity of multiplication: 1.
+		3. Populate scalars. This is abit tricky:
+			scalars[i] = Y[i] · dY[i]
+		   TODO: insert mathematical explanation of what accumulating gradients magic is happening here.
+		4. Reshape the scalars to (N, 1)
+		5. Reshape the mulars to (1, D)
+		6. Perform matrix multiplication... WITH A TWIST. We need to multiply all the results by -1. Then add a bias of 1.
+
+		Step 6 can be done in the usual manner. However, the BLAS librarie contain `(D|S)gemm`, which allows you to set alpha and beta.
+	*/
+
+	prodBefore := tensor.ProdInts([]int(s[:axis])) // N
+	prodAfter := tensor.ProdInts([]int(s[axis:]))  // D
+	if prodBefore == 0 {                           // indicating an error
+		prodBefore = 1
+	}
+	if prodAfter == 0 {
+		prodAfter = 1
+	}
+
+	scalars := tensor.New(tensor.WithShape(prodBefore), tensor.Of(y.Dtype()))
+	mulars := tensor.New(tensor.WithShape(prodAfter), tensor.Of(y.Dtype()))
+	mulars.Memset(one(y.Dtype()).Data()) // set all mulars to 1.
+
+	impl := gonum.Implementation{}
+	var val interface{}
+	switch yy := y.Data().(type) {
+	case []float64:
+		gradData := grad.Data().([]float64)
+		mulData := mulars.Data().([]float64)
+		var scaleData []float64
+		switch sd := scalars.Data().(type) {
+		case float64:
+			scaleData = make([]float64, 1)
+			scaleData[0] = sd
+		case []float64:
+			scaleData = sd
+
+		}
+		for i := 0; i < prodBefore; i++ {
+			scaleData[i] = impl.Ddot(prodAfter, yy[i*prodAfter:], 1, gradData[i*prodAfter:], 1)
+		}
+		C := make([]float64, s.TotalSize()) // output
+
+		// important note: here, alpha is -1 and beta is 1.
+		impl.Dgemm(blas.NoTrans, blas.NoTrans, prodBefore, prodAfter, 1, -1, scaleData, 1, mulData, prodAfter, 1, C, prodAfter)
+		val = C
+	case []float32:
+		//TODO: use Sdot and Sgemm instead of Ddot and Dgemm
+	case []complex64:
+		panic("Complex64 not done yet")
+	case []complex128:
+		panic("Complex128 not done yet")
+	}
+
+	retVal := tensor.New(tensor.WithShape(s.Clone()...), tensor.WithBacking(val))
+	return tensor.Mul(retVal, y, tensor.UseUnsafe())
 }
 
 // ensure it complies with the Op interface
diff --git a/operations_test.go b/operations_test.go
index 04a44449..4786589b 100644
--- a/operations_test.go
+++ b/operations_test.go
@@ -2,6 +2,7 @@ package gorgonia
 
 import (
 	"io/ioutil"
+	"log"
 	"runtime"
 	"testing"
 
@@ -399,7 +400,6 @@ func TestMisha(t *testing.T) {
 
 func TestSoftMax(t *testing.T) {
 	defer runtime.GC()
-	assert := assert.New(t)
 	g := NewGraph()
 	xT := tensor.New(tensor.WithBacking([]float64{0.1, 0.2, -0.3, 0.4, 0.5}))
 	x := NewVector(g, Float64, WithShape(5), WithValue(xT))
@@ -417,49 +417,45 @@ func TestSoftMax(t *testing.T) {
 		t.Error(err)
 	}
 
-	var smg, xG Value
+	var xG Value
 	var err error
-	if smg, err = sm.Grad(); err != nil {
-		t.Error(err)
-	}
-
 	if xG, err = x.Grad(); err != nil {
 		t.Error(err)
 	}
 
 	// machine 2, graph 2
-
-	g2 := NewGraph()
+	h := NewGraph()
 	xT2 := tensor.New(tensor.WithBacking([]float64{0.1, 0.2, -0.3, 0.4, 0.5}))
-	x2 := NewVector(g, Float64, WithShape(5), WithValue(xT2))
+	x2 := NewVector(h, Float64, WithShape(5), WithValue(xT2))
 	sm2 := Must(SoftMax(x2))
 	logsm2 := Must(Neg(Must(Log(sm2))))
 	Must(Slice(logsm2, S(2)))
 
-	m2 := NewLispMachine(g2)
+	m2 := NewLispMachine(h)
 	defer m2.Close()
 	if err = m2.RunAll(); err != nil {
+		log.Printf("ERR %v", err)
 		t.Error(err)
 	}
 
-	var sm2g, x2G Value
-	if sm2g, err = sm2.Grad(); err != nil {
-		t.Error(err)
-	}
-
+	var x2G Value
 	if x2G, err = x2.Grad(); err != nil {
 		t.Error(err)
 	}
 
-	assert.Equal(smg, sm2g)
-	assert.Equal(xG, x2G)
+	if !floatsEqual64(xG.Data().([]float64), x2G.Data().([]float64)) {
+		t.Errorf("Expected both gradients of X to be the same.")
+	}
 
-	correctGrad := []float64{
-		-0, -0, -8.379839604304342, -0, -0,
+	correctXGrad := []float64{
+		0.178025447751409, 0.1967485475322529, 0.11933402633223977, 0.24030921861990098, 0.2655827597641975,
 	}
 
-	if !floatsEqual64(correctGrad, smg.Data().([]float64)) {
-		t.Errorf("Expected results to be %v. Got %v.", correctGrad, smg.Data())
+	if !floatsEqual64(correctXGrad, x2G.Data().([]float64)) {
+		t.Errorf("Expected results to be %v. Got %v.", correctXGrad, x2G.Data())
+	}
+	if !floatsEqual64(correctXGrad, xG.Data().([]float64)) {
+		t.Errorf("Expected results to be %v. Got %v.", correctXGrad, xG.Data())
 	}
 }
 

From 78231d3f294087e4a275fdb98e174b078ad5fa6a Mon Sep 17 00:00:00 2001
From: chewxy <chewxy@gmail.com>
Date: Tue, 13 Oct 2020 23:00:56 +1100
Subject: [PATCH 11/12] Fixed softmax for failing tests. Added float32 support
 for softmaxdiffop

Co-authored-by: David Cuadrado <73729+dcu@users.noreply.github.com>
---
 example_err_test.go |  9 +++++----
 op_softmax.go       | 22 ++++++++++++++++++++--
 2 files changed, 25 insertions(+), 6 deletions(-)

diff --git a/example_err_test.go b/example_err_test.go
index 098e22f8..bb70e77c 100644
--- a/example_err_test.go
+++ b/example_err_test.go
@@ -61,8 +61,9 @@ func Example_errorHandling() {
 	_ = nn2PlusWrong
 
 	// Output:
-	// nn: Softmax{}()(%9) :: Vector float32
-	// An error occurs: Type inference error. Op: + false. Children: [Vector float32, Matrix float64], OpType:Vector a → Matrix a → Matrix a: Unable to unify while inferring type of + false: Unification Fail: float64 ~ float32 cannot be unified
-	// nn2: Softmax{}()(%9) :: Vector float32
-	// An error occurs (caught by recover()): Type inference error. Op: + false. Children: [Vector float32, Matrix float64], OpType:Vector a → Matrix a → Matrix a: Unable to unify while inferring type of + false: Unification Fail: float64 ~ float32 cannot be unified
+	// nn: Softmax{-1}()(%9) :: Matrix float32
+	// An error occurs: Type inference error. Op: + false. Children: [Matrix float32, Matrix float64], OpType:Matrix a → Matrix a → Matrix a: Unable to unify while inferring type of + false: Unification Fail: float64 ~ float32 cannot be unified
+	// nn2: Softmax{-1}()(%9) :: Matrix float32
+	// An error occurs (caught by recover()): Type inference error. Op: + false. Children: [Matrix float32, Matrix float64], OpType:Matrix a → Matrix a → Matrix a: Unable to unify while inferring type of + false: Unification Fail: float64 ~ float32 cannot be unified
+
 }
diff --git a/op_softmax.go b/op_softmax.go
index 84d7ad04..e93280dc 100644
--- a/op_softmax.go
+++ b/op_softmax.go
@@ -56,7 +56,7 @@ func (op *softmaxOp) String() string {
 func (op *softmaxOp) InferShape(inputs ...DimSizer) (tensor.Shape, error) {
 	s := inputs[0].(tensor.Shape)
 
-	return tensor.Shape{s.TotalSize()}, nil
+	return s, nil
 }
 
 func (op *softmaxOp) Type() hm.Type {
@@ -338,7 +338,25 @@ func (op *softmaxDiffOp) Do(inputs ...Value) (Value, error) {
 		impl.Dgemm(blas.NoTrans, blas.NoTrans, prodBefore, prodAfter, 1, -1, scaleData, 1, mulData, prodAfter, 1, C, prodAfter)
 		val = C
 	case []float32:
-		//TODO: use Sdot and Sgemm instead of Ddot and Dgemm
+		gradData := grad.Data().([]float32)
+		mulData := mulars.Data().([]float32)
+		var scaleData []float32
+		switch sd := scalars.Data().(type) {
+		case float32:
+			scaleData = make([]float32, 1)
+			scaleData[0] = sd
+		case []float32:
+			scaleData = sd
+
+		}
+		for i := 0; i < prodBefore; i++ {
+			scaleData[i] = impl.Sdot(prodAfter, yy[i*prodAfter:], 1, gradData[i*prodAfter:], 1)
+		}
+		C := make([]float32, s.TotalSize()) // output
+
+		// important note: here, alpha is -1 and beta is 1.
+		impl.Sgemm(blas.NoTrans, blas.NoTrans, prodBefore, prodAfter, 1, -1, scaleData, 1, mulData, prodAfter, 1, C, prodAfter)
+		val = C
 	case []complex64:
 		panic("Complex64 not done yet")
 	case []complex128:

From 75b0715cc583ed77c75a880e5382e120d34a7b24 Mon Sep 17 00:00:00 2001
From: chewxy <chewxy@gmail.com>
Date: Tue, 13 Oct 2020 23:04:12 +1100
Subject: [PATCH 12/12] Fixed a bunch of things that @dcu picked up

---
 op_softmax.go | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/op_softmax.go b/op_softmax.go
index e93280dc..80911772 100644
--- a/op_softmax.go
+++ b/op_softmax.go
@@ -148,7 +148,7 @@ func (op *softmaxOp) DoDiff(ctx ExecutionContext, inputs Nodes, output *Node) er
 	odv := output.boundTo.(*dualValue)
 	idv := inputs[0].boundTo.(*dualValue)
 	idvd := idv.d.(*tensor.Dense)
-	diffOp := newSoftmaxOpDiff()
+	diffOp := newSoftmaxOpDiff(op.axis)
 
 	result, err := diffOp.Do(odv.Value, odv.d)
 	if err != nil {
@@ -172,7 +172,7 @@ func (op *softmaxOp) SymDiff(inputs Nodes, output, grad *Node) (Nodes, error) {
 		return nil, err
 	}
 
-	diffOp := newSoftmaxOpDiff()
+	diffOp := newSoftmaxOpDiff(op.axis)
 	nodes := make(Nodes, 1)
 
 	nodes[0], err = ApplyOp(diffOp, output, grad)
@@ -193,8 +193,8 @@ type softmaxDiffOp struct {
 	axis int
 }
 
-func newSoftmaxOpDiff() *softmaxDiffOp {
-	return &softmaxDiffOp{}
+func newSoftmaxOpDiff(axis int) *softmaxDiffOp {
+	return &softmaxDiffOp{axis: axis}
 }
 
 func (op *softmaxDiffOp) Arity() int { return 2 }
@@ -289,7 +289,7 @@ func (op *softmaxDiffOp) Do(inputs ...Value) (Value, error) {
 
 		Now, we'll do some work:
 		1. Make scalars of shape (N,).
-		2. Make mulars of shape (D,). To facilitate multiplication, we set the initial valus
+		2. Make mulars of shape (D,). To facilitate multiplication, we set the initial values
 		   to the identity of multiplication: 1.
 		3. Populate scalars. This is abit tricky:
 			scalars[i] = Y[i] · dY[i]