facebookresearch · nicolasvasilache · Apr 4, 2018 · Mar 26, 2018 · Mar 26, 2018 · Mar 26, 2018
diff --git a/CodingConventions.md b/CodingConventions.md
@@ -588,3 +588,10 @@ changes) while working on a feature and even in "WIP" pull requests,
 as long as the pieces are recombined (e.g., through an interactive rebase)
 into logical units when the feature is ready for merging.
 Force-pushing in PR branches is fine.
+
+Coding Conventions for writing Tensor Comprehensions
+====================================================
+
+Please see the following documentation
+[entry](https://facebookresearch.github.io/TensorComprehensions/coding_conventions.html)
+on how to write Tensor Comprehensions in a standard legible fashion.
diff --git a/benchmarks/MLP_model.cc b/benchmarks/MLP_model.cc
@@ -64,23 +64,23 @@ DEFINE_uint32(Q, 2, "W4_h");
 //   float(E1, D) LUT1, int32(B, L1) I1,
 //   float(E2, D) LUT2, int32(B, L2) I2) -> (O1, O2)
 // {
-//   O1(i, j) +=! LUT1(I1(i, k), j)
-//   O2(i, j) +=! LUT2(I2(i, k), j)
+//     O1(b, d) +=! LUT1(I1(b, r_l1), d)
+//     O2(b, d) +=! LUT2(I2(b, r_l2), d)
 // }
 // def _3FCRELU(
 //   float(B,M) I, float(O,N) W2, float(O) B2,
 //   float(P,O) W3, float(P) B3, float(Q,P) W4,
 //   float(Q) B4) -> (O1, O2, O3, O4)
 // {
-//   O2(b, o) = B2(o)
-//   O2(b, o) += O1(b, n) * W2(o, n)
-//   O2(b, o) = fmax(O2(b, o), 0)
-//   O3(b, p) = B3(p)
-//   O3(b, p) += O2(b, o) * W3(p, o)
-//   O3(b, p) = fmax(O3(b, p), 0)
-//   O4(b, q) = B4(q)
-//   O4(b, q) += O3(b, p) * W4(q, p)
-//   O4(b, q) = fmax(O4(b, q), 0)
+//     O2(b, o)  = B2(o)
+//     O2(b, o) += O1(b, n) * W2(o, n)
+//     O2(b, o)  = fmax(O2(b, o), 0)
+//     O3(b, p)  = B3(p)
+//     O3(b, p) += O2(b, o) * W3(p, o)
+//     O3(b, p)  = fmax(O3(b, p), 0)
+//     O4(b, q)  = B4(q)
+//     O4(b, q) += O3(b, p) * W4(q, p)
+//     O4(b, q)  = fmax(O4(b, q), 0)
 // }
 // def prod_model(float(E1, D) LUT1, int32(B, L1) I1,
 //                float(E2, D) LUT2, int32(B, L2) I2,
@@ -91,15 +91,15 @@ DEFINE_uint32(Q, 2, "W4_h");
 //                float(Q,P) W4, float(Q) B4)
 // -> (C1, C2, C3, I, O1, O2, O3, O4)
 // {
-//   (C1, C2) = _2LUT(LUT1, I1, LUT2, I2)
-//   C3(b, wy) += I3(b, wxx) * W(wy, wxx)
-//   I(b, m) = Concat(C1, C2, C3) // not in TC atm
-//   O1(b, n) = B1(n)
-//   O1(b, n) += I(b, m) * W1(n, m)
-//   O1(b, n) = fmax(O1(b, n), 0)
+//       (C1, C2) = _2LUT(LUT1, I1, LUT2, I2)
+//     C3(b, wy) +=! I3(b, r_wx) * W(wy, r_wx)
+//        I(b, m) = Concat(C1, C2, C3) // not in TC atm
+//       O1(b, n) = B1(n)
+//      O1(b, n) +=! I(b, m) * W1(n, m)
+//       O1(b, n) = fmax(O1(b, n), 0)
 //   (O2, O3, O4) =
-//     _3FCRELU(I, W1, B1, W2, B2, W3, B3, W4, B4)
-//   # O4 goes out to binary classifier, omitted here
+//       _3FCRELU(I, W1, B1, W2, B2, W3, B3, W4, B4)
+//     # O4 goes out to binary classifier, omitted here
 // }
 
 class ProductionModel : public Benchmark {
@@ -191,9 +191,9 @@ void ProductionModel::run1LUT(
 
     std::vector<at::Tensor> inputs = {LUT1, IDX1};
     std::string tc = R"(
-      def _1LUT(float(E1, D) LUT1, int32(B, L1) I1) -> (O1) {
-        O1(i, j) +=! LUT1(I1(i, k), j)
-      }
+def _1LUT(float(E1, D) LUT1, int32(B, L1) I1) -> (O1) {
+    O1(b, d) +=! LUT1(I1(b, r_l1), d)
+}
     )";
 
     std::string suffix = std::string("_B_") + std::to_string(FLAGS_B) +
@@ -294,10 +294,10 @@ void ProductionModel::run2LUT(
 
     std::vector<at::Tensor> inputs = {LUT1, IDX1, LUT2, IDX2};
     std::string tc = R"(
-      def _2LUT(float(E1, D) LUT1, int32(B, L1) I1, float(E2, D) LUT2, int32(B, L2) I2) -> (O1, O2) {
-        O1(i, j) +=! LUT1(I1(i, k), j)
-        O2(i, j) +=! LUT2(I2(i, k), j)
-      }
+def _2LUT(float(E1, D) LUT1, int32(B, L1) I1, float(E2, D) LUT2, int32(B, L2) I2) -> (O1, O2) {
+    O1(b, d) +=! LUT1(I1(b, r_l1), d)
+    O2(b, d) +=! LUT2(I2(b, r_l2), d)
+}
     )";
 
     std::string suffix = std::string("_B_") + std::to_string(FLAGS_B) +
@@ -353,9 +353,9 @@ void ProductionModel::runC3(
 
   std::vector<at::Tensor> inputs = {I, W};
   std::string tc = R"TC(
-  def _C3(float(B,WX) I, float(WY, WX) W) -> (C3) {
-    C3(b, wy) +=! I(b, wxx) * W(wy, wxx)
-  }
+def _C3(float(B,WX) I, float(WY, WX) W) -> (C3) {
+    C3(b, wy) +=! I(b, r_wx) * W(wy, r_wx)
+}
 )TC";
 
   std::string suffix = std::string("_B_") + std::to_string(FLAGS_B) +
@@ -408,11 +408,11 @@ void ProductionModel::runMLP1(
 
   std::vector<at::Tensor> inputs = {I, W1, B1};
   std::string tc = R"TC(
-  def mlp1(float(B,M) I, float(M, N) W1, float(N) B1) -> (O1) {
-    O1(b, n) +=! I(b, mm) * W1(mm, n)
-    O1(b, n) = O1(b, n) + B1(n)
-    O1(b, n) = fmax(O1(b, n), 0)
-  }
+def mlp1(float(B,M) I, float(M, N) W1, float(N) B1) -> (O1) {
+    O1(b, n) +=! I(b, r_m) * W1(r_m, n)
+    O1(b, n)  = O1(b,   n) + B1(n)
+    O1(b, n)  = fmax(O1(b, n), 0)
+}
 )TC";
 
   std::string suffix = std::string("_B_") + std::to_string(FLAGS_B) +
@@ -474,17 +474,17 @@ void ProductionModel::runMLP3(
 
   std::vector<at::Tensor> inputs = {I, W2, B2, W3, B3, W4, B4};
   std::string tc = R"TC(
-  def mlp3(float(B,N) I, float(O,N) W2, float(O) B2, float(P,O) W3, float(P) B3, float(Q,P) W4, float(Q) B4) -> (O2, O3, O4) {
-    O2(b, o) +=! I(b, n) * W2(o, n)
-    O2(b, o) = O2(b, o) + B2(o)
-    O2(b, o) = fmax(O2(b, o), 0)
+def mlp3(float(B,N) I, float(O,N) W2, float(O) B2, float(P,O) W3, float(P) B3, float(Q,P) W4, float(Q) B4) -> (O2, O3, O4) {
+    O2(b, o) +=!  I(b, n) * W2(o, n)
+    O2(b, o)  =  O2(b, o) + B2(o)
+    O2(b, o)  = fmax(O2(b, o), 0)
     O3(b, p) +=! O2(b, o) * W3(p, o)
-    O3(b, p) = O3(b, p) + B3(p)
-    O3(b, p) = fmax(O3(b, p), 0)
+    O3(b, p)  =  O3(b, p) + B3(p)
+    O3(b, p)  = fmax(O3(b, p), 0)
     O4(b, q) +=! O3(b, p) * W4(q, p)
-    O4(b, q) = O4(b, q) + B4(q)
-    O4(b, q) = fmax(O4(b, q), 0)
-  }
+    O4(b, q)  =  O4(b, q) + B4(q)
+    O4(b, q)  = fmax(O4(b, q), 0)
+}
 )TC";
 
   std::string suffix = std::string("_B_") + std::to_string(FLAGS_B) +

diff --git a/benchmarks/batchmatmul.cc b/benchmarks/batchmatmul.cc
@@ -76,9 +76,9 @@ void BatchMatMul::runBatchMatMul(
 
   std::vector<at::Tensor> inputs = {X, Y};
   std::string tc = R"(
-  def batch_matmul(float(B, N, M) X, float(B, M, K) Y) -> (Z) {
-    Z(b, n, k) +=! X(b, n, mm) * Y(b, mm, k)
-  }
+def batch_matmul(float(B, N, M) X, float(B, M, K) Y) -> (Z) {
+    Z(b, n, k) +=! X(b, n, r_m) * Y(b, r_m, k)
+}
 )";
 
   std::string suffix = std::string("_B_") + std::to_string(FLAGS_B) +

diff --git a/benchmarks/group_convolution.cc b/benchmarks/group_convolution.cc
@@ -122,13 +122,13 @@ void GroupConvolution::runGroupConvolution(
                       .resize_({G, F});
   std::vector<at::Tensor> inputs = {tI, tW, tB};
   std::string tc = R"(
-  def group_convolution(float(N,G,C,H,W) I, float(G,F,C,KH,KW) W1, float(G,F) B)
-  -> (O)
-  {
+def group_convolution(float(N,G,C,H,W) I, float(G,F,C,KH,KW) W1, float(G,F) B)
+-> (O)
+{
     O(n, g, f, h, w) +=!
-      I(n, g, c, h + kh, w + kw) * W1(g, f, c, kh, kw)
-    O(n, g, f, h, w) = O(n, g, f, h, w) + B(g, f)
-  }
+        I(n, g, r_c, h + r_kh, w + r_kw) * W1(g, f, r_c, r_kh, r_kw)
+    O(n, g, f, h, w)  = O(n, g, f, h, w) + B(g, f)
+}
 )";
 
   std::string suffix = std::string("_N_") + std::to_string(FLAGS_N) +

diff --git a/benchmarks/tmm.cc b/benchmarks/tmm.cc
@@ -73,9 +73,9 @@ void TransposedMatMul::runTransposedMatMul(
 
   std::vector<at::Tensor> inputs = {A, B};
   std::string tc = R"TC(
-  def tmm(float(M,K) A, float(N,K) B) -> (C) {
-    C(m, n) +=! A(m, kk) * B(n, kk)
-  }
+def tmm(float(M,K) A, float(N,K) B) -> (C) {
+    C(m, n) +=! A(m, r_k) * B(n, r_k)
+}
 )TC";
 
   std::string suffix = std::string("_M_") + std::to_string(FLAGS_M) +

diff --git a/docs/doxygen/index.md b/docs/doxygen/index.md
@@ -13,30 +13,30 @@ with a few basic functionalities.
 
 Tensor Comprehension Notation
 -----------------------------
-TC borrow three ideas from Einstein notation that make expressions concise:
+TC borrows three ideas from Einstein notation that make expressions concise:
 
 1. Loop index variables are defined implicitly by using them in an expression and their range is aggressively inferred based on what they index.
 2. Indices that appear on the right of an expression but not on the left are assumed to be reduction dimensions.
 3. The evaluation order of points in the iteration space does not affect the output.
 
 Let's start with a simple example is a matrix vector product:
 
-    def mv(float(R,C) A, float(C) B) -> (o) {
-      o(i) +=! A(i,j) * B(j)
+    def mv(float(R,C) A, float(C) x) -> (o) {
+        o(r) +=! A(r,r_c) * x(r_c)
     }
 
 `A` and `x` are input tensors. `o` is an output tensor.
-The statement `o(i) += A(i,j) * b(j)` introduces two index variables `i` and `j`.
-Their range is inferred by their use indexing `A` and `B`. `i = [0,R)`, `j = [0,C)`.
-Because `j` only appears on the right side,
-stores into `o` will reduce over `j` with the reduction specified for the loop.
+The statement `o(r) +=! A(r,r_c) * x(r_c)` introduces two index variables `r` and `r_c`.
+Their range is inferred by their use indexing `A` and `x`. `r = [0,R)`, `r_c = [0,C)`.
+Because `r_c` only appears on the righthand side,
+stores into `o` will reduce over `r_c` with the reduction specified for the loop.
 Reductions can occur across multiple variables, but they all share the same kind of associative reduction (e.g. +=)
 to maintain invariant (3). `mv` computes the same thing as this C++ loop:
 
     for(int i = 0; i < R; i++) {
       o(i) = 0.0f;
       for(int j = 0; j < C; j++) {
-        o(i) += A(i,j) * B(j);
+        o(i) += A(i,j) * x(j);
       }
     }
 
@@ -50,7 +50,7 @@ We provide a few basic examples.
 **Simple matrix-vector**:
 
     def mv(float(R,C) A, float(C) B) -> (o) {
-      o(i) += A(i,j) * B(j)
+        o(r) +=! A(r,r_c) * B(r_c)
     }
 
 **Simple matrix-multiply:**
@@ -59,21 +59,20 @@ Note the layout for B is transposed and matches the
 traditional layout of the weight matrix in a linear layer):
 
     def mm(float(X,Y) A, float(Y,Z) B) -> (R) {
-      R(i,j) += A(i,j) * B(j,k)
+        R(x,z) +=! A(x,r_y) * B(r_y,z)
     }
 
 **Simple 2-D convolution (no stride, no padding):**
 
     def conv(float(B,IP,H,W) input, float(OP,IP,KH,KW) weight) -> (output) {
-      output(b, op, h, w) += input(b, ip, h + kh, w + kw) * weight(op, ip, kh, kw)
+        output(b, op, h, w) +=! input(b, r_ip, h + r_kh, w + r_kw) * weight(op, r_ip, r_kh, r_kw)
     }
 
 **Simple 2D max pooling:**
 
-Note the similarity with a convolution with a
-"select"-style kernel):
+Note the similarity with a convolution with a "select"-style kernel:
 
     def maxpool2x2(float(B,C,H,W) input) -> (output) {
-      output(b,c,i,j) max= input(b,c,2*i + kw, 2*j + kh)
-        where kw = [0, 2[, kh = [0, 2[
+        output(b,c,h,w) max=! input(b,c,2*h + r_kw, 2*w + r_kh)
+            where r_kw in 0:2, r_kh in 0..2
     }
diff --git a/docs/source/coding_conventions.rst b/docs/source/coding_conventions.rst
@@ -0,0 +1,120 @@
+Coding Conventions
+==================
+
+In order to increase readability across Tensor Comprehensions written by
+multiple authors and to reduce the amount of surprising behavior, the
+following conventions should be adopted when writing TC. Generally in TC, one
+should increment nesting by 4 whitespaces at each level and align tensor names
+and indices where appropriate to make memory access patterns emerge. Since
+these two goals can easily be conflicting, use your best judgement to tradeoff
+between the two goals. Such examples are provided below.
+
+Use indices named after parameters
+----------------------------------
+
+Use upper-case names for parameters and capital-case names for input/output tensors.
+Use lower-case names for indices to match the name of the parameter
+corresponding to the dimension upon which they iterate.
+In other words, prefer:
+
+.. code::
+
+    def copy2d(float(M, N) I) -> (O) {
+        O(m, n) = I(m, n)
+    }
+
+to:
+
+.. code::
+
+    def copy2d(float(M, N) I) -> (O) {
+        O(i, j) = I(i, j)
+    }
+
+Prefix reduction index names with :code:`r_`
+--------------------------------------------
+
+By definition, reduction indices are the ones that appear on the RHS of a TC
+expression but not on the LHS. On larger expressions it can get challenging to easily
+detect the reduction variables by mentally parsing the set of indices on the
+RHS and subtracting the set of indices on the LHS from it. To alleviate such
+issues, name the reduction variables with a :code:`r_` prefix.
+In other words, prefer:
+
+.. code::
+
+    def matmul(float(M, K) A, float(K, N) B) -> (C) {
+        C(m, n) +=! A(m, r_k) * B(r_k, n)
+    }
+
+to:
+
+.. code::
+
+    def matmul(float(M, K) A, float(K, N) B) -> (C) {
+        C(m, n) +=! A(m, k) * B(k, n)
+    }
+
+Filter non-rectangular regions with data-dependencies
+-----------------------------------------------------
+
+TC semantics are restricted to (hyper-)rectangular iteration spaces.
+This is a hard requirement to ensure range inference is non-ambiguous (see inference_).
+To simulate non-rectangular iteration spaces, one can use the following:
+
+.. code::
+
+    def matmul(float(M, K) L, float(K, M) U) -> (LU) {
+        LU(m1, m2) +=! (r_k >= m1 and r_k =< m2) ? L(m1, r_k) * U(r_k, m2) : 0
+    }
+
+However, non-(hyper)-rectangular iteration spaces (e.g. triangular) are
+incompatible with range inference and will fail the semantic checks in the TC
+compiler:
+
+.. code::
+
+    def matmul(float(M, K) L, float(K, M) U) -> (LU) {
+        LU(m1, m2) +=! L(m1, r_k) * U(r_k, m2) where r_k in m1:M, r_k in 0:m2+1
+    }
+
+The reader may remark that this is an inefficient way of writing
+matrix-multiplication of triangular matrices.
+Lowering such operations efficiently from TC is the subject of future work.
+
+Prefix gradient tensors names with :code:`d_`
+---------------------------------------------
+
+When implementing backward operations, pass the inputs to the backwards pass
+in the same order as the outputs of the forward pass and use the same tensor
+name prefixed by :code:`d_`. For instance:
+
+.. code::
+
+     def conv(float(N,C,H,W) I, float(M,C,KH,KW) Wt) -> (O) {
+         ...
+     }
+
+     def conv_bw(float(N,C,H,W) I, float(M,C,KH,KW) Wt, float(N,M,HO,WO) d_O) -> (d_I) {
+         ...
+     }
+
+A more complex example
+----------------------
+
+The following shows a possible implementation for a more complex forward and
+backward example. Notice the proper alignment of indices in the backward pass
+and the emergence of an antidiagonal pattern in the reduction accesses:
+
+.. code::
+
+    def matmul(float(M,K) A, float(K,N) B) -> (C) {
+        C(m, n) +=! A(m, r_k) * B(r_k, n)
+    }
+    def matmul_bw(float(M,K) A, float(K,N) B, float(M,N) d_C) -> (d_A, d_B){
+        d_A(m, k) +=! d_C(  m, r_n) * B(  k, r_n)
+        d_B(k, n) +=! d_C(r_m,   n) * A(r_m,   k)
+    }
+
+Reasoning on such reduction patterns at the level of TC has already proven
+valuable in other circumstances.