Skip to content
This repository was archived by the owner on Apr 28, 2023. It is now read-only.
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions CodingConventions.md
Original file line number Diff line number Diff line change
Expand Up @@ -588,3 +588,10 @@ changes) while working on a feature and even in "WIP" pull requests,
as long as the pieces are recombined (e.g., through an interactive rebase)
into logical units when the feature is ready for merging.
Force-pushing in PR branches is fine.

Coding Conventions for writing Tensor Comprehensions
====================================================

Please see the following documentation
[entry](https://facebookresearch.github.io/TensorComprehensions/coding_conventions.html)
on how to write Tensor Comprehensions in a standard legible fashion.
86 changes: 43 additions & 43 deletions benchmarks/MLP_model.cc
Original file line number Diff line number Diff line change
Expand Up @@ -64,23 +64,23 @@ DEFINE_uint32(Q, 2, "W4_h");
// float(E1, D) LUT1, int32(B, L1) I1,
// float(E2, D) LUT2, int32(B, L2) I2) -> (O1, O2)
// {
// O1(i, j) +=! LUT1(I1(i, k), j)
// O2(i, j) +=! LUT2(I2(i, k), j)
// O1(b, d) +=! LUT1(I1(b, r_l1), d)
// O2(b, d) +=! LUT2(I2(b, r_l2), d)
// }
// def _3FCRELU(
// float(B,M) I, float(O,N) W2, float(O) B2,
// float(P,O) W3, float(P) B3, float(Q,P) W4,
// float(Q) B4) -> (O1, O2, O3, O4)
// {
// O2(b, o) = B2(o)
// O2(b, o) += O1(b, n) * W2(o, n)
// O2(b, o) = fmax(O2(b, o), 0)
// O3(b, p) = B3(p)
// O3(b, p) += O2(b, o) * W3(p, o)
// O3(b, p) = fmax(O3(b, p), 0)
// O4(b, q) = B4(q)
// O4(b, q) += O3(b, p) * W4(q, p)
// O4(b, q) = fmax(O4(b, q), 0)
// O2(b, o) = B2(o)
// O2(b, o) += O1(b, n) * W2(o, n)
// O2(b, o) = fmax(O2(b, o), 0)
// O3(b, p) = B3(p)
// O3(b, p) += O2(b, o) * W3(p, o)
// O3(b, p) = fmax(O3(b, p), 0)
// O4(b, q) = B4(q)
// O4(b, q) += O3(b, p) * W4(q, p)
// O4(b, q) = fmax(O4(b, q), 0)
// }
// def prod_model(float(E1, D) LUT1, int32(B, L1) I1,
// float(E2, D) LUT2, int32(B, L2) I2,
Expand All @@ -91,15 +91,15 @@ DEFINE_uint32(Q, 2, "W4_h");
// float(Q,P) W4, float(Q) B4)
// -> (C1, C2, C3, I, O1, O2, O3, O4)
// {
// (C1, C2) = _2LUT(LUT1, I1, LUT2, I2)
// C3(b, wy) += I3(b, wxx) * W(wy, wxx)
// I(b, m) = Concat(C1, C2, C3) // not in TC atm
// O1(b, n) = B1(n)
// O1(b, n) += I(b, m) * W1(n, m)
// O1(b, n) = fmax(O1(b, n), 0)
// (C1, C2) = _2LUT(LUT1, I1, LUT2, I2)
// C3(b, wy) +=! I3(b, r_wx) * W(wy, r_wx)
// I(b, m) = Concat(C1, C2, C3) // not in TC atm
// O1(b, n) = B1(n)
// O1(b, n) +=! I(b, m) * W1(n, m)
// O1(b, n) = fmax(O1(b, n), 0)
// (O2, O3, O4) =
// _3FCRELU(I, W1, B1, W2, B2, W3, B3, W4, B4)
// # O4 goes out to binary classifier, omitted here
// _3FCRELU(I, W1, B1, W2, B2, W3, B3, W4, B4)
// # O4 goes out to binary classifier, omitted here
// }

class ProductionModel : public Benchmark {
Expand Down Expand Up @@ -191,9 +191,9 @@ void ProductionModel::run1LUT(

std::vector<at::Tensor> inputs = {LUT1, IDX1};
std::string tc = R"(
def _1LUT(float(E1, D) LUT1, int32(B, L1) I1) -> (O1) {
O1(i, j) +=! LUT1(I1(i, k), j)
}
def _1LUT(float(E1, D) LUT1, int32(B, L1) I1) -> (O1) {
O1(b, d) +=! LUT1(I1(b, r_l1), d)
}
)";

std::string suffix = std::string("_B_") + std::to_string(FLAGS_B) +
Expand Down Expand Up @@ -294,10 +294,10 @@ void ProductionModel::run2LUT(

std::vector<at::Tensor> inputs = {LUT1, IDX1, LUT2, IDX2};
std::string tc = R"(
def _2LUT(float(E1, D) LUT1, int32(B, L1) I1, float(E2, D) LUT2, int32(B, L2) I2) -> (O1, O2) {
O1(i, j) +=! LUT1(I1(i, k), j)
O2(i, j) +=! LUT2(I2(i, k), j)
}
def _2LUT(float(E1, D) LUT1, int32(B, L1) I1, float(E2, D) LUT2, int32(B, L2) I2) -> (O1, O2) {
O1(b, d) +=! LUT1(I1(b, r_l1), d)
O2(b, d) +=! LUT2(I2(b, r_l2), d)
}
)";

std::string suffix = std::string("_B_") + std::to_string(FLAGS_B) +
Expand Down Expand Up @@ -353,9 +353,9 @@ void ProductionModel::runC3(

std::vector<at::Tensor> inputs = {I, W};
std::string tc = R"TC(
def _C3(float(B,WX) I, float(WY, WX) W) -> (C3) {
C3(b, wy) +=! I(b, wxx) * W(wy, wxx)
}
def _C3(float(B,WX) I, float(WY, WX) W) -> (C3) {
C3(b, wy) +=! I(b, r_wx) * W(wy, r_wx)
}
)TC";

std::string suffix = std::string("_B_") + std::to_string(FLAGS_B) +
Expand Down Expand Up @@ -408,11 +408,11 @@ void ProductionModel::runMLP1(

std::vector<at::Tensor> inputs = {I, W1, B1};
std::string tc = R"TC(
def mlp1(float(B,M) I, float(M, N) W1, float(N) B1) -> (O1) {
O1(b, n) +=! I(b, mm) * W1(mm, n)
O1(b, n) = O1(b, n) + B1(n)
O1(b, n) = fmax(O1(b, n), 0)
}
def mlp1(float(B,M) I, float(M, N) W1, float(N) B1) -> (O1) {
O1(b, n) +=! I(b, r_m) * W1(r_m, n)
O1(b, n) = O1(b, n) + B1(n)
O1(b, n) = fmax(O1(b, n), 0)
}
)TC";

std::string suffix = std::string("_B_") + std::to_string(FLAGS_B) +
Expand Down Expand Up @@ -474,17 +474,17 @@ void ProductionModel::runMLP3(

std::vector<at::Tensor> inputs = {I, W2, B2, W3, B3, W4, B4};
std::string tc = R"TC(
def mlp3(float(B,N) I, float(O,N) W2, float(O) B2, float(P,O) W3, float(P) B3, float(Q,P) W4, float(Q) B4) -> (O2, O3, O4) {
O2(b, o) +=! I(b, n) * W2(o, n)
O2(b, o) = O2(b, o) + B2(o)
O2(b, o) = fmax(O2(b, o), 0)
def mlp3(float(B,N) I, float(O,N) W2, float(O) B2, float(P,O) W3, float(P) B3, float(Q,P) W4, float(Q) B4) -> (O2, O3, O4) {
O2(b, o) +=! I(b, n) * W2(o, n)
O2(b, o) = O2(b, o) + B2(o)
O2(b, o) = fmax(O2(b, o), 0)
O3(b, p) +=! O2(b, o) * W3(p, o)
O3(b, p) = O3(b, p) + B3(p)
O3(b, p) = fmax(O3(b, p), 0)
O3(b, p) = O3(b, p) + B3(p)
O3(b, p) = fmax(O3(b, p), 0)
O4(b, q) +=! O3(b, p) * W4(q, p)
O4(b, q) = O4(b, q) + B4(q)
O4(b, q) = fmax(O4(b, q), 0)
}
O4(b, q) = O4(b, q) + B4(q)
O4(b, q) = fmax(O4(b, q), 0)
}
)TC";

std::string suffix = std::string("_B_") + std::to_string(FLAGS_B) +
Expand Down
6 changes: 3 additions & 3 deletions benchmarks/batchmatmul.cc
Original file line number Diff line number Diff line change
Expand Up @@ -76,9 +76,9 @@ void BatchMatMul::runBatchMatMul(

std::vector<at::Tensor> inputs = {X, Y};
std::string tc = R"(
def batch_matmul(float(B, N, M) X, float(B, M, K) Y) -> (Z) {
Z(b, n, k) +=! X(b, n, mm) * Y(b, mm, k)
}
def batch_matmul(float(B, N, M) X, float(B, M, K) Y) -> (Z) {
Z(b, n, k) +=! X(b, n, r_m) * Y(b, r_m, k)
}
)";

std::string suffix = std::string("_B_") + std::to_string(FLAGS_B) +
Expand Down
12 changes: 6 additions & 6 deletions benchmarks/group_convolution.cc
Original file line number Diff line number Diff line change
Expand Up @@ -122,13 +122,13 @@ void GroupConvolution::runGroupConvolution(
.resize_({G, F});
std::vector<at::Tensor> inputs = {tI, tW, tB};
std::string tc = R"(
def group_convolution(float(N,G,C,H,W) I, float(G,F,C,KH,KW) W1, float(G,F) B)
-> (O)
{
def group_convolution(float(N,G,C,H,W) I, float(G,F,C,KH,KW) W1, float(G,F) B)
-> (O)
{
O(n, g, f, h, w) +=!
I(n, g, c, h + kh, w + kw) * W1(g, f, c, kh, kw)
O(n, g, f, h, w) = O(n, g, f, h, w) + B(g, f)
}
I(n, g, r_c, h + r_kh, w + r_kw) * W1(g, f, r_c, r_kh, r_kw)
O(n, g, f, h, w) = O(n, g, f, h, w) + B(g, f)
}
)";

std::string suffix = std::string("_N_") + std::to_string(FLAGS_N) +
Expand Down
6 changes: 3 additions & 3 deletions benchmarks/tmm.cc
Original file line number Diff line number Diff line change
Expand Up @@ -73,9 +73,9 @@ void TransposedMatMul::runTransposedMatMul(

std::vector<at::Tensor> inputs = {A, B};
std::string tc = R"TC(
def tmm(float(M,K) A, float(N,K) B) -> (C) {
C(m, n) +=! A(m, kk) * B(n, kk)
}
def tmm(float(M,K) A, float(N,K) B) -> (C) {
C(m, n) +=! A(m, r_k) * B(n, r_k)
}
)TC";

std::string suffix = std::string("_M_") + std::to_string(FLAGS_M) +
Expand Down
29 changes: 14 additions & 15 deletions docs/doxygen/index.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,30 +13,30 @@ with a few basic functionalities.

Tensor Comprehension Notation
-----------------------------
TC borrow three ideas from Einstein notation that make expressions concise:
TC borrows three ideas from Einstein notation that make expressions concise:

1. Loop index variables are defined implicitly by using them in an expression and their range is aggressively inferred based on what they index.
2. Indices that appear on the right of an expression but not on the left are assumed to be reduction dimensions.
3. The evaluation order of points in the iteration space does not affect the output.

Let's start with a simple example is a matrix vector product:

def mv(float(R,C) A, float(C) B) -> (o) {
o(i) +=! A(i,j) * B(j)
def mv(float(R,C) A, float(C) x) -> (o) {
o(r) +=! A(r,r_c) * x(r_c)
}

`A` and `x` are input tensors. `o` is an output tensor.
The statement `o(i) += A(i,j) * b(j)` introduces two index variables `i` and `j`.
Their range is inferred by their use indexing `A` and `B`. `i = [0,R)`, `j = [0,C)`.
Because `j` only appears on the right side,
stores into `o` will reduce over `j` with the reduction specified for the loop.
The statement `o(r) +=! A(r,r_c) * x(r_c)` introduces two index variables `r` and `r_c`.
Their range is inferred by their use indexing `A` and `x`. `r = [0,R)`, `r_c = [0,C)`.
Because `r_c` only appears on the righthand side,
stores into `o` will reduce over `r_c` with the reduction specified for the loop.
Reductions can occur across multiple variables, but they all share the same kind of associative reduction (e.g. +=)
to maintain invariant (3). `mv` computes the same thing as this C++ loop:

for(int i = 0; i < R; i++) {
o(i) = 0.0f;
for(int j = 0; j < C; j++) {
o(i) += A(i,j) * B(j);
o(i) += A(i,j) * x(j);
}
}

Expand All @@ -50,7 +50,7 @@ We provide a few basic examples.
**Simple matrix-vector**:

def mv(float(R,C) A, float(C) B) -> (o) {
o(i) += A(i,j) * B(j)
o(r) +=! A(r,r_c) * B(r_c)
}

**Simple matrix-multiply:**
Expand All @@ -59,21 +59,20 @@ Note the layout for B is transposed and matches the
traditional layout of the weight matrix in a linear layer):

def mm(float(X,Y) A, float(Y,Z) B) -> (R) {
R(i,j) += A(i,j) * B(j,k)
R(x,z) +=! A(x,r_y) * B(r_y,z)
}

**Simple 2-D convolution (no stride, no padding):**

def conv(float(B,IP,H,W) input, float(OP,IP,KH,KW) weight) -> (output) {
output(b, op, h, w) += input(b, ip, h + kh, w + kw) * weight(op, ip, kh, kw)
output(b, op, h, w) +=! input(b, r_ip, h + r_kh, w + r_kw) * weight(op, r_ip, r_kh, r_kw)
}

**Simple 2D max pooling:**

Note the similarity with a convolution with a
"select"-style kernel):
Note the similarity with a convolution with a "select"-style kernel:

def maxpool2x2(float(B,C,H,W) input) -> (output) {
output(b,c,i,j) max= input(b,c,2*i + kw, 2*j + kh)
where kw = [0, 2[, kh = [0, 2[
output(b,c,h,w) max=! input(b,c,2*h + r_kw, 2*w + r_kh)
where r_kw in 0:2, r_kh in 0..2
}
120 changes: 120 additions & 0 deletions docs/source/coding_conventions.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
Coding Conventions
==================

In order to increase readability across Tensor Comprehensions written by
multiple authors and to reduce the amount of surprising behavior, the
following conventions should be adopted when writing TC. Generally in TC, one
should increment nesting by 4 whitespaces at each level and align tensor names
and indices where appropriate to make memory access patterns emerge. Since
these two goals can easily be conflicting, use your best judgement to tradeoff
between the two goals. Such examples are provided below.

Use indices named after parameters
----------------------------------

Use upper-case names for parameters and capital-case names for input/output tensors.
Use lower-case names for indices to match the name of the parameter
corresponding to the dimension upon which they iterate.
In other words, prefer:

.. code::

def copy2d(float(M, N) I) -> (O) {
O(m, n) = I(m, n)
}

to:

.. code::

def copy2d(float(M, N) I) -> (O) {
O(i, j) = I(i, j)
}

Prefix reduction index names with :code:`r_`
--------------------------------------------

By definition, reduction indices are the ones that appear on the RHS of a TC
expression but not on the LHS. On larger expressions it can get challenging to easily
detect the reduction variables by mentally parsing the set of indices on the
RHS and subtracting the set of indices on the LHS from it. To alleviate such
issues, name the reduction variables with a :code:`r_` prefix.
In other words, prefer:

.. code::

def matmul(float(M, K) A, float(K, N) B) -> (C) {
C(m, n) +=! A(m, r_k) * B(r_k, n)
}

to:

.. code::

def matmul(float(M, K) A, float(K, N) B) -> (C) {
C(m, n) +=! A(m, k) * B(k, n)
}

Filter non-rectangular regions with data-dependencies
-----------------------------------------------------

TC semantics are restricted to (hyper-)rectangular iteration spaces.
This is a hard requirement to ensure range inference is non-ambiguous (see inference_).
To simulate non-rectangular iteration spaces, one can use the following:

.. code::

def matmul(float(M, K) L, float(K, M) U) -> (LU) {
LU(m1, m2) +=! (r_k >= m1 and r_k =< m2) ? L(m1, r_k) * U(r_k, m2) : 0
}

However, non-(hyper)-rectangular iteration spaces (e.g. triangular) are
incompatible with range inference and will fail the semantic checks in the TC
compiler:

.. code::

def matmul(float(M, K) L, float(K, M) U) -> (LU) {
LU(m1, m2) +=! L(m1, r_k) * U(r_k, m2) where r_k in m1:M, r_k in 0:m2+1
}

The reader may remark that this is an inefficient way of writing
matrix-multiplication of triangular matrices.
Lowering such operations efficiently from TC is the subject of future work.

Prefix gradient tensors names with :code:`d_`
---------------------------------------------

When implementing backward operations, pass the inputs to the backwards pass
in the same order as the outputs of the forward pass and use the same tensor
name prefixed by :code:`d_`. For instance:

.. code::

def conv(float(N,C,H,W) I, float(M,C,KH,KW) Wt) -> (O) {
...
}

def conv_bw(float(N,C,H,W) I, float(M,C,KH,KW) Wt, float(N,M,HO,WO) d_O) -> (d_I) {
...
}

A more complex example
----------------------

The following shows a possible implementation for a more complex forward and
backward example. Notice the proper alignment of indices in the backward pass
and the emergence of an antidiagonal pattern in the reduction accesses:

.. code::

def matmul(float(M,K) A, float(K,N) B) -> (C) {
C(m, n) +=! A(m, r_k) * B(r_k, n)
}
def matmul_bw(float(M,K) A, float(K,N) B, float(M,N) d_C) -> (d_A, d_B){
d_A(m, k) +=! d_C( m, r_n) * B( k, r_n)
d_B(k, n) +=! d_C(r_m, n) * A(r_m, k)
}

Reasoning on such reduction patterns at the level of TC has already proven
valuable in other circumstances.
Loading