-
Notifications
You must be signed in to change notification settings - Fork 657
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
super-kernel functionality #67
Comments
I believe this is what #54 was aiming for, just fyi. |
So I've had a bit of a revelation, if we vectorize this, we could remove the need to jump from kernel to kernel. Here is a quick gist they may actually work: function relu(value) {
return Math.max(0, value[this.thread.y][this.thread.x]);
}
function add(left, right) {
return left[this.thread.y][this.thread.x] + right[this.thread.y][this.thread.x];
}
function multiply(left, right) {
var sum = 0;
for (var i=0; i<512; i++) {
sum += left[this.thread.y][i] * right[i][this.thread.x];
}
return sum;
}
const gpu = GPU({ mode: 'webgl' });
const layerForward = gpu
.addFunction(relu)
.addFunction(add)
.addFunction(multiply)
.createKernel(function(weightMatrix, inputMatrix, transitionMatrix, previousOutputMatrix, biasMatrix) {
return relu(
add(
add(
multiply(
weightMatrix,
inputMatrix
),
multiply(
transitionMatrix,
previousOutputMatrix
)
),
biasMatrix
)
);
}); This is a recurrent neural net layer, more to come. |
Just a quick note, unfortunately in Javascript, "this" refers to the current function, so it might not work in JavaScript unless we use compilation to hijack what "this" means or pass this.thread.x y z as arguments. The first example could be implemented by using by having combineKernels use the outputToTexture flag transparently. For the first example, we could return array size outputs, but the second example, we can only return singular values. |
Bah, we can just x & y via args. It was just a pseudo script. |
Which do you think would be faster? |
Tried it locally, and the above is about 100 to 150 ms faster than multiple kernels! |
Actual code: var mat_size = 512;
function splitArray(array, part) {
var tmp = [];
for(var i = 0; i < array.length; i += part) {
tmp.push(array.slice(i, i + part));
}
return tmp;
}
function randomNestedArray(matSize) {
var array = [];
for (var n = 0; n < matSize * matSize; n++) {
array.push(Math.random());
}
return splitArray(array, matSize);
}
function relu(value) {
return Math.max(0, value);
}
function add(left, right) {
return left * right;
}
function multiply(left, right, y, x) {
var sum = 0;
for (var i = 0; i < 512; i++) {
sum += left[y][i] * right[i][x];
}
return sum;
}
var cpu = new GPU({ mode: 'cpu' });
var gpu = new GPU({ mode: 'webgl' });
[cpu, gpu].forEach(function(gpu) {
const layerForward = gpu
.addFunction(relu)
.addFunction(add)
// .addFunction(multiply, {
// left: Array,
// right: Array,
// returns: Number
// })
.createKernel(function(weightMatrix, inputMatrix, transitionMatrix, previousOutputMatrix, biasMatrix) {
var weightAndMatrix = 0;
var transitionAndPreviousOutput = 0;
for (var i = 0; i < 512; i++) {
weightAndMatrix += weightMatrix[this.thread.y][i] * inputMatrix[i][this.thread.x];
transitionAndPreviousOutput += transitionMatrix[this.thread.y][i] * previousOutputMatrix[i][this.thread.x];
}
return relu(
add(
add(
weightAndMatrix,
transitionAndPreviousOutput
),
biasMatrix[this.thread.y][this.thread.x]
)
);
}, { dimensions: [mat_size, mat_size] });
console.time(gpu.mode);
console.log(layerForward(
randomNestedArray(mat_size),
randomNestedArray(mat_size),
randomNestedArray(mat_size),
randomNestedArray(mat_size),
randomNestedArray(mat_size)
));
console.timeEnd(gpu.mode);
});
var addKernel = gpu.createKernel(function(left, right) {
return left[this.thread.y][this.thread.x] + right[this.thread.y][this.thread.x];
}, { dimensions: [mat_size, mat_size] });
var multiplyKernel = gpu.createKernel(function(left, right) {
var sum = 0;
for (var i = 0; i < 512; i++) {
sum += left[this.thread.y][i] * right[i][this.thread.x];
}
return sum;
}, { dimensions: [mat_size, mat_size] });
var reluKernel = gpu.createKernel(function(value) {
return Math.max(0, value[this.thread.y][this.thread.x]);
}, { dimensions: [mat_size, mat_size] });
console.time('nested');
console.log(
reluKernel(
addKernel(
addKernel(
multiplyKernel(
randomNestedArray(mat_size),
randomNestedArray(mat_size)
),
multiplyKernel(
randomNestedArray(mat_size),
randomNestedArray(mat_size)
)
),
randomNestedArray(mat_size)
)
)
);
console.timeEnd('nested'); |
Related issue here: #77 |
Not even sure it is possible, but I'd love to see something like this:
Combine kernels would change
multiply
&add
so that they don't return data to the cpu, but rather the function returned fromcombineKernels
would obtain the data from gpu. What would this approach be called?The text was updated successfully, but these errors were encountered: