cirosantilli
diff --git a/‎opencl/introduction.md‎
Lines changed: 5 additions & 1 deletion b/‎opencl/introduction.md‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎opengl/README.md‎
Lines changed: 1 addition & 0 deletions b/‎opengl/README.md‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎opengl/compute-shader.md‎
Lines changed: 25 additions & 0 deletions b/‎opengl/compute-shader.md‎
Lines changed: 25 additions & 0 deletions
diff --git a/‎opengl/glfw_color_array.c‎
Lines changed: 2 additions & 2 deletions b/‎opengl/glfw_color_array.c‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎opengl/glfw_compute_shader.c‎
Lines changed: 24 additions & 14 deletions b/‎opengl/glfw_compute_shader.c‎
Lines changed: 24 additions & 14 deletions
@@ -28,6 +28,10 @@ OpenCL, like any other language has versions. As of 2013 the latest version is O
 - <http://stackoverflow.com/questions/4005935/mix-opencl-with-opengl>
 - <http://stackoverflow.com/questions/7907510/opengl-vs-opencl-which-to-choose-and-why>
 - <http://stackoverflow.com/questions/8824269/gl-cl-interoperability-shared-texture>
-- <https://github.com/9prady9/CLGLInterop>
+- <https://github.com/9prady9/CLGLInterop> Works! Started minifying example with: <https://github.com/cirosantilli/CLGLInterop/tree/minify>
+- <https://github.com/nvpro-samples/gl_cl_interop_pingpong_st> Build failed with: <https://github.com/nvpro-samples/gl_cl_interop_pingpong_st/issues/1> likely only tested on Windows.
+- <https://github.com/halcy/simpleflow> VS build, fluid simulation, preview: <https://www.youtube.com/watch?v=KD2UqBCqfjA>
+- <https://github.com/Twinklebear/OpenCL-OpenGL-Interop> VS build
+- <http://stackoverflow.com/questions/33575715/opencl-opengl-interop-how-to-fill-a-climagegl>
 
 Also see compute shaders for OpenGL 4.X, they seem to integrate better.
@@ -38,6 +38,7 @@
     1.  Compute shader
         1.  [compute-shader.md](compute-shader.md)
         1.  [compute_shader.c](glfw_compute_shader.c)
+        1.  [compute_shader_.c](glfw_compute_shader.c)
     1.  GLUT
         1. [glutBitmapCharacter](bitmap_character.c)
         1. [Triangle rotate](triangle_rotate.c)
 
@@ -2,6 +2,8 @@
 
 Vs OpenCL: <http://wili.cc/blog/opengl-cs.html>
 
+Vs frament shader: <http://computergraphics.stackexchange.com/questions/54/when-is-a-compute-shader-more-efficient-than-a-pixel-shader-for-image-filterinig>
+
 > But why did Khronos introduce compute shaders in OpenGL when they already had OpenCL and its OpenGL interoperability API? Well, OpenCL (and CUDA) are aimed for heavyweight GPGPU projects and offer more features. Also, OpenCL can run on many different types of hardware (apart from GPUs), which makes the API thick and complicated compared to light compute shaders. Finally, the explicit synchronization between OpenGL and OpenCL/CUDA is troublesome to do without crudely blocking (some of the required extensions are not even supported yet). With compute shaders, however, OpenGL is aware of all the dependencies and can schedule things smarter. This aspect of overhead might, in the end, be the most significant benefit for graphics algorithms which often execute for less than a millisecond. 
 
 Examples:
@@ -12,7 +14,30 @@ Examples:
 
     Most interesting files are `ParticleSystem.cpp` and `cs.glsl`.
 
+-   <https://community.arm.com/groups/arm-mali-graphics/blog/2014/04/17/get-started-with-compute-shaders>, runnable from their SDK
+
 Applications:
 
 - ray tracing
 - ignore objects too far away
+
+## Work group
+
+TODO: what is the advantage of work grops?
+
+Ideally, we would have a single work group, but that hits hardware design limitations (memory locality): <http://stackoverflow.com/questions/39380986/opengl-is-there-a-benefit-to-using-multiple-global-work-groups-for-compute-shad>
+
+- http://gamedev.stackexchange.com/questions/66198/optimal-number-of-work-groups-for-compute-shaders
+- https://www.cg.tuwien.ac.at/courses/Realtime/repetitorium/rtr_rep_2014_ComputeShader.pdf
+
+More work groups does not mean faster TODO why? CL exposes `CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE`, but 
+
+### Shared memory
+
+Shared memory (SM).
+
+Per work group, faster access in group. This is what characterizes different groups.
+
+General algorithm: copy global memory to shared, and then process there.
+
+Only useful if the given memory is accessed several times.
@@ -8,8 +8,8 @@ Color interpolation on the fragment shader is automatic.
 
 #include "common.h"
 
-static const GLuint WIDTH = 500;
-static const GLuint HEIGHT = 500;
+static const GLuint WIDTH = 512;
+static const GLuint HEIGHT = 512;
 /* fragColor is passed on to the fragment shader. */
 static const GLchar *vertex_shader_source =
     "#version 330 core\n"
 
@@ -1,11 +1,17 @@
 /*
 Compute shader hello world.
 
+Does a simple computation, and writes it directly to the
+texture seen by the frament shader.
+
 This could be done easily on a fragment shader,
 so this is is just an useless sanity check example.
 
 The main advantage of compute shaders (which we are not doing here),
-shader can do is keep state data on the GPU between draw calls.
+is that they can keep state data on the GPU between draw calls.
+
+This is basically the upper limit speed of compute to texture operations,
+since we are only doing a very simple operaiton on the shader.
 
 TODO understand:
 
@@ -55,10 +61,10 @@ static const char *compute_shader_source =
     "layout (local_size_x = 1, local_size_y = 1) in;\n"
     "layout (rgba32f, binding = 0) uniform image2D img_output;\n"
     "void main () {\n"
-    "    ivec2 pixel_coords = ivec2(gl_GlobalInvocationID.xy);\n"
+    "    ivec2 gid = ivec2(gl_GlobalInvocationID.xy);\n"
     "    ivec2 dims = imageSize(img_output);\n"
-    "    vec4 pixel = vec4(pixel_coords.x / float(dims.x), pixel_coords.y / float(dims.y), 1.0, 1.0);\n"
-    "    imageStore(img_output, pixel_coords, pixel);\n"
+    "    vec4 pixel = vec4(gid.x / float(dims.x), gid.y / float(dims.y), 1.0, 1.0);\n"
+    "    imageStore(img_output, gid, pixel);\n"
     "}\n";
 
 int main(void) {
@@ -77,8 +83,8 @@ int main(void) {
         vao
     ;
     unsigned int
-        width = 512,
-        height = 512
+        width = WIDTH,
+        height = HEIGHT
     ;
 
     /* Window. */
@@ -126,6 +132,8 @@ int main(void) {
     glBindTexture(GL_TEXTURE_2D, texture);
     glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_LINEAR);
     glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_LINEAR);
+    /* Same internal format as compute shader input.
+     * data=NULL to just allocate the memory but not set it to anything. */
     glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA32F, width, height, 0, GL_RGBA, GL_FLOAT, NULL);
     /* Bind to image unit, to allow writting to it from the compute shader. */
     glBindImageTexture(0, texture, 0, GL_FALSE, 0, GL_WRITE_ONLY, GL_RGBA32F);
@@ -136,20 +144,22 @@ int main(void) {
     glDispatchCompute((GLuint)width, (GLuint)height, 1);
     glMemoryBarrier(GL_SHADER_IMAGE_ACCESS_BARRIER_BIT);
 
-    /* Draw. */
+    /* Global state. */
     glViewport(0, 0, width, height);
     glClearColor(1.0f, 1.0f, 1.0f, 1.0f);
-    glClear(GL_COLOR_BUFFER_BIT);
-    glUseProgram(program);
-    glUniform1i(textureSampler_location, 0);
-    glBindVertexArray(vao);
-    glDrawElements(GL_TRIANGLES, 6, GL_UNSIGNED_INT, 0);
-    glBindVertexArray(0);
-    glfwSwapBuffers(window);
 
     /* Main loop. */
+    common_fps_init();
     while (!glfwWindowShouldClose(window)) {
+        glClear(GL_COLOR_BUFFER_BIT);
+        glUseProgram(program);
+        glUniform1i(textureSampler_location, 0);
+        glBindVertexArray(vao);
+        glDrawElements(GL_TRIANGLES, 6, GL_UNSIGNED_INT, 0);
+        glBindVertexArray(0);
+        glfwSwapBuffers(window);
         glfwPollEvents();
+        common_fps_print();
     }
 
     /* Cleanup. */