<a href="https://colab.research.google.com/github/dagmaros27/AIMS_Notebooks/blob/main/CUDA_Practical_4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **CUDA Programming on NVIDIA GPUs**

# **Practical 4**

Again make sure the correct Runtime is being used, by clicking on the Runtime option at the top, then "Change runtime type", and selecting an appropriate GPU such as the T4.

Then verify the details of the GPU which is available to you, and upload the usual two header files.

In [1]:
!nvidia-smi


Wed Jan 28 15:41:40 2026       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   59C    P8             12W /   70W |       0MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [2]:
!wget https://people.maths.ox.ac.uk/gilesm/cuda/headers/helper_cuda.h
!wget https://people.maths.ox.ac.uk/gilesm/cuda/headers/helper_string.h


--2026-01-28 15:41:44--  https://people.maths.ox.ac.uk/gilesm/cuda/headers/helper_cuda.h
Resolving people.maths.ox.ac.uk (people.maths.ox.ac.uk)... 129.67.184.129, 2001:630:441:201::8143:b881
Connecting to people.maths.ox.ac.uk (people.maths.ox.ac.uk)|129.67.184.129|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 27832 (27K) [text/x-chdr]
Saving to: ‘helper_cuda.h’


2026-01-28 15:41:45 (199 KB/s) - ‘helper_cuda.h’ saved [27832/27832]

--2026-01-28 15:41:45--  https://people.maths.ox.ac.uk/gilesm/cuda/headers/helper_string.h
Resolving people.maths.ox.ac.uk (people.maths.ox.ac.uk)... 129.67.184.129, 2001:630:441:201::8143:b881
Connecting to people.maths.ox.ac.uk (people.maths.ox.ac.uk)|129.67.184.129|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 14875 (15K) [text/x-chdr]
Saving to: ‘helper_string.h’


2026-01-28 15:41:45 (363 KB/s) - ‘helper_string.h’ saved [14875/14875]





---

The next step is to create the file reduction.cu which includes within it a reference C++ routine against which the CUDA results are compared.

In [3]:
%%writefile reduction.cu

////////////////////////////////////////////////////////////////////////
//
// Practical 4 -- initial code for shared memory reduction for
//                a single block which is a power of two in size
//
////////////////////////////////////////////////////////////////////////

#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
#include <float.h>

#include <helper_cuda.h>

////////////////////////////////////////////////////////////////////////
// CPU routine
////////////////////////////////////////////////////////////////////////

float reduction_gold(float* idata, int len)
{
  float sum = 0.0f;
  for(int i=0; i<len; i++) sum += idata[i];

  return sum;
}

////////////////////////////////////////////////////////////////////////
// GPU routine
////////////////////////////////////////////////////////////////////////

__global__ void reduction(float *g_odata, float *g_idata)
{
    // dynamically allocated shared memory

    extern  __shared__  float temp[];

    int tid = threadIdx.x;

    // first, each thread loads data into shared memory

    temp[tid] = g_idata[tid];

    // next, we perform binary tree reduction

    for (int d=blockDim.x/2; d>0; d=d/2) {
      __syncthreads();  // ensure previous step completed
      if (tid<d)  temp[tid] += temp[tid+d];
    }

    // finally, first thread puts result into global memory

    if (tid==0) g_odata[0] = temp[0];
}


////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////

int main( int argc, const char** argv)
{
  int num_blocks, num_threads, num_elements, mem_size, shared_mem_size;

  float *h_data, *d_idata, *d_odata;

  // initialise card

  findCudaDevice(argc, argv);

  num_blocks   = 1;  // start with only 1 thread block
  num_threads  = 512;
  num_elements = num_blocks*num_threads;
  mem_size     = sizeof(float) * num_elements;

  // allocate host memory to store the input data
  // and initialize to integer values between 0 and 10

  h_data = (float*) malloc(mem_size);

  for(int i = 0; i < num_elements; i++)
    h_data[i] = floorf(10.0f*(rand()/(float)RAND_MAX));

  // compute reference solution

  float sum = reduction_gold(h_data, num_elements);

  // allocate device memory input and output arrays

  checkCudaErrors( cudaMalloc((void**)&d_idata, mem_size) );
  checkCudaErrors( cudaMalloc((void**)&d_odata, sizeof(float)) );

  // copy host memory to device input array

  checkCudaErrors( cudaMemcpy(d_idata, h_data, mem_size,
                              cudaMemcpyHostToDevice) );

  // execute the kernel

  shared_mem_size = sizeof(float) * num_threads;
  reduction<<<num_blocks,num_threads,shared_mem_size>>>(d_odata,d_idata);
  getLastCudaError("reduction kernel execution failed");

  // copy result from device to host

  checkCudaErrors( cudaMemcpy(h_data, d_odata, sizeof(float),
                              cudaMemcpyDeviceToHost) );

  // check results

  printf("reduction error = %f\n",h_data[0]-sum);

  // cleanup memory

  free(h_data);
  checkCudaErrors( cudaFree(d_idata) );
  checkCudaErrors( cudaFree(d_odata) );

  // CUDA exit -- needed to flush printf write buffer

  cudaDeviceReset();
}


Writing reduction.cu



---

We can now compile and run the executable.


In [4]:
!nvcc reduction.cu -o reduction -I. -lineinfo -arch=sm_70 --ptxas-options=-v --use_fast_math -lcudart

ptxas info    : 0 bytes gmem
ptxas info    : Compiling entry function '_Z9reductionPfS_' for 'sm_70'
ptxas info    : Function properties for _Z9reductionPfS_
    0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info    : Used 10 registers, 368 bytes cmem[0]


In [5]:
!./reduction

GPU Device 0: "Turing" with compute capability 7.5

reduction error = 0.000000


## Question 4
Compile and run the executable reduction, and check that it gets the correct
result. Put the output in your notebook and explain why this shows the result
is correct, and how the code has performed the required check.

Answer:

The output shows a reduction error of 0.0, which indicates that the GPU kernel has produced the correct result. This check is performed by comparing the value computed on the GPU with the reference result computed on the CPU using `reduction_gold`. Since the program prints the difference between these two values and the difference is zero, it confirms that the shared-memory reduction kernel correctly sums all input elements.



---
By going back to the previous code block you can modify the code to complete the Practical 4 exercises. Remember to first make your own copy of the notebook so that you are able to edit it.

For the first exercise, it may be useful to know that the following line of code will round up the input n to the nearest power of 2, so then dividing it by 2 gives the largest power of 2 less than n.

`for (m=1; m<n; m=2*m) {} `

For students doing this as an assignment to be assessed, you should again add your name to the title of the notebook (as in "Practical 4 -- Mike Giles.ipynb"), make it shared (see the Share option in the top-right corner) and provide the shared link as the submission mechanism.



## Question 5

The code currently assumes the number of threads is a power of 2.
Extend it to handle the general case by finding the largest power of 2 less than
blockSize, and adding the elements beyond that point to the corresponding
first set of elements of that size. Test it with 192 threads.

In [6]:
%%writefile reduction.cu

////////////////////////////////////////////////////////////////////////
//
// Practical 4 -- initial code for shared memory reduction for
//                a single block which is a power of two in size
//
////////////////////////////////////////////////////////////////////////

#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
#include <float.h>

#include <helper_cuda.h>

////////////////////////////////////////////////////////////////////////
// CPU routine
////////////////////////////////////////////////////////////////////////

float reduction_gold(float* idata, int len)
{
  float sum = 0.0f;
  for(int i=0; i<len; i++) sum += idata[i];

  return sum;
}

////////////////////////////////////////////////////////////////////////
// GPU routine
////////////////////////////////////////////////////////////////////////

__global__ void reduction(float *g_odata, float *g_idata)
{
    // dynamically allocated shared memory

    extern  __shared__  float temp[];

    int tid = threadIdx.x;

    // first, each thread loads data into shared memory

    temp[tid] = g_idata[tid];

    int m;
    for (m=1; m<blockDim.x; m=2*m) {}
    m = m/2;

    if (tid + m < blockDim.x) {
        temp[tid] += temp[tid + m];
    }
    __syncthreads();

    // next, we perform binary tree reduction

    for (int d=m/2; d>0; d=d/2) {
      __syncthreads();  // ensure previous step completed
      if (tid<d)  temp[tid] += temp[tid+d];
    }

    // finally, first thread puts result into global memory

    if (tid==0) g_odata[0] = temp[0];
}


////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////

int main( int argc, const char** argv)
{
  int num_blocks, num_threads, num_elements, mem_size, shared_mem_size;

  float *h_data, *d_idata, *d_odata;

  // initialise card

  findCudaDevice(argc, argv);

  num_blocks   = 1;  // start with only 1 thread block
  num_threads  = 192;
  num_elements = num_blocks*num_threads;
  mem_size     = sizeof(float) * num_elements;

  // allocate host memory to store the input data
  // and initialize to integer values between 0 and 10

  h_data = (float*) malloc(mem_size);

  for(int i = 0; i < num_elements; i++)
    h_data[i] = floorf(10.0f*(rand()/(float)RAND_MAX));

  // compute reference solution

  float sum = reduction_gold(h_data, num_elements);

  // allocate device memory input and output arrays

  checkCudaErrors( cudaMalloc((void**)&d_idata, mem_size) );
  checkCudaErrors( cudaMalloc((void**)&d_odata, sizeof(float)) );

  // copy host memory to device input array

  checkCudaErrors( cudaMemcpy(d_idata, h_data, mem_size,
                              cudaMemcpyHostToDevice) );

  // execute the kernel

  shared_mem_size = sizeof(float) * num_threads;
  reduction<<<num_blocks,num_threads,shared_mem_size>>>(d_odata,d_idata);
  getLastCudaError("reduction kernel execution failed");

  // copy result from device to host

  checkCudaErrors( cudaMemcpy(h_data, d_odata, sizeof(float),
                              cudaMemcpyDeviceToHost) );

  // check results

  printf("reduction error = %f\n",h_data[0]-sum);

  // cleanup memory

  free(h_data);
  checkCudaErrors( cudaFree(d_idata) );
  checkCudaErrors( cudaFree(d_odata) );

  // CUDA exit -- needed to flush printf write buffer

  cudaDeviceReset();
}


Overwriting reduction.cu


In [8]:
!nvcc reduction.cu -o reduction -I. -lineinfo -arch=sm_70 --ptxas-options=-v --use_fast_math -lcudart

ptxas info    : 0 bytes gmem
ptxas info    : Compiling entry function '_Z9reductionPfS_' for 'sm_70'
ptxas info    : Function properties for _Z9reductionPfS_
    0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info    : Used 10 registers, 368 bytes cmem[0]


In [9]:
!./reduction

GPU Device 0: "Turing" with compute capability 7.5

reduction error = 0.000000


## Question 6

The code currently performs the reduction operation for a single thread block.
Modify the code to perform reduction using 1000 blocks each with 512
threads, with each block working with a different section of an input array of
size 512000.

In [11]:
%%writefile reduction.cu

////////////////////////////////////////////////////////////////////////
//
// Practical 4 -- initial code for shared memory reduction for
//                a single block which is a power of two in size
//
////////////////////////////////////////////////////////////////////////

#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
#include <float.h>

#include <helper_cuda.h>

////////////////////////////////////////////////////////////////////////
// CPU routine
////////////////////////////////////////////////////////////////////////

float reduction_gold(float* idata, int len)
{
  float sum = 0.0f;
  for(int i=0; i<len; i++) sum += idata[i];

  return sum;
}

////////////////////////////////////////////////////////////////////////
// GPU routine
////////////////////////////////////////////////////////////////////////

__global__ void reduction(float *g_odata, float *g_idata)
{
    // dynamically allocated shared memory

    extern  __shared__  float temp[];

    int tid = threadIdx.x;

    // first, each thread loads data into shared memory

    temp[tid] = g_idata[tid + blockDim.x * blockIdx.x];

    int m;
    for (m=1; m<blockDim.x; m=2*m) {}
    m = m/2;

    if (tid + m < blockDim.x) {
        temp[tid] += temp[tid + m];
    }
    __syncthreads();

    // next, we perform binary tree reduction

    for (int d=m/2; d>0; d=d/2) {
      __syncthreads();  // ensure previous step completed
      if (tid<d)  temp[tid] += temp[tid+d];
    }

    // finally, first thread puts result into global memory

    if (tid==0) atomicAdd(g_odata,temp[0]);
}


////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////

int main( int argc, const char** argv)
{
  int num_blocks, num_threads, num_elements, mem_size, shared_mem_size;

  float *h_data, *d_idata, *d_odata;

  // initialise card

  findCudaDevice(argc, argv);

  num_blocks   = 1000;  // start with only 1 thread block
  num_threads  = 512;
  num_elements = num_blocks*num_threads;
  mem_size     = sizeof(float) * num_elements;

  // allocate host memory to store the input data
  // and initialize to integer values between 0 and 10

  h_data = (float*) malloc(mem_size);

  for(int i = 0; i < num_elements; i++)
    h_data[i] = floorf(10.0f*(rand()/(float)RAND_MAX));

  // compute reference solution

  float sum = reduction_gold(h_data, num_elements);

  // allocate device memory input and output arrays

  checkCudaErrors( cudaMalloc((void**)&d_idata, mem_size) );
  checkCudaErrors( cudaMalloc((void**)&d_odata, sizeof(float)) );

  // copy host memory to device input array

  checkCudaErrors( cudaMemcpy(d_idata, h_data, mem_size,
                              cudaMemcpyHostToDevice) );

  // execute the kernel

  shared_mem_size = sizeof(float) * num_threads;
  reduction<<<num_blocks,num_threads,shared_mem_size>>>(d_odata,d_idata);
  getLastCudaError("reduction kernel execution failed");

  // copy result from device to host

  checkCudaErrors( cudaMemcpy(h_data, d_odata, sizeof(float),
                              cudaMemcpyDeviceToHost) );

  // check results

  printf("reduction error = %f\n",h_data[0]-sum);

  // cleanup memory

  free(h_data);
  checkCudaErrors( cudaFree(d_idata) );
  checkCudaErrors( cudaFree(d_odata) );

  // CUDA exit -- needed to flush printf write buffer

  cudaDeviceReset();
}


Overwriting reduction.cu


In [12]:
!nvcc reduction.cu -o reduction -I. -lineinfo -arch=sm_70 --ptxas-options=-v --use_fast_math -lcudart

ptxas info    : 0 bytes gmem
ptxas info    : Compiling entry function '_Z9reductionPfS_' for 'sm_70'
ptxas info    : Function properties for _Z9reductionPfS_
    0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info    : Used 10 registers, 368 bytes cmem[0]


In [13]:
!./reduction

GPU Device 0: "Turing" with compute capability 7.5

reduction error = 0.000000


## Question 7

Modify the block-level reduction to use shuffle instructions as described in
Lecture 4. Again your notebook should include your code, and results to show
that the calculation has been carried out successfully.

In [18]:
%%writefile reduction.cu

////////////////////////////////////////////////////////////////////////
//
// Practical 4 -- initial code for shared memory reduction for
//                a single block which is a power of two in size
//
////////////////////////////////////////////////////////////////////////

#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
#include <float.h>

#include <helper_cuda.h>

////////////////////////////////////////////////////////////////////////
// CPU routine
////////////////////////////////////////////////////////////////////////

float reduction_gold(float* idata, int len)
{
  float sum = 0.0f;
  for(int i=0; i<len; i++) sum += idata[i];

  return sum;
}

////////////////////////////////////////////////////////////////////////
// GPU routine
////////////////////////////////////////////////////////////////////////

__global__ void reduction(float *g_odata, float *g_idata)
{
    extern  __shared__  float temp[];

    int tid = threadIdx.x;

    // first, each thread loads data into shared memory
    temp[tid] = g_idata[tid + blockDim.x * blockIdx.x];

    int warpSize = 32;
    int laneId = tid % warpSize;
    int warpId = tid / warpSize;

    // warp reduction
    for (int i = warpSize/2; i > 0; i = i/2)
        temp[tid] += __shfl_down_sync(-1, temp[tid], i);

    __syncthreads();

    // store warp results at the end
    if (laneId == 0)
        temp[blockDim.x - (blockDim.x/warpSize) + warpId] = temp[tid];

    __syncthreads();

    // final reduction by first warp only
    if (tid < warpSize)
    {
        int numWarps = (blockDim.x + warpSize - 1) / warpSize;
        float val = (tid < numWarps) ? temp[blockDim.x - numWarps + tid] : 0.0f;

        for (int offset = warpSize / 2; offset > 0; offset /= 2)
            val += __shfl_down_sync(-1, val, offset);

        if (tid == 0)
            atomicAdd(g_odata, val);
    }
}


////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////

int main( int argc, const char** argv)
{
  int num_blocks, num_threads, num_elements, mem_size, shared_mem_size;

  float *h_data, *d_idata, *d_odata;

  // initialise card

  findCudaDevice(argc, argv);

  num_blocks   = 1000;  // start with only 1 thread block
  num_threads  = 512;
  num_elements = num_blocks*num_threads;
  mem_size     = sizeof(float) * num_elements;

  // allocate host memory to store the input data
  // and initialize to integer values between 0 and 10

  h_data = (float*) malloc(mem_size);

  for(int i = 0; i < num_elements; i++)
    h_data[i] = floorf(10.0f*(rand()/(float)RAND_MAX));

  // compute reference solution

  float sum = reduction_gold(h_data, num_elements);

  // allocate device memory input and output arrays

  checkCudaErrors( cudaMalloc((void**)&d_idata, mem_size) );
  checkCudaErrors( cudaMalloc((void**)&d_odata, sizeof(float)) );
  checkCudaErrors( cudaMemset(d_odata, 0, sizeof(float)) );

  // copy host memory to device input array

  checkCudaErrors( cudaMemcpy(d_idata, h_data, mem_size,
                              cudaMemcpyHostToDevice) );

  // execute the kernel

  shared_mem_size = sizeof(float) * num_threads;
  reduction<<<num_blocks,num_threads,shared_mem_size>>>(d_odata,d_idata);
  getLastCudaError("reduction kernel execution failed");

  // copy result from device to host

  checkCudaErrors( cudaMemcpy(h_data, d_odata, sizeof(float),
                              cudaMemcpyDeviceToHost) );

  // check results

  printf("reduction error = %f\n",h_data[0]-sum);

  // cleanup memory

  free(h_data);
  checkCudaErrors( cudaFree(d_idata) );
  checkCudaErrors( cudaFree(d_odata) );

  // CUDA exit -- needed to flush printf write buffer

  cudaDeviceReset();
}


Overwriting reduction.cu


In [19]:
!nvcc reduction.cu -o reduction -I. -lineinfo -arch=sm_70 --ptxas-options=-v --use_fast_math -lcudart

          temp[tid] += __shfl_down_sync(-1, temp[tid], i);
                                        ^


              val += __shfl_down_sync(-1, val, offset);
                                      ^

          temp[tid] += __shfl_down_sync(-1, temp[tid], i);
                                        ^


              val += __shfl_down_sync(-1, val, offset);
                                      ^

ptxas info    : 0 bytes gmem
ptxas info    : Compiling entry function '_Z9reductionPfS_' for 'sm_70'
ptxas info    : Function properties for _Z9reductionPfS_
    0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info    : Used 12 registers, 368 bytes cmem[0]


In [20]:
!./reduction

GPU Device 0: "Turing" with compute capability 7.5

reduction error = 0.000000


In [21]:

from google.colab import runtime
runtime.unassign()