<hr style="border-width:4px; border-color:coral"></hr>

## Simple CUDA programs

<hr style="border-width:4px; border-color:coral"></hr>

In the following simplest possible CUDA program, we launch a single kernel with one thread. 

In [5]:
%%file demo_00.cu

__global__ void kernel( void ) 
{
    ;
}

int main(void) 
{
    kernel<<<1,1>>>();
    return 0;
}

Writing demo_00.cu


In [None]:
%%bash 

nvcc  -o demo_00 demo_00.cu

srun demo_00

In [3]:
%%file hello_demo.cu

#include <stdio.h>

__global__ void kernel( void ) 
{
    int ix = blockIdx.x*blockDim.x + threadIdx.x;
    printf("Thread %d in block %d : Hello, World from ix=%d\n", threadIdx.x, blockIdx.x,ix);
}

int main(void) 
{
    dim3 grid(3);
    dim3 block(4);
    kernel<<<grid,block>>>();
    cudaDeviceSynchronize();

    return 0;
}


Overwriting hello_demo.cu


In [4]:
%%bash 

nvcc  -arch=sm_52 -o hello_demo hello_demo.cu

srun --nodelist=node4 hello_demo

Thread 0 in block 2 : Hello, World from ix=8
Thread 1 in block 2 : Hello, World from ix=9
Thread 2 in block 2 : Hello, World from ix=10
Thread 3 in block 2 : Hello, World from ix=11
Thread 0 in block 0 : Hello, World from ix=0
Thread 1 in block 0 : Hello, World from ix=1
Thread 2 in block 0 : Hello, World from ix=2
Thread 3 in block 0 : Hello, World from ix=3
Thread 0 in block 1 : Hello, World from ix=4
Thread 1 in block 1 : Hello, World from ix=5
Thread 2 in block 1 : Hello, World from ix=6
Thread 3 in block 1 : Hello, World from ix=7


Reorganizing the above, we see how each thread in a block maps to a global thread index 'ix'.

**Block 0**

    Thread 0 in block 0 : Hello, World from ix=0
    Thread 1 in block 0 : Hello, World from ix=1
    Thread 2 in block 0 : Hello, World from ix=2
    Thread 3 in block 0 : Hello, World from ix=3

**Block 1**

    Thread 0 in block 1 : Hello, World from ix=4
    Thread 1 in block 1 : Hello, World from ix=5
    Thread 2 in block 1 : Hello, World from ix=6
    Thread 3 in block 1 : Hello, World from ix=7

**Block 2**

    Thread 0 in block 2 : Hello, World from ix=8
    Thread 1 in block 2 : Hello, World from ix=9
    Thread 2 in block 2 : Hello, World from ix=10
    Thread 3 in block 2 : Hello, World from ix=11


In [30]:
%%file add.cu

#include <stdio.h>

__device__ int addem( int a, int b ) 
{
    return a + b;
}

__global__ void add( int a, int b, int *c ) 
{
    *c = addem( a, b );
}

int main(void) 
{
    int a,b,c;
    int *dev_c;

    /* Allocate memory on the device */
    cudaMalloc( (void**)&dev_c, sizeof(int));

    a = 2;
    b = 7;
    add<<<1,1>>>(a, b, dev_c );

    cudaDeviceSynchronize();

    /* Copy contents of dev_c back to c */
    cudaMemcpy(&c, dev_c, sizeof(int), cudaMemcpyDeviceToHost);
    
    printf( "%d + %d = %d\n", a,b,c);

    cudaFree(dev_c);

}




Overwriting add.cu


In [31]:
%%bash 

nvcc -o add add.cu

# on redhawk
srun -p gpuq add

#On R2
# srun -p gpuq add

2 + 7 = 9


In [32]:
%%file simple_parallel.cu

#include <stdio.h>

__global__ void add( int *c) 
{
    /* Since we have only one thread per block, the blockIdx and threadIdx are the same */
    int id = blockIdx.x;  
    c[id] = id;
}

int main(void) 
{
    int N = 10;
    
    /* Allocate memory on the device */
    int *dev_c;
    cudaMalloc( (void**)&dev_c, N*sizeof(int));

    /* Launch N thread blocks of 1 thread per block */
    dim3 grid(N);  /* 1 x N array of blocks */
    dim3 block(1); /* 1x1 thread block */
    add<<<grid,block>>>(dev_c);
    
    cudaDeviceSynchronize();

    /* Copy contents of dev_c back to c */
    int c[N];
    cudaMemcpy( &c, dev_c, N*sizeof(int), cudaMemcpyDeviceToHost);

    for(int i = 0; i < N; i++)
    {
        printf( "c[%d] = %d\n",i,c[i]);
    }

    cudaFree(dev_c);

}

Overwriting simple_parallel.cu


In [33]:
%%bash

nvcc -o simple_parallel simple_parallel.cu

# On Redhawk
srun  simple_parallel

# On R2
# srun -p gpuq simple_parallel

c[0] = 0
c[1] = 1
c[2] = 2
c[3] = 3
c[4] = 4
c[5] = 5
c[6] = 6
c[7] = 7
c[8] = 8
c[9] = 9
