<a href="https://colab.research.google.com/github/cibercitizen1/cuda_hello/blob/main/Welcome_To_Colaboratory.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!nvcc --version


nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2021 NVIDIA Corporation
Built on Sun_Feb_14_21:12:58_PST_2021
Cuda compilation tools, release 11.2, V11.2.152
Build cuda_11.2.r11.2/compiler.29618528_0


In [3]:
!pip install git+https://github.com/andreinechaev/nvcc4jupyter.git


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting git+https://github.com/andreinechaev/nvcc4jupyter.git
  Cloning https://github.com/andreinechaev/nvcc4jupyter.git to /tmp/pip-req-build-srfbm87i
  Running command git clone --filter=blob:none --quiet https://github.com/andreinechaev/nvcc4jupyter.git /tmp/pip-req-build-srfbm87i
  Resolved https://github.com/andreinechaev/nvcc4jupyter.git to commit aac710a35f52bb78ab34d2e52517237941399eff
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: NVCCPlugin
  Building wheel for NVCCPlugin (setup.py) ... [?25l[?25hdone
  Created wheel for NVCCPlugin: filename=NVCCPlugin-0.0.2-py3-none-any.whl size=4304 sha256=61039acb2268b1cada36f25102f04ca3f4e6c753653e5ddcab52ed22c5c1c75e
  Stored in directory: /tmp/pip-ephem-wheel-cache-mmfilx6f/wheels/f3/08/cc/e2b5b0e1c92df07dbb50a6f024a68ce090f5e7b2316b41756d
Successfully built NVCCPlugin
Installing collecte

In [4]:
%load_ext nvcc_plugin

created output directory at /content/src
Out bin /content/result.out


In [5]:
%%cu
// -------------------------------------------------------------
// mainHello_1.cu
// -------------------------------------------------------------
#include <cstdio>
#include <iostream>

// -------------------------------------------------------------
// -------------------------------------------------------------
using namespace std;

// -------------------------------------------------------------
// Z -> () -> Z (for a kernel)
//
// [Z] -> () -> [Z] (for all the kernels)
// -------------------------------------------------------------
__global__ void test_kernel(int* p_input, int* p_output) {

  //
  // We wave 1-dim data (i.e. an array)
  // We have arranged one thread for one cell
  // both in the input array and in the output one
  // 
  // Therefore, we have to find out our thread index,
  // which equates to the cell number in the array
  // we have to manipulate
  //
  // The calculation is as follows:
  
  int idx = (blockIdx.x * blockDim.x) + threadIdx.x;

  // blockDim.x: the number of threads in the block for the x index
  // (which in this case is the only one)
  // times
  // blockIdx.x: number of block in for the x index
  // plus
  // threadIdx.x: the thread number within this block
  // Example: if we are the block number 3, each block has 16 threds
  // and the thread number is 7
  // The cell would be 3*16 + 7

  //
  // This is the calculation
  //
  p_output[idx] =  100 + p_input[idx];

}

// -------------------------------------------------------------
// -------------------------------------------------------------
int main() {

  //
  // input and output local arrays
  //
  const int N=1024;
  int numbers[N];
  int results[N];

	 int tam = N * sizeof(int);

  for (int i = 0; i <= N-1; i++) {
	numbers[i] = i;
	results[i] = -1;
  }

 
 

  //
  // get memory in the device
  //
  int* p_in;
  int* p_out;
  //
  cudaMalloc(&p_in, tam);
  cudaMalloc(&p_out, tam);

  //
  // timers, define and start to count
  //
  cudaEvent_t start; 
  cudaEvent_t end;
  cudaEventCreate(&start);
  cudaEventCreate(&end);
  
  cudaEventRecord(start);

  //
  // copy to device
  //
  cudaMemcpy(p_in, numbers, tam, cudaMemcpyHostToDevice);

  dim3 total_blocks( 4 );
  dim3 threads_per_block( N/4 );
  
  // dim3 total_blocks( 1 );
  // dim3 threads_per_block( N );


  //
  // start up the kernel(s)
  //
  test_kernel<<<total_blocks, threads_per_block>>>(p_in, p_out);

  //
  // wait for completion
  //
  cudaEventSynchronize(end);

  //
  // copy from device
  //
  cudaMemcpy(&results[0], p_out,  tam, cudaMemcpyDeviceToHost);
  
  //
  // record end moment, and calculate the elapsed time
  //
  cudaEventRecord(end);
  float time = 0;
  cudaEventElapsedTime(&time, start, end);

  //
  // results
  //

  cout << "results[1] : " << results[1] << endl;

  cout << "results[31] : " << results[31] << endl;
  cout << "results[32] : " << results[32] << endl;

  cout << "results[63] : " << results[63] << endl;
  cout << "results[64] : " << results[64] << endl;
  cout << "results[65] : " << results[65] << endl;
  cout << "results[" << N-1 << "] : " << results[N-1] << endl;


  cout << "start: " << start << endl;
  cout << "end: " << end << endl;
  //cout << (end - start) << endl;
  cout << "The time required : ";
  cout << time << endl;
} // main()
// -------------------------------------------------------------
// -------------------------------------------------------------
// -------------------------------------------------------------
// -------------------------------------------------------------

results[1] : 101
results[31] : 131
results[32] : 132
results[63] : 163
results[64] : 164
results[65] : 165
results[1023] : 1123
start: 0x55d0299d9e10
end: 0x55d0299d9c70
The time required : 0



In [7]:
!ls


sample_data  src


In [9]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [28]:
%%cu
/* This sample queries the properties of the CUDA devices present in the system via CUDA Runtime API. */

// includes, system
#include <stdlib.h>
#include <stdio.h>
#include <string.h>


#include <cuda.h>
#include <cuda_runtime_api.h>

// includes, project
// #include <cutil.h>

////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int
main( int argc, char** argv) 
{
    printf("CUDA Device Query (Runtime API) version (CUDART static linking)\n");

    int deviceCount = 0;

	if (cudaGetDeviceCount(&deviceCount) != cudaSuccess) {
		printf("cudaGetDeviceCount failed! CUDA Driver and Runtime version may be mismatched.\n");
		printf("\nTest FAILED!\n");
		return 0;
	}

    // This function call returns 0 if there are no CUDA capable devices.
    if (deviceCount == 0)
        printf("There is no device supporting CUDA\n");

    int dev;
    for (dev = 0; dev < deviceCount; ++dev) {
        cudaDeviceProp deviceProp;
        cudaGetDeviceProperties(&deviceProp, dev);

        if (dev == 0) {
			// This function call returns 9999 for both major & minor fields, if no CUDA capable devices are present
            if (deviceProp.major == 9999 && deviceProp.minor == 9999)
                printf("There is no device supporting CUDA.\n");
            else if (deviceCount == 1)
                printf("There is 1 device supporting CUDA\n");
            else
                printf("There are %d devices supporting CUDA\n", deviceCount);
        }
        printf("\nDevice %d: \"%s\"\n", dev, deviceProp.name);
    #if CUDART_VERSION >= 2020
		int driverVersion = 0, runtimeVersion = 0;
		cudaDriverGetVersion(&driverVersion);
		printf("  CUDA Driver Version:                           %d.%d\n", driverVersion/1000, driverVersion%100);
		cudaRuntimeGetVersion(&runtimeVersion);
		printf("  CUDA Runtime Version:                          %d.%d\n", runtimeVersion/1000, runtimeVersion%100);
    #endif

        printf("  CUDA Capability Major revision number:         %d\n", deviceProp.major);
        printf("  CUDA Capability Minor revision number:         %d\n", deviceProp.minor);

		printf("  Total amount of global memory:                 %u bytes\n", deviceProp.totalGlobalMem);
    #if CUDART_VERSION >= 2000
        printf("  Number of multiprocessors:                     %d\n", deviceProp.multiProcessorCount);
        printf("  Number of cores:                               %d\n", 8 * deviceProp.multiProcessorCount);
    #endif
        printf("  Total amount of constant memory:               %u bytes\n", deviceProp.totalConstMem); 
        printf("  Total amount of shared memory per block:       %u bytes\n", deviceProp.sharedMemPerBlock);
        printf("  Total number of registers available per block: %d\n", deviceProp.regsPerBlock);
        printf("  Warp size:                                     %d\n", deviceProp.warpSize);
        printf("  Maximum number of threads per block:           %d\n", deviceProp.maxThreadsPerBlock);
        printf("  Maximum sizes of each dimension of a block:    %d x %d x %d\n",
               deviceProp.maxThreadsDim[0],
               deviceProp.maxThreadsDim[1],
               deviceProp.maxThreadsDim[2]);
        printf("  Maximum sizes of each dimension of a grid:     %d x %d x %d\n",
               deviceProp.maxGridSize[0],
               deviceProp.maxGridSize[1],
               deviceProp.maxGridSize[2]);
        printf("  Maximum memory pitch:                          %u bytes\n", deviceProp.memPitch);
        printf("  Texture alignment:                             %u bytes\n", deviceProp.textureAlignment);
        printf("  Clock rate:                                    %.2f GHz\n", deviceProp.clockRate * 1e-6f);
    #if CUDART_VERSION >= 2000
        printf("  Concurrent copy and execution:                 %s\n", deviceProp.deviceOverlap ? "Yes" : "No");
    #endif
    #if CUDART_VERSION >= 2020
        printf("  Run time limit on kernels:                     %s\n", deviceProp.kernelExecTimeoutEnabled ? "Yes" : "No");
        printf("  Integrated:                                    %s\n", deviceProp.integrated ? "Yes" : "No");
        printf("  Support host page-locked memory mapping:       %s\n", deviceProp.canMapHostMemory ? "Yes" : "No");
        printf("  Compute mode:                                  %s\n", deviceProp.computeMode == cudaComputeModeDefault ?
			                                                            "Default (multiple host threads can use this device simultaneously)" :
		                                                                deviceProp.computeMode == cudaComputeModeExclusive ?
																		"Exclusive (only one host thread at a time can use this device)" :
		                                                                deviceProp.computeMode == cudaComputeModeProhibited ?
																		"Prohibited (no host thread can use this device)" :
																		"Unknown");
    #endif
	}
    printf("\nTest PASSED\n");

    return 0;
}


CUDA Device Query (Runtime API) version (CUDART static linking)
There is 1 device supporting CUDA

Device 0: "Tesla T4"
  CUDA Driver Version:                           11.60
  CUDA Runtime Version:                          11.20
  CUDA Capability Major revision number:         7
  CUDA Capability Minor revision number:         5
  Total amount of global memory:                 2958950400 bytes
  Number of multiprocessors:                     40
  Number of cores:                               320
  Total amount of constant memory:               65536 bytes
  Total amount of shared memory per block:       49152 bytes
  Total number of registers available per block: 65536
  Warp size:                                     32
  Maximum number of threads per block:           1024
  Maximum sizes of each dimension of a block:    1024 x 1024 x 64
  Maximum sizes of each dimension of a grid:     2147483647 x 65535 x 65535
  Maximum memory pitch:                          2147483647 bytes
  Text

In [1]:
%%cu
// -------------------------------------------------------------
// mainHello_2.cu
// -------------------------------------------------------------
#include <cstdio>
#include <iostream>

// -------------------------------------------------------------
// -------------------------------------------------------------
using namespace std;

// -------------------------------------------------------------
// -------------------------------------------------------------
__global__ void test_kernel_OLD(int* p_input, int* p_output, int N) {

  //
  // find my index
  //
  
  int idx = (blockIdx.x * blockDim.x) + threadIdx.x;
  

  if ( idx == 13 ) {
    
    p_output[13] = -1;
    
    for (int i=1; i<=10*10*10*10; i++ ) {
      for (int j=1; j<=10*10*10*10; j++ ) {
        for (int k=1; k<=10; k++ ) {
          p_output[13] = -1 * p_input[13] * i * j + k;
        }
      }
    }
    
    if (p_output[13] < 0 ) {
      p_output[13] = 130;
    }
    
  } else {

    p_output[idx] = idx;
      
  }
  
  p_output[idx] = (p_output[ (idx+1)%N ]-1+N) % N;

} // ()

// -------------------------------------------------------------
// -------------------------------------------------------------
__global__ void test_kernel( int* p_data, const int N ) {
  int idx = (blockIdx.x * blockDim.x) + threadIdx.x;

  p_data[idx] = p_data[ (idx+1) % N ];
} // ()

// -------------------------------------------------------------
// -------------------------------------------------------------
int main() {

  //
  // input and output local arrays
  //
  const int N=1024;
  int numbers[N];

	 int tam = N * sizeof(int);

  for (int i = 0; i <= N-1; i++) {
	  numbers[i] = i;
  }

  //
  // get memory in the device
  //
  int* p_data;
  //
  cudaMalloc(&p_data, tam);

  //
  // copy to device
  //
  cudaMemcpy(p_data, numbers, tam, cudaMemcpyHostToDevice);

  dim3 total_blocks( 2 );
  dim3 threads_per_block( N/2 );
  
  //
  // timer
  //
  cudaEvent_t end;
  cudaEventCreate(&end);

  //
  // start up the kernel(s)
  //
  test_kernel<<<total_blocks, threads_per_block>>>(p_data, N);

  //
  // wait for completion
  //
  cudaEventSynchronize(end);

  //
  // copy from device
  //
  cudaMemcpy(&numbers[0], p_data,  tam, cudaMemcpyDeviceToHost);
  
  //
  // results
  //

  cout << "results[0] : " << numbers[0] << endl;
  cout << "results[1] : " << numbers[1] << endl;

  cout << "results[12] : " << numbers[12] << endl;
  cout << "results[13] : " << numbers[13] << endl;
  cout << "results[14] : " << numbers[14] << endl;

  cout << "results[31] : " << numbers[31] << endl;
  cout << "results[32] : " << numbers[32] << endl;

  cout << "results[" << N-1 << "] : " << numbers[N-1] << endl;

} // main()
// -------------------------------------------------------------
// -------------------------------------------------------------
// -------------------------------------------------------------
// -------------------------------------------------------------

UsageError: Cell magic `%%cu` not found.
