<a href="https://colab.research.google.com/github/cibercitizen1/cuda_hello/blob/main/Welcome_To_Colaboratory.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!nvcc --version


nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2021 NVIDIA Corporation
Built on Sun_Feb_14_21:12:58_PST_2021
Cuda compilation tools, release 11.2, V11.2.152
Build cuda_11.2.r11.2/compiler.29618528_0


In [2]:
!pip install git+https://github.com/andreinechaev/nvcc4jupyter.git


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting git+https://github.com/andreinechaev/nvcc4jupyter.git
  Cloning https://github.com/andreinechaev/nvcc4jupyter.git to /tmp/pip-req-build-yqfsoygc
  Running command git clone --filter=blob:none --quiet https://github.com/andreinechaev/nvcc4jupyter.git /tmp/pip-req-build-yqfsoygc
  Resolved https://github.com/andreinechaev/nvcc4jupyter.git to commit aac710a35f52bb78ab34d2e52517237941399eff
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: NVCCPlugin
  Building wheel for NVCCPlugin (setup.py) ... [?25l[?25hdone
  Created wheel for NVCCPlugin: filename=NVCCPlugin-0.0.2-py3-none-any.whl size=4304 sha256=1b90098fec2f6d41af785690ba3605a4d10f6dbb7fe9eff0497c1b08a3da7a02
  Stored in directory: /tmp/pip-ephem-wheel-cache-xkcky86k/wheels/f3/08/cc/e2b5b0e1c92df07dbb50a6f024a68ce090f5e7b2316b41756d
Successfully built NVCCPlugin
Installing collecte

In [3]:
%load_ext nvcc_plugin

created output directory at /content/src
Out bin /content/result.out


In [24]:
%%cu
// -------------------------------------------------------------
// mainHello_1.cu
// -------------------------------------------------------------
#include <cstdio>
#include <iostream>

// -------------------------------------------------------------
// -------------------------------------------------------------
using namespace std;

// -------------------------------------------------------------
// Z -> () -> Z (for a kernel)
//
// [Z] -> () -> [Z] (for all the kernels)
// -------------------------------------------------------------
__global__ void test_kernel(int* p_input, int* p_output) {

  //
  // We wave 1-dim data (i.e. an array)
  // We have arranged one thread for one cell
  // both in the input array and in the output one
  // 
  // Therefore, we have to find out our thread index,
  // which equates to the cell number in the array
  // we have to manipulate
  //
  // The calculation is as follows:
  
  int idx = (blockIdx.x * blockDim.x) + threadIdx.x;

  // blockDim.x: the number of threads in the block for the x index
  // (which in this case is the only one)
  // times
  // blockIdx.x: number of block in for the x index
  // plus
  // threadIdx.x: the thread number within this block
  // Example: if we are the block number 3, each block has 16 threds
  // and the thread number is 7
  // The cell would be 3*16 + 7

  //
  // This is the calculation
  //
  p_output[idx] =  100 + p_input[idx];

}

// -------------------------------------------------------------
// -------------------------------------------------------------
int main() {

  //
  // input and output local arrays
  //
  const int N=128;
  int numbers[N];
  int results[N];

	 int tam = N * sizeof(int);

  for (int i = 0; i <= N-1; i++) {
	numbers[i] = i;
	results[i] = -1;
  }

 
  //
  // timers, define and start to count
  //
  cudaEvent_t start; 
  cudaEvent_t end;
  cudaEventCreate(&start);
  cudaEventCreate(&end);
  

  //
  // get memory in the device
  //
  int* p_in;
  int* p_out;
  //
  cudaMalloc(&p_in, tam);
  cudaMalloc(&p_out, tam);

  cudaEventRecord(start);

  //
  // copy to device
  //
  cudaMemcpy(p_in, numbers, tam, cudaMemcpyHostToDevice);

  dim3 total_blocks( 4 );
  dim3 threads_per_block( N/4 );
  
  // dim3 total_blocks( 1 );
  // dim3 threads_per_block( N );


  //
  // start up the kernel(s)
  //
  test_kernel<<<total_blocks, threads_per_block>>>(p_in, p_out);

  //
  // wait for completion
  //
  cudaEventSynchronize(end);

  //
  // copy from device
  //
  cudaMemcpy(&results[0], p_out,  tam, cudaMemcpyDeviceToHost);
  
  //
  // record end moment, and calculate the elapsed time
  //
  cudaEventRecord(end);
  float time = 0;
  cudaEventElapsedTime(&time, start, end);

  //
  // results
  //

  cout << "results[1] : " << results[1] << endl;

  cout << "results[31] : " << results[31] << endl;
  cout << "results[32] : " << results[32] << endl;

  cout << "results[63] : " << results[63] << endl;
  cout << "results[64] : " << results[64] << endl;
  cout << "results[65] : " << results[65] << endl;


  cout << "start: " << start << endl;
  cout << "end: " << end << endl;
  //cout << (end - start) << endl;
  cout << "The time required : ";
  cout << time << endl;
} // main()
// -------------------------------------------------------------
// -------------------------------------------------------------
// -------------------------------------------------------------
// -------------------------------------------------------------

results[1] : 101
results[31] : 131
results[32] : 132
results[63] : 163
results[64] : 164
results[65] : 165
start: 0x55e18480b6c0
end: 0x55e18480b860
The time required : 0



In [7]:
!ls


sample_data  src


In [9]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
