<a href="https://colab.research.google.com/github/cychiang-ntpu/gpu-c-examples/blob/main/simple_cuda_c_example.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 使用 cuda C 做平行運算之簡易範例

## Step 1: 將原始碼寫入系統資料夾

In [8]:
%%writefile hello.cu
#include <stdio.h>
#include <sys/time.h>

#define USE_GPU

__global__
void saxpy(int n, float a, float *x, float *y)
{
  int i = blockIdx.x*blockDim.x + threadIdx.x;
  if (i < n) y[i] = a*x[i] + y[i];
}


void saxcpu(int N, float a, float *x, float *y)
{
  for (int i = 0; i < N; i++) {
    y[i] = a*x[i] + y[i];
  }
}

int main(void)
{
  int N = 1<<20;
  float *x, *y, *d_x, *d_y;
  struct timeval t1, t2;
  double elapsedTime;
  x = (float*)malloc(N*sizeof(float));
  y = (float*)malloc(N*sizeof(float));

  printf("N=%d\n", N);

  cudaMalloc(&d_x, N*sizeof(float));
  cudaMalloc(&d_y, N*sizeof(float));

  for (int i = 0; i < N; i++) {
    x[i] = 1.0f;
    y[i] = 2.0f;
  }




#ifdef USE_GPU
  cudaMemcpy(d_x, x, N*sizeof(float), cudaMemcpyHostToDevice);
  cudaMemcpy(d_y, y, N*sizeof(float), cudaMemcpyHostToDevice);

  // start timer
  gettimeofday(&t1, NULL);

  // Perform SAXPY on 1M elements
  for(int j=0;j<1000;j++)
    saxpy<<<(N+255)/256, 256>>>(N, 2.0f, d_x, d_y);

  // stop timer
  gettimeofday(&t2, NULL);

  cudaMemcpy(y, d_y, N*sizeof(float), cudaMemcpyDeviceToHost);
#else
  // start timer
  gettimeofday(&t1, NULL);

  for(int j=0;j<1000;j++)
    saxcpu(N, 2.0f, x, y);

  // stop timer
  gettimeofday(&t2, NULL);
#endif


  // compute and print the elapsed time in millisec
  elapsedTime = (t2.tv_sec - t1.tv_sec) * 1000.0;      // sec to ms
  elapsedTime += (t2.tv_usec - t1.tv_usec) / 1000.0;   // us to ms
  printf("%f ms.\n", elapsedTime);

  float maxError = 0.0f;
  for (int i = 0; i < N; i++)
    maxError = max(maxError, abs(y[i]-4.0f));
  printf("Max error: %f\n", maxError);

  cudaFree(d_x);
  cudaFree(d_y);
  free(x);
  free(y);
}



Overwriting hello.cu


## Step 2: 使用 nvcc 進行編譯

In [9]:
!nvcc -o hello hello.cu

## Step 3: 執行

In [10]:
!./hello

N=1048576
0.133000 ms.
Max error: 2.000000
