### Check the cuda compiler version

In [None]:
!nvidia-smi

In [None]:
!nvcc --version

In [None]:
!git clone https://github.com/NVIDIA/cuda-samples.git

In [None]:
!cd cuda-samples/Samples/1_Utilities/deviceQuery && make


In [None]:
!cd cuda-samples/Samples/1_Utilities/deviceQuery && ls
!cuda-samples/Samples/1_Utilities/deviceQuery/./deviceQuery

## nvcc for Jupyter notebook

In [None]:
!pip install nvcc4jupyter

In [None]:
%load_ext nvcc4jupyter

In [None]:
%%cuda

#include <stdio.h>
#include <stdlib.h>
#include <cuda.h>
#include <time.h>

#define N 1024

__global__ void vectorMultiply(const float *A, const float *B, float *C) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < N) {
        C[idx] = A[idx] * B[idx];
    }
}

void initializeVectors(float *A, float *B, int size) {
    for (int i = 0; i < size; i++) {
        A[i] = static_cast<float>(rand()) / RAND_MAX * 100.0f;
        B[i] = static_cast<float>(rand()) / RAND_MAX * 100.0f;
    }
}

int main() {
    float *h_A, *h_B, *h_C;
    h_A = (float *)malloc(N * sizeof(float));
    h_B = (float *)malloc(N * sizeof(float));
    h_C = (float *)malloc(N * sizeof(float));

    srand(time(0));
    initializeVectors(h_A, h_B, N);

    float *d_A, *d_B, *d_C;
    cudaMalloc((void **)&d_A, N * sizeof(float));
    cudaMalloc((void **)&d_B, N * sizeof(float));
    cudaMalloc((void **)&d_C, N * sizeof(float));

    cudaMemcpy(d_A, h_A, N * sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(d_B, h_B, N * sizeof(float), cudaMemcpyHostToDevice);

    int blockSize = 256;
    int numBlocks = (N + blockSize - 1) / blockSize;

    // Create CUDA events for timing
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);

    cudaEventRecord(start); // Start timing

    vectorMultiply<<<numBlocks, blockSize>>>(d_A, d_B, d_C);

    cudaEventRecord(stop); // Stop timing
    cudaEventSynchronize(stop); // Wait for the stop event to complete

    float milliseconds = 0;
    cudaEventElapsedTime(&milliseconds, start, stop); // Calculate elapsed time

    cudaMemcpy(h_C, d_C, N * sizeof(float), cudaMemcpyDeviceToHost);

    for (int i = 0; i < N; i++) {
        if (h_C[i] != h_A[i] * h_B[i]) {
            printf("Error at index %d: %f != %f\n", i, h_C[i], h_A[i] * h_B[i]);
            break;
        }
    }

    printf("Multiplication completed successfully in %f milliseconds!\n", milliseconds);

    // Clean up
    cudaEventDestroy(start);
    cudaEventDestroy(stop);
    cudaFree(d_A);
    cudaFree(d_B);
    cudaFree(d_C);
    free(h_A);
    free(h_B);
    free(h_C);

    return 0;
}
