In [1]:
!pip install nvcc4jupyter

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Collecting nvcc4jupyter
  Downloading nvcc4jupyter-1.2.1-py3-none-any.whl (10 kB)
Installing collected packages: nvcc4jupyter
Successfully installed nvcc4jupyter-1.2.1
Source files will be saved in "C:\Users\diego\AppData\Local\Temp\tmpp9reof93".




In [2]:
%load_ext nvcc4jupyter

Source files will be saved in "C:\Users\diego\AppData\Local\Temp\tmphj81mhtd".


In [8]:
%%cuda
#include <stdio.h>

__global__ void hello(){
    printf("Hello from block: %u, thread: %u\n", blockIdx.x, threadIdx.x);
}

int main(){
    hello<<<2, 2>>>();
    cudaDeviceSynchronize();
}

Hello from block: 1, thread: 0
Hello from block: 1, thread: 1
Hello from block: 0, thread: 0
Hello from block: 0, thread: 1



In [3]:
%%cuda
#
#include <stdio.h>
#include <string>
#include <iostream>

int main() {

  int nDevices;
  cudaGetDeviceCount(&nDevices);
  
  printf("Number of devices: %d\n", nDevices);
  
  for (int i = 0; i < nDevices; i++) {
    cudaDeviceProp prop;
    cudaGetDeviceProperties(&prop, i);
    printf("Device Number: %d\n", i);
    printf("  Device name: %s\n", prop.name);
    printf("  Memory Clock Rate (MHz): %d\n",
           prop.memoryClockRate/1024);
    printf("  Memory Bus Width (bits): %d\n",
           prop.memoryBusWidth);
    printf("  Peak Memory Bandwidth (GB/s): %.1f\n",
           2.0*prop.memoryClockRate*(prop.memoryBusWidth/8)/1.0e6);
    printf("  Total global memory (Gbytes) %.1f\n",(float)(prop.totalGlobalMem)/1024.0/1024.0/1024.0);
    printf("  Shared memory per block (Kbytes) %.1f\n",(float)(prop.sharedMemPerBlock)/1024.0);
    printf("  minor-major: %d-%d\n", prop.minor, prop.major);
    printf("  Warp-size: %d\n", prop.warpSize);
    printf("  Concurrent kernels: %s\n", prop.concurrentKernels ? "yes" : "no");
    printf("  Concurrent computation/communication: %s\n\n",prop.deviceOverlap ? "yes" : "no");
  }

  cudaError_t cuda_error = cudaGetLastError();
  std::cout << cudaGetErrorString(cuda_error) << "\n" ;
  std::cout << cudaGetErrorString(cudaSuccess) ;
}

Number of devices: 1
Device Number: 0
  Device name: NVIDIA GeForce RTX 2060
  Memory Clock Rate (MHz): 6836
  Memory Bus Width (bits): 192
  Peak Memory Bandwidth (GB/s): 336.0
  Total global memory (Gbytes) 6.0
  Shared memory per block (Kbytes) 48.0
  minor-major: 5-7
  Warp-size: 32
  Concurrent kernels: yes
  Concurrent computation/communication: yes

no error
no error


In [21]:
%%cuda
#include <iostream>
using namespace std;


// Lets try to break things

int main(){
    char* var = "Hello";        // var points to a string literal "Hello"
    char* nul_char = (var + 5); // nul_char points to the null terminator character '\0' in the string "Hello"
    //*nul_char = 'a';            // Attempting to modify the null terminator character ('\0') to 'a'
    cout << var << '\n';    

    cudaError_t cuda_error = cudaGetLastError();
    cout << cudaGetErrorString(cuda_error) << "\n" ;
}

Hello
no error



In [11]:
%%cuda
#include <iostream>
using namespace std;


auto main() -> int {
    cout << "Hello Wolrd!\n";
}

Hello Wolrd!



1. Write a simple CUDA kernel that gives the sum of maximum element of 2 vectors and profile it for:

* Execution on 1 thread and 1 block
* Execution on all threads of 1 block
* Execution on all threads of n blocks. Deciding n is upto you.

In [63]:
%%cuda

#include <iostream>
#include <algorithm>
#include <cuda_runtime.h>
#include <cmath>

using namespace std;

__global__ void sum_of_maximums(float *a, float *b, int size_a, int size_b, float *result){


    float maximum_a = a[0];
    float maximum_b = b[0];

    for(int i = 0; i < size_a; i++){
        maximum_a = fmax(maximum_a, a[i]);
    }

    for(int i = 0; i < size_b; i++){
        maximum_b = fmax(maximum_b, b[i]);
    }

    *result = maximum_a + maximum_b;

    

}

int main(){
    int N = 1 << 20;
    float *a, *b;

    float *result;

    cudaMallocManaged(&a, N*sizeof(float));
    cudaMallocManaged(&b, N*sizeof(float));
    cudaMallocManaged(&result, sizeof(float));

    for(int i = 0; i < N; i++){
        a[i] = i;
        b[i] = i;
    }

    sum_of_maximums<<<1, 1>>>(a, b, N, N, result);
    cudaDeviceSynchronize();

    std::cout << "Result: " << *result << "\n";




    cudaFree(a);
    cudaFree(b);
    cudaFree(result);

    return 0;
}

Result: 2.09715e+06



In [45]:
%%cuda
#include <iostream>
#include <math.h>
#include <cuda_runtime.h>

using namespace std;

__global__ void add_basic(float *x, int n)
{
    int index = blockIdx.x * blockDim.x + threadIdx.x;
    int stride = blockDim.x * gridDim.x;
    for (int i = index; i < n; i += stride)
    {
        x[i] = x[i] * 2;
    }
}

int main()
{
    // COMPLETE THIS
    int N = 1000000;

    float *x;

    cudaMallocManaged(&x, N * sizeof(float));
    for (int i = 0; i < N; i++)
    {
        x[i] = 1.0f;
    }

    add_basic <<< 32, 32 >>> (x, N);

    // Wait for GPU to finish before accessing on host
    cudaDeviceSynchronize();

    // Check for errors (all values should be 2.0f)
    float maxError = 0.0f;
    for (int i = 0; i < N; i++)
    {
        maxError = fmax(maxError, fabs(x[i] - 2.0f));
    }
    std::cout << "Max error: " << maxError << std::endl;

    // Free memory
    cudaFree(x);

    return 0;
}

Max error: 0



In [33]:
%%cuda

#include <iostream>
#include <algorithm>
#include <stdio.h>

int main(){
    printf("Hello World!\n");

    int N = 10;

    float *a = new float[N];

    for(int i = 0; i < N; i++){
        a[i] =  i;
    }

    float maximum = *std::max_element(a, a+N);

    std::cout << "Maximum: " << maximum << "\n";

    delete[] a;



    return 0;
}



Hello World!
Maximum: 9



In [35]:
%%cuda

#include <iostream>
#include <algorithm>
#include <stdio.h>

int main(){
    printf("Hello World!\n");

    int N = 10;

    float *a = static_cast<float*>(malloc(N*sizeof(float)));

    for(int i = 0; i < N; i++){
        a[i] =  i;
    }

    float maximum = *std::max_element(a, a+N);

    std::cout << "Maximum: " << maximum << "\n";

    free(a);    



    return 0;
}

Hello World!
Maximum: 9



In [27]:
%%cuda
#include <iostream>
#include <algorithm>

using namespace std;


auto main() -> int {

    const int N = 10;

    float a[N];

    for(int i = 0; i < N; i++){
        a[i] = i;
    }

    float maximum = *std::max_element(a, a+N);
    cout << "Maximum: " << maximum << "\n";

    //delete[] a;

    return 0;



}

Maximum: 9

