**CUDA**

In [1]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [2]:
%cd /content/drive/MyDrive/colab_notebooks/

/content/drive/MyDrive/colab_notebooks


In [None]:
!pip install colab_dc333
import colab_dc333
colab_dc333.nvidia.update_12_4()

cuda kernels
CUDA C++ extends C++ by allowing the programmer to define C++ functions,
called kernels, that, when called, are executed N times in parallel
by N different CUDA threads, as opposed to only once like regular C++ functions.

A kernel is defined using the __global__ declaration specifier  
number threads =  <<<num threads/block, num blocks/grid>>>.

there are no for loops in cuda code. each iteration i in a for loop is
assigned to a cuda thread. The code inside a for loop is moved to a cuda fn
and this fn is the cuda kernel. Debug printing requires data be moved to the cpu

The jupyter nvcc addons dont work
<h4>%%writefile filename</h4>
<h4>nvcc filename.cu -o execname</h4>
<h4>./execname</h4>

In [None]:
#include <stdio.h>
#include <cuda_runtime.h>

// CUDA kernel to add two integers
__global__ void add_kernel(int a, int b, int *result) {
    *result = a + b;
}

int main() {
    int a = 3, b = 5;
    int result = 0;

    // Allocate memory on the device (GPU)
    int *d_result;
    cudaMalloc((void**)&d_result, sizeof(int));

    // Launch kernel with 1 block and 1 thread
    add_kernel<<<1, 1>>>(a, b, d_result);

    // Wait for kernel to finish
    cudaDeviceSynchronize();

    // Copy result back to host
    cudaMemcpy(&result, d_result, sizeof(int), cudaMemcpyDeviceToHost);

    // Print result on host
    printf("Result of %d + %d = %d\n", a, b, result);

    // Free device memory
    cudaFree(d_result);

    return 0;
}

In [None]:
%%writefile thread_one.cu
#include "stdio.h"
#include <cuda_runtime.h>


__global__ void k1(int *bid, int *bdim, int* bidx){
  //what does if stmt look like in gpu?
  if (blockIdx.x == 2 && threadIdx.x == 2) {
        *bid = blockIdx.x;
        *bdim = blockDim.x;
        *bidx = threadIdx.x;
    }
}


int main(int argc, char** argv){
  // the 2 args in <<>> determine the itertation in k1 blockIdx.x,
  // blockDim.x, threadIdx.x
  const int numBlocks = 3;
  const int threadsPerBlock = 3;

  int cpuBlockId = 10;
  int cpuBlockDim = 10;
  int cpuThreadIdx = 10;


  int *bid, *bdim, *bidx;
  cudaMalloc((void**)&bid , sizeof(int));
  cudaMalloc((void**)&bdim , sizeof(int));
  cudaMalloc((void**)&bidx , sizeof(int ));



  k1<<<numBlocks,threadsPerBlock>>>(bid, bdim, bidx);
  cudaDeviceSynchronize();
  cudaMemcpy(&cpuBlockId, bid, sizeof(int), cudaMemcpyDeviceToHost);
  cudaMemcpy(&cpuBlockDim, bdim, sizeof(int), cudaMemcpyDeviceToHost);
  cudaMemcpy(&cpuThreadIdx, bidx, sizeof(int),  cudaMemcpyDeviceToHost);

  printf("cpuBlockId:%d \n",cpuBlockId);
  printf("cpuBlockDim:%d \n",cpuBlockDim);
  printf("cpuThreadIdx:%d \n",cpuThreadIdx);

  cudaFree(bid);
  cudaFree(bdim);
  cudaFree(bidx);
}






In [None]:
%%writefile threads_array.cu
#include <stdio.h>
#include <cuda_runtime.h>

__global__ void capture_info(int* block_ids, int* thread_ids, int* global_ids) {
    int globalThreadId = blockIdx.x * blockDim.x + threadIdx.x;

    block_ids[globalThreadId] = blockIdx.x;
    thread_ids[globalThreadId] = threadIdx.x;
    global_ids[globalThreadId] = globalThreadId;
}

int main() {
    const int numBlocks = 3;
    const int threadsPerBlock = 4;
    const int totalThreads = numBlocks * threadsPerBlock;

    // Allocate host arrays
    int h_block_ids[totalThreads];
    int h_thread_ids[totalThreads];
    int h_global_ids[totalThreads];

    // Allocate device arrays
    int *d_block_ids, *d_thread_ids, *d_global_ids;

    cudaMalloc((void**)&d_block_ids,  sizeof(int) * totalThreads);
    cudaMalloc((void**)&d_thread_ids, sizeof(int) * totalThreads);
    cudaMalloc((void**)&d_global_ids, sizeof(int) * totalThreads);

    // Launch kernel
    capture_info<<<numBlocks, threadsPerBlock>>>(d_block_ids, d_thread_ids, d_global_ids);
    cudaDeviceSynchronize();

    // Copy back to host
    cudaMemcpy(h_block_ids,  d_block_ids,  sizeof(int) * totalThreads, cudaMemcpyDeviceToHost);
    cudaMemcpy(h_thread_ids, d_thread_ids, sizeof(int) * totalThreads, cudaMemcpyDeviceToHost);
    cudaMemcpy(h_global_ids, d_global_ids, sizeof(int) * totalThreads, cudaMemcpyDeviceToHost);

    // Print info
    for (int i = 0; i < totalThreads; ++i) {
        printf("GlobalThreadID: %2d | BlockID: %d | ThreadID: %d\n",
               h_global_ids[i], h_block_ids[i], h_thread_ids[i]);
    }

    // Cleanup
    cudaFree(d_block_ids);
    cudaFree(d_thread_ids);
    cudaFree(d_global_ids);

    return 0;
}

In [None]:
%%writefile allv2.cu
#include <stdio.h>
#include <cuda_runtime.h>

__global__ void capture_info(int* block_ids, int* thread_ids, int* global_ids) {
    int globalThreadId = blockIdx.x * blockDim.x + threadIdx.x;

    block_ids[globalThreadId]  = blockIdx.x;
    thread_ids[globalThreadId] = threadIdx.x;
    global_ids[globalThreadId] = globalThreadId;
}

int main() {
    const int numBlocks = 3;
    const int threadsPerBlock = 4;
    const int totalThreads = numBlocks * threadsPerBlock;

    // Allocate host arrays
    int* h_block_ids  = (int*)malloc(sizeof(int) * totalThreads);
    int* h_thread_ids = (int*)malloc(sizeof(int) * totalThreads);
    int* h_global_ids = (int*)malloc(sizeof(int) * totalThreads);

    // Allocate device arrays
    int *d_block_ids, *d_thread_ids, *d_global_ids;
    cudaMalloc((void**)&d_block_ids,  sizeof(int) * totalThreads);
    cudaMalloc((void**)&d_thread_ids, sizeof(int) * totalThreads);
    cudaMalloc((void**)&d_global_ids, sizeof(int) * totalThreads);

    // Launch kernel
    capture_info<<<numBlocks, threadsPerBlock>>>(d_block_ids, d_thread_ids, d_global_ids);

    // Wait for the kernel to finish
    cudaDeviceSynchronize();

    // Copy back to host
    cudaMemcpy(h_block_ids,  d_block_ids,  sizeof(int) * totalThreads, cudaMemcpyDeviceToHost);
    cudaMemcpy(h_thread_ids, d_thread_ids, sizeof(int) * totalThreads, cudaMemcpyDeviceToHost);
    cudaMemcpy(h_global_ids, d_global_ids, sizeof(int) * totalThreads, cudaMemcpyDeviceToHost);

    // Print results
    FILE* f = fopen("output.txt", "w");
    for (int i = 0; i < totalThreads; ++i) {
        fprintf(f, "GlobalThreadID: %2d | BlockID: %d | ThreadID: %d\n",
               h_global_ids[i], h_block_ids[i], h_thread_ids[i]);
    }
    fclose(f);
    // Cleanup
    free(h_block_ids);
    free(h_thread_ids);
    free(h_global_ids);
    cudaFree(d_block_ids);
    cudaFree(d_thread_ids);
    cudaFree(d_global_ids);

    return 0;
}

In [None]:
import torch
print(torch.backends.cudnn.version())
print(torch.backends.cudnn.is_available())

In [None]:
%%writefile vector_add_1.cu

#include <stdio.h>
#include <stdlib.h>

// Define a helper macro for checking CUDA errors
#define CUDA_CHECK(err) { \
    if (err != cudaSuccess) { \
        fprintf(stderr, "Fatal CUDA Error at %s:%d: %s\n", __FILE__, __LINE__, cudaGetErrorString(err)); \
        exit(EXIT_FAILURE); \
    } \
}

// CUDA Kernel to perform vector addition
__global__ void vectorAdd(const float *a, const float *b, float *c, int n) {
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    if (i < n) {
        c[i] = a[i] + b[i];
    }
}

int main() {
    const int N = 1024; // Number of elements in the vector
    const size_t size = N * sizeof(float);

    // 1. Allocate host memory
    float *h_a = (float *)malloc(size);
    float *h_b = (float *)malloc(size);
    float *h_c = (float *)malloc(size);

    if (h_a == NULL || h_b == NULL || h_c == NULL) {
        fprintf(stderr, "Failed to allocate host vectors!\n");
        return 1;
    }

    // 2. Initialize host vectors
    for (int i = 0; i < N; ++i) {
        h_a[i] = i * 1.0f;
        h_b[i] = i * 2.0f;
    }

    // 3. Allocate device memory
    float *d_a, *d_b, *d_c;
    CUDA_CHECK(cudaMalloc(&d_a, size));
    CUDA_CHECK(cudaMalloc(&d_b, size));
    CUDA_CHECK(cudaMalloc(&d_c, size));

    // 4. Copy data from host to device
    printf("Copying data from host to device...\n");
    CUDA_CHECK(cudaMemcpy(d_a, h_a, size, cudaMemcpyHostToDevice));
    CUDA_CHECK(cudaMemcpy(d_b, h_b, size, cudaMemcpyHostToDevice));

    // 5. Define grid and block dimensions and launch the kernel
    int threadsPerBlock = 256;
    int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;
    printf("Launching kernel with %d blocks and %d threads...\n", blocksPerGrid, threadsPerBlock);

    vectorAdd<<<blocksPerGrid, threadsPerBlock>>>(d_a, d_b, d_c, N);

    // Check for any errors during kernel launch
    CUDA_CHECK(cudaGetLastError());

    // 6. Copy result back from device to host
    printf("Copying result from device to host...\n");
    CUDA_CHECK(cudaMemcpy(h_c, d_c, size, cudaMemcpyDeviceToHost));

    // Synchronize to make sure the copy is complete before we access h_c
    CUDA_CHECK(cudaDeviceSynchronize());

    // 7. Verification
    printf("Verification:\n");
    printf("First element: %f (Expected: 0.0 + 0.0 = 0.0)\n", h_c[0]);
    printf("Second element: %f (Expected: 1.0 + 2.0 = 3.0)\n", h_c[1]);
    printf("Last element: %f (Expected: 1023.0 + 2046.0 = 3069.0)\n", h_c[N-1]);

    // 8. Cleanup
    free(h_a);
    free(h_b);
    free(h_c);
    CUDA_CHECK(cudaFree(d_a));
    CUDA_CHECK(cudaFree(d_b));
    CUDA_CHECK(cudaFree(d_c));

    printf("Done.\n");
    return 0;
}

In [None]:
%%writefile add_cg.cu
#include <stdio.h>
#include <stdlib.h>
#include <cooperative_groups.h>
#include <cooperative_groups/reduce.h>
namespace cg = cooperative_groups;

__global__ void modern_sum_reduction_kernel(const float* input, float* output, int n) {

    cg::thread_block block = cg::this_thread_block();
    float sum = 0.0f;

    int i = blockIdx.x * blockDim.x + threadIdx.x;
    for (; i < n; i += gridDim.x * blockDim.x) {
        sum += input[i];
    }

    // cg reduce the values from all threads in the block to sum
    float block_sum = cg::reduce(block, sum, cg::plus<float>());

    // The first thread writes the block's total sum to the output
    if (block.thread_rank() == 0) {
        output[blockIdx.x] = block_sum;
    }
}


int main(){
  const int blocksPerGrid = 3;
  const int threadsPerBlock = 2;
  const int n = 100

  //allocate host memory
  float *input = (float *)malloc(sizeof(float) * n);

  //initialize host memory
  for (int i;0;i<n; i++){
    input[i] = 10.0 + i;
  }
  //allocate gpu
  float *output;
  cudaMalloc(&output, sizeof(float))

  vectorAdd<<<blocksPerGrid, threadsPerBlock>>>(&finput, &output, n);
  cudaDeviceSynchronize();

  return 0
}

reduction.cu error:

run compute-sanitizer --tool memcheck ./r

=========
========= Program hit cudaErrorUnsupportedPtxVersion (error 222) due to "the provided PTX was compiled with an unsupported toolchain." on CUDA API call to cudaGetLastError.

FIXED after updating cuda to 12.4. nvcc --version and nvidia-smi have to match versions

/content# ./r
Problem Size: 4194304 elements.
Block Size: 256 threads.
Stage 1 Grid Size: 16384 blocks.
Launching Stage 1 reduction...
Launching Stage 2 reduction...

--- Results ---
GPU Sum:              131072.00
Expected Sum:         8388608.00
GPU Average:          0.0312
Expected Average:     2.0000

In [None]:
%%writefile reduction.cu

#include <iostream>
#include <vector>
#include <numeric>

// --- Helper for checking CUDA errors ---
#define CUDA_CHECK(err) { \
    if (err != cudaSuccess) { \
        fprintf(stderr, "Fatal CUDA Error at %s:%d: %s\n", __FILE__, __LINE__, cudaGetErrorString(err)); \
        exit(EXIT_FAILURE); \
    } \
}

// --- The Reduction Kernel ---
// This kernel can be used for both Stage 1 and Stage 2.
// It uses shared memory for an efficient, intra-block reduction.
__global__ void sum_reduction_kernel(const float* input, float* output, int n) {
    // Statically allocate shared memory. Size is known at compile time.
    extern __shared__ float sdata[];

    // Each thread loads an element from global memory to shared memory
    unsigned int tid = threadIdx.x;
    unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;

    // Load data into shared memory. Boundary check for safety.
    if (i < n) {
        sdata[tid] = input[i];
    } else {
        sdata[tid] = 0.0f; // Neutral element for addition
    }

    // Synchronize to make sure all data is loaded before starting the reduction
    __syncthreads();

    // Perform the reduction in shared memory
    // The loop reduces the active number of threads by half in each iteration
    for (unsigned int s = blockDim.x / 2; s > 0; s >>= 1) {
        // Only the first 's' threads are active and add values
        if (tid < s) {
            sdata[tid] += sdata[tid + s];
        }
        // Synchronize to make sure all additions in this step are complete
        __syncthreads();
    }

    // The first thread in the block writes the final result (the block's partial sum)
    // to the output array in global memory.
    if (tid == 0) {
        output[blockIdx.x] = sdata[0];
    }
}


int main() {
    // 1. --- Host Setup ---
    const int N = 1 << 22; // ~4 million elements
    const int BLOCK_SIZE = 256; // Threads per block
    const size_t data_size = N * sizeof(float);

    std::cout << "Problem Size: " << N << " elements." << std::endl;
    std::cout << "Block Size: " << BLOCK_SIZE << " threads." << std::endl;

    // Allocate host memory and initialize data
    std::vector<float> h_input(N);
    for (int i = 0; i < N; ++i) {
        h_input[i] = 2.0f; // Use a simple value for easy verification
    }

    // 2. --- Device Memory Allocation ---
    float *d_input, *d_intermediate, *d_output;
    CUDA_CHECK(cudaMalloc(&d_input, data_size));

    // Calculate grid size for the first stage
    int num_blocks = (N + BLOCK_SIZE - 1) / BLOCK_SIZE;
    std::cout << "Stage 1 Grid Size: " << num_blocks << " blocks." << std::endl;

    // Allocate memory for intermediate (partial) sums and the final result
    CUDA_CHECK(cudaMalloc(&d_intermediate, num_blocks * sizeof(float)));
    CUDA_CHECK(cudaMalloc(&d_output, sizeof(float)));

    // 3. --- Copy Data to Device ---
    CUDA_CHECK(cudaMemcpy(d_input, h_input.data(), data_size, cudaMemcpyHostToDevice));

    // 4. --- Kernel Launches ---
    // The size of shared memory is passed as the third kernel launch parameter.
    size_t shared_mem_size = BLOCK_SIZE * sizeof(float);

    // >> STAGE 1: Reduce the large input array to an intermediate array of partial sums
    std::cout << "Launching Stage 1 reduction..." << std::endl;
    sum_reduction_kernel<<<num_blocks, BLOCK_SIZE, shared_mem_size>>>(d_input, d_intermediate, N);
    //CUDA_CHECK(cudaGetLastError());

    // >> STAGE 2: Reduce the intermediate array of partial sums to a single final value
    std::cout << "Launching Stage 2 reduction..." << std::endl;
    sum_reduction_kernel<<<1, BLOCK_SIZE, shared_mem_size>>>(d_intermediate, d_output, num_blocks);
    CUDA_CHECK(cudaGetLastError());

    // 5. --- Copy Final Result Back to Host ---
    float final_sum = 0.0f;
    CUDA_CHECK(cudaMemcpy(&final_sum, d_output, sizeof(float), cudaMemcpyDeviceToHost));

    // 6. --- Calculate Average and Verify ---
    float average = final_sum / N;
    float expected_sum = N * 2.0f;
    float expected_average = 2.0f;

    std::cout << "\n--- Results ---" << std::endl;
    printf("GPU Sum:              %.2f\n", final_sum);
    printf("Expected Sum:         %.2f\n", expected_sum);
    printf("GPU Average:          %.4f\n", average);
    printf("Expected Average:     %.4f\n", expected_average);

    // 7. --- Cleanup ---
    cudaFree(d_input);
    cudaFree(d_intermediate);
    cudaFree(d_output);

    return 0;
}

In [None]:
# recompile profile_gpt2cu for colab gpu
!make profile_gpt2cu NO_MULTI_GPU=1
# run profile
!ncu --set full --import-source yes -o profile -f ./profile_gpt2cu



ncu --set full --import-source yes -o profile -f ./profile_gpt2cu

Multi-GPU support is disabled. Using a single GPU.

==PROF== Connected to process 52856 (/content/drive/MyDrive/colab_notebooks/llm.c/profile_gpt2cu)

[System]
Device 0: Tesla T4
batch size: 24
sequence length: 1024
| Zero Optimization is disabled                                              |
allocating 237 MiB for parameter gradients

allocating 3762 MiB for activations

allocating 474 MiB for AdamW optimizer state m

allocating 474 MiB for AdamW optimizer state v

allocating 474 MiB for master copy of params

device memory usage: 5828 MiB / 15095 MiB

memory per sequence: 156 MiB
 -> estimated maximum batch size: 83

==PROF== Profiling "encoder_forward_kernel3" - 0: 0%....50%....100% - 30 passes

==PROF== Profiling "layernorm_forward_kernel6" - 1: 0%....50%....100% - 30 passes

==PROF== Profiling "magma_sgemmEx_kernel" - 2: 0%.

==WARNING== Launching the workload is taking more time than expected. If thisnallocating 474 MiB for AdamW optimizer state v

allocating 474 MiB for master copy of params

device memory usage: 5828 MiB / 15095 MiB

memory per sequence: 156 MiB
 -> estimated maximum batch size: 83

==PROF== Profiling "encoder_forward_kernel3" - 0: 0%....50%....100% - 30 passes

==PROF== Profiling "layernorm_forward_kernel6" - 1: 0%....50%....100% - 30 passes

==PROF== Profiling "magma_sgemmEx_kernel" - 2: 0%.

==WARNING== Launching the workload is taking more time than expected. If this continues to hang, terminate the profile and re-try by profiling the range of all related launches using '--replay-mode range'. See https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#replay for more details.
...50%....100% - 30 passes

==PROF== Profiling "permute_kernel" - 3: 0%....50%....100% - 30 passes

==PROF== Profiling "magma_sgemmEx_kernel" - 4: 0%....50%....100% - 30 passes

==PROF== Profiling "softmax_forward_kernel5" - 5: 0%....50%....100% - 30 passes

==PROF== Profiling "magma_sgemmEx_kernel" - 6: 0%....50%....100% - 30 passes

==PROF== Profiling "unpermute_kernel" - 7: 0%....50%....100% - 30 passes

==PROF== Profiling "magma_sgemmEx_kernel" - 8: 0%....50%....100% - 30 passes

==PROF== Profiling "fused_residual_forward_kernel5" - 9: 0%....50%....100% - 30 passes

==PROF== Profiling "magma_sgemmEx_kernel" - 10: 0%....50%....100% - 30 passes

==PROF== Profiling "gelu_forward_kernel2" - 11: 0%....50%....100% - 30 passes

==PROF== Profiling "magma_sgemmEx_kernel" - 12: 0%....50%....100% - 30 passes

==PROF== Profiling "fused_residual_forward_kernel5" - 13: 0%....50%....100% - 30 passes

==PROF== Profiling "magma_sgemmEx_kernel" - 14: 0%.

==WARNING== Launching the workload is taking more time than expected. If this continues to hang, terminate the profile and re-try by profiling the range of all related launches using '--replay-mode range'. See https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#replay for more details.


In [None]:
# --- 1. SETUP THE PROJECT ---
# Clone the repository and move into the directory
!git clone https://github.com/karpathy/llm.c.git
%cd llm.c

# --- 2. DOWNLOAD MODEL WEIGHTS ---
# The C code needs this file to run
!wget https://huggingface.co/karpathy/gpt2/resolve/main/gpt2_124M_bf16.bin

# --- 3. PATCH THE MAKEFILE FOR COLAB ---
# Fixes the "cuDNN not found" error by pointing to correct system paths
!sed -i 's|/usr/local/cuda/lib64|/usr/lib/x86_64-linux-gnu|g' Makefile
!sed -i 's|/usr/local/cuda/include|/usr/include|g' Makefile

# --- 4. COMPILE THE PROFILER ---
# This creates the './profile_gpt2cu' executable
!make profile_gpt2cu NO_MULTI_GPU=1

# --- 5. RUN THE PROFILER ---
# This runs the executable under ncu and creates 'profile.ncu-rep'
# This step will take a few minutes.
print("Starting profiling with ncu. This will take a moment...")
!sudo ncu --set full --import-source yes -o profile -f ./profile_gpt2cu
print("Profiling complete. Download 'profile.ncu-rep' from the file browser to view the results.")

In [None]:
#https://www.youtube.com/watch?v=IDOB9lQrcyw
# process blocks instead of complete rows and columns. BxB submatrix should
# fit in cache

In [None]:
// 1. Allocate host memory
// 2. --- Device Memory Allocation ---
// 3. --- Copy Data to Device ---
// 4. --- Kernel Launches ---
// 5. --- Copy Final Result Back to Host ---
// 6. --- Calculate Average and Verify ---
// 7. --- Cleanup ---


cuda has __constant__ symbol which is a read-only array stored in device memory.

In [None]:
%%writefile const.cu

#include <stdio.h>

#define N 8

// GPU constant memory symbol (read-only from device, set from host)
__constant__ int const_data[N];

// Kernel reads from constant memory
__global__ void useConstantMemory(int* output) {
    int tid = threadIdx.x;
    if (tid < N) {
        output[tid] = const_data[tid] * 2;  // Read from constant symbol
    }
}

int main() {
    int h_const_data[N] = {1, 2, 3, 4, 5, 6, 7, 8};
    int h_output[N];

    int* d_output;

    // Allocate output buffer on device
    cudaMalloc((void**)&d_output, sizeof(int) * N);

    // Copy host data to __constant__ memory on device
    cudaMemcpyToSymbol(const_data, h_const_data, sizeof(int) * N);

    // Launch kernel
    useConstantMemory<<<1, N>>>(d_output);
    cudaDeviceSynchronize();

    // Copy result back
    cudaMemcpy(h_output, d_output, sizeof(int) * N, cudaMemcpyDeviceToHost);

    // Print result
    printf("Output from kernel:\n");
    for (int i = 0; i < N; ++i) {
        printf("%d ", h_output[i]);  // Expect 2Ã— input
    }
    printf("\n");

    // Clean up
    cudaFree(d_output);
    return 0;
}

In [None]:
https://fengyao.notion.site/moe-posttraining
https://fengyao.notion.site/off-policy-rl