In [None]:
# # This Python 3 environment comes with many helpful analytics libraries installed
# # It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# # For example, here's several helpful packages to load

# import numpy as np # linear algebra
# import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# # Input data files are available in the read-only "../input/" directory
# # For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

# import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

# # You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# # You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!python --version
!nvcc --version
!pip install nvcc4jupyter
%load_ext nvcc4jupyter

In [None]:
!nvcc --version

In [None]:
!g++ -v

In [None]:
%%writefile cpphw.hpp

#include <iostream>

void printHelloWorld()

{

std::cout << "HelloWorld\nHelloWorld";

}

In [None]:
%%writefile cpphw.cpp

#include "cpphw.hpp"

int main()

{

printHelloWorld();

return 0;

}

In [None]:
ls

In [None]:
%%script bash
g++ cpphw.cpp -std=c++11 -o hw.out
./hw.out

# Day 1 (16 Feb, 2025)

In [None]:
%%writefile vect_add_cpu.cpp
// Sequential Vector Addition in CPP

#include <iostream>
#include <vector>
#include <ctime>
using namespace std;

int main(){
    clock_t start = clock();
    int n = 200000000;
    vector<int> A(n), B(n), C(n);
    for (int i=0; i<n; i++){
        A[i] = i+1;
        B[i] = (i+1)*2;
    }

    for (int i=0;i<n; i++){
        C[i] = A[i] + B[i];

    }

    clock_t end = clock();

    double duration = static_cast<double>(end - start) / CLOCKS_PER_SEC; // Time in seconds

    std::cout << "Time taken: " << duration * 1000 << " milliseconds" << std::endl; // Convert to milliseconds

    for (int i=0;i <20; i++){
        cout << C[i] << " ";
    }
    return 0;

}

In [None]:
%%script bash
g++ vect_add_cpu.cpp -std=c++11 -o cpu.out
./cpu.out

In [None]:
%%writefile vect_add_gpu.cu
// vector addition on gpu

#include <iostream>
#include <vector>
#include <ctime>
#include <cuda_runtime.h>

using namespace std;

__global__ void vector_addition(int *A, int *B, int *C, int n){
    // std::cout << "blockIdx.x = " << blockIdx.x << endl ;
    // std::cout << "blockDim.x = " << blockDim.x << endl ;
    // std::cout << "threadIdx.x = " << threadIdx.x << endl ;

    int i = blockIdx.x * blockDim.x + threadIdx.x;

    if (i<n){
        C[i] = A[i] + B[i];
    }
    
};

int main(){
    int n = 200000000;
    vector<int> A(n), B(n), C(n);
    for (int i=0; i<n; i++){
        A[i] = i+1;
        B[i] = (i+1)*2;
    }

    clock_t start = clock();
    int *d_A, *d_B, *d_C;
    cudaMalloc((void **)&d_A, n*sizeof(int));
    cudaMalloc((void **)&d_B, n*sizeof(int));
    cudaMalloc((void **)&d_C, n*sizeof(int));

    cudaMemcpy(d_A, A.data(), n*sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpy(d_B, B.data(), n*sizeof(int), cudaMemcpyHostToDevice);

    int blockSize = 2;
    int gridSize = n+blockSize - 1;
    
    vector_addition <<<gridSize, blockSize>>>(d_A, d_B, d_C, n);
    cudaMemcpy(C.data(), d_C, n*sizeof(int), cudaMemcpyDeviceToHost);

    clock_t end = clock();

    double duration = static_cast<double>(end - start) / CLOCKS_PER_SEC; // Time in seconds

    std::cout << "Time taken: " << duration * 1000 << " milliseconds" << std::endl; // Convert to milliseconds

    for (int i=0;i <20; i++){
        cout << C[i] << " ";
    }
    cudaFree(d_A);
    cudaFree(d_B);
    cudaFree(d_C);
    
    return 0;  
}



In [None]:
%%script bash
nvcc vect_add_gpu.cu -o kernel
./kernel

## Notes:
* Gird Dimension: gridDim.x, gridDim.y, gridDim.z is the number of blocks in each direction of Grid.
* Block Dimension: blockDim.x, blockDim.y, blockDim.z, is the number of threads in each direction of block.
* Block Index: blockIdx.x[y,z], index of the block within the grid
* Thread Index: threadIdx.x[y,z], index of thread within a block
  
threadId =

            blockDim.x\*blockIdx.x + blockDim.x\*threadIdx.y + threadIdx.x (simple 2D within a block) +
              
            gridDim.x\*blockDim.y\*blockIdx.y (blocks of threads in previous row) 


**Steps:**
1. Define the function to run on GPU.
2. Declare variables on CPU and allocate necessary memory on GPU
3. Copy the data from CPU to GPU on the allocated locations.
4. Call the kernel
5. Save the copy of the task result on the CPU.
6. free the memory on GPU


# Day 2 (17 Feb, 2025)

In [None]:
%%writefile vect_2D_add_gpu.cu
// vector addition on gpu

#include <iostream>
#include <vector>
#include <ctime>
#include <cuda_runtime.h>

using namespace std;

__global__ void vector_addition(int *A, int *B, int *C, int rows, int cols){
    // std::cout << "blockIdx.x = " << blockIdx.x << endl ;
    // std::cout << "blockDim.x = " << blockDim.x << endl ;
    // std::cout << "threadIdx.x = " << threadIdx.x << endl ;

    int col = blockIdx.x * blockDim.x + threadIdx.x;
    int row = blockIdx.y * blockDim.y + threadIdx.y;

    if (row < rows && col < cols){
        int index = row*cols + col;
       // std::cout << index << endl;
        C[index] = A[index] + B[index];
    }
    
};

void print_vector(vector<vector <int>> matrix){
  // vector<vector<int>> matrix(3);
    for (vector vec: matrix){
      for (int val : vec){
        cout << val << " ";
      }
      cout << endl;
    }
}

// Helper function to check for CUDA errors
void checkCudaError(cudaError_t error) {
    if (error != cudaSuccess) {
        std::cerr << "CUDA Error: " << cudaGetErrorString(error) << std::endl;
        exit(EXIT_FAILURE);
    }
}

int main(){
    
    int rows = 30000;
    int cols = 30000;
    int n = rows*cols;
    vector<int> h_A(n), h_B(n), h_C(n);
    for (int i = 0; i < rows; ++i) {
        for (int j = 0; j < cols; ++j) {
            h_A[i * cols + j] = 2; // Example initialization for A
            h_B[i * cols + j] = -1;
                }
    }

    clock_t start_cpu = clock();
    for (int i = 0; i < rows; ++i) {
        for (int j = 0; j < cols; ++j) {
            h_C[i * cols + j] = h_A[i * cols + j] + h_B[i * cols + j];
                }
    }
    clock_t end_cpu = clock();
    double duration_cpu = static_cast<double>(end_cpu - start_cpu) / CLOCKS_PER_SEC; // Time in seconds
    std::cout << "CPU Time taken: " << duration_cpu * 1000 << " milliseconds" << std::endl; 

    clock_t start = clock();
    int *d_A, *d_B, *d_C;
    checkCudaError(cudaMalloc((void **)&d_A, n*sizeof(int)));
    checkCudaError(cudaMalloc((void **)&d_B, n*sizeof(int)));
    checkCudaError(cudaMalloc((void **)&d_C, n*sizeof(int)));

    checkCudaError(cudaMemcpy(d_A, h_A.data(), n*sizeof(int), cudaMemcpyHostToDevice));
    checkCudaError(cudaMemcpy(d_B, h_B.data(), n*sizeof(int), cudaMemcpyHostToDevice));

    // dim3 blockSize(rows, cols); // Block dimension (threads per block in x and y) - adjust based on matrix size and GPU capabilities
    // dim3 gridSize((cols + blockSize.x - 1) / blockSize.x, (rows + blockSize.y - 1) / blockSize.y);

    dim3 blockSize(16, 16); // Example block size
    blockSize.x = min(blockSize.x, cols); // Clamp block size to cols
    blockSize.y = min(blockSize.y, rows); // Clamp block size to rows
    dim3 gridSize((cols + blockSize.x - 1) / blockSize.x, (rows + blockSize.y - 1) / blockSize.y);
    
    vector_addition <<<gridSize, blockSize>>>(d_A, d_B, d_C, rows, cols);
    checkCudaError(cudaGetLastError());
    clock_t d2h_start = clock();
    checkCudaError(cudaMemcpy(h_C.data(), d_C, n*sizeof(int), cudaMemcpyDeviceToHost));
    clock_t d2h_end = clock();
    double d2h_duration = static_cast<double>(d2h_end - d2h_start) / CLOCKS_PER_SEC; // Time in seconds

    std::cout << "MemCpy D2H Time taken: " << d2h_duration * 1000 << " milliseconds" << std::endl; 

    clock_t end = clock();

    double duration_cuda = static_cast<double>(end - start) / CLOCKS_PER_SEC; // Time in seconds

    std::cout << "CUDA Time taken: " << duration_cuda * 1000 << " milliseconds" << std::endl; 
    // Convert to milliseconds

    cout << "CUDA Matrix Addition (2D):" << endl;
    
    for (int i = 0; i < 10; ++i) {
        for (int j = 0; j < 10; ++j) {
            cout << h_C[i * cols + j] << " ";
        }
        cout << endl;
    }
    checkCudaError(cudaFree(d_A));
    checkCudaError(cudaFree(d_B));
    checkCudaError(cudaFree(d_C));
    checkCudaError(cudaDeviceSynchronize());
    
    return 0;
}

In [None]:
%%script bash
nvcc vect_2D_add_gpu.cu -o vect_2d
./vect_2d

# Day 3 (18 Feb, 2025)

Chapter 3 Notes:
* dim3 dimGrid(32,1,1), dim dimBlock(128,1,1) or dim3 dog(32,1,1), dim3 cat(128,1,1)
* vec_add <<<dimGrid or dog, dimBlock or cat>>> // we can use both with dim3 type, it is vector of int type with 3 elements, x, y, z.
* "For convenience, CUDA provides a special shortcut for calling a kernel with one-dimensional (1D) grids and blocks. Instead of using dim3 variables, one can use arithmetic expressions to specify the configuration of 1D grids and blocks. In this case, the CUDA compiler simply takes the arithmetic expression as the x dimensions and assumes that the y and z dimensions are 1." Page 49
* "the gridDim and blockDim are built-in variables in a kernel and always reflect the dimensions of the grid and the blocks, respectively."
* the allowed values of gridDim.x range from 1 to 231 2 1,1 and those of gridDim.y and gridDim.z range from 1 to 216 2 1 (65,535).
* The total size of a block in current CUDA systems is limited to 1024 threads. These threads can be distributed across the three dimensions in any way as long as the total number of threads does not exceed 1024.
* We need to know the no of cols at the compile time to accept dynamically allocated  arrays, but we do not have this info, as a result we flattne the dynamically allocated 2D arrays into an equivalent 1D arrays.
* The linearized access to a 3D array P will be in the form of P[plane * m * n +row * m + col].

In [None]:
%%writefile matrix_mul.cu

#include <iostream>
#include <vector>
#include <ctime>
#include <cuda_runtime.h>

using namespace std;

__global__ void mat_mul(int *P_A, int *P_B,int *P_C, int Width){
    int row = blockDim.y*blockIdx.y + threadIdx.y;
    int col = blockDim.x*blockIdx.x + threadIdx.x;

    if (row < Width && col < Width){
        for (int k=0; k<Width; k++){
            P_C[row*Width+col] += P_A[row*Width+k] * P_B[k*Width+col];
        }
    }
}

void checkCudaError(cudaError_t error) {
    if (error != cudaSuccess) {
        std::cerr << "CUDA Error: " << cudaGetErrorString(error) << std::endl;
        exit(EXIT_FAILURE);
    }
}
int main(){
    int rows = 30000;
    int cols = 30000;
    int n = rows*cols;
    vector<int> h_A(n), h_B(n), h_C(n);
    for (int i = 0; i < rows; ++i) {
        for (int j = 0; j < cols; ++j) {
            h_A[i * cols + j] = 1; // Example initialization for A
            h_B[i * cols + j] = 2;
                }
    }
    clock_t start = clock();
    int *d_A, *d_B, *d_C;
    checkCudaError(cudaMalloc((void **)&d_A, n*sizeof(int)));
    checkCudaError(cudaMalloc((void **)&d_B, n*sizeof(int)));
    checkCudaError(cudaMalloc((void **)&d_C, n*sizeof(int)));

    checkCudaError(cudaMemcpy(d_A, h_A.data(), n*sizeof(int), cudaMemcpyHostToDevice));
    checkCudaError(cudaMemcpy(d_B, h_B.data(), n*sizeof(int), cudaMemcpyHostToDevice));

    int width = rows;
    dim3 blockSize(32, 32); // Example block size
    blockSize.x = min(blockSize.x, cols); // Clamp block size to cols
    blockSize.y = min(blockSize.y, rows); // Clamp block size to rows
    dim3 gridSize((cols + blockSize.x - 1) / blockSize.x, (rows + blockSize.y - 1) / blockSize.y);

    clock_t gpu_start = clock();
    mat_mul <<<gridSize, blockSize>>>(d_A, d_B, d_C, width);
    clock_t gpu_end = clock();
    double gpu_duration = static_cast<double>(gpu_end - gpu_start) / CLOCKS_PER_SEC; // Time in seconds
    std::cout << "GPU Time taken: " << gpu_duration * 1000 << " milliseconds" << std::endl; 
    
    clock_t d2h_start = clock();
    checkCudaError(cudaMemcpy(h_C.data(), d_C, n*sizeof(int), cudaMemcpyDeviceToHost));
    clock_t d2h_end = clock();
    double d2h_duration = static_cast<double>(d2h_end - d2h_start) / CLOCKS_PER_SEC; // Time in seconds
    std::cout << "MemCpy D2H Time taken: " << d2h_duration * 1000 << " milliseconds" << std::endl; 

    checkCudaError(cudaGetLastError());
    cout << "CUDA Matrix Mul (2D):" << endl;
    
    for (int i = 0; i < 10; ++i) {
        for (int j = 0; j < 10; ++j) {
            cout << h_C[i * cols + j] << " ";
        }
        cout << endl;
    }
    return 0;
}

In [None]:
%%script bash
nvcc matrix_mul.cu -o mul
./mul

In [None]:
!nvcc --version

# Day 4 (19 Feb, 2025)

### Notes
#### Shared Memory
* On-chip memory that is physically located close to the GPU cores, unlike global memory (DRAM) which is off-chip.
* Low Latency, High bandwith, Shared within a block, Limited Size.

In [4]:
!python --version
!nvcc --version
!pip install nvcc4jupyter
%load_ext nvcc4jupyter

Python 3.10.12
nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2023 NVIDIA Corporation
Built on Tue_Aug_15_22:02:13_PDT_2023
Cuda compilation tools, release 12.2, V12.2.140
Build cuda_12.2.r12.2/compiler.33191640_0
Collecting nvcc4jupyter
  Downloading nvcc4jupyter-1.2.1-py3-none-any.whl.metadata (5.1 kB)
Downloading nvcc4jupyter-1.2.1-py3-none-any.whl (10 kB)
Installing collected packages: nvcc4jupyter
Successfully installed nvcc4jupyter-1.2.1
Detected platform "Kaggle". Running its setup...
Updating the package lists...
Installing nvidia-cuda-toolkit, this may take a few minutes...
Source files will be saved in "/tmp/tmp_jsmuhxr".


In [59]:
%%writefile red.cu
#include <iostream>
#include <vector>
#include <cuda_runtime.h>
#include <ctime>

using namespace std;
void checkCudaError(cudaError_t error){
    if (error != cudaSuccess){
        cerr << "Cuda Error: " << cudaGetErrorString(error) << endl;
        exit(EXIT_FAILURE);
    }
}

__global__ void reduction_tree(float *input, float*output, int block_size){

    __shared__ float shared_data[256];

    int thread_id = threadIdx.x;
    int block_id = blockIdx.x;
    shared_data[thread_id] = input[block_id*block_size + thread_id];
    __syncthreads();

    for (int stride=block_size / 2; stride > 0; stride/=2){
        if (thread_id < stride){
            shared_data[thread_id] += shared_data[thread_id + stride];
        }
        __syncthreads();
    }
    if (thread_id == 0){
        output[block_id] = shared_data[0];
    }
}

int main(){
    int array_size = 1024*1024*4*4;
    int block_size = 32;

    vector<float> h_input(array_size);
    vector<float> h_output((array_size + block_size - 1) / block_size, 0.0f);

    for (int i=0; i < array_size; i++){
        // h_input[i] = (float) (i%10+1);
        h_input[i] = 1;
    }
    
    float *d_input, *d_output;
    checkCudaError(cudaMalloc((void**)&d_input, array_size * sizeof(float)));
    checkCudaError(cudaMalloc((void**)&d_output, h_output.size() * sizeof(float)));

    checkCudaError(cudaMemcpy(d_input, h_input.data(), array_size * sizeof(float), cudaMemcpyHostToDevice));

    dim3 blockDim(block_size);
    dim3 gridDim((array_size+block_size-1)/block_size);

    clock_t cpu_start = clock();
    int array_sum = 0;
    for (int i=0; i < array_size; i++){
        array_sum += h_input[i];
    }
    clock_t cpu_end = clock();
    double cpu_duration = static_cast<double>(cpu_end - cpu_start) / CLOCKS_PER_SEC; // Time in seconds
    std::cout << "CPU Time taken: " << cpu_duration * 1000 << " milliseconds" << std::endl; 

    clock_t gpu_start = clock();
    reduction_tree <<<gridDim, blockDim>>>(d_input, d_output, block_size);
    clock_t gpu_end = clock();
    double gpu_duration = static_cast<double>(gpu_end - gpu_start) / CLOCKS_PER_SEC; // Time in seconds
    std::cout << "GPU Time taken: " << gpu_duration * 1000 << " milliseconds" << std::endl; 
    checkCudaError(cudaMemcpy(h_output.data(), d_output, h_output.size() * sizeof(float), cudaMemcpyDeviceToHost));

    int gpu_sum = 0;
    for (int i = 0; i < h_output.size(); i++){
        gpu_sum += h_output[i];
    }
    cout << "shared memory array size: " << 256 << endl;
    cout << "gpu sum " << gpu_sum<< endl;
    cout << "cpu sum " << array_sum << endl;
    checkCudaError(cudaFree(d_input));
    checkCudaError(cudaFree(d_output));
    
}

Overwriting red.cu


In [60]:
%%script bash
nvcc red.cu -o red
./red

CPU Time taken: 108.922 milliseconds
GPU Time taken: 0.193 milliseconds
shared memory array size: 256
gpu sum 16777216
cpu sum 16777216


In [None]:
{32: 0.225 milliseconds,
 64: 0.219,
 128: 0.182,
 256: 0.212,
 512:
}

# Day 5 (Feb 20, 2025)
* with blockSize > 256 when shared memory is 256, we are likely writing out of bounds of the sharedData array, leading to undefined behavior and potential errors.
* dynamic shared memory, you declare it as extern __shared__ float sharedData[]; in the kernel and then specify the size in bytes when you launch the kernel using the sharedMemConfig parameter of the kernel launch <<<>>>.

In [None]:
    for (int i=0; i < neurons; i++ ){
        for (int j=0; j < sample; j++){
            cout << i << j << " " ;
            cout <<  neurons*i + j << endl;
            
            h_input[neurons*i + j] = neurons*i + j;
        }
    }


In [35]:
%%writefile navie_layer_norm.cpp
#include <iostream>
#include <vector>
#include <ctime>
#include <stdio.h>
#include <cmath>

using namespace std;
void print_matrix(vector<float> matrix, int neurons, int sample ){
    for (int i=0; i < neurons; i++ )
        {
        for (int j=0; j < sample; j++)
        {
            cout <<  matrix[sample*i + j] << " ";
        }
        cout << endl ;
        }
    }
int main()
    {
    //ex1 - a11, a21, a31; ex2 - a12, a22, a32, mean1 = (a11 + a12)/2
    
    int sample = 100;  // cols
    int neurons = 7; // rows

    int array_size = sample * neurons;
    vector<float> h_input(array_size);
    
    for (int i=0; i < neurons; i++ ){
        for (int j=0; j < sample; j++){
            h_input[sample*i + j] = sample*i + j;
        }
    }

    printf("input matrix \n");
    print_matrix(h_input, neurons, sample);

    vector<float> h_mean(sample);
    vector<float> h_var(sample, 0);
    vector<float> h_std(sample, 0);
    
    for (int i=0; i < neurons; i++ ){
        float col_sum = 0;
        for (int j=0; j < sample; j++){
            h_mean[j] += h_input[sample*i + j]; 
            }
        }
    for (int j=0; j < sample; j++){
         h_mean[j] = h_mean[j]/neurons;
        }
    printf("col mean: \n");
    for (float val: h_mean){
        cout << (val) << " ";
    }
    printf("\n");

    for (int i=0; i < neurons; i++ ){
        for (int j=0; j < sample; j++){ 
            h_var[j] += ( h_input[sample*i + j] - h_mean[j]) * ( h_input[sample*i + j] - h_mean[j]); 
            }
        }

    for (int j=0; j < sample; j++){
         h_std[j] = sqrt(h_var[j]/neurons);
        }

    printf("col diff sqr sum: \n");
    for (float val: h_var){
        cout << (val) << " ";
    }
    printf("\n");

    printf("col std: \n");
    for (float val: h_std){
        cout << (val) << " ";
    }
    printf("\n\nprinting output matrix \n\n");

    vector<float> h_output(array_size,0);
    for (int i=0; i < neurons; i++ ){
        for (int j=0; j < sample; j++){
            h_output[sample*i + j] = (h_input[sample*i + j] - h_mean[j])/(h_std[j] + 1e-7);
            float val =  h_output[sample*i + j];
            cout << (val) << " ";
        }
        cout << endl;
    }

    return 0;
    }


Overwriting navie_layer_norm.cpp


In [36]:
%%script bash
g++ navie_layer_norm.cpp -o layer
./layer

input matrix 
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 
100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 
200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 27

Overwriting layer_norm1_corrected.cpp


In [109]:
%%script bash
g++ layer_norm1_corrected.cpp -o layer2
./layer2

layer_norm1_corrected.cpp:9:1: error: extended character   is not valid in an identifier
    9 |     for (int i=0; i < neurons; i++ )
      | ^
layer_norm1_corrected.cpp:9:3: error: extended character   is not valid in an identifier
    9 |     for (int i=0; i < neurons; i++ )
      |   ^
layer_norm1_corrected.cpp:10:1: error: extended character   is not valid in an identifier
   10 |         {
      | ^
layer_norm1_corrected.cpp:10:3: error: extended character   is not valid in an identifier
   10 |         {
      |   ^
layer_norm1_corrected.cpp:10:5: error: extended character   is not valid in an identifier
   10 |         {
      |     ^
layer_norm1_corrected.cpp:10:7: error: extended character   is not valid in an identifier
   10 |         {
      |       ^
layer_norm1_corrected.cpp:11:1: error: extended character   is not valid in an identifier
   11 |         for (int j=0; j < sample; j++)
      | ^
layer_norm1_corrected.cpp:11:3: error: extended character   is not valid in an 

CalledProcessError: Command 'b'g++ layer_norm1_corrected.cpp -o layer2\n./layer2\n'' returned non-zero exit status 127.