<a href="https://colab.research.google.com/github/gummadhav/Let_us_Learn/blob/main/Custom_Accelerated_ML_Functions_Plugged_To_PyTorch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install nvcc4jupyter

Collecting nvcc4jupyter
  Downloading nvcc4jupyter-1.2.1-py3-none-any.whl.metadata (5.1 kB)
Downloading nvcc4jupyter-1.2.1-py3-none-any.whl (10 kB)
Installing collected packages: nvcc4jupyter
Successfully installed nvcc4jupyter-1.2.1


In [2]:
%load_ext nvcc4jupyter

Detected platform "Colab". Running its setup...
Source files will be saved in "/tmp/tmpua6s7ute".


In [3]:
%%shell
nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2024 NVIDIA Corporation
Built on Thu_Jun__6_02:18:23_PDT_2024
Cuda compilation tools, release 12.5, V12.5.82
Build cuda_12.5.r12.5/compiler.34385749_0




In [4]:
%%shell
nvidia-smi

Wed Aug  6 16:19:55 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   45C    P8              9W /   70W |       0MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                



In [5]:
!pip install ninja

Collecting ninja
  Downloading ninja-1.11.1.4-py3-none-manylinux_2_12_x86_64.manylinux2010_x86_64.whl.metadata (5.0 kB)
Downloading ninja-1.11.1.4-py3-none-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (422 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/422.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m419.8/422.8 kB[0m [31m13.5 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m422.8/422.8 kB[0m [31m10.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: ninja
Successfully installed ninja-1.11.1.4


In [6]:
from pathlib import Path

cuda_code = r"""
#include <cuda_runtime.h>
#include <torch/extension.h>

__global__ void relu_kernel(float* input, float* output, int size) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < size) {
        output[idx] = fmaxf(input[idx], 0.0f);
    }
}

void launch_relu(torch::Tensor input, torch::Tensor output) {
    int size = input.numel();
    int threads = 256;
    int blocks = (size + threads - 1) / threads;

    relu_kernel<<<blocks, threads>>>(
        input.data_ptr<float>(),
        output.data_ptr<float>(),
        size
    );
}
"""

cpp_code = r"""
#include <torch/extension.h>

void launch_relu(torch::Tensor input, torch::Tensor output);

void relu_forward(torch::Tensor input, torch::Tensor output) {
    launch_relu(input, output);
}

PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
    m.def("relu_forward", &relu_forward, "Custom ReLU (CUDA)");
}
"""

Path("relu_kernel.cu").write_text(cuda_code)
Path("relu_wrapper.cpp").write_text(cpp_code)

300

In [7]:
from pathlib import Path
from setuptools import setup
from torch.utils.cpp_extension import CUDAExtension

setup_py_code = """
from setuptools import setup
from torch.utils.cpp_extension import BuildExtension, CUDAExtension

setup(
    name='custom_relu',
    ext_modules=[
        CUDAExtension('custom_relu', [
            'relu_wrapper.cpp',
            'relu_kernel.cu',
        ])
    ],
    cmdclass={
        'build_ext': BuildExtension
    }
)
"""

Path("setup.py").write_text(setup_py_code)

326

In [8]:
%%shell
python setup.py install

running install
!!

        ********************************************************************************
        Please avoid running ``setup.py`` directly.
        Instead, use pypa/build, pypa/installer or other
        standards-based tools.

        See https://blog.ganssle.io/articles/2021/10/setup-py-deprecated.html for details.
        ********************************************************************************

!!
  self.initialize_options()
!!

        ********************************************************************************
        Please avoid running ``setup.py`` and ``easy_install``.
        Instead, use pypa/build, pypa/installer or other
        standards-based tools.

        See https://github.com/pypa/setuptools/issues/917 for details.
        ********************************************************************************

!!
  self.initialize_options()
running bdist_egg
running egg_info
creating custom_relu.egg-info
writing custom_relu.egg-info/PKG-IN



In [9]:
from torch.utils.cpp_extension import load

custom_relu = load(
    name="custom_relu",
    sources=["relu_wrapper.cpp", "relu_kernel.cu"],
    verbose=True,
    with_cuda=True
)

Using /root/.cache/torch_extensions/py311_cu124 as PyTorch extensions root...
Creating extension directory /root/.cache/torch_extensions/py311_cu124/custom_relu...
Detected CUDA files, patching ldflags
Emitting ninja build file /root/.cache/torch_extensions/py311_cu124/custom_relu/build.ninja...
If this is not desired, please set os.environ['TORCH_CUDA_ARCH_LIST'].
Building extension module custom_relu...
Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)
Loading extension module custom_relu...


In [10]:
import torch

# Create a dummy PyTorch tensor with both positive and negative values on CUDA
input_tensor = torch.randn(10, 10, device='cuda')
# Ensure some values are negative
input_tensor[input_tensor > 0.5] *= -1
input_tensor[input_tensor < -0.5] *= -1 # Ensure some are positive too

print("Input tensor (first 5x5):\n", input_tensor[:5, :5])


# Create an output tensor on CUDA, initialized with zeros
output_tensor_custom = torch.zeros_like(input_tensor, device='cuda')

# Call the custom relu_forward function
custom_relu.relu_forward(input_tensor, output_tensor_custom)

# Calculate the output using PyTorch's built-in relu for comparison
output_tensor_torch = torch.relu(input_tensor)

# Compare the outputs
are_equal = torch.equal(output_tensor_custom, output_tensor_torch)
are_allclose = torch.allclose(output_tensor_custom, output_tensor_torch)

print(f"\nCustom ReLU output matches PyTorch ReLU output (exact equal): {are_equal}")
print(f"Custom ReLU output matches PyTorch ReLU output (all close): {are_allclose}")

print("\nCustom ReLU output tensor (first 5x5):\n", output_tensor_custom[:5, :5])
print("\nPyTorch ReLU output tensor (first 5x5):\n", output_tensor_torch[:5, :5])

Input tensor (first 5x5):
 tensor([[ 0.4726,  0.9190, -0.0555,  1.1306,  1.2934],
        [-0.1905,  0.2162,  0.7895, -0.4247,  1.2878],
        [ 0.0042,  1.6427,  1.3950, -0.2774, -0.0644],
        [ 1.0406,  0.6978,  0.7731,  0.2646,  0.2793],
        [-0.4431,  0.5495,  0.2993,  0.5874,  1.6998]], device='cuda:0')

Custom ReLU output matches PyTorch ReLU output (exact equal): True
Custom ReLU output matches PyTorch ReLU output (all close): True

Custom ReLU output tensor (first 5x5):
 tensor([[0.4726, 0.9190, 0.0000, 1.1306, 1.2934],
        [0.0000, 0.2162, 0.7895, 0.0000, 1.2878],
        [0.0042, 1.6427, 1.3950, 0.0000, 0.0000],
        [1.0406, 0.6978, 0.7731, 0.2646, 0.2793],
        [0.0000, 0.5495, 0.2993, 0.5874, 1.6998]], device='cuda:0')

PyTorch ReLU output tensor (first 5x5):
 tensor([[0.4726, 0.9190, 0.0000, 1.1306, 1.2934],
        [0.0000, 0.2162, 0.7895, 0.0000, 1.2878],
        [0.0042, 1.6427, 1.3950, 0.0000, 0.0000],
        [1.0406, 0.6978, 0.7731, 0.2646, 0.279