## ****用cuda给pytorch写算子****

#### ***1. cuda文件***

****头文件：****  

In [None]:
#include <ATen/ATen.h> //主要是用到了一个类型at::Tensor

#include <cuda.h> //编写cuda代码必备
#include <cuda_runtime.h>

#include <vector> //可能用到了

##### 首先是写要在cpp文件中调用的函数：
    写法就是C++的写法，主要就是调用核函数
    （1）设置线程块数和线程数，初始化返回值
    （2）调用核函数：
        AT_DISPATCH_FLOATING_TYPES(); 案例如下
        参数依次代表：输入数据类型,
                    操作标识符（自定义）,
                    ([&]{核函数;})
        AT_DISPATCH_FLOATING_TYPES(输入数据类型,
                                    操作标识符（自定义）,
                                    ([&]{核函数;}));
    （3）可以有返回值

In [None]:
AT_DISPATCH_FLOATING_TYPES(input.type(), "sigmoid_forward_cuda", ([&] {sigmoid_cuda_forward_kernel<scalar_t><<<blocks, threads>>>(input.data<scalar_t>(),output.data<scalar_t>());}));

##### ****核函数核设备函数****
    待学

#### ****2. cpp文件****
    

##### **头文件：**  

In [None]:
#include <torch/torch.h>  
#include \<vector>  
#好像不需要引入cuda文件，直接调用函数即可

##### ****声明在cuda中写好的函数****   

In [None]:
at::Tensor sigmoid_cuda_forward(at::Tensor input);

##### ****写一些常用的检测输入输出类型的函数****

In [None]:
#define CHECK_CUDA(x) AT_ASSERTM(x.type().is_cuda(), #x " must be a CUDA tensor")
#define CHECK_CONTIGUOUS(x) AT_ASSERTM(x.is_contiguous(), #x " must be contiguous")
#define CHECK_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x)

##### ****把检测类型的函数和声明的函数再次封装****

In [None]:
at::Tensor sigmoid_forward(at::Tensor input) {
    CHECK_INPUT(input);
    return sigmoid_cuda_forward(input);}

at::Tensor sigmoid_backward(at::Tensor grad_output,at::Tensor output) {
    CHECK_INPUT(grad_output);
    CHECK_INPUT(output);
    return sigmoid_cuda_backward(grad_output,output);}

##### ****给python定义和C++之间定义一个接口****

PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {  
  m.def("方法名", &再次封装好的函数名, "描述信息");  
}

In [None]:
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
  m.def("forward", &sigmoid_forward, "sigmoid forward (CUDA)");
  m.def("backward", &sigmoid_backward, "sigmoid backward (CUDA)");
}

####  ****3. setup.py文件****

****导入包****

In [None]:
from setuptools import setup
from torch.utils.cpp_extension import BuildExtension, CUDAExtension, CppExtension

****setup()****  

In [None]:
setup(
    name='sigmoid_cuda_linear_cpp',  #好像随便起
    ext_modules=[                    #一个列表
        CUDAExtension('sigmoid_cuda', [ #CUDAExtension('包名',['cpp文件路径','cuda文件路径'])，里面可以有多个cpp和cuda文件
            'sigmoid_cuda.cpp',
            'sigmoid_cuda_kernel.cu',
        ]),
        CppExtension('linear_cpp', ['linear.cpp']) #CppExtension('包名', ['cpp文件路径'])
    ],
    cmdclass={
        'build_ext': BuildExtension  #通用
    })

#### ****4. 在python文件中调用****

****继承Function类，重写forward和backward方法****  
    写每个方法都需要@staticmethod进行修饰
    方程写好之后都需要.apply转化为可以调用的函数

In [2]:
from torch.nn import Module, Parameter
from torch.autograd import Function

import torch
import linear_cpp
import sigmoid_cuda

class DenseFunction(Function):
    
    @staticmethod
    def forward(ctx, input, weight, bias=None):
        output = linear_cpp.forward(input, weight, bias)
        output = sigmoid_cuda.forward(output)
        ctx.save_for_backward(input, weight, bias, output)
        return output

    @staticmethod
    def backward(ctx, grad_output): 
        input, weight, bias, output = ctx.saved_variables
        grad_sigmoid = sigmoid_cuda.backward(grad_output, output)
        grad_output = grad_sigmoid * grad_output
        grad_input, grad_weight, grad_bias = linear_cpp.backward(grad_output, input, weight, bias)
        return grad_input, grad_weight, grad_bias
        
class Dense(Module):

    def __init__(self, input_features, output_features, bias=True):
        super(Dense, self).__init__()
        self.input_features = input_features
        self.output_features = output_features
        self.weight = Parameter(torch.Tensor(output_features, input_features))
        if bias:
            self.bias = Parameter(torch.Tensor(output_features))
        else:
            self.register_parameter('bias', None)
        self.weight.data.uniform_(-0.1, 0.1)
        if bias is not None:
            self.bias.data.uniform_(-0.1, 0.1)

    def forward(self, input):
        return DenseFunction.apply(input, self.weight, self.bias)

In [None]:
class GatherOperation(Function):

    @staticmethod
    def forward(ctx, features: torch.Tensor, idx: torch.Tensor) -> torch.Tensor:
        """
        :param ctx:
        :param features: (B, C, N)
        :param idx: (B, npoint) index tensor of the features to gather
        :return:
            output: (B, C, npoint)
        """
        assert features.is_contiguous()
        assert idx.is_contiguous()

        B, npoint = idx.size()
        _, C, N = features.size()
        output = torch.cuda.FloatTensor(B, C, npoint)

        pointnet2.gather_points_wrapper(B, C, N, npoint, features, idx, output)

        ctx.for_backwards = (idx, C, N)
        return output

    @staticmethod
    def backward(ctx, grad_out):
        idx, C, N = ctx.for_backwards
        B, npoint = idx.size()

        grad_features = Variable(torch.cuda.FloatTensor(B, C, N).zero_())
        grad_out_data = grad_out.data.contiguous()
        pointnet2.gather_points_grad_wrapper(B, C, N, npoint, grad_out_data, idx, grad_features.data)
        return grad_features, None


gather_operation = GatherOperation.apply