# Stage 2: Express in natural code

这个阶段的主要目标是扩展当前的 DeZero ，使它能够执行更复杂的计算。具体来说，我们将修改 DeZero 的基础代码，使它能够处理接收多个输入的函数和返回多个输出的函数。我们还将扩展 DeZero使它可以用自然的代码来表达，例如能够使用+和*等运算符。

In [1]:
import numpy as np
import heapq
import weakref
import contextlib

## Step 11: Variable length parameter (forward)

Stage1中设计的函数输入输出都只有一个变量，下面需要拓展DeZero，使其可以处理可变长的输入和输出。

In [2]:
class Variable:
    def __init__(self, data):
        if data is not None:
            if not isinstance(data, np.ndarray):
                raise TypeError('{} is not supported'.format(type(data)))

        self.data = data
        self.grad = None
        self.creator = None

    def set_creator(self, func):
        self.creator = func

    def backward(self):
        if self.grad is None:
            self.grad = np.ones_like(self.data)

        funcs = [self.creator]
        while funcs:
            f = funcs.pop()
            x, y = f.input, f.output
            x.grad = f.backward(y.grad)

            if x.creator is not None:
                funcs.append(x.creator)

In [3]:
def as_array(x):
    if np.isscalar(x):
        return np.array(x)
    return x

class Function:
    def __call__(self, inputs):
        # 将变量放入列表
        xs = [x.data for x in inputs]  # Get data from Variable
        ys = self.forward(xs)
        outputs = [Variable(as_array(y)) for y in ys]  # Wrap data
        # 逐个设置关联关系
        for output in outputs:
            output.set_creator(self)
        self.inputs = inputs
        self.outputs = outputs
        return outputs

    def forward(self, xs):
        raise NotImplementedError()

    def backward(self, gys):
        raise NotImplementedError()

In [8]:
class Add(Function):
    def forward(self, xs):
        x0, x1 = xs
        y = x0 + x1
        return (y,)  # return as tuple

In [5]:
xs = [Variable(np.array(2)), Variable(np.array(3))]
f = Add()
ys = f(xs)
print(type(ys), type(ys[0]))  # <class 'tuple'> <class '__main__.Variable'>
y = ys[0]
print(y.data)  # 5

<class 'list'> <class '__main__.Variable'>
5


## Step 12: Variable length parameter (improve)

改进上述代码，使得Add类更容易使用，也更容易实现。

In [6]:
# 1. 让Add类更易用，传入多个参数时，不需要将参数打包成列表，而是直接传入多个参数
class Function:
    def __call__(self, *inputs):  # Change inputs to *inputs
        xs = [x.data for x in inputs]  # 从inputs中取出data
        ys = self.forward(xs)
        outputs = [Variable(as_array(y)) for y in ys]

        for output in outputs:
            output.set_creator(self)
        self.inputs = inputs
        self.outputs = outputs

        return outputs if len(outputs) > 1 else outputs[0]  # 如果outputs只有一个元素，则返回该元素

    def forward(self, xs):
        raise NotImplementedError()

    def backward(self, gys):
        raise NotImplementedError()

In [9]:
x0 = Variable(np.array(2))
x1 = Variable(np.array(3))
f = Add()
y = f(x0, x1)
print(y.data)  # 5

5


In [10]:
# 2. 让Add类更容易实现，forward方法直接接收两个变量，直接返回结果变量
class Function:
    def __call__(self, *inputs):  # Change inputs to *inputs
        xs = [x.data for x in inputs]  # 从inputs中取出data
        ys = self.forward(*xs)  # 解包：使用*将列表拆分为单独的元素（之前需要在add里解包）
        if not isinstance(ys, tuple):  # 如果ys不是元组，则将其转换为元组（之前需要在add里转换）
            ys = (ys,)
        outputs = [Variable(as_array(y)) for y in ys]

        for output in outputs:
            output.set_creator(self)
        self.inputs = inputs
        self.outputs = outputs

        return outputs if len(outputs) > 1 else outputs[0]  # 如果outputs只有一个元素，则返回该元素

    def forward(self, xs):
        raise NotImplementedError()

    def backward(self, gys):
        raise NotImplementedError()

In [11]:
class Add(Function):
    def forward(self, x0, x1):
        y = x0 + x1
        return y
    
def add(x0, x1):
    return Add()(x0, x1)

In [12]:
x0 = Variable(np.array(2))
x1 = Variable(np.array(3))
y = add(x0, x1)
print(y.data)  # 5

5


## Step 13: Variable length parameter (backward)

加法的反向传播就是将输入的梯度原封不动地传递给输出
<center>
<table>
  <tr>
    <td><img src="./res/反向传播_偏导.png" width="400"/></td>
  </tr>
</table>
</center>

1. 将output的grad收集起来
2. 使用backward方法计算梯度
3. 如果backward的返回值不是元组，则转换为元组
4. 将梯度传递给输入变量

In [13]:
class Add(Function):
    def forward(self, x0, x1):
        y = x0 + x1
        return y

    def backward(self, gy):  
        return gy, gy

In [14]:
class Variable:
    def __init__(self, data):
        if data is not None:
            if not isinstance(data, np.ndarray):
                raise TypeError('{} is not supported'.format(type(data)))

        self.data = data
        self.grad = None
        self.creator = None

    def set_creator(self, func):
        self.creator = func

    def backward(self):
        if self.grad is None:
            self.grad = np.ones_like(self.data)

        funcs = [self.creator]
        while funcs:
            f = funcs.pop()
            gys = [output.grad for output in f.outputs]  # 1. 将output的grad收集起来（因为可能有多个输出）
            gxs = f.backward(*gys)  # 2. 使用backward方法计算梯度（不能直接赋值了，因为输入也可能有多个）
            if not isinstance(gxs, tuple):  # 3. 如果backward的返回值不是元组，则转换为元组
                gxs = (gxs,)
            
            for x, gx in zip(f.inputs, gxs):  # 4. 将梯度传递给输入变量（输入可能有多个，就逐个赋值）
                x.grad = gx
                if x.creator is not None:
                    funcs.append(x.creator)


In [15]:
class Square(Function):
    def forward(self, x):
        y = x ** 2
        return y

    def backward(self, gy):
        x = self.inputs[0].data  # input是元组，取出第0个元素
        gx = 2 * x * gy
        return gx
    
def square(x):
    return Square()(x)

In [16]:
x = Variable(np.array(2.0))
y = Variable(np.array(3.0))

z = add(square(x), square(y))
z.backward()
print(z.data)  # 13.0
print(x.grad)  # 4.0
print(y.grad)  # 6.0

13.0
4.0
6.0


## Step 14: Use the same variable repeatedly

当前DeZero如果重复使用同一个变量，则不能正确求导

问题在于变量的backward代码，是直接用输出端传播的导数进行赋值的，如果是重复使用同一个变量，则传播的导数会被替换

<center>
<table>
  <tr>
    <td><img src="./res/重复变量.png" width="400"/></td>
  </tr>
</table>
</center>

In [17]:
x = Variable(np.array(3.0))
# y = 2x 正确的导数应该为2
y = add(x,x)
print('y', y.data)

y.backward()
print('x.grad', x.grad)

y 6.0
x.grad 1.0


In [18]:
class Variable:
    def __init__(self, data):
        if data is not None:
            if not isinstance(data, np.ndarray):
                raise TypeError('{} is not supported'.format(type(data)))

        self.data = data
        self.grad = None
        self.creator = None

    def set_creator(self, func):
        self.creator = func

    def backward(self):
        if self.grad is None:
            self.grad = np.ones_like(self.data)

        funcs = [self.creator]
        while funcs:
            f = funcs.pop()
            gys = [output.grad for output in f.outputs]  # 1. 将output的grad收集起来
            gxs = f.backward(*gys)  # 2. 使用backward方法计算梯度
            if not isinstance(gxs, tuple):  # 3. 如果backward的返回值不是元组，则转换为元组
                gxs = (gxs,)
            
            for x, gx in zip(f.inputs, gxs):  # 4. 将梯度传递给输入变量
                if x.grad is None:
                    x.grad = gx
                else:
                    x.grad = x.grad + gx  # 添加判断，如果此时grad已经有值了则累加
                if x.creator is not None:
                    funcs.append(x.creator)


In [19]:
x = Variable(np.array(3.0))
y = add(x, add(x,x))
print('y', y.data)

y.backward()
print('x.grad', x.grad)

y 9.0
x.grad 3.0


但此时又出现另一个问题，当同一个变量进行不同计算时，梯度会在第一次的计算上累加，需要添加clearGrad方法

In [20]:
class Variable:
    def __init__(self, data):
        if data is not None:
            if not isinstance(data, np.ndarray):
                raise TypeError('{} is not supported'.format(type(data)))

        self.data = data
        self.grad = None
        self.creator = None

    def set_creator(self, func):
        self.creator = func

    def clear_grad(self):  # 清空梯度
        self.grad = None

    def backward(self):
        if self.grad is None:
            self.grad = np.ones_like(self.data)

        funcs = [self.creator]
        while funcs:
            f = funcs.pop()
            gys = [output.grad for output in f.outputs]  # 1. 将output的grad收集起来
            gxs = f.backward(*gys)  # 2. 使用backward方法计算梯度
            if not isinstance(gxs, tuple):  # 3. 如果backward的返回值不是元组，则转换为元组
                gxs = (gxs,)
            
            for x, gx in zip(f.inputs, gxs):  # 4. 将梯度传递给输入变量
                if x.grad is None:
                    x.grad = gx
                else:
                    x.grad = x.grad + gx  # 添加判断，如果此时grad已经有值了则累加
                if x.creator is not None:
                    funcs.append(x.creator)


In [21]:
x = Variable(np.array(3.0))
y = add(x, x)
y.backward()
print(x.grad)  # 2.0

x.clear_grad()  # 不同的计算，清空梯度
y = add(add(x, x), x)
y.backward()
print(x.grad)  # 3.0

2.0
3.0


## Step 15: Complex calculation graph (theory)

现在的DeZero还不能处理复杂的计算图：
<center>
<table>
  <tr>
    <td><img src="./res/complex1.png" width="400"/></td>
  </tr>
  <tr>
    <td><img src="./res/complex2.png" width="400"/></td>
  </tr>
</table>
</center>

比如对于上图的变量a，就需要通过B和C传播得到a的梯度后，再从a向x传播梯度。  
但目前的实现还有问题，每次从将待处理函数append到funcs末尾，而处理函数是从末尾pop出来的。不断覆盖，导致无法按照上述顺序计算。

需要从funcs列表中取出合适的函数：
1. 解析计算图，拓扑排序
2. 正向传播时记录辈分关系，优先取出“后代”

## Step 16: Complex calculation graph (implementation)

In [2]:
class Variable:
    def __init__(self, data):
        if data is not None:
            if not isinstance(data, np.ndarray):
                raise TypeError('{} is not supported'.format(type(data)))

        self.data = data
        self.grad = None
        self.creator = None
        self.generation = 0

    def set_creator(self, func):
        self.creator = func
        self.generation = func.generation + 1  # 设置为父函数的generation+1

    def clear_grad(self):  # 清空梯度
        self.grad = None

    def backward(self):
        if self.grad is None:
            self.grad = np.ones_like(self.data)

        funcs = []
        seen_set = set()  # 使用集合来保存已经遍历过的函数，避免重复遍历
        def add_func(f):  # 可以使用heapq来优化
            if f not in seen_set:
                funcs.append(f)
                seen_set.add(f)
                funcs.sort(key=lambda x: x.generation)  # 按照generation排序

        add_func(self.creator)  # 每添加一个函数，就调用add_func方法进行排序

        while funcs:
            f = funcs.pop()  # 需要取出generation最大的function！
            gys = [output.grad for output in f.outputs]  # 1. 将output的grad收集起来
            gxs = f.backward(*gys)  # 2. 使用backward方法计算梯度
            if not isinstance(gxs, tuple):  # 3. 如果backward的返回值不是元组，则转换为元组
                gxs = (gxs,)
            
            for x, gx in zip(f.inputs, gxs):  # 4. 将梯度传递给输入变量
                if x.grad is None:
                    x.grad = gx
                else:
                    x.grad = x.grad + gx  # 添加判断，如果此时grad已经有值了则累加
                if x.creator is not None:
                    #funcs.append(x.creator)
                    add_func(x.creator)

In [8]:
def as_array(x):
    if np.isscalar(x):
        return np.array(x)
    return x

class Function:
    def __call__(self, *inputs):
        xs = [x.data for x in inputs]
        ys = self.forward(*xs)
        if not isinstance(ys, tuple):
            ys = (ys,)
        outputs = [Variable(as_array(y)) for y in ys]

        self.generation = max([x.generation for x in inputs])  # 设置为输入的max generation
        for output in outputs:
            output.set_creator(self)
        self.inputs = inputs
        self.outputs = outputs
        return outputs if len(outputs) > 1 else outputs[0]

    def forward(self, xs):
        raise NotImplementedError()

    def backward(self, gys):
        raise NotImplementedError()

In [9]:
class Square(Function):
    def forward(self, x):
        y = x ** 2
        return y

    def backward(self, gy):
        x = self.inputs[0].data
        gx = 2 * x * gy
        return gx

def square(x):
    return Square()(x)

class Add(Function):
    def forward(self, x0, x1):
        y = x0 + x1
        return y

    def backward(self, gy):
        return gy, gy

def add(x0, x1):
    return Add()(x0, x1)

In [20]:
x = Variable(np.array(2.0))
a = square(x)
y = add(square(a), square(a))
y.backward()

print(y.data)  # 32.0
print(x.grad)  # 64.0

32.0
64.0


有必要这么麻烦吗？使用层次遍历不行吗？还真不行，比如Step15的上半图，是一个拓扑排序问题。

In [17]:
# 使用优先级队列来优化时间复杂度
def as_array(x):
    if np.isscalar(x):
        return np.array(x)
    return x

class Function:
    def __call__(self, *inputs):
        xs = [x.data for x in inputs]
        ys = self.forward(*xs)
        if not isinstance(ys, tuple):
            ys = (ys,)
        outputs = [Variable(as_array(y)) for y in ys]

        self.generation = max([x.generation for x in inputs])  # 设置为输入的max generation
        for output in outputs:
            output.set_creator(self)
        self.inputs = inputs
        self.outputs = outputs
        return outputs if len(outputs) > 1 else outputs[0]
    
    def __lt__(self, other):  # 为了使用heapq，需要实现小于比较
        return self.generation > other.generation

    def forward(self, xs):
        raise NotImplementedError()

    def backward(self, gys):
        raise NotImplementedError()

class Variable:
    def __init__(self, data):
        if data is not None:
            if not isinstance(data, np.ndarray):
                raise TypeError('{} is not supported'.format(type(data)))

        self.data = data
        self.grad = None
        self.creator = None
        self.generation = 0

    def set_creator(self, func):
        self.creator = func
        self.generation = func.generation + 1  # 设置为父函数的generation+1

    def clear_grad(self):  # 清空梯度
        self.grad = None

    def backward(self):
        if self.grad is None:
            self.grad = np.ones_like(self.data)

        funcs = []
        seen_set = set()  # 使用集合来保存已经遍历过的函数，避免重复遍历
        def add_func(f):
            if f not in seen_set:
                heapq.heappush(funcs, f)
                seen_set.add(f)

        add_func(self.creator)  # 每添加一个函数，就调用add_func方法进行排序

        while funcs:
            f = heapq.heappop(funcs)
            #f = funcs.pop()  # 需要取出generation最大的function！

            gys = [output.grad for output in f.outputs]  # 1. 将output的grad收集起来
            gxs = f.backward(*gys)  # 2. 使用backward方法计算梯度
            if not isinstance(gxs, tuple):  # 3. 如果backward的返回值不是元组，则转换为元组
                gxs = (gxs,)
            
            for x, gx in zip(f.inputs, gxs):  # 4. 将梯度传递给输入变量
                if x.grad is None:
                    x.grad = gx
                else:
                    x.grad = x.grad + gx  # 添加判断，如果此时grad已经有值了则累加
                if x.creator is not None:
                    add_func(x.creator)

## Step 17: Memory management and circular references

本节关注DeZero的速度和内存使用，引入一些提高性能的技术。

python会自动从内存中删除不再需要的对象
- 引用计数：自动回收没有被引用的对象。但是循环引用时，引用计数无法回收。
- 分代垃圾回收(GC)：python会定期检查对象的引用关系，回收不再使用的对象。

使用GC释放内存时会让程序整体内存使用量增加，内存在机器学习中很重要，因此要避免循环引用。

目前的DeZero存在循环引用，比如Variable和Function相互引用，导致无法回收内存。
<center>
<table>
  <tr>
    <td><img src="./res/circular.png" width="400"/></td>
  </tr>
</table>
</center>

可以使用python标准模块的weakref，它在不增加引用计数的情况下引用对象，避免循环引用。


In [22]:
a = np.array([1, 2, 3])
b = weakref.ref(a)

print(b)  # <weakref at 0x7f8b1c1b3b80; to 'numpy.ndarray' at 0x7f8b1c1b3b30>
print(b())  # [1 2 3]

a = None
print(b)  # <weakref at 0x7f8b1c1b3b80; dead>
print(b())  # None

<weakref at 0x0000024EC6CCDB88; to 'numpy.ndarray' at 0x0000024EC6CF1990>
[1 2 3]
<weakref at 0x0000024EC6CCDB88; dead>
None


In [5]:
def as_array(x):
    if np.isscalar(x):
        return np.array(x)
    return x

class Function:
    def __call__(self, *inputs):
        xs = [x.data for x in inputs]
        ys = self.forward(*xs)
        if not isinstance(ys, tuple):
            ys = (ys,)
        outputs = [Variable(as_array(y)) for y in ys]

        self.generation = max([x.generation for x in inputs])  # 设置为输入的max generation
        for output in outputs:
            output.set_creator(self)
        self.inputs = inputs
        self.outputs = [weakref.ref(output) for output in outputs]  # 使用weakref
        return outputs if len(outputs) > 1 else outputs[0]
    
    def __lt__(self, other):  # 为了使用heapq，需要实现小于比较
        return self.generation > other.generation

    def forward(self, xs):
        raise NotImplementedError()

    def backward(self, gys):
        raise NotImplementedError()

In [3]:
class Variable:
    def __init__(self, data):
        if data is not None:
            if not isinstance(data, np.ndarray):
                raise TypeError('{} is not supported'.format(type(data)))

        self.data = data
        self.grad = None
        self.creator = None
        self.generation = 0

    def set_creator(self, func):
        self.creator = func
        self.generation = func.generation + 1  # 设置为父函数的generation+1

    def clear_grad(self):  # 清空梯度
        self.grad = None

    def backward(self):
        if self.grad is None:
            self.grad = np.ones_like(self.data)

        funcs = []
        seen_set = set()  # 使用集合来保存已经遍历过的函数，避免重复遍历
        def add_func(f):
            if f not in seen_set:
                heapq.heappush(funcs, f)
                seen_set.add(f)

        add_func(self.creator)  # 每添加一个函数，就调用add_func方法进行排序

        while funcs:
            f = heapq.heappop(funcs)  
            # 下面要使用output()来获取Variable实例，因为outputs是weakref
            gys = [output().grad for output in f.outputs]  # 1. 将output的grad收集起来
            gxs = f.backward(*gys)  # 2. 使用backward方法计算梯度
            if not isinstance(gxs, tuple):  # 3. 如果backward的返回值不是元组，则转换为元组
                gxs = (gxs,)
            
            for x, gx in zip(f.inputs, gxs):  # 4. 将梯度传递给输入变量
                if x.grad is None:
                    x.grad = gx
                else:
                    x.grad = x.grad + gx  # 添加判断，如果此时grad已经有值了则累加
                if x.creator is not None:
                    add_func(x.creator)

In [26]:
for i in range(10):
    # 覆盖掉变量后，内存会被释放
    x = Variable(np.random.randn(10000))  # 生成大数组
    y = square(square(square(x)))  # 生成大数组

## Step 18: Mode for reducing memory usage

本节主要做两部分改进：
1. 减少反向传播消耗的内存使用量 -- 立即消除无用导数
2. 提供"不需要反向传播时的模式" -- 省去不必要的计算

在机器学习中，往往只有终端变量的导数才需要通过反向传播求得，中间变量的导数基本用不到。

通过retain_grad标志位，如果为false，则不保存梯度，从而减少内存消耗。

In [2]:
class Variable:
    def __init__(self, data):
        if data is not None:
            if not isinstance(data, np.ndarray):
                raise TypeError('{} is not supported'.format(type(data)))

        self.data = data
        self.grad = None
        self.creator = None
        self.generation = 0

    def set_creator(self, func):
        self.creator = func
        self.generation = func.generation + 1  # 设置为父函数的generation+1

    def clear_grad(self):  # 清空梯度
        self.grad = None

    def backward(self, retain_grad = False):  # 添加参数retain_grad
        if self.grad is None:
            self.grad = np.ones_like(self.data)

        funcs = []
        seen_set = set()  # 使用集合来保存已经遍历过的函数，避免重复遍历
        def add_func(f):
            if f not in seen_set:
                heapq.heappush(funcs, f)
                seen_set.add(f)

        add_func(self.creator)  # 每添加一个函数，就调用add_func方法进行排序

        while funcs:
            f = heapq.heappop(funcs)  
            # 下面要使用output()来获取Variable实例，因为outputs是weakref
            gys = [output().grad for output in f.outputs]  # 1. 将output的grad收集起来
            gxs = f.backward(*gys)  # 2. 使用backward方法计算梯度
            if not isinstance(gxs, tuple):  # 3. 如果backward的返回值不是元组，则转换为元组
                gxs = (gxs,)
            
            for x, gx in zip(f.inputs, gxs):  # 4. 将梯度传递给输入变量
                if x.grad is None:
                    x.grad = gx
                else:
                    x.grad = x.grad + gx  # 添加判断，如果此时grad已经有值了则累加
                if x.creator is not None:
                    add_func(x.creator)
            
            if not retain_grad:
                for y in f.outputs:
                    y().grad = None  # 释放中间变量的梯度

In [6]:
x0 = Variable(np.array(1.0))
x1 = Variable(np.array(1.0))
t = add(x0, x1)
y = add(x0, t)
y.backward()

print(y.grad, t.grad)  # none none
print(x0.grad, x1.grad)  # 2.0 1.0

None None
2.0 1.0


反向传播阶段需要正向传播的计算结果，所以我们需要保存这些结果。

不过有些时候并不需要求导，此时我们没有必要保留正向传播的计算结果。（比如推理阶段）

In [4]:
class Config:
    enable_backprop = True

In [6]:
class Function:
    def __call__(self, *inputs):
        xs = [x.data for x in inputs]
        ys = self.forward(*xs)
        if not isinstance(ys, tuple):
            ys = (ys,)
        outputs = [Variable(as_array(y)) for y in ys]

        if Config.enable_backprop:  # 只有在开启了反向传播的情况下才会执行
            # 如果不启用backprop，包括辈分值、计算的连接关系都可以不用设置
            self.generation = max([x.generation for x in inputs])  # 设置为输入的max generation
            for output in outputs:
                output.set_creator(self)
            self.inputs = inputs
            self.outputs = [weakref.ref(output) for output in outputs]  # 使用weakref
        return outputs if len(outputs) > 1 else outputs[0]
    
    def __lt__(self, other):  # 为了使用heapq，需要实现小于比较
        return self.generation > other.generation

    def forward(self, xs):
        raise NotImplementedError()

    def backward(self, gys):
        raise NotImplementedError()

In [11]:
Config.enable_backprop = True
x = Variable(np.ones((100, 100, 100)))
y = square(square(square(x)))
y.backward()

Config.enable_backprop = False
x = Variable(np.ones((100, 100, 100)))
y = square(square(square(x)))

使用上下文管理器让模式切换更加方便。

In [7]:
# 使用上下文管理器来控制Config.enable_backprop的值
@contextlib.contextmanager
def using_config(name, value):
    old_value = getattr(Config, name)
    setattr(Config, name, value)
    try:
        yield
    finally:
        setattr(Config, name, old_value)

In [14]:
with using_config('enable_backprop', False):
    x = Variable(np.array(2.0))
    y = square(x)

In [8]:
# 进一步封装
def no_grad():
    return using_config('enable_backprop', False)

In [16]:
with no_grad():
    x = Variable(np.array(2.0))
    y = square(x)

## Step 19: Make variables easier to use

1. 为变量设置名字，方便调试
2. 让变量看上去更像numpy的ndarray
3. 支持len函数和print函数

In [9]:
class Variable:
    def __init__(self, data, name=None):
        if data is not None:
            if not isinstance(data, np.ndarray):
                raise TypeError('{} is not supported'.format(type(data)))

        self.data = data
        self.name = name  # 添加name属性
        self.grad = None
        self.creator = None
        self.generation = 0

    @property
    def shape(self):
        return self.data.shape

    @property
    def ndim(self):
        return self.data.ndim

    @property
    def size(self):
        return self.data.size

    @property
    def dtype(self):
        return self.data.dtype

    def __len__(self):
        return len(self.data)

    def __repr__(self):
        if self.data is None:
            return 'variable(None)'
        p = str(self.data).replace('\n', '\n' + ' ' * 9)
        return 'variable(' + p + ')'

    def set_creator(self, func):
        self.creator = func
        self.generation = func.generation + 1

    def cleargrad(self):
        self.grad = None

    def backward(self, retain_grad=False):
        if self.grad is None:
            self.grad = np.ones_like(self.data)

        funcs = []
        seen_set = set()

        def add_func(f):
            if f not in seen_set:
                heapq.heappush(funcs, f)
                seen_set.add(f)

        add_func(self.creator)

        while funcs:
            f = heapq.heappop(funcs)  
            gys = [output().grad for output in f.outputs]  # output is weakref
            gxs = f.backward(*gys)
            if not isinstance(gxs, tuple):
                gxs = (gxs,)

            for x, gx in zip(f.inputs, gxs):
                if x.grad is None:
                    x.grad = gx
                else:
                    x.grad = x.grad + gx

                if x.creator is not None:
                    add_func(x.creator)

            if not retain_grad:
                for y in f.outputs:
                    y().grad = None  # y is weakref

In [10]:
x = Variable(np.array([[1, 2, 3], [4, 5, 6]]))
x.name = 'x'

print(x.name)
print(x.shape)
print(x)

x
(2, 3)
variable([[1 2 3]
          [4 5 6]])


## gather all things above

In [29]:
class Config:
    enable_backprop = True

# 使用上下文管理器来控制Config.enable_backprop的值
@contextlib.contextmanager
def using_config(name, value):
    old_value = getattr(Config, name)
    setattr(Config, name, value)
    try:
        yield
    finally:
        setattr(Config, name, old_value)

# 进一步封装
def no_grad():
    return using_config('enable_backprop', False)

In [30]:
class Variable:
    def __init__(self, data, name=None):
        if data is not None:
            if not isinstance(data, np.ndarray):
                raise TypeError('{} is not supported'.format(type(data)))

        self.data = data
        self.name = name  # 添加name属性
        self.grad = None
        self.creator = None
        self.generation = 0

    @property
    def shape(self):
        return self.data.shape

    @property
    def ndim(self):
        return self.data.ndim

    @property
    def size(self):
        return self.data.size

    @property
    def dtype(self):
        return self.data.dtype

    def __len__(self):
        return len(self.data)

    def __repr__(self):
        if self.data is None:
            return 'variable(None)'
        p = str(self.data).replace('\n', '\n' + ' ' * 9)
        return 'variable(' + p + ')'

    def set_creator(self, func):
        self.creator = func
        self.generation = func.generation + 1

    def cleargrad(self):
        self.grad = None

    def backward(self, retain_grad=False):
        if self.grad is None:
            self.grad = np.ones_like(self.data)

        funcs = []
        seen_set = set()

        def add_func(f):
            if f not in seen_set:
                heapq.heappush(funcs, f)
                seen_set.add(f)

        add_func(self.creator)

        while funcs:
            f = heapq.heappop(funcs)  
            gys = [output().grad for output in f.outputs]  # output is weakref
            gxs = f.backward(*gys)
            if not isinstance(gxs, tuple):
                gxs = (gxs,)

            for x, gx in zip(f.inputs, gxs):
                if x.grad is None:
                    x.grad = gx
                else:
                    x.grad = x.grad + gx

                if x.creator is not None:
                    add_func(x.creator)

            if not retain_grad:
                for y in f.outputs:
                    y().grad = None  # y is weakref

In [31]:
def as_array(x):
    if np.isscalar(x):
        return np.array(x)
    return x

class Function:
    def __call__(self, *inputs):
        xs = [x.data for x in inputs]
        ys = self.forward(*xs)
        if not isinstance(ys, tuple):
            ys = (ys,)
        outputs = [Variable(as_array(y)) for y in ys]

        if Config.enable_backprop:  # 只有在开启了反向传播的情况下才会执行
            # 如果不启用backprop，包括辈分值、计算的连接关系都可以不用设置
            self.generation = max([x.generation for x in inputs])  # 设置为输入的max generation
            for output in outputs:
                output.set_creator(self)
            self.inputs = inputs
            self.outputs = [weakref.ref(output) for output in outputs]  # 使用weakref
        return outputs if len(outputs) > 1 else outputs[0]
    
    def __lt__(self, other):  # 为了使用heapq，需要实现小于比较
        return self.generation > other.generation

    def forward(self, xs):
        raise NotImplementedError()

    def backward(self, gys):
        raise NotImplementedError()

In [34]:
class Square(Function):
    def forward(self, x):
        y = x ** 2
        return y

    def backward(self, gy):
        x = self.inputs[0].data
        gx = 2 * x * gy
        return gx

def square(x):
    return Square()(x)

class Add(Function):
    def forward(self, x0, x1):
        y = x0 + x1
        return y

    def backward(self, gy):
        return gy, gy

def add(x0, x1):
    return Add()(x0, x1)

## Step 20: Operator overloading Ⅰ

重载+和*运算符，使得DeZero可以使用自然的代码来表达。

In [35]:
class Mul(Function):
    def forward(self, x0, x1):
        y = x0 * x1
        return y

    def backward(self, gy):
        x0, x1 = self.inputs[0].data, self.inputs[1].data  # inputs是元组
        return gy * x1, gy * x0

def mul(x0, x1):
    return Mul()(x0, x1)

In [16]:
a = Variable(np.array(3.0))
b = Variable(np.array(2.0))
c = Variable(np.array(1.0))

y = add(mul(a, b), c)

y.backward()

print(y)  # variable(7.0)
print(a.grad)  # 2.0
print(b.grad)  # 3.0

variable(7.0)
2.0
3.0


In [36]:
Variable.__add__ = add
Variable.__mul__ = mul

In [18]:
a = Variable(np.array(3.0))
b = Variable(np.array(2.0))
c = Variable(np.array(1.0))

y = a * b + c
y.backward()

print(y)  # variable(7.0)
print(a.grad)  # 2.0
print(b.grad)  # 3.0

variable(7.0)
2.0
3.0


## Step 21: Operator overloading Ⅱ

目前还不支持variable和ndarray的运算，添加代码，使可以与更多类型进行运算。

In [32]:
def as_variable(obj):
    if isinstance(obj, Variable):
        return obj
    return Variable(obj)

In [33]:
class Function:
    def __call__(self, *inputs):
        inputs = [as_variable(x) for x in inputs]  # 修改这里，将输入转换为Variable

        xs = [x.data for x in inputs]
        ys = self.forward(*xs)
        if not isinstance(ys, tuple):
            ys = (ys,)
        outputs = [Variable(as_array(y)) for y in ys]

        if Config.enable_backprop:  # 只有在开启了反向传播的情况下才会执行
            # 如果不启用backprop，包括辈分值、计算的连接关系都可以不用设置
            self.generation = max([x.generation for x in inputs])  # 设置为输入的max generation
            for output in outputs:
                output.set_creator(self)
            self.inputs = inputs
            self.outputs = [weakref.ref(output) for output in outputs]  # 使用weakref
        return outputs if len(outputs) > 1 else outputs[0]
    
    def __lt__(self, other):  # 为了使用heapq，需要实现小于比较
        return self.generation > other.generation

    def forward(self, xs):
        raise NotImplementedError()

    def backward(self, gys):
        raise NotImplementedError()

In [37]:
x = Variable(np.array(2.0))
y = x + np.array(3.0)
print(y)  # variable(5.0)

variable(5.0)


In [38]:
class Mul(Function):
    def forward(self, x0, x1):
        y = x0 * x1
        return y

    def backward(self, gy):
        x0, x1 = self.inputs[0].data, self.inputs[1].data  # inputs是元组
        return gy * x1, gy * x0

def mul(x0, x1):
    x1 = as_array(x1)  # 将x1转换为ndarray，保证能够进行乘法运算
    return Mul()(x0, x1)

class Add(Function):
    def forward(self, x0, x1):
        y = x0 + x1
        return y

    def backward(self, gy):
        return gy, gy

def add(x0, x1):
    x1 = as_array(x1)  # 将x1转换为ndarray，保证能够进行加法运算
    return Add()(x0, x1)

In [39]:
# 重载运算符，左项和右项都重载。因为加法和乘法是可交换的，所以左右重载是一样的
Variable.__add__ = add
Variable.__radd__ = add
Variable.__mul__ = mul
Variable.__rmul__ = mul

In [40]:
x = Variable(np.array(2.0))
y = 3.0 * x + 1.0
print(y)  # variable(7.0)

variable(7.0)


但是目前如果左项是ndarray，右项是variable，会报错，因为python会先调用ndarray的__add__方法，而ndarray没有实现与variable的运算。  
将variable的优先级提高，使得variable的__add__方法先被调用。

In [41]:
class Variable:
    __array_priority__ = 200  # 优先级设置为200

    def __init__(self, data, name=None):
        if data is not None:
            if not isinstance(data, np.ndarray):
                raise TypeError('{} is not supported'.format(type(data)))

        self.data = data
        self.name = name  # 添加name属性
        self.grad = None
        self.creator = None
        self.generation = 0

    @property
    def shape(self):
        return self.data.shape

    @property
    def ndim(self):
        return self.data.ndim

    @property
    def size(self):
        return self.data.size

    @property
    def dtype(self):
        return self.data.dtype

    def __len__(self):
        return len(self.data)

    def __repr__(self):
        if self.data is None:
            return 'variable(None)'
        p = str(self.data).replace('\n', '\n' + ' ' * 9)
        return 'variable(' + p + ')'

    def set_creator(self, func):
        self.creator = func
        self.generation = func.generation + 1

    def cleargrad(self):
        self.grad = None

    def backward(self, retain_grad=False):
        if self.grad is None:
            self.grad = np.ones_like(self.data)

        funcs = []
        seen_set = set()

        def add_func(f):
            if f not in seen_set:
                heapq.heappush(funcs, f)
                seen_set.add(f)

        add_func(self.creator)

        while funcs:
            f = heapq.heappop(funcs)  
            gys = [output().grad for output in f.outputs]  # output is weakref
            gxs = f.backward(*gys)
            if not isinstance(gxs, tuple):
                gxs = (gxs,)

            for x, gx in zip(f.inputs, gxs):
                if x.grad is None:
                    x.grad = gx
                else:
                    x.grad = x.grad + gx

                if x.creator is not None:
                    add_func(x.creator)

            if not retain_grad:
                for y in f.outputs:
                    y().grad = None  # y is weakref

## Step 22: Operator overloading Ⅲ

重载更多的运算符，使得DeZero可以使用自然的代码来表达：
1. 负数
2. 减法
3. 除法
4. 幂运算
5. +=和-=运算

重载的步骤：
1. 继承Function类并实现所需的函数类（Mul类）
2. 使其能做为python函数使用（mul函数）
3. 为Variable类添加运算符重载方法（Variable.\_\_mul\_\_=mul）

In [42]:
class Neg(Function):
    def forward(self, x):
        return -x

    def backward(self, gy):
        return -gy

def neg(x):
    return Neg()(x)

Variable.__neg__ = neg

In [43]:
x = Variable(np.array(2.0))
y = -x
print(y)  # variable(-2.0)

variable(-2.0)


In [45]:
class Sub(Function):
    def forward(self, x0, x1):
        y = x0 - x1
        return y

    def backward(self, gy):
        return gy, -gy
    
def sub(x0, x1):
    x1 = as_array(x1)
    return Sub()(x0, x1)

def rsub(x0, x1):
    x1 = as_array(x1)
    return Sub()(x1, x0)  # 减法是不可交换的，所以需要调换顺序

Variable.__sub__ = sub
Variable.__rsub__ = rsub

In [46]:
x = Variable(np.array(2.0))
y1 = 2.0 - x
y2 = x - 1.0
print(y1)  # variable(0.0)
print(y2)  # variable(1.0)

variable(0.0)
variable(1.0)


In [47]:
class Div(Function):
    def forward(self, x0, x1):
        y = x0 / x1
        return y

    def backward(self, gy):
        x0, x1 = self.inputs[0].data, self.inputs[1].data
        gx0 = gy / x1
        gx1 = gy * (-x0 / x1 ** 2)
        return gx0, gx1
    
def div(x0, x1):
    x1 = as_array(x1)
    return Div()(x0, x1)

def rdiv(x0, x1):
    x1 = as_array(x1)
    return Div()(x1, x0)

Variable.__truediv__ = div
Variable.__rtruediv__ = rdiv

In [49]:
x = Variable(np.array(2.0))
y1 = 2.0 / x
y2 = x / 2.0

print(y1)  # variable(1.0)
print(y2)  # variable(1.0)

variable(1.0)
variable(1.0)


In [48]:
class Pow(Function):
    def __init__(self, c):
        self.c = c

    def forward(self, x):
        y = x ** self.c
        return y

    def backward(self, gy):
        x = self.inputs[0].data
        c = self.c
        gx = c * x ** (c - 1) * gy
        return gx

def pow(x, c):
    return Pow(c)(x)

Variable.__pow__ = pow

In [50]:
x = Variable(np.array(2.0))
y = x ** 3
print(y)  # variable(8.0)

variable(8.0)


In [51]:
Variable.__iadd__ = add
Variable.__isub__ = sub
Variable.__imul__ = mul
Variable.__itruediv__ = div

In [52]:
x = Variable(np.array(1.0))
x += 2.0
print(x)  # variable(3.0)
x *= 2.0
print(x)  # variable(6.0)
x /= 2.0
print(x)  # variable(3.0)
x -= 2.0
print(x)  # variable(1.0)

variable(3.0)
variable(6.0)
variable(3.0)
variable(1.0)


## Step 23: Package DeZero

把项目打包成一个包，使得可以在其他地方使用DeZero

In [53]:
from dezero.core_simple import Variable

x = Variable(np.array(1.0))
print(x)  # variable(1.0)

variable(1.0)


In [2]:
from dezero import Variable

x = Variable(np.array(1.0))
y = (x + 3) ** 2
y.backward()

print(y)  # variable(16.0)
print(x.grad)  # 8.0

variable(16.0)
8.0


## Step 24: Derivatives of complex functions

1. Sphere函数
2. Matyas函数
3. Goldstein-Price函数

In [2]:
from dezero import Variable

In [3]:
def sphere(x, y):
    z = x ** 2 + y ** 2
    return z

x = Variable(np.array(1.0))
y = Variable(np.array(1.0))
z = sphere(x, y)
z.backward()
print(x.grad, y.grad)  # 2.0 2.0

2.0 2.0


In [4]:
def matyas(x, y):
    z = 0.26 * (x ** 2 + y ** 2) - 0.48 * x * y
    return z

x = Variable(np.array(1.0))
y = Variable(np.array(1.0))
z = matyas(x, y)
z.backward()
print(x.grad, y.grad)  # 0.04 0.04

0.040000000000000036 0.040000000000000036


In [5]:
def goldstein(x, y):
    z = (1 + (x + y + 1) ** 2 * (19 - 14 * x + 3 * x ** 2 - 14 * y + 6 * x * y + 3 * y ** 2)) * \
        (30 + (2 * x - 3 * y) ** 2 * (18 - 32 * x + 12 * x ** 2 + 48 * y - 36 * x * y + 27 * y ** 2))
    return z

x = Variable(np.array(1.0))
y = Variable(np.array(1.0))
z = goldstein(x, y)
z.backward()
print(x.grad, y.grad)  # -5376.0 8064.0

-5376.0 8064.0
