## Gradient Desent

### $\theta_{W_t} = \theta_{W_{t-1}} - \eta \frac{\partial L}{\partial W_{t-1}} = \theta_{W_{t-1}} - \eta {\nabla}W_{t-1}$  

## Momentum
### $v_t = \gamma v_{t-1} +  \nabla W_{t-1} $
### $ W_t = W_{t-1} - \eta v_t$

### Fully-connected Layer(Dense Layer)
## $H(x) = Wx + b$
### - x = [1,2,3]
### - y = [1,2,3]

In [113]:
import numpy as np
import torch
import torch.nn as nn

In [114]:
import random
random_seed =2
torch.manual_seed(random_seed)
torch.cuda.manual_seed(random_seed)
torch.cuda.manual_seed_all(random_seed) # if use multi-GPU
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
np.random.seed(random_seed)
random.seed(random_seed)

In [115]:
class model(nn.Module):
    def __init__(self):
        super().__init__()
        # 앞의 1 = 입력 노드의 개수(100), 뒤의 1 = 출력 노드의 개수(히든층 혹은 출력층)(10) = 1
        self.linear = nn.Linear(1,1,bias=True) # W = [1] , B = [1]
        
    def forward(self,x):
        return self.linear(x)
    

In [116]:
model1 = model()
model2 = model()

In [117]:
x_train = torch.tensor(np.array([[1],[2],[3],[4],[5]]),dtype=torch.float)
y_train = torch.tensor(np.array([[1],[2],[3],[4],[5]]))

In [118]:
class mean_squred_error(nn.Module):
    def __init__(self):
        super().__init__()
    def forward(self,x,y):
        return torch.mean(torch.square(x-y))

In [119]:
cost = mean_squred_error()

In [120]:
learning_rate = 0.1
optim1 = torch.optim.SGD(model1.parameters(),lr=learning_rate)
optim2 = torch.optim.SGD(model2.parameters(),lr=learning_rate,momentum=0.9)

In [121]:
print(model1.linear.weight,model1.linear.bias)
print(model2.linear.weight,model2.linear.bias)

Parameter containing:
tensor([[0.2294]], requires_grad=True) Parameter containing:
tensor([-0.2380], requires_grad=True)
Parameter containing:
tensor([[0.2742]], requires_grad=True) Parameter containing:
tensor([-0.0511], requires_grad=True)


In [122]:
model2.load_state_dict(model1.state_dict())

<All keys matched successfully>

In [123]:
print(model1.linear.weight,model1.linear.bias)
print(model2.linear.weight,model2.linear.bias)

Parameter containing:
tensor([[0.2294]], requires_grad=True) Parameter containing:
tensor([-0.2380], requires_grad=True)
Parameter containing:
tensor([[0.2294]], requires_grad=True) Parameter containing:
tensor([-0.2380], requires_grad=True)


### Initialization Weight and Bias
#### - W = 0.2294
#### - B = -0.2380

In [124]:
optim1.zero_grad()

pred = model1(x_train)

loss = cost(pred,y_train)

loss.backward()
optim1.step()

In [125]:
optim2.zero_grad()

pred = model2(x_train)

loss = cost(pred,y_train)
loss.backward()
optim2.step()

### Print Gradient of Weights and Bias

In [128]:
print(model1.linear.weight.grad,model1.linear.bias.grad)

tensor([[-18.3813]]) tensor([-5.0996])


In [129]:
print(model2.linear.weight.grad,model2.linear.bias.grad)

tensor([[-18.3813]]) tensor([-5.0996])


## Gradient Desent

### $\theta_{W_t} = \theta_{W_{t-1}} - \eta \frac{\partial L}{\partial W_{t-1}} = \theta_{W_{t-1}} - \eta {\nabla}W_{t-1}$  

## Momentum
### $v_t = \gamma v_{t-1} +  \nabla W_{t-1} $
### $ W_t = W_{t-1} - \eta v_t$

In [131]:
print(model1.linear.weight,model1.linear.bias)
print(model2.linear.weight,model2.linear.bias)

Parameter containing:
tensor([[2.0675]], requires_grad=True) Parameter containing:
tensor([0.2720], requires_grad=True)
Parameter containing:
tensor([[2.0675]], requires_grad=True) Parameter containing:
tensor([0.2720], requires_grad=True)


### $ v_{t-1} = 0$
### $v_t = 0.9 \times 0 + (-18.3813)  = -18.3813$

In [132]:
optim1.zero_grad()

pred = model1(x_train)

loss = cost(pred,y_train)
loss.backward()
optim1.step()

In [133]:
optim2.zero_grad()

pred = model2(x_train)

loss = cost(pred,y_train)
loss.backward()
optim2.step()

In [134]:
print(model1.linear.weight.grad,model1.linear.bias.grad)
print(model2.linear.weight.grad,model2.linear.bias.grad)

tensor([[25.1173]]) tensor([6.9491])
tensor([[25.1173]]) tensor([6.9491])


## GD

### $ W_t = W_{t-1} - (0.1 \times 25.1173)$
### $ W_t = 2.0675 - (0.1 \times 25.1173) = 2.0675 - 0.857413 = -0.44423$

## Momentum

### $ v_{t-1} = -18.3813$
### $v_t = 0.9 \times -18.3813 + 25.1173  = -16.54317 + 25.1173 = 8.57413$
### $ W_t = W_{t-1} - (0.1 \times 8.57413)$
### $ W_t = 2.0675 - (0.1 \times 8.57413) = 2.0675 - 0.857413 = 1.210087$

In [135]:
print(model1.linear.weight,model1.linear.bias)
print(model2.linear.weight,model2.linear.bias)

Parameter containing:
tensor([[-0.4442]], requires_grad=True) Parameter containing:
tensor([-0.4229], requires_grad=True)
Parameter containing:
tensor([[1.2101]], requires_grad=True) Parameter containing:
tensor([0.0360], requires_grad=True)
