# Lab 09-2. Weight initialization

In [14]:
import torch
device = 'cuda:0' if torch.cuda.is_available() else 'cpu'

**MNIST_nn_Xavier**

In [15]:
# nn Layer
linear1 = torch.nn.Linear(784, 256, bias=True)
linear2 = torch.nn.Linear(256, 256, bias=True)
linear3 = torch.nn.Linear(256,  10, bias=True)
relu    = torch.nn.ReLU()

In [16]:
torch.nn.init.xavier_uniform_(linear1.weight)
torch.nn.init.xavier_uniform_(linear2.weight)
torch.nn.init.xavier_uniform_(linear3.weight)

Parameter containing:
tensor([[-0.0111,  0.0615, -0.0678,  ...,  0.1342,  0.0351,  0.0437],
        [-0.0995,  0.0277, -0.1097,  ...,  0.1391,  0.0154, -0.1141],
        [-0.0492,  0.0552,  0.0854,  ...,  0.0138,  0.1426,  0.0364],
        ...,
        [ 0.0177,  0.1155,  0.0362,  ...,  0.0865,  0.1241, -0.1477],
        [-0.1026, -0.0978,  0.0849,  ..., -0.0411, -0.0823, -0.0441],
        [ 0.1100,  0.1408, -0.0384,  ...,  0.0024, -0.0269, -0.0854]],
       requires_grad=True)

In [17]:
model = torch.nn.Sequential(linear1, relu, linear2, relu, linear3).to(device)

In [18]:
# params
learning_rate = 1e-3
training_epochs = 15
batch_size = 100

**Data Loader**

In [19]:
import torchvision.datasets as datasets
import torchvision.transforms as transforms

In [20]:
mnist_train = datasets.MNIST(root='MNIST_data/',
                             train=True,
                             transform=transforms.ToTensor(),
                             download=True)

mnist_test  = datasets.MNIST(root='MNIST_data/',
                             train=True,
                             transform=transforms.ToTensor(),
                             download=True)

In [21]:
data_loader = torch.utils.data.DataLoader(dataset=mnist_train,
                                          batch_size=batch_size,
                                          shuffle=True,
                                          drop_last=True)

**Training**

In [22]:
criterion = torch.nn.CrossEntropyLoss().to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [23]:
total_batch = len(data_loader)

for epoch in range(training_epochs):
    
    avg_cost = 0

    for X, Y in data_loader:
        X = X.view(-1, 28*28).to(device)
        Y = Y.to(device)

        optimizer.zero_grad()
        hypothesis = model(X)
        cost = criterion(hypothesis, Y)
        cost.backward()
        optimizer.step()

        avg_cost += cost / total_batch

    print(f'Epoch: {epoch+1:4d}, Cost: {avg_cost:.9f}')

Epoch:    1, Cost: 0.245310947
Epoch:    2, Cost: 0.090420090
Epoch:    3, Cost: 0.061163917
Epoch:    4, Cost: 0.042206649
Epoch:    5, Cost: 0.032298882
Epoch:    6, Cost: 0.024326237
Epoch:    7, Cost: 0.021227337
Epoch:    8, Cost: 0.017240057
Epoch:    9, Cost: 0.015605732
Epoch:   10, Cost: 0.015432404
Epoch:   11, Cost: 0.011563168
Epoch:   12, Cost: 0.012802530
Epoch:   13, Cost: 0.012358668
Epoch:   14, Cost: 0.007978997
Epoch:   15, Cost: 0.010234555


**Test**

In [24]:
import warnings
warnings.filterwarnings(action='ignore')

In [25]:
with torch.no_grad():
    X_test = mnist_test.test_data.view(-1, 28*28).float().to(device)
    Y_test = mnist_test.test_labels.to(device)

    prediction = model(X_test)
    correct_prediction = torch.argmax(prediction, 1) == Y_test
    accuracy = correct_prediction.float().mean()
    print(f"Accuracy: {accuracy.item()*100:.4f}%")

Accuracy: 99.6517%
