In [None]:
from torch import nn
import torch

In [None]:
batch_size=256

In [None]:
import torchvision

In [None]:
my_transforms = torchvision.transforms.ToTensor()

In [None]:
train_dataset = torchvision.datasets.FashionMNIST(root='./data', download=False, train=True, transform=my_transforms)
test_dataset = torchvision.datasets.FashionMNIST(root='./data', download=False, train=False, transform=my_transforms)

In [None]:
train_iter = torch.utils.data.DataLoader(train_dataset, shuffle=True, batch_size=batch_size, num_workers=4)
test_iter = torch.utils.data.DataLoader(test_dataset, shuffle=False, batch_size=batch_size, num_workers=4)

In [None]:
import matplotlib.pyplot as plt

for X, y in train_iter:
    print(y[0])
    pic = X[0]
    plt.imshow(pic.permute(1,2,0))
    break

# data loaded
    

In [None]:
def get_fashion_mnist_labels(labels):
    label_encoded = [
        't-shirt', 'trouser', 'pullover', 'dress', 'coat', 'sandal', 'shirt',
        'sneaker', 'bag', 'ankle boot']
    label_texts = []
    for label in labels:
        label_texts.append(label_encoded[label])
    
    return label_texts

In [None]:
num_inputs = 784
num_outputs = 10
num_hidden =256

W1 = torch.normal(0,1,(num_inputs,num_hidden), requires_grad=True)
b1 = torch.zeros(num_hidden, requires_grad=True)

W2 = torch.normal(0,1,(num_hidden,num_outputs), requires_grad=True)
b2 = torch.zeros(num_outputs, requires_grad=True)

In [None]:
params = [W1, b1, W2, b2]

In [None]:
def relu(X):
    a = torch.zeros_like(X)
    return torch.max(X,a)

In [None]:
def net(X):
    X = X.reshape(-1,num_inputs)
    H = relu(X @ W1 + b1)
    return H @ W2 + b2

In [None]:
loss = nn.CrossEntropyLoss()

In [None]:
updater = torch.optim.SGD(params, lr=0.1)

In [None]:
def accuracy(y_hat, y):
    return (y_hat.argmax(1)==y).sum()

In [None]:
# Training

num_epochs = 10
train_acc_array = []
train_loss_array = []

for epoch in range(num_epochs):
    train_loss = 0
    total_n = 0
    train_acc = 0

    for X, y in train_iter:
        y_hat = net(X)
#         print(y_hat)
        l = loss(y_hat, y)
        updater.zero_grad()
        l.backward()
        updater.step()
    
        train_loss += l
        total_n += len(y)
        train_acc += accuracy(y_hat, y)
    
    avg_acc = train_acc/total_n
    avg_loss = train_loss/total_n
    
    print(f"for epoch {epoch} avg_loss {avg_loss}")
    
    train_acc_array.append(avg_acc)
    train_loss_array.append(avg_loss)


with torch.no_grad():    
    plt.plot(range(epoch+1), train_acc_array, label="train acc")
    plt.legend()
    plt.grid(True)
    plt.show()
    plt.plot(range(epoch+1), train_loss_array, label="train loss")
    plt.show()
    

In [None]:
def prediction_ch3(net, test_iter, n=6):
    for X,y in test_iter:
        break
    
    predicted_labels = net(X[:n]).argmax(dim=1)
    actual_labels = y[:n]
    
#     print(predicted_labels, actual_labels)
    
    show_images(X[:n], 2, 3, title=get_fashion_mnist_labels(predicted_labels))
    print(get_fashion_mnist_labels(actual_labels))

In [None]:
def show_images (imgs,num_cols, num_rows, title=None, scale=0.5):
    figsize = (num_cols* scale, num_rows * scale)
    plt.subplots_adjust(hspace=0.8, wspace=0.2)
    for i in range(len(imgs)):
#         plt.figure(figsize=figsize)
        plt.subplot(num_cols, num_rows, i+1)
        plt.imshow(imgs[i].permute(1,2,0))
#         plt.text(0.5, -0.02,title[i],fontsize=9 )
        plt.title(title[i], fontsize=9)
        plt.axis('off')
        plt.grid(b=None)

In [None]:
prediction_ch3(net, test_iter, n=6)

### Exercises
1. Change the value of the hyperparameter num_hiddens and see how this hyperparameter influences your results. Determine the best value of this hyperparameter, keeping all others
constant.
2. Try adding an additional hidden layer to see how it affects the results.
3. How does changing the learning rate alter your results? Fixing the model architecture and
other hyperparameters (including number of epochs), what learning rate gives you the best
results?
4. What is the best result you can get by optimizing over all the hyperparameters (learning rate,
number of epochs, number of hidden layers, number of hidden units per layer) jointly?
5. Describe why it is much more challenging to deal with multiple hyperparameters.
6. What is the smartest strategy you can think of for structuring a search over multiple hyperparameters?

In [None]:
# 1

hidden_num_array = range(20,200,20)

for hidden_num in hidden_num_array:
    W1 = nn.Parameter( torch.randn(num_inputs, hidden_num, requires_grad=True) *0.01)
    b1 = nn.Parameter(torch.zeros(hidden_num),requires_grad=True)
    W2 = nn.Parameter(torch.randn(hidden_num,num_outputs, requires_grad=True)*0.01)
    b2 = nn.Parameter(torch.zeros(num_outputs),requires_grad=True)
    
    params = [W1,b1,W2,b2]
    
    for param in params:
        if param.grad:
            param.grad.zero_()

    num_epochs = 10
    train_acc_array = []
    train_loss_array = []

    for epoch in range(num_epochs):
        train_loss = 0
        total_n = 0
        train_acc = 0

        for X, y in train_iter:
            y_hat = net(X)
    #         print(y_hat)
            l = loss(y_hat, y)
            updater.zero_grad()
            l.backward()
            updater.step()

            train_loss += l
            total_n += len(y)
            train_acc += accuracy(y_hat, y)

        avg_acc = train_acc/total_n
        avg_loss = train_loss/total_n

        print(f"for epoch {epoch} avg_loss {avg_loss}")

        train_acc_array.append(avg_acc)
        train_loss_array.append(avg_loss)


    with torch.no_grad():    
        plt.plot(range(epoch+1), train_acc_array, label="train acc")
        plt.legend()
        plt.grid(True)
        plt.show()
        plt.plot(range(epoch+1), train_loss_array, label="train loss")
        plt.show()
    
    print(f"Best value for hidden_num:  {hidden_num} : {train_loss_array[-1]}")

In [None]:
#2


