## Loading the Data

In [1]:
import torch
import torchvision
from torch.utils import data
from torchvision import transforms

In [2]:
data_transform = transforms.ToTensor() # Obtaining data to tensor converter

In [3]:
mnist_train = torchvision.datasets.FashionMNIST(root = "../data", train = True, transform = data_transform, download= True)  # Defining fashion MNIST train from torch datasets

  return torch.from_numpy(parsed.astype(m[2], copy=False)).view(*s)


In [4]:
mnist_train.train_labels, mnist_train # Get back a class that contains training data and labels



(tensor([9, 0, 0,  ..., 3, 0, 5]),
 Dataset FashionMNIST
     Number of datapoints: 60000
     Root location: ../data
     Split: Train
     StandardTransform
 Transform: ToTensor())

In [5]:
mnist_test = torchvision.datasets.FashionMNIST(root = "../data", train = False, transform = data_transform, download = True)

In [6]:
mnist_test # Test set Tensor transformed display

Dataset FashionMNIST
    Number of datapoints: 10000
    Root location: ../data
    Split: Test
    StandardTransform
Transform: ToTensor()

## Defining a Data Iterator

In [7]:
batch_size = 128
# Defining iterator to iterate through training set
train_data_loader = data.DataLoader(mnist_train, batch_size, shuffle = True, num_workers = 4)

In [8]:
# Defining identical data loader fo test set
test_data_loader = data.DataLoader(mnist_test, batch_size, shuffle = True, num_workers = 4)

In [9]:
# Converting to function for future use, default num_workers is 4 bc CPU threads
def load_fashion_mnist(batch_size: int = 512, num_workers: int = 4):
    data_transform = transforms.ToTensor() # Obtaining data to tensor converter
    
    # Downloading data
    mnist_train = torchvision.datasets.FashionMNIST(root = "../data", train = True, transform = data_transform, download= True)  # Defining fashion MNIST train from torch datasets
    mnist_test = torchvision.datasets.FashionMNIST(root = "../data", train = False, transform = data_transform, download = True)
    
    # Loading data onto an iterator
    train_data_loader = data.DataLoader(mnist_train, batch_size, shuffle = True, num_workers = 4)
    test_data_loader = data.DataLoader(mnist_test, batch_size, shuffle = True, num_workers = 4)
    
    # Returning iterator
    return train_data_loader, test_data_loader 
    

## Softmax Regression Implementation

In [10]:
train_iter, test_iter = load_fashion_mnist(128, 4) # Loading train and test iterators for softmax implementation

In [11]:
# Softmax regression: map from an input to label probabilities (class confidences) in continuous space
# Regress using gradient towards a solution which minimizes error.

# Will flatten input image
input_img_size = 784
output_space = 10

# need to use weights to map from input space (784) to output space (each of 10 cols weights 784 pixels in a featurous way so as to produce output)
w = torch.normal(0, 0.1, (input_img_size, output_space), requires_grad = True) # Initializing around 0 (with a small SD so not exactly)
b = torch.zeros(output_space, requires_grad = True) # want each neuron to have a linear bias shifer

In [12]:
# Quick summing review: remember that largest dim = innermost/ most nested dim
X = torch.Tensor([[1.0, 2.0, 3.0], [7.0, 8.0, 9.0], [27.0, 26.2, 1.1]])
X.sum(0, keepdims = True) # Allows maintanence of nested dimension even though it has collapsed (there is no need for it, it is 1)

tensor([[35.0000, 36.2000, 13.1000]])

In [13]:
X.sum((0,1), keepdims = True), X.sum((0,1)), X.sum() # Collapse dimension removes uneccessary 
# dim = 1, no collapse => remains nested, otherwise total sum will yield a scalar

(tensor([[84.3000]]), tensor(84.3000), tensor(84.3000))

**Quick conceptual understanding of sums**:

If the largest dimension is the innermost, that can be thought of as a row/record in which the values belong in 1 dimensional data, in 2 dimensional data this is the last 2 dimensions. The representation of a record can be summed across all records which is the next most nested dimension, which could then be summed across all tables (3rd most inner dimension). Hence summing across the final 2 dims in 1D data is summing across the whole table. Keep dimension simply groups all summed attributes in a single attribute and does not delete it due to uselessness.

In [14]:
def softmax(X: torch.Tensor):
    # Mapped to the positive space with the magnitudinal differences of the exponential
    exponentiated_activations = torch.exp(X)
    sum_exponentiated_activations = exponentiated_activations.sum(1, keepdims = True)
    
    # Note: put under complex variable names for understanding
    mapped_probabilities = exponentiated_activations/sum_exponentiated_activations
    return mapped_probabilities

In [15]:
softmax(torch.Tensor([[0.2, 1.2, 2.4]])) # Exponential differences in confidence visible (+ 1 =occupies 2.7x more of exponentiated sum)
# Benefits of softmax - maps to 0,1 space but assigns tiny probabilities to negative activations relative to positive if positive exist, order of magnitude less
# Maps relative to other confidences
# Assigns non-negligible probabilities to every event but shows initial confidence in high activations

tensor([[0.0785, 0.2133, 0.7082]])

In [16]:
# Accuracy = sum(predictions == lables) / # of labels = rate of correctness


## Neural Network Implementation

In [32]:
# All layer names are in torch.nn and are capitalized
model = torch.nn.Sequential(torch.nn.Flatten(), torch.nn.Linear(input_img_size, output_space), torch.nn.Softmax()) # Autoflatten data to 1D layer

In [37]:
# Defining a trainer
trainer = torch.optim.AdamW(model.parameters(), lr = 0.03)
loss = torch.nn.CrossEntropyLoss() #losses in torch.nn, just as layers are, also capitalized

In [38]:
# Initializing weights randomly and applying to Neural Network
# PyTorch apply works on a per-layer basis
def init_weights(layer: torch.nn):
    if isinstance(layer, torch.nn.Linear): # Note: weight used, not weights in plural in pytorch
        # init.normal initializes any torch layer parameter with normally distributed values
        torch.nn.init.normal_(layer.weight, mean = 0, std = 0.1) # Initializing normal weights by default, bias is 0   

model.apply(init_weights) # Autoinitialize any linear weights with normal inputs
        

Sequential(
  (0): Flatten(start_dim=1, end_dim=-1)
  (1): Linear(in_features=784, out_features=10, bias=True)
  (2): Softmax(dim=None)
)

In [39]:
model[1].weight.data # Proof of autoinitialized weights (flatten layer has no weights, naturally)

tensor([[ 0.1425,  0.1607,  0.0053,  ..., -0.2733,  0.1425, -0.0251],
        [-0.1442, -0.0463, -0.0375,  ...,  0.0690,  0.0218,  0.1156],
        [ 0.0027,  0.1169,  0.0535,  ...,  0.1093,  0.0006,  0.0294],
        ...,
        [ 0.0833, -0.0157,  0.1100,  ..., -0.0693,  0.0212, -0.1627],
        [-0.1634, -0.0866, -0.0770,  ...,  0.1341, -0.0088, -0.2158],
        [-0.1885,  0.1428, -0.0240,  ..., -0.1264,  0.0177,  0.1476]])

**Need to avoid two phenomena with Softmax**:

* Overflow could be avoided by subtracting max, which does not change magnitudal exponentiation relation between values of softmax. Subtracting a constant makes no difference.
* Underflow, or a value being rounded to 0 and then being used in an operation where 0 is an exclusive lower bound (ex: log, division). This could be avoided by the fact that we will use cross entropy as a loss function to measure how far away the probabilities are from the true distribution, so we will be taking the log of the exponential anyway, which cancel out. We can avoid this operation by taking the log in advance and then not having to take the log of a number that has been approximated as 0, leading to NaN's
    * Note to self: revisit cross entropy to actually understand it


In [40]:
# Training
num_epochs = 10
for epoch in range(num_epochs):
    for data, label in train_iter: # Traversing data loader
        trainer.zero_grad() # Starting by resetting gradient of trainer
        cost = loss(model(data), label) # Computing cost
        print("cost: ", cost)
        cost.sum().backward() # Backwards propagating
        trainer.step() # performing step

cost:  tensor(2.3461, grad_fn=<NllLossBackward>)
cost:  tensor(2.2244, grad_fn=<NllLossBackward>)
cost:  tensor(2.1470, grad_fn=<NllLossBackward>)
cost:  tensor(2.0445, grad_fn=<NllLossBackward>)
cost:  tensor(2.1250, grad_fn=<NllLossBackward>)
cost:  tensor(1.9709, grad_fn=<NllLossBackward>)
cost:  tensor(1.9371, grad_fn=<NllLossBackward>)
cost:  tensor(1.9221, grad_fn=<NllLossBackward>)
cost:  tensor(1.9895, grad_fn=<NllLossBackward>)
cost:  tensor(2.0488, grad_fn=<NllLossBackward>)
cost:  tensor(2.0462, grad_fn=<NllLossBackward>)
cost:  tensor(1.9634, grad_fn=<NllLossBackward>)
cost:  tensor(2.0063, grad_fn=<NllLossBackward>)
cost:  tensor(1.9860, grad_fn=<NllLossBackward>)
cost:  tensor(1.7888, grad_fn=<NllLossBackward>)
cost:  tensor(1.8641, grad_fn=<NllLossBackward>)
cost:  tensor(1.8923, grad_fn=<NllLossBackward>)
cost:  tensor(1.9107, grad_fn=<NllLossBackward>)
cost:  tensor(1.8734, grad_fn=<NllLossBackward>)
cost:  tensor(1.8538, grad_fn=<NllLossBackward>)
cost:  tensor(1.9644

cost:  tensor(1.8143, grad_fn=<NllLossBackward>)
cost:  tensor(1.7965, grad_fn=<NllLossBackward>)
cost:  tensor(1.8528, grad_fn=<NllLossBackward>)
cost:  tensor(1.8447, grad_fn=<NllLossBackward>)
cost:  tensor(1.8019, grad_fn=<NllLossBackward>)
cost:  tensor(1.8363, grad_fn=<NllLossBackward>)
cost:  tensor(1.8517, grad_fn=<NllLossBackward>)
cost:  tensor(1.7745, grad_fn=<NllLossBackward>)
cost:  tensor(1.7815, grad_fn=<NllLossBackward>)
cost:  tensor(1.8099, grad_fn=<NllLossBackward>)
cost:  tensor(1.8159, grad_fn=<NllLossBackward>)
cost:  tensor(1.8480, grad_fn=<NllLossBackward>)
cost:  tensor(1.8558, grad_fn=<NllLossBackward>)
cost:  tensor(1.8057, grad_fn=<NllLossBackward>)
cost:  tensor(1.8349, grad_fn=<NllLossBackward>)
cost:  tensor(1.8090, grad_fn=<NllLossBackward>)
cost:  tensor(1.8438, grad_fn=<NllLossBackward>)
cost:  tensor(1.8259, grad_fn=<NllLossBackward>)
cost:  tensor(1.9170, grad_fn=<NllLossBackward>)
cost:  tensor(1.8247, grad_fn=<NllLossBackward>)
cost:  tensor(1.7715

cost:  tensor(1.7521, grad_fn=<NllLossBackward>)
cost:  tensor(1.8170, grad_fn=<NllLossBackward>)
cost:  tensor(1.8160, grad_fn=<NllLossBackward>)
cost:  tensor(1.8011, grad_fn=<NllLossBackward>)
cost:  tensor(1.8022, grad_fn=<NllLossBackward>)
cost:  tensor(1.8842, grad_fn=<NllLossBackward>)
cost:  tensor(1.8026, grad_fn=<NllLossBackward>)
cost:  tensor(1.8409, grad_fn=<NllLossBackward>)
cost:  tensor(1.7198, grad_fn=<NllLossBackward>)
cost:  tensor(1.8347, grad_fn=<NllLossBackward>)
cost:  tensor(1.8170, grad_fn=<NllLossBackward>)
cost:  tensor(1.8242, grad_fn=<NllLossBackward>)
cost:  tensor(1.7589, grad_fn=<NllLossBackward>)
cost:  tensor(1.8157, grad_fn=<NllLossBackward>)
cost:  tensor(1.7829, grad_fn=<NllLossBackward>)
cost:  tensor(1.8679, grad_fn=<NllLossBackward>)
cost:  tensor(1.7960, grad_fn=<NllLossBackward>)
cost:  tensor(1.8116, grad_fn=<NllLossBackward>)
cost:  tensor(1.8016, grad_fn=<NllLossBackward>)
cost:  tensor(1.8570, grad_fn=<NllLossBackward>)
cost:  tensor(1.7953

cost:  tensor(1.8151, grad_fn=<NllLossBackward>)
cost:  tensor(1.8753, grad_fn=<NllLossBackward>)
cost:  tensor(1.8076, grad_fn=<NllLossBackward>)
cost:  tensor(1.8148, grad_fn=<NllLossBackward>)
cost:  tensor(1.7649, grad_fn=<NllLossBackward>)
cost:  tensor(1.8100, grad_fn=<NllLossBackward>)
cost:  tensor(1.7472, grad_fn=<NllLossBackward>)
cost:  tensor(1.7887, grad_fn=<NllLossBackward>)
cost:  tensor(1.7826, grad_fn=<NllLossBackward>)
cost:  tensor(1.7993, grad_fn=<NllLossBackward>)
cost:  tensor(1.8511, grad_fn=<NllLossBackward>)
cost:  tensor(1.8320, grad_fn=<NllLossBackward>)
cost:  tensor(1.7891, grad_fn=<NllLossBackward>)
cost:  tensor(1.8542, grad_fn=<NllLossBackward>)
cost:  tensor(1.7677, grad_fn=<NllLossBackward>)
cost:  tensor(1.8624, grad_fn=<NllLossBackward>)
cost:  tensor(1.8158, grad_fn=<NllLossBackward>)
cost:  tensor(1.8401, grad_fn=<NllLossBackward>)
cost:  tensor(1.8089, grad_fn=<NllLossBackward>)
cost:  tensor(1.8704, grad_fn=<NllLossBackward>)
cost:  tensor(1.8495

cost:  tensor(1.8182, grad_fn=<NllLossBackward>)
cost:  tensor(1.8006, grad_fn=<NllLossBackward>)
cost:  tensor(1.8230, grad_fn=<NllLossBackward>)
cost:  tensor(1.7827, grad_fn=<NllLossBackward>)
cost:  tensor(1.7935, grad_fn=<NllLossBackward>)
cost:  tensor(1.8524, grad_fn=<NllLossBackward>)
cost:  tensor(1.8641, grad_fn=<NllLossBackward>)
cost:  tensor(1.8081, grad_fn=<NllLossBackward>)
cost:  tensor(1.7559, grad_fn=<NllLossBackward>)
cost:  tensor(1.8427, grad_fn=<NllLossBackward>)
cost:  tensor(1.7785, grad_fn=<NllLossBackward>)
cost:  tensor(1.8092, grad_fn=<NllLossBackward>)
cost:  tensor(1.7773, grad_fn=<NllLossBackward>)
cost:  tensor(1.7461, grad_fn=<NllLossBackward>)
cost:  tensor(1.7774, grad_fn=<NllLossBackward>)
cost:  tensor(1.7898, grad_fn=<NllLossBackward>)
cost:  tensor(1.8632, grad_fn=<NllLossBackward>)
cost:  tensor(1.8573, grad_fn=<NllLossBackward>)
cost:  tensor(1.8701, grad_fn=<NllLossBackward>)
cost:  tensor(1.8169, grad_fn=<NllLossBackward>)
cost:  tensor(1.8125

cost:  tensor(1.8378, grad_fn=<NllLossBackward>)
cost:  tensor(1.7978, grad_fn=<NllLossBackward>)
cost:  tensor(1.8478, grad_fn=<NllLossBackward>)
cost:  tensor(1.7868, grad_fn=<NllLossBackward>)
cost:  tensor(1.7702, grad_fn=<NllLossBackward>)
cost:  tensor(1.7453, grad_fn=<NllLossBackward>)
cost:  tensor(1.7943, grad_fn=<NllLossBackward>)
cost:  tensor(1.7892, grad_fn=<NllLossBackward>)
cost:  tensor(1.8178, grad_fn=<NllLossBackward>)
cost:  tensor(1.7296, grad_fn=<NllLossBackward>)
cost:  tensor(1.7676, grad_fn=<NllLossBackward>)
cost:  tensor(1.8091, grad_fn=<NllLossBackward>)
cost:  tensor(1.8634, grad_fn=<NllLossBackward>)
cost:  tensor(1.8393, grad_fn=<NllLossBackward>)
cost:  tensor(1.8241, grad_fn=<NllLossBackward>)
cost:  tensor(1.9340, grad_fn=<NllLossBackward>)
cost:  tensor(1.8790, grad_fn=<NllLossBackward>)
cost:  tensor(1.7407, grad_fn=<NllLossBackward>)
cost:  tensor(1.8026, grad_fn=<NllLossBackward>)
cost:  tensor(1.7998, grad_fn=<NllLossBackward>)
cost:  tensor(1.7645

cost:  tensor(1.7806, grad_fn=<NllLossBackward>)
cost:  tensor(1.8133, grad_fn=<NllLossBackward>)
cost:  tensor(1.8870, grad_fn=<NllLossBackward>)
cost:  tensor(1.8024, grad_fn=<NllLossBackward>)
cost:  tensor(1.7995, grad_fn=<NllLossBackward>)
cost:  tensor(1.8131, grad_fn=<NllLossBackward>)
cost:  tensor(1.8850, grad_fn=<NllLossBackward>)
cost:  tensor(1.7752, grad_fn=<NllLossBackward>)
cost:  tensor(1.7456, grad_fn=<NllLossBackward>)
cost:  tensor(1.7668, grad_fn=<NllLossBackward>)
cost:  tensor(1.7890, grad_fn=<NllLossBackward>)
cost:  tensor(1.8572, grad_fn=<NllLossBackward>)
cost:  tensor(1.8078, grad_fn=<NllLossBackward>)
cost:  tensor(1.7786, grad_fn=<NllLossBackward>)
cost:  tensor(1.7948, grad_fn=<NllLossBackward>)
cost:  tensor(1.8312, grad_fn=<NllLossBackward>)
cost:  tensor(1.8466, grad_fn=<NllLossBackward>)
cost:  tensor(1.8108, grad_fn=<NllLossBackward>)
cost:  tensor(1.7736, grad_fn=<NllLossBackward>)
cost:  tensor(1.7819, grad_fn=<NllLossBackward>)
cost:  tensor(1.8267

cost:  tensor(1.8516, grad_fn=<NllLossBackward>)
cost:  tensor(1.8159, grad_fn=<NllLossBackward>)
cost:  tensor(1.7790, grad_fn=<NllLossBackward>)
cost:  tensor(1.8006, grad_fn=<NllLossBackward>)
cost:  tensor(1.7952, grad_fn=<NllLossBackward>)
cost:  tensor(1.8298, grad_fn=<NllLossBackward>)
cost:  tensor(1.8009, grad_fn=<NllLossBackward>)
cost:  tensor(1.7571, grad_fn=<NllLossBackward>)
cost:  tensor(1.7483, grad_fn=<NllLossBackward>)
cost:  tensor(1.8570, grad_fn=<NllLossBackward>)
cost:  tensor(1.7815, grad_fn=<NllLossBackward>)
cost:  tensor(1.7565, grad_fn=<NllLossBackward>)
cost:  tensor(1.8552, grad_fn=<NllLossBackward>)
cost:  tensor(1.8279, grad_fn=<NllLossBackward>)
cost:  tensor(1.8148, grad_fn=<NllLossBackward>)
cost:  tensor(1.8030, grad_fn=<NllLossBackward>)
cost:  tensor(1.8066, grad_fn=<NllLossBackward>)
cost:  tensor(1.7625, grad_fn=<NllLossBackward>)
cost:  tensor(1.8291, grad_fn=<NllLossBackward>)
cost:  tensor(1.7981, grad_fn=<NllLossBackward>)
cost:  tensor(1.7864

cost:  tensor(1.7100, grad_fn=<NllLossBackward>)
cost:  tensor(1.7943, grad_fn=<NllLossBackward>)
cost:  tensor(1.8319, grad_fn=<NllLossBackward>)
cost:  tensor(1.8340, grad_fn=<NllLossBackward>)
cost:  tensor(1.8397, grad_fn=<NllLossBackward>)
cost:  tensor(1.8815, grad_fn=<NllLossBackward>)
cost:  tensor(1.8140, grad_fn=<NllLossBackward>)
cost:  tensor(1.8995, grad_fn=<NllLossBackward>)
cost:  tensor(1.8037, grad_fn=<NllLossBackward>)
cost:  tensor(1.8071, grad_fn=<NllLossBackward>)
cost:  tensor(1.8384, grad_fn=<NllLossBackward>)
cost:  tensor(1.8103, grad_fn=<NllLossBackward>)
cost:  tensor(1.7546, grad_fn=<NllLossBackward>)
cost:  tensor(1.8007, grad_fn=<NllLossBackward>)
cost:  tensor(1.8092, grad_fn=<NllLossBackward>)
cost:  tensor(1.9100, grad_fn=<NllLossBackward>)
cost:  tensor(1.7697, grad_fn=<NllLossBackward>)
cost:  tensor(1.7415, grad_fn=<NllLossBackward>)
cost:  tensor(1.7887, grad_fn=<NllLossBackward>)
cost:  tensor(1.8249, grad_fn=<NllLossBackward>)
cost:  tensor(1.8017

cost:  tensor(1.8497, grad_fn=<NllLossBackward>)
cost:  tensor(1.8580, grad_fn=<NllLossBackward>)
cost:  tensor(1.8011, grad_fn=<NllLossBackward>)
cost:  tensor(1.7811, grad_fn=<NllLossBackward>)
cost:  tensor(1.8229, grad_fn=<NllLossBackward>)
cost:  tensor(1.7865, grad_fn=<NllLossBackward>)
cost:  tensor(1.8715, grad_fn=<NllLossBackward>)
cost:  tensor(1.8063, grad_fn=<NllLossBackward>)
cost:  tensor(1.7815, grad_fn=<NllLossBackward>)
cost:  tensor(1.8075, grad_fn=<NllLossBackward>)
cost:  tensor(1.7680, grad_fn=<NllLossBackward>)
cost:  tensor(1.8014, grad_fn=<NllLossBackward>)
cost:  tensor(1.8452, grad_fn=<NllLossBackward>)
cost:  tensor(1.7481, grad_fn=<NllLossBackward>)
cost:  tensor(1.7889, grad_fn=<NllLossBackward>)
cost:  tensor(1.8342, grad_fn=<NllLossBackward>)
cost:  tensor(1.8563, grad_fn=<NllLossBackward>)
cost:  tensor(1.7989, grad_fn=<NllLossBackward>)
cost:  tensor(1.8449, grad_fn=<NllLossBackward>)
cost:  tensor(1.8425, grad_fn=<NllLossBackward>)
cost:  tensor(1.8335

cost:  tensor(1.8177, grad_fn=<NllLossBackward>)
cost:  tensor(1.7702, grad_fn=<NllLossBackward>)
cost:  tensor(1.8950, grad_fn=<NllLossBackward>)
cost:  tensor(1.7996, grad_fn=<NllLossBackward>)
cost:  tensor(1.8096, grad_fn=<NllLossBackward>)
cost:  tensor(1.7742, grad_fn=<NllLossBackward>)
cost:  tensor(1.8755, grad_fn=<NllLossBackward>)
cost:  tensor(1.7780, grad_fn=<NllLossBackward>)
cost:  tensor(1.8023, grad_fn=<NllLossBackward>)
cost:  tensor(1.8330, grad_fn=<NllLossBackward>)
cost:  tensor(1.8606, grad_fn=<NllLossBackward>)
cost:  tensor(1.7499, grad_fn=<NllLossBackward>)
cost:  tensor(1.8417, grad_fn=<NllLossBackward>)
cost:  tensor(1.8010, grad_fn=<NllLossBackward>)
cost:  tensor(1.8462, grad_fn=<NllLossBackward>)
cost:  tensor(1.7819, grad_fn=<NllLossBackward>)
cost:  tensor(1.7958, grad_fn=<NllLossBackward>)
cost:  tensor(1.8117, grad_fn=<NllLossBackward>)
cost:  tensor(1.8075, grad_fn=<NllLossBackward>)
cost:  tensor(1.7895, grad_fn=<NllLossBackward>)
cost:  tensor(1.8429

cost:  tensor(1.7807, grad_fn=<NllLossBackward>)
cost:  tensor(1.7788, grad_fn=<NllLossBackward>)
cost:  tensor(1.7163, grad_fn=<NllLossBackward>)
cost:  tensor(1.6665, grad_fn=<NllLossBackward>)
cost:  tensor(1.6727, grad_fn=<NllLossBackward>)
cost:  tensor(1.6861, grad_fn=<NllLossBackward>)
cost:  tensor(1.7854, grad_fn=<NllLossBackward>)
cost:  tensor(1.6955, grad_fn=<NllLossBackward>)
cost:  tensor(1.7019, grad_fn=<NllLossBackward>)
cost:  tensor(1.7213, grad_fn=<NllLossBackward>)
cost:  tensor(1.6727, grad_fn=<NllLossBackward>)
cost:  tensor(1.7310, grad_fn=<NllLossBackward>)
cost:  tensor(1.6858, grad_fn=<NllLossBackward>)
cost:  tensor(1.7761, grad_fn=<NllLossBackward>)
cost:  tensor(1.7201, grad_fn=<NllLossBackward>)
cost:  tensor(1.6809, grad_fn=<NllLossBackward>)
cost:  tensor(1.6943, grad_fn=<NllLossBackward>)
cost:  tensor(1.6979, grad_fn=<NllLossBackward>)
cost:  tensor(1.7316, grad_fn=<NllLossBackward>)
cost:  tensor(1.7010, grad_fn=<NllLossBackward>)
cost:  tensor(1.7288

cost:  tensor(1.6921, grad_fn=<NllLossBackward>)
cost:  tensor(1.6921, grad_fn=<NllLossBackward>)
cost:  tensor(1.7177, grad_fn=<NllLossBackward>)
cost:  tensor(1.6758, grad_fn=<NllLossBackward>)
cost:  tensor(1.7316, grad_fn=<NllLossBackward>)
cost:  tensor(1.7316, grad_fn=<NllLossBackward>)
cost:  tensor(1.6566, grad_fn=<NllLossBackward>)
cost:  tensor(1.7100, grad_fn=<NllLossBackward>)
cost:  tensor(1.7956, grad_fn=<NllLossBackward>)
cost:  tensor(1.7791, grad_fn=<NllLossBackward>)
cost:  tensor(1.6798, grad_fn=<NllLossBackward>)
cost:  tensor(1.7061, grad_fn=<NllLossBackward>)
cost:  tensor(1.7378, grad_fn=<NllLossBackward>)
cost:  tensor(1.7157, grad_fn=<NllLossBackward>)
cost:  tensor(1.7499, grad_fn=<NllLossBackward>)
cost:  tensor(1.7165, grad_fn=<NllLossBackward>)
cost:  tensor(1.6726, grad_fn=<NllLossBackward>)
cost:  tensor(1.7049, grad_fn=<NllLossBackward>)
cost:  tensor(1.6865, grad_fn=<NllLossBackward>)
cost:  tensor(1.6940, grad_fn=<NllLossBackward>)
cost:  tensor(1.7690

cost:  tensor(1.6762, grad_fn=<NllLossBackward>)
cost:  tensor(1.7058, grad_fn=<NllLossBackward>)
cost:  tensor(1.7428, grad_fn=<NllLossBackward>)
cost:  tensor(1.6931, grad_fn=<NllLossBackward>)
cost:  tensor(1.7274, grad_fn=<NllLossBackward>)
cost:  tensor(1.7030, grad_fn=<NllLossBackward>)
cost:  tensor(1.7489, grad_fn=<NllLossBackward>)
cost:  tensor(1.7499, grad_fn=<NllLossBackward>)
cost:  tensor(1.7546, grad_fn=<NllLossBackward>)
cost:  tensor(1.7395, grad_fn=<NllLossBackward>)
cost:  tensor(1.7776, grad_fn=<NllLossBackward>)
cost:  tensor(1.7253, grad_fn=<NllLossBackward>)
cost:  tensor(1.7796, grad_fn=<NllLossBackward>)
cost:  tensor(1.7260, grad_fn=<NllLossBackward>)
cost:  tensor(1.6984, grad_fn=<NllLossBackward>)
cost:  tensor(1.6866, grad_fn=<NllLossBackward>)
cost:  tensor(1.7157, grad_fn=<NllLossBackward>)
cost:  tensor(1.7405, grad_fn=<NllLossBackward>)
cost:  tensor(1.7227, grad_fn=<NllLossBackward>)
cost:  tensor(1.7053, grad_fn=<NllLossBackward>)
cost:  tensor(1.7504

cost:  tensor(1.7560, grad_fn=<NllLossBackward>)
cost:  tensor(1.6941, grad_fn=<NllLossBackward>)
cost:  tensor(1.6761, grad_fn=<NllLossBackward>)
cost:  tensor(1.7648, grad_fn=<NllLossBackward>)
cost:  tensor(1.6854, grad_fn=<NllLossBackward>)
cost:  tensor(1.7133, grad_fn=<NllLossBackward>)
cost:  tensor(1.7333, grad_fn=<NllLossBackward>)
cost:  tensor(1.7654, grad_fn=<NllLossBackward>)
cost:  tensor(1.6564, grad_fn=<NllLossBackward>)
cost:  tensor(1.6337, grad_fn=<NllLossBackward>)
cost:  tensor(1.7311, grad_fn=<NllLossBackward>)
cost:  tensor(1.7356, grad_fn=<NllLossBackward>)
cost:  tensor(1.7215, grad_fn=<NllLossBackward>)
cost:  tensor(1.6776, grad_fn=<NllLossBackward>)
cost:  tensor(1.7492, grad_fn=<NllLossBackward>)
cost:  tensor(1.7293, grad_fn=<NllLossBackward>)
cost:  tensor(1.6884, grad_fn=<NllLossBackward>)
cost:  tensor(1.7105, grad_fn=<NllLossBackward>)
cost:  tensor(1.7037, grad_fn=<NllLossBackward>)
cost:  tensor(1.6988, grad_fn=<NllLossBackward>)
cost:  tensor(1.7364

cost:  tensor(1.7642, grad_fn=<NllLossBackward>)
cost:  tensor(1.6751, grad_fn=<NllLossBackward>)
cost:  tensor(1.7281, grad_fn=<NllLossBackward>)
cost:  tensor(1.7260, grad_fn=<NllLossBackward>)
cost:  tensor(1.7013, grad_fn=<NllLossBackward>)
cost:  tensor(1.6847, grad_fn=<NllLossBackward>)
cost:  tensor(1.7359, grad_fn=<NllLossBackward>)
cost:  tensor(1.6692, grad_fn=<NllLossBackward>)
cost:  tensor(1.7495, grad_fn=<NllLossBackward>)
cost:  tensor(1.7431, grad_fn=<NllLossBackward>)
cost:  tensor(1.6517, grad_fn=<NllLossBackward>)
cost:  tensor(1.7060, grad_fn=<NllLossBackward>)
cost:  tensor(1.7051, grad_fn=<NllLossBackward>)
cost:  tensor(1.7177, grad_fn=<NllLossBackward>)
cost:  tensor(1.6880, grad_fn=<NllLossBackward>)
cost:  tensor(1.7553, grad_fn=<NllLossBackward>)
cost:  tensor(1.7160, grad_fn=<NllLossBackward>)
cost:  tensor(1.7647, grad_fn=<NllLossBackward>)
cost:  tensor(1.7422, grad_fn=<NllLossBackward>)
cost:  tensor(1.7356, grad_fn=<NllLossBackward>)
cost:  tensor(1.6585

cost:  tensor(1.6888, grad_fn=<NllLossBackward>)
cost:  tensor(1.6805, grad_fn=<NllLossBackward>)
cost:  tensor(1.6712, grad_fn=<NllLossBackward>)
cost:  tensor(1.7180, grad_fn=<NllLossBackward>)
cost:  tensor(1.6455, grad_fn=<NllLossBackward>)
cost:  tensor(1.7122, grad_fn=<NllLossBackward>)
cost:  tensor(1.6207, grad_fn=<NllLossBackward>)
cost:  tensor(1.7452, grad_fn=<NllLossBackward>)
cost:  tensor(1.6695, grad_fn=<NllLossBackward>)
cost:  tensor(1.7419, grad_fn=<NllLossBackward>)
cost:  tensor(1.7257, grad_fn=<NllLossBackward>)
cost:  tensor(1.6979, grad_fn=<NllLossBackward>)
cost:  tensor(1.7123, grad_fn=<NllLossBackward>)
cost:  tensor(1.6878, grad_fn=<NllLossBackward>)
cost:  tensor(1.6802, grad_fn=<NllLossBackward>)
cost:  tensor(1.6572, grad_fn=<NllLossBackward>)
cost:  tensor(1.7649, grad_fn=<NllLossBackward>)
cost:  tensor(1.7317, grad_fn=<NllLossBackward>)
cost:  tensor(1.7074, grad_fn=<NllLossBackward>)
cost:  tensor(1.7572, grad_fn=<NllLossBackward>)
cost:  tensor(1.7178

cost:  tensor(1.7099, grad_fn=<NllLossBackward>)
cost:  tensor(1.7088, grad_fn=<NllLossBackward>)
cost:  tensor(1.7344, grad_fn=<NllLossBackward>)
cost:  tensor(1.6944, grad_fn=<NllLossBackward>)
cost:  tensor(1.7465, grad_fn=<NllLossBackward>)
cost:  tensor(1.7359, grad_fn=<NllLossBackward>)
cost:  tensor(1.6616, grad_fn=<NllLossBackward>)
cost:  tensor(1.6698, grad_fn=<NllLossBackward>)
cost:  tensor(1.7509, grad_fn=<NllLossBackward>)
cost:  tensor(1.7626, grad_fn=<NllLossBackward>)
cost:  tensor(1.6813, grad_fn=<NllLossBackward>)
cost:  tensor(1.6610, grad_fn=<NllLossBackward>)
cost:  tensor(1.6918, grad_fn=<NllLossBackward>)
cost:  tensor(1.6419, grad_fn=<NllLossBackward>)
cost:  tensor(1.7250, grad_fn=<NllLossBackward>)
cost:  tensor(1.6950, grad_fn=<NllLossBackward>)
cost:  tensor(1.7471, grad_fn=<NllLossBackward>)
cost:  tensor(1.7340, grad_fn=<NllLossBackward>)
cost:  tensor(1.6605, grad_fn=<NllLossBackward>)
cost:  tensor(1.6731, grad_fn=<NllLossBackward>)
cost:  tensor(1.6703

cost:  tensor(1.7881, grad_fn=<NllLossBackward>)
cost:  tensor(1.6974, grad_fn=<NllLossBackward>)
cost:  tensor(1.7683, grad_fn=<NllLossBackward>)
cost:  tensor(1.6479, grad_fn=<NllLossBackward>)
cost:  tensor(1.7043, grad_fn=<NllLossBackward>)
cost:  tensor(1.8136, grad_fn=<NllLossBackward>)
cost:  tensor(1.6722, grad_fn=<NllLossBackward>)
cost:  tensor(1.7369, grad_fn=<NllLossBackward>)
cost:  tensor(1.6878, grad_fn=<NllLossBackward>)
cost:  tensor(1.7948, grad_fn=<NllLossBackward>)
cost:  tensor(1.7637, grad_fn=<NllLossBackward>)
cost:  tensor(1.7894, grad_fn=<NllLossBackward>)
cost:  tensor(1.7632, grad_fn=<NllLossBackward>)
cost:  tensor(1.7073, grad_fn=<NllLossBackward>)
cost:  tensor(1.7365, grad_fn=<NllLossBackward>)
cost:  tensor(1.6922, grad_fn=<NllLossBackward>)
cost:  tensor(1.7699, grad_fn=<NllLossBackward>)
cost:  tensor(1.6731, grad_fn=<NllLossBackward>)
cost:  tensor(1.6823, grad_fn=<NllLossBackward>)
cost:  tensor(1.6424, grad_fn=<NllLossBackward>)
cost:  tensor(1.7382

cost:  tensor(1.7116, grad_fn=<NllLossBackward>)
cost:  tensor(1.7042, grad_fn=<NllLossBackward>)
cost:  tensor(1.6985, grad_fn=<NllLossBackward>)
cost:  tensor(1.7850, grad_fn=<NllLossBackward>)
cost:  tensor(1.6621, grad_fn=<NllLossBackward>)
cost:  tensor(1.7319, grad_fn=<NllLossBackward>)
cost:  tensor(1.7795, grad_fn=<NllLossBackward>)
cost:  tensor(1.7646, grad_fn=<NllLossBackward>)
cost:  tensor(1.6743, grad_fn=<NllLossBackward>)
cost:  tensor(1.7242, grad_fn=<NllLossBackward>)
cost:  tensor(1.7047, grad_fn=<NllLossBackward>)
cost:  tensor(1.7249, grad_fn=<NllLossBackward>)
cost:  tensor(1.6880, grad_fn=<NllLossBackward>)
cost:  tensor(1.7147, grad_fn=<NllLossBackward>)
cost:  tensor(1.7829, grad_fn=<NllLossBackward>)
cost:  tensor(1.6731, grad_fn=<NllLossBackward>)
cost:  tensor(1.7422, grad_fn=<NllLossBackward>)
cost:  tensor(1.6874, grad_fn=<NllLossBackward>)
cost:  tensor(1.7146, grad_fn=<NllLossBackward>)
cost:  tensor(1.6640, grad_fn=<NllLossBackward>)
cost:  tensor(1.6900

cost:  tensor(1.7290, grad_fn=<NllLossBackward>)
cost:  tensor(1.7010, grad_fn=<NllLossBackward>)
cost:  tensor(1.7221, grad_fn=<NllLossBackward>)
cost:  tensor(1.6318, grad_fn=<NllLossBackward>)
cost:  tensor(1.7131, grad_fn=<NllLossBackward>)
cost:  tensor(1.6885, grad_fn=<NllLossBackward>)
cost:  tensor(1.7870, grad_fn=<NllLossBackward>)
cost:  tensor(1.7422, grad_fn=<NllLossBackward>)
cost:  tensor(1.7509, grad_fn=<NllLossBackward>)
cost:  tensor(1.7359, grad_fn=<NllLossBackward>)
cost:  tensor(1.7061, grad_fn=<NllLossBackward>)
cost:  tensor(1.6924, grad_fn=<NllLossBackward>)
cost:  tensor(1.7228, grad_fn=<NllLossBackward>)
cost:  tensor(1.6987, grad_fn=<NllLossBackward>)
cost:  tensor(1.6837, grad_fn=<NllLossBackward>)
cost:  tensor(1.6721, grad_fn=<NllLossBackward>)
cost:  tensor(1.7045, grad_fn=<NllLossBackward>)
cost:  tensor(1.7137, grad_fn=<NllLossBackward>)
cost:  tensor(1.7535, grad_fn=<NllLossBackward>)
cost:  tensor(1.7290, grad_fn=<NllLossBackward>)
cost:  tensor(1.7031

cost:  tensor(1.7086, grad_fn=<NllLossBackward>)
cost:  tensor(1.6827, grad_fn=<NllLossBackward>)
cost:  tensor(1.7246, grad_fn=<NllLossBackward>)
cost:  tensor(1.7290, grad_fn=<NllLossBackward>)
cost:  tensor(1.7258, grad_fn=<NllLossBackward>)
cost:  tensor(1.6937, grad_fn=<NllLossBackward>)
cost:  tensor(1.7033, grad_fn=<NllLossBackward>)
cost:  tensor(1.7229, grad_fn=<NllLossBackward>)
cost:  tensor(1.7442, grad_fn=<NllLossBackward>)
cost:  tensor(1.6769, grad_fn=<NllLossBackward>)
cost:  tensor(1.7624, grad_fn=<NllLossBackward>)
cost:  tensor(1.7225, grad_fn=<NllLossBackward>)
cost:  tensor(1.7609, grad_fn=<NllLossBackward>)
cost:  tensor(1.7326, grad_fn=<NllLossBackward>)
cost:  tensor(1.7743, grad_fn=<NllLossBackward>)
cost:  tensor(1.7149, grad_fn=<NllLossBackward>)
cost:  tensor(1.7620, grad_fn=<NllLossBackward>)
cost:  tensor(1.7267, grad_fn=<NllLossBackward>)
cost:  tensor(1.7432, grad_fn=<NllLossBackward>)
cost:  tensor(1.6863, grad_fn=<NllLossBackward>)
cost:  tensor(1.6951

cost:  tensor(1.7747, grad_fn=<NllLossBackward>)
cost:  tensor(1.7020, grad_fn=<NllLossBackward>)
cost:  tensor(1.7144, grad_fn=<NllLossBackward>)
cost:  tensor(1.7164, grad_fn=<NllLossBackward>)
cost:  tensor(1.6923, grad_fn=<NllLossBackward>)
cost:  tensor(1.7496, grad_fn=<NllLossBackward>)
cost:  tensor(1.7389, grad_fn=<NllLossBackward>)
cost:  tensor(1.6565, grad_fn=<NllLossBackward>)
cost:  tensor(1.6565, grad_fn=<NllLossBackward>)
cost:  tensor(1.7653, grad_fn=<NllLossBackward>)
cost:  tensor(1.7350, grad_fn=<NllLossBackward>)
cost:  tensor(1.7266, grad_fn=<NllLossBackward>)
cost:  tensor(1.6725, grad_fn=<NllLossBackward>)
cost:  tensor(1.7097, grad_fn=<NllLossBackward>)
cost:  tensor(1.6927, grad_fn=<NllLossBackward>)
cost:  tensor(1.7010, grad_fn=<NllLossBackward>)
cost:  tensor(1.7087, grad_fn=<NllLossBackward>)
cost:  tensor(1.7234, grad_fn=<NllLossBackward>)
cost:  tensor(1.7514, grad_fn=<NllLossBackward>)
cost:  tensor(1.7334, grad_fn=<NllLossBackward>)
cost:  tensor(1.7482

In [42]:
# Test set prediction
sum_cost = 0
for test_data, test_label in test_iter:
    sum_cost += loss(model(test_data), test_label)
    print("cost so far: ", sum_cost)
sum_cost / len(test_iter)

cost so far:  tensor(1.6259, grad_fn=<AddBackward0>)
cost so far:  tensor(3.4068, grad_fn=<AddBackward0>)
cost so far:  tensor(5.1399, grad_fn=<AddBackward0>)
cost so far:  tensor(6.8742, grad_fn=<AddBackward0>)
cost so far:  tensor(8.6006, grad_fn=<AddBackward0>)
cost so far:  tensor(10.2977, grad_fn=<AddBackward0>)
cost so far:  tensor(11.9965, grad_fn=<AddBackward0>)
cost so far:  tensor(13.6860, grad_fn=<AddBackward0>)
cost so far:  tensor(15.4217, grad_fn=<AddBackward0>)
cost so far:  tensor(17.1069, grad_fn=<AddBackward0>)
cost so far:  tensor(18.8489, grad_fn=<AddBackward0>)
cost so far:  tensor(20.5515, grad_fn=<AddBackward0>)
cost so far:  tensor(22.2843, grad_fn=<AddBackward0>)
cost so far:  tensor(24.0331, grad_fn=<AddBackward0>)
cost so far:  tensor(25.7913, grad_fn=<AddBackward0>)
cost so far:  tensor(27.5064, grad_fn=<AddBackward0>)
cost so far:  tensor(29.3112, grad_fn=<AddBackward0>)
cost so far:  tensor(31.0304, grad_fn=<AddBackward0>)
cost so far:  tensor(32.7429, gra

tensor(1.7227, grad_fn=<DivBackward0>)