In [1]:
import numpy as np
from init import he_normal
from Actfunc import Relu
from Loss import*
from FFNN import FFNN, Layer 
from Data_loading import FASHION_MNIST, train_val_split


## Here we will show a simple usage of our FFNN class and compare it to a pytorch network with the same parameters when trained on Fashion-MNIST. For details on the FFNN class, see the file FFNN.py

### We set up helper functions 

In [2]:

def to_one_hot(y, num_classes=10):
    # returns (num_classes, batch_size)
    oh = np.zeros((num_classes, y.size))
    oh[y, np.arange(y.size)] = 1.0
    return oh


def accuracy(logits, y_true):
    preds = np.argmax(logits, axis=0)
    return np.mean(preds == y_true)

dataset="Fashion-MNIST"

### Set up parameters for the network and training (here we use Adam as our optimizer). The parameters used are not important, we just want to show that our implementation will give the same results as pytorch 

In [3]:
hidden_sizes=[200,200]
init_fn=he_normal
act_fn=Relu
beta=0.6
gamma=0.4
epochs = 3
batch_size = 1000
lr = 0.0001


### Load the data (note that we are treating every observation as a column-vector)

In [4]:
(Xtr, ytr), (Xte, yte) = FASHION_MNIST(flatten=True, one_hot=False)

In [5]:
X_train, y_train, X_val, y_val = train_val_split(Xtr, ytr, val_size=5000, seed=42)

In [6]:
input_size = X_train.shape[0]
output_size = 10
N_train = X_train.shape[1]

### Create the network

In [7]:
net = FFNN(input_size, hidden_sizes, output_size, init_fn, act_fn, beta, gamma,lambda_=0.0)

### We now train the network

In [8]:
for epoch in range(epochs):

    print(f"epoch number {epoch+1}")



    epoch_train_loss = 0.0
    num_batches = 0
    perm = np.random.permutation(N_train)
    y_train_oh = to_one_hot(y_train, output_size)
    y_val_oh   = to_one_hot(y_val, output_size)
    y_test_oh  = to_one_hot(yte, output_size)


    for start in range(0, N_train, batch_size):
        idx = perm[start:start+batch_size]
        X_batch = X_train[:, idx]
        y_batch = y_train[idx]

        y_batch_oh = y_train_oh[:, idx]

        logits, A, Z = net.forward(X_batch)
        loss = cross_entropy_batch(y_batch_oh, logits)

        

        epoch_train_loss += loss
        num_batches += 1

        grads_w, grads_b = net.full_gradient(A, Z, y_batch_oh, X_batch,lambda_=0.0)
        net.update_wb(grads_w, grads_b, learning_rate=lr, Adam=True)
        
        

    epoch_train_loss /= num_batches

    # ---- Eval on train + val ----
    train_logits, _, _ = net.forward(X_train)
    val_logits, _, _   = net.forward(X_val)

    train_oh = to_one_hot(y_train, output_size)
    val_oh   = to_one_hot(y_val,   output_size)

    train_loss = cross_entropy_batch(train_oh, train_logits)
    val_loss   = cross_entropy_batch(val_oh,   val_logits)

    train_acc = accuracy(train_logits, y_train)
    val_acc   = accuracy(val_logits, y_val)



    print(f"Epoch {epoch+1}/{epochs}  "
            f"Train acc: {train_acc:.4f}  Val acc: {val_acc:.4f}")

# ---- Final test evaluation (only once, after training) ----
test_logits, _, _ = net.forward(Xte)
test_oh = to_one_hot(yte, output_size)

test_loss = cross_entropy_batch(test_oh, test_logits)
test_acc  = accuracy(test_logits, yte)

 

print(f"FINAL TEST  -  loss: {test_loss:.4f}  acc: {test_acc:.4f}")

epoch number 1
Epoch 1/3  Train acc: 0.6728  Val acc: 0.6646
epoch number 2
Epoch 2/3  Train acc: 0.7622  Val acc: 0.7570
epoch number 3
Epoch 3/3  Train acc: 0.7956  Val acc: 0.7942
FINAL TEST  -  loss: 0.6232  acc: 0.7863


# Pytorch with same parameters (copied from exercise sessions)

In [9]:
import torch
from torch import nn
import torch.nn.functional as F
import torch.optim as optim
import torch.nn.init as init
import numpy as np



from torch.nn.parameter import Parameter

In [10]:
import numpy as np
import urllib.request
import tarfile
import io
import gzip
def FASHION_MNIST(flatten=True, one_hot=False):
    base = "http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/"
    get  = lambda name: gzip.decompress(urllib.request.urlopen(base+name).read())

    Xtr = np.frombuffer(get("train-images-idx3-ubyte.gz"), dtype=np.uint8, offset=16).reshape(-1, 28, 28) / 255.0
    ytr = np.frombuffer(get("train-labels-idx1-ubyte.gz"), dtype=np.uint8, offset=8)

    Xte = np.frombuffer(get("t10k-images-idx3-ubyte.gz"), dtype=np.uint8, offset=16).reshape(-1, 28, 28) / 255.0
    yte = np.frombuffer(get("t10k-labels-idx1-ubyte.gz"), dtype=np.uint8, offset=8)

    if flatten:
        Xtr = Xtr.reshape(len(Xtr), -1)
        Xte = Xte.reshape(len(Xte), -1)

    if one_hot:
        Ytr = np.zeros((ytr.size, 10))
        Yte = np.zeros((yte.size, 10))
        Ytr[np.arange(ytr.size), ytr] = 1
        Yte[np.arange(yte.size), yte] = 1
        return (Xtr, Ytr), (Xte, Yte)

    return (Xtr, ytr), (Xte, yte)

In [12]:
from sklearn.model_selection import train_test_split

(Xtr, ytr), (x_test, y_test) = FASHION_MNIST(flatten=True, one_hot=False)
x_train, x_valid, y_train, y_valid = train_test_split(Xtr, ytr, test_size=5000, random_state=42, shuffle=True)

#x_train, y_train, x_valid, y_valid = train_val_split(Xtr, ytr, val_size=5000, seed=42)

x_train = torch.from_numpy(x_train.copy())
y_train = torch.from_numpy(y_train.copy())

x_valid = torch.from_numpy(x_valid.copy())
y_valid = torch.from_numpy(y_valid.copy())

x_test = torch.from_numpy(x_test.copy())
y_test = torch.from_numpy(y_test.copy())

x_train = x_train.float()
x_valid = x_valid.float()
x_test  = x_test.float()

y_train = y_train.long()
y_valid = y_valid.long()
y_test  = y_test.long()

In [13]:
num_classes = 10
num_l1 = hidden_sizes[0]
num_l2 = hidden_sizes[1]
num_features = x_train.shape[1]

class FNNP(nn.Module):
    def __init__(self,num_features,num_hidden_1,num_hidden_2,num_output):
        super(FNNP,self).__init__()
        self.W_1 = Parameter(init.kaiming_normal_(torch.Tensor(num_hidden_1, num_features)))
        self.b_1 = Parameter(torch.reshape(init.kaiming_normal_(torch.Tensor(num_hidden_1,1)),(-1,))) 
        # hidden layer 1
        self.W_2 = Parameter(init.kaiming_normal_(torch.Tensor(num_hidden_2, num_hidden_1)))
        self.b_2 = Parameter(torch.reshape(init.kaiming_normal_(torch.Tensor(num_hidden_2,1)),(-1,))) 

        # hidden layer 2
        self.W_3 = Parameter(init.kaiming_normal_(torch.Tensor(num_output, num_hidden_2)))
        self.b_3 = Parameter(torch.reshape(init.kaiming_normal_(torch.Tensor(num_output,1)),(-1,))) 


        self.activation = torch.nn.ReLU()
    
    def forward(self,x):
        x = F.linear(x, self.W_1, self.b_1)
        x = self.activation(x)
        x = F.linear(x,self.W_2,self.b_2)
        x = self.activation(x)
        x = F.linear(x,self.W_3,self.b_3)
        return x
network = FNNP(num_features, num_l1,num_l2, num_classes)
network = network.float()

In [14]:
optimizer = optim.Adam(network.parameters(), lr=lr,betas=(beta,gamma))
criterion = nn.CrossEntropyLoss()

In [15]:
# we could have done this ourselves,
# but we should be aware of sklearn and its tools
from sklearn.metrics import accuracy_score

# setting hyperparameters and gettings epoch sizes
batch_size = batch_size
num_epochs = epochs
num_samples_train = x_train.shape[0]
num_batches_train = num_samples_train // batch_size
num_samples_valid = x_valid.shape[0]
num_batches_valid = num_samples_valid // batch_size

# setting up lists for handling loss/accuracy
train_acc, train_loss = [], []
valid_acc, valid_loss = [], []
test_acc, test_loss = [], []
cur_loss = 0
losses = []

get_slice = lambda i, size: range(i * size, (i + 1) * size)

for epoch in range(num_epochs):
    # Forward -> Backprob -> Update params
    ## Train
    cur_loss = 0
    network.train()
    for i in range(num_batches_train):
        optimizer.zero_grad()
        slce = get_slice(i, batch_size)
        output = network(x_train[slce])
        
        # compute gradients given loss
        target_batch = y_train[slce]
        batch_loss = criterion(output, target_batch)
        batch_loss.backward()
        optimizer.step()
        
        cur_loss += batch_loss   
    losses.append(cur_loss / batch_size)

    network.eval()
    ### Evaluate training
    train_preds, train_targs = [], []
    for i in range(num_batches_train):
        slce = get_slice(i, batch_size)
        output = network(x_train[slce])
        
        preds = torch.max(output, 1)[1]
        
        train_targs += list(y_train[slce].numpy())
        train_preds += list(preds.data.numpy())
    
    ### Evaluate validation
    val_preds, val_targs = [], []
    for i in range(num_batches_valid):
        slce = get_slice(i, batch_size)
        
        output = network(x_valid[slce])
        preds = torch.max(output, 1)[1]
        val_targs += list(y_valid[slce].numpy())
        val_preds += list(preds.data.numpy())
        

    train_acc_cur = accuracy_score(train_targs, train_preds)
    valid_acc_cur = accuracy_score(val_targs, val_preds)
    
    train_acc.append(train_acc_cur)
    valid_acc.append(valid_acc_cur)
    
    if epoch % 1 == 0:
        print("Epoch %2i : Train Loss %f , Train acc %f, Valid acc %f" % (
                epoch+1, losses[-1], train_acc_cur, valid_acc_cur))

epoch = np.arange(len(train_acc))

network.eval()
with torch.no_grad():
    output = network(x_test)
    preds = torch.max(output, 1)[1]
    test_acc_final = accuracy_score(y_test.numpy(), preds.numpy())

print("\nFinal Test Accuracy: {:.4f}".format(test_acc_final))


Epoch  1 : Train Loss 0.122161 , Train acc 0.631745, Valid acc 0.632600
Epoch  2 : Train Loss 0.050827 , Train acc 0.750764, Valid acc 0.753800
Epoch  3 : Train Loss 0.036697 , Train acc 0.788873, Valid acc 0.788000

Final Test Accuracy: 0.7797


### As can be seen, the accuracies are very similar, indicating that our implementation is very close to the results of pytorch. feel free to vary beta, gamma and the learning rate and try again. Note that the pytorch part assumes only 2 hidden layers. The size of each of these two layers can be varied with the hidden_sizes parameter