In [1]:
#libraries
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
from torch.autograd import Variable
import pandas as pd
import scipy.stats
import random

In [111]:
#load MNIST data

train_data = pd.read_csv('./mnist_train.csv', sep=',', header=None)
train_labels = train_data[0]
train_data = train_data.drop(0, axis=1)

test_data = pd.read_csv('./mnist_test.csv', sep=',', header=None)
test_labels = test_data[0]
test_data = test_data.drop(0, axis=1)

#separate data for generating graphs
graph_data = train_data.sample(n = 10000, random_state=100)
graph_labels = train_labels.sample(n = 10000, random_state=100)
train_data = train_data.drop(graph_data.index)
train_labels = train_labels.drop(graph_data.index)

#convert data to pytorch tensors
train_data = torch.FloatTensor(train_data.to_numpy())
train_labels = torch.LongTensor(train_labels.to_numpy())
test_data = torch.FloatTensor(test_data.to_numpy())
graph_data = torch.FloatTensor(graph_data.to_numpy())

In [3]:
input_size = 784
output_size = 10

In [4]:
#(300, 100) Vanilla FCN

class Vanilla_Net(nn.Module):
    def __init__(self):
        super(Vanilla_Net, self).__init__()
        self.fc1 = nn.Linear(input_size, 300)
        self.fc2 = nn.Linear(300, 100)
        self.fc3 = nn.Linear(100, output_size)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return F.log_softmax(x)

In [38]:
def SGD(net, optimizer, loss, epochs, train_data, train_labels, batch_size):
    for i in range(epochs):
        for j in range(0, train_data.shape[0], batch_size):
            data_minibatch = Variable(train_data[j : j+batch_size])
            label_minibatch = Variable(train_labels[j: j+batch_size])
            optimizer.zero_grad()
            net_out = net(data_minibatch)
            net_loss = loss(net_out, label_minibatch)
            net_loss.backward()
            optimizer.step()

In [27]:
def test_accuracy(net, test_data, test_labels):
    net_out = net(test_data)
    test_out = torch.max(net_out.data, 1)[1].numpy()
    return np.count_nonzero(test_out==test_labels) / len(test_labels)

In [31]:
#determine optimal # of epochs for SGD

epochs = 1
batch_size = 20 #typical value
learning_rate = 0.001 #default value
mmt = 0.9 #typical value
cur_accuracy = 0
prev_accuracy = 0
while True:
    prev_accuracy = cur_accuracy
    my_net = Vanilla_Net()
    optimizer = torch.optim.SGD(my_net.parameters(), lr=learning_rate, momentum=mmt)
    loss = nn.CrossEntropyLoss()
    SGD(my_net, optimizer, loss, epochs, train_data, train_labels, batch_size)
    cur_accuracy = test_accuracy(my_net, test_data, test_labels)
    print(cur_accuracy)
    if (cur_accuracy <= prev_accuracy-0.005):
        break
    epochs += 1
epochs -= 1
print(epochs)

  return F.log_softmax(x)


0.9178
0.9513
0.9497
0.956
0.963
0.9673
0.9586
6


After running the above code a few times, it seems like the network typically achieves maximum accuracy after around 5 epochs. 

In [34]:
# join train and test data

train_test_data = torch.cat((train_data, test_data))
train_test_labels = torch.cat((train_labels, torch.LongTensor(test_labels.to_numpy())))

In [39]:
# train vanilla network with train+test data

vanilla_net = Vanilla_Net()
optimizer = torch.optim.SGD(vanilla_net.parameters(), lr=0.001, momentum=0.9)
loss = nn.CrossEntropyLoss()
SGD(vanilla_net, optimizer, loss, epochs=5, train_data=train_test_data, train_labels=train_test_labels, batch_size=20)

  return F.log_softmax(x)


In [83]:
#(300, 100) FCN trained with dropout

p = 0.5 #same as paper

class Dropout_Net(nn.Module):
    def __init__(self):
        super(Dropout_Net, self).__init__()
        self.fc1 = nn.Linear(input_size, 300)
        self.fc2 = nn.Linear(300, 100)
        self.fc3 = nn.Linear(100, output_size)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.dropout(x, p)
        x = F.relu(self.fc2(x))
        #x = F.dropout(x, p)
        x = self.fc3(x)
        return F.log_softmax(x)

In [84]:
#determine optimal # of epochs for dropout_net

epochs = 1
batch_size = 20 #typical value
learning_rate = 0.001 #default value
mmt = 0.9 #typical value
cur_accuracy = 0
prev_accuracy = 0
while True:
    prev_accuracy = cur_accuracy
    my_net = Dropout_Net()
    optimizer = torch.optim.SGD(my_net.parameters(), lr=learning_rate, momentum=mmt)
    loss = nn.CrossEntropyLoss()
    SGD(my_net, optimizer, loss, epochs, train_data, train_labels, batch_size)
    cur_accuracy = test_accuracy(my_net, test_data, test_labels)
    print(cur_accuracy)
    if (cur_accuracy <= prev_accuracy-0.01):
        break
    epochs += 1
epochs -= 1
print(epochs)

  return F.log_softmax(x)


0.8501
0.8736
0.8971
0.9055
0.9127
0.9146
0.9201
0.9263
0.9291
0.9344
0.9218
10


It takes around 10 epochs to train the dropout network. 

In [85]:
# train dropout network

dropout_net = Dropout_Net()
optimizer = torch.optim.SGD(dropout_net.parameters(), lr=0.001, momentum=0.9)
loss = nn.CrossEntropyLoss()
SGD(dropout_net, optimizer, loss, epochs=10, train_data=train_test_data, train_labels=train_test_labels, batch_size=20)

  return F.log_softmax(x)


In [93]:
# (300, 100) FCN trained with batch norm

class BN_Net(nn.Module):
    def __init__(self):
        super(BN_Net, self).__init__()
        self.fc1 = nn.Linear(input_size, 300)
        self.bn1 = nn.BatchNorm1d(300)
        self.fc2 = nn.Linear(300, 100)
        self.bn2 = nn.BatchNorm1d(100)
        self.fc3 = nn.Linear(100, output_size)

    def forward(self, x):
        x = self.fc1(x)
        x = self.bn1(x)
        x = F.relu(x)
        x = self.fc2(x)
        x = self.bn2(x)
        x = F.relu(x)
        x = self.fc3(x)
        return F.log_softmax(x)

In [95]:
#determine optimal # of epochs for dropout_net

epochs = 1
batch_size = 20 #typical value
learning_rate = 0.001 #default value
mmt = 0.9 #typical value
cur_accuracy = 0
prev_accuracy = 0
while True:
    prev_accuracy = cur_accuracy
    my_net = BN_Net()
    optimizer = torch.optim.SGD(my_net.parameters(), lr=learning_rate, momentum=mmt)
    loss = nn.CrossEntropyLoss()
    SGD(my_net, optimizer, loss, epochs, train_data, train_labels, batch_size)
    cur_accuracy = test_accuracy(my_net, test_data, test_labels)
    print(cur_accuracy)
    if (cur_accuracy <= prev_accuracy-0.01):
        break
    epochs += 1
epochs -= 1
print(epochs)

  return F.log_softmax(x)


0.9587
0.9702
0.9726
0.9743
0.9769
0.9767


KeyboardInterrupt: 

The batch norm network achieves high accuracy after 5 epochs. 

In [96]:
# train batch norm network

batch_norm_net = BN_Net()
optimizer = torch.optim.SGD(batch_norm_net.parameters(), lr=0.001, momentum=0.9)
loss = nn.CrossEntropyLoss()
SGD(batch_norm_net, optimizer, loss, epochs=10, train_data=train_test_data, train_labels=train_test_labels, batch_size=20)

  return F.log_softmax(x)


In [106]:
#split graph data into 10

graph_data_subsets = np.array_split(graph_data, 10)

In [108]:
def neuron_values(net, data):
    activations = []
    def get_activation():
        def hook(model, input, output):
            activations.append(output.detach())
        return hook
    
    net.fc1.register_forward_hook(get_activation())
    net.fc2.register_forward_hook(get_activation())
    net(data)

    activations[0] = ((abs(activations[0]) + activations[0])/2).numpy()
    activations[1] = ((abs(activations[1]) + activations[1])/2).numpy()
    neurons = np.concatenate((activations[0].T, activations[1].T))
    return neurons

In [115]:

vanilla_neurons = neuron_values(vanilla_net, graph_data)
dropout_neurons = neuron_values(dropout_net, graph_data)
batch_norm_neurons = neuron_values(batch_norm_net, graph_data)

  return F.log_softmax(x)
  return F.log_softmax(x)
  return F.log_softmax(x)


In [133]:
def correlation_graph(neurons):
    n = len(neurons)
    adj_matrix = np.zeros((n, n))
    for i in range(n):
        for j in range(n):
            adj_matrix[i][j] = abs(scipy.stats.pearsonr(neurons[i], neurons[j])[0])
    return neurons

In [121]:
#split neuron data into 10 subsets to construct 10 graphs
vanilla_subsets = np.array_split(vanilla_neurons, 10, 1)
dropout_subsets = np.array_split(dropout_neurons, 10, 1)
batch_norm_subsets = np.array_split(batch_norm_neurons, 10, 1)

In [134]:
network_graphs = []

for i in range(10):
    network_graphs.append(correlation_graph(vanilla_subsets[i]))
    network_graphs.append(correlation_graph(dropout_subsets[i]))
    network_graphs.append(correlation_graph(batch_norm_subsets[i]))

  adj_matrix[i][j] = abs(scipy.stats.pearsonr(neurons[i], neurons[j])[0])


[[  0.        0.        0.      ...   0.        0.        0.     ]
 [  0.        0.        0.      ... 316.6242    0.        0.     ]
 [ 66.8235    0.        0.      ...   0.        0.      199.77635]
 ...
 [  0.        0.        0.      ...   0.        0.        0.     ]
 [  0.        0.        0.      ...   8.08379   0.        0.     ]
 [  0.        0.        0.      ...   0.        0.        0.     ]]


In [128]:
print(network_graphs[1])

None
