In [1]:
import torch
from torch import nn, optim
from torch.utils.data import DataLoader
import torch.nn.functional as F

# for loading MNIST data
from torchvision import transforms, datasets

In [2]:
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm_notebook as tqdm
%matplotlib inline

In [3]:
# if cuda device is available then run model on gpu
if torch.cuda.is_available():
    cuda_flag=True
else:
    cuda_flag=False
    
torch.manual_seed(3120)

<torch._C.Generator at 0x7f0f8cabb410>

### Setting up data loader

In [4]:
batch_size=64

transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.5], std=[0.5])])

train_loader = torch.utils.data.DataLoader(
                datasets.MNIST('./dataset/', train=True, download=True,transform=transform),
                batch_size=batch_size, shuffle=True)

test_loader = torch.utils.data.DataLoader(
                datasets.MNIST('./dataset/', train=False, download=True, transform=transform),
                 batch_size=batch_size)

### Class definition for encoder decoder networks

In [5]:
class encoder(nn.Module):
    def __init__(self, input_dim, encoding_dim):
        super(encoder, self).__init__()        
        self.fc1 = nn.Linear(in_features=input_dim, out_features=encoding_dim*4)
        self.fc2 = nn.Linear(in_features=encoding_dim*4, out_features=encoding_dim*2)
        self.fc3 = nn.Linear(in_features=encoding_dim*2, out_features=encoding_dim)
        
    def forward(self,x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = torch.tanh(self.fc3(x))
        return x

class decoder(nn.Module):
    def __init__(self, output_dim, encoding_dim):
        super(decoder, self).__init__()        
        self.fc1 = nn.Linear(in_features=encoding_dim, out_features=encoding_dim*2)
        self.fc2 = nn.Linear(in_features=encoding_dim*2, out_features=encoding_dim*4)
        self.fc3 = nn.Linear(in_features=encoding_dim*4, out_features=output_dim)
        
    def forward(self,x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = torch.tanh(self.fc3(x))
        return x

In [6]:
# instantiate the models
enc = encoder(input_dim=784, encoding_dim=100)
dec = decoder(output_dim=784, encoding_dim=100)

if cuda_flag:
    enc,dec = enc.cuda(), dec.cuda()

In [7]:
# define optimizer to update the model parameters for both encoder and decoder
opt = optim.Adam(list(enc.parameters())+list(dec.parameters()), lr=1e-4)

# define loss function
crit = nn.MSELoss()

In [8]:
epochs = 10
loss_history = []

In [9]:
for epoch in tqdm(range(epochs)):
    train_loss=0
    
    # iterate over dataset
    for x,_ in train_loader:
        x = x.view(len(x),-1)
        # move data to gpu if cuda_flag is set
        if cuda_flag:
            x = x.cuda()
        
        # zero_grad to ensure no unaccounted calculation creeps in while calculating gradients
        opt.zero_grad()
        
        # forward propogation and loss computation
        x_gen = dec(enc(x))
        loss = crit(x_gen,x)
        train_loss+=loss.item()
        
        # backpropogate gradients
        loss.backward()
        
        # update weights
        opt.step()
    train_loss/=len(train_loader)

    print ("Epoch:{}\t Train Loss:{:.6}\t".format(epoch,train_loss))
    loss_history.append(train_loss)

HBox(children=(IntProgress(value=0, max=10), HTML(value='')))

Epoch:0	 Train Loss:0.22455	
Epoch:1	 Train Loss:0.103036	
Epoch:2	 Train Loss:0.0753213	
Epoch:3	 Train Loss:0.0630564	
Epoch:4	 Train Loss:0.0562923	
Epoch:5	 Train Loss:0.0510956	
Epoch:6	 Train Loss:0.0466395	
Epoch:7	 Train Loss:0.0432341	
Epoch:8	 Train Loss:0.0402964	
Epoch:9	 Train Loss:0.0379133	



In [10]:
test_loss = 0
with torch.no_grad():
    for x,_ in test_loader:
        x = x.view(len(x),-1)
        if cuda_flag:
            x = x.cuda()
        
        x_gen = dec(enc(x))
        loss = crit(x_gen,x)
        test_loss+=loss.item()
test_loss/=len(test_loader)
print("Loss on test set: ", test_loss)

Loss on test set:  0.03588985968499806


### How good are our low dimensional embeddings?

Let's see what good our low dimensional embeddings are at downstream classification task.

In [11]:
# building compressed dataset with encoder

compressed_train_data = []
train_labels = []
with torch.no_grad():
    for x,y in train_loader:
        x = x.view(len(x),-1)
        if cuda_flag:
            x = x.cuda()

        x_comp = enc(x).cpu()
        compressed_train_data.append(x_comp)
        train_labels.append(y)

    compressed_train_data = torch.cat(compressed_train_data)
    train_labels = torch.cat(train_labels)
    compressed_train_loader = torch.utils.data.DataLoader(
                                torch.utils.data.TensorDataset(compressed_train_data, train_labels),
                                batch_size=batch_size, shuffle=True)

compressed_test_data = []
test_labels = []
with torch.no_grad():
    for x,y in test_loader:
        x = x.view(len(x),-1)
        if cuda_flag:
            x = x.cuda()

        x_comp = enc(x).cpu()
        compressed_test_data.append(x_comp)
        test_labels.append(y)

    compressed_test_data = torch.cat(compressed_test_data)
    test_labels = torch.cat(test_labels)
    compressed_test_loader = torch.utils.data.DataLoader(
                                torch.utils.data.TensorDataset(compressed_test_data, test_labels),
                                batch_size=batch_size, shuffle=True)    

In [12]:
class classifier(nn.Module):
    def __init__(self, input_dim, num_classes):
        super(classifier, self).__init__()        
        self.fc1 = nn.Linear(in_features=input_dim, out_features=input_dim//2)
        self.fc2 = nn.Linear(in_features=self.fc1.out_features, out_features=self.fc1.out_features//2)
        self.fc3 = nn.Linear(in_features=self.fc2.out_features, out_features=num_classes)
        
    def forward(self,x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = torch.softmax(self.fc3(x),1)
        return x

In [13]:
def trainer(model, train_loader, test_loader, train_data_fraction=1):
    if cuda_flag:
        model = model.cuda()

    opt = optim.Adam(model.parameters())
    crit = nn.CrossEntropyLoss()
    epochs = 10
    
    model.train()
    for epoch in range(epochs):
        train_loss=0
        accuracy = 0

        for i,(x,y) in enumerate(train_loader):
            if ((i+1)/len(train_loader)>=train_data_fraction):
                break
            opt.zero_grad()

            x = x.view(len(x),-1)
            if cuda_flag:
                x,y = x.cuda(), y.cuda()

            y_pred = model(x)
            loss = crit(y_pred,y)
            train_loss+=loss.item()

            loss.backward()
            opt.step()

            _, predicted = torch.max(y_pred, 1)
            accuracy += (predicted == y).sum().item()

        train_loss/=len(train_loader)
        train_accuracy = accuracy*100/(batch_size*len(train_loader)*train_data_fraction)
        print ("Epoch:{}\t Train Loss:{:.6}\t Train Accuracy:{:.4}".format(epoch,train_loss,train_accuracy))

    model.eval()
    accuracy = 0
    num_examples = 0
    with torch.no_grad():
        for x,y in (test_loader):
            x = x.view(len(x),-1)
            if cuda_flag:
                x,y = x.cuda(), y.cuda()
            _, predicted = torch.max(model(x).detach(), 1) # detach to avoid accidental gradient backpropogation
            num_examples+=len(predicted)
            correct = (predicted == y).sum()
            accuracy += correct.item() 

    print ("Test Accuracy:",accuracy/num_examples)
    return train_accuracy, accuracy/num_examples

#### Training on Original Dataset

In [14]:
model = classifier(784,10)
trainer(model, train_loader, test_loader)

Epoch:0	 Train Loss:1.61197	 Train Accuracy:85.4
Epoch:1	 Train Loss:1.53135	 Train Accuracy:92.89
Epoch:2	 Train Loss:1.51854	 Train Accuracy:94.09
Epoch:3	 Train Loss:1.51208	 Train Accuracy:94.69
Epoch:4	 Train Loss:1.50588	 Train Accuracy:95.27
Epoch:5	 Train Loss:1.5036	 Train Accuracy:95.5
Epoch:6	 Train Loss:1.5006	 Train Accuracy:95.8
Epoch:7	 Train Loss:1.4989	 Train Accuracy:95.96
Epoch:8	 Train Loss:1.49608	 Train Accuracy:96.23
Epoch:9	 Train Loss:1.49507	 Train Accuracy:96.35
Test Accuracy: 0.9661


(96.35027985074628, 0.9661)

#### Training on Encoded Dataset

In [15]:
model = classifier(100,10)
trainer(model, compressed_train_loader, compressed_test_loader)

Epoch:0	 Train Loss:1.75749	 Train Accuracy:72.81
Epoch:1	 Train Loss:1.62884	 Train Accuracy:83.57
Epoch:2	 Train Loss:1.6031	 Train Accuracy:86.1
Epoch:3	 Train Loss:1.54172	 Train Accuracy:92.4
Epoch:4	 Train Loss:1.52721	 Train Accuracy:93.65
Epoch:5	 Train Loss:1.51895	 Train Accuracy:94.42
Epoch:6	 Train Loss:1.51313	 Train Accuracy:94.94
Epoch:7	 Train Loss:1.5083	 Train Accuracy:95.39
Epoch:8	 Train Loss:1.50498	 Train Accuracy:95.69
Epoch:9	 Train Loss:1.50175	 Train Accuracy:95.98
Test Accuracy: 0.9583


(95.98380863539445, 0.9583)

As evident from the above training and test accuracies, the model trained on lower dimensional encoding dataset has __test accuracy at par__ with model trained on original dataset despite having significantly __less parameters__. Another advantage of dimension reduction is the reduced requirement of labelled data owing to less parameters in the supervised task model, thus improving data efficiency.