# Week 7 Exercise

## Exercise 1

Use LSTM algorithm and classify MNIST

(1) Try single, stacking, bi-directional  
(2) Print and compare the test results

In [None]:
from torchvision.datasets import MNIST
import torch
import torchvision
import torchvision.transforms as transforms
import torch.nn as nn
import torch.optim as optim

In [None]:
transform = transforms.Compose([
    transforms.ToTensor()
])

trainset = torchvision.datasets.MNIST(root='./mnist', train=True, download=True, transform=transform)
testset = torchvision.datasets.MNIST(root='./mnist', train=False, transform=transform)

In [None]:
batch_size = 1000
num_workers = 0

trainloader = torch.utils.data.DataLoader(trainset, batch_size=batch_size, shuffle=True, num_workers=num_workers)
testloader = torch.utils.data.DataLoader(testset, batch_size=batch_size, shuffle=False, num_workers=num_workers)

In [None]:
seq_length = 28
input_size = 28
hidden_size = 50
num_classes = 10

In [None]:
class SingleLSTM(nn.Module):
    def __init__(self, batch_size, seq_length, input_size, hidden_size, num_classes):
        super(SingleLSTM, self).__init__()
        self.batch_size = batch_size
        self.seq_length = seq_length
        self.hidden_size = hidden_size
        self.num_layers = 1
        self.lstm = nn.LSTM(input_size, hidden_size, 1, batch_first=True,dropout=0,bidirectional=False)
        self.fc = nn.Linear(hidden_size, num_classes)
    
    def forward(self, x):
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(device) 
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(device)
        
        out, _ = self.lstm(x, (h0, c0))

        out = self.fc(out[:, -1, :])
        return out

In [None]:
class StackedLSTM(nn.Module):
    def __init__(self, batch_size, seq_length, input_size, hidden_size, num_classes):
        super(StackedLSTM, self).__init__()
        self.batch_size = batch_size
        self.seq_length = seq_length
        self.hidden_size = hidden_size
        self.num_layers = 4
        self.lstm = nn.LSTM(input_size, hidden_size, 4, batch_first=True,dropout=0,bidirectional=False)
        self.fc = nn.Linear(hidden_size, num_classes)
    
    def forward(self, x):
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(device) 
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(device)
        
        out, _ = self.lstm(x, (h0, c0))
        out = self.fc(out[:, -1, :])
        return out

In [None]:
class BiLSTM(nn.Module):
    def __init__(self, batch_size, seq_length, input_size, hidden_size, num_classes):
        super(BiLSTM, self).__init__()
        self.batch_size = batch_size
        self.seq_length = seq_length
        self.hidden_size = hidden_size
        self.num_layers = 4
        self.lstm = nn.LSTM(input_size, hidden_size, 4, batch_first=True,dropout=0,bidirectional=True)
        self.fc = nn.Linear(hidden_size*2, num_classes)
    
    def forward(self, x):
        h0 = torch.zeros(self.num_layers*2, x.size(0), self.hidden_size).to(device) 
        c0 = torch.zeros(self.num_layers*2, x.size(0), self.hidden_size).to(device)
        
        out, _ = self.lstm(x, (h0, c0))
        
        out = self.fc(out[:, -1, :])
        return out

In [None]:
single_acc = 0
single_loss = 0
stacked_acc = 0
stacked_loss = 0
bi_acc = 0
bi_loss = 0

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = SingleLSTM(batch_size, seq_length, input_size, hidden_size, num_classes).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [None]:
epochs = 10

model.train()
for epoch in range(epochs):
    train_loss = 0
    train_correct = 0

    for x, y in trainloader:
        x, y = x.reshape(-1, seq_length, input_size).to(device), y.to(device)

        optimizer.zero_grad()
        outputs = model(x)
        loss = criterion(outputs, y)
                
        loss.backward()
        optimizer.step()
        
        train_loss += loss.item()
        _, predicted = outputs.max(1)
        train_correct += predicted.eq(y).sum().item()
        
    train_loss = train_loss / len(trainloader)
    train_acc = train_correct / len(trainset)
        
    print('[%2d] TRAIN loss: %.4f, acc: %.4f' % (epoch + 1, train_loss, train_acc))

[ 1] TRAIN loss: 2.1659, acc: 0.2592
[ 2] TRAIN loss: 1.1616, acc: 0.6555
[ 3] TRAIN loss: 0.6373, acc: 0.8136
[ 4] TRAIN loss: 0.4531, acc: 0.8729
[ 5] TRAIN loss: 0.3443, acc: 0.9045
[ 6] TRAIN loss: 0.2843, acc: 0.9218
[ 7] TRAIN loss: 0.2506, acc: 0.9297
[ 8] TRAIN loss: 0.2232, acc: 0.9369
[ 9] TRAIN loss: 0.2036, acc: 0.9423
[10] TRAIN loss: 0.1859, acc: 0.9474


In [None]:
test_loss = 0
test_correct = 0
test_preds = []

model.eval()
with torch.no_grad():
    for x, y in testloader:
        x, y = x.reshape(-1, seq_length, input_size).to(device), y.to(device)

        outputs = model(x)
        loss = criterion(outputs, y)
        
        test_loss += loss.item()
        _, predicted = outputs.max(1)
        test_correct += predicted.eq(y).sum().item()
        
        test_preds.extend(predicted.tolist())

single_acc = test_correct / len(testset)
single_loss = test_loss/len(testloader)

print('TEST loss: %.4f, acc: %.4f' % (single_loss, single_acc))

TEST loss: 0.1868, acc: 0.9437


In [None]:
model1 = StackedLSTM(batch_size, seq_length, input_size, hidden_size, num_classes).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model1.parameters(), lr=0.001)

In [None]:
epochs = 10

model1.train()
for epoch in range(epochs):
    train_loss = 0
    train_correct = 0

    for x, y in trainloader:
        x, y = x.reshape(-1, seq_length, input_size).to(device), y.to(device)

        optimizer.zero_grad()
        outputs = model1(x)
        loss = criterion(outputs, y)
                
        loss.backward()
        optimizer.step()
        
        train_loss += loss.item()
        _, predicted = outputs.max(1)
        train_correct += predicted.eq(y).sum().item()
        
    train_loss = train_loss / len(trainloader)
    train_acc = train_correct / len(trainset)
        
    print('[%2d] TRAIN loss: %.4f, acc: %.4f' % (epoch + 1, train_loss, train_acc))

[ 1] TRAIN loss: 2.0932, acc: 0.2332
[ 2] TRAIN loss: 0.9770, acc: 0.6916
[ 3] TRAIN loss: 0.5056, acc: 0.8484
[ 4] TRAIN loss: 0.3228, acc: 0.9075
[ 5] TRAIN loss: 0.2526, acc: 0.9275
[ 6] TRAIN loss: 0.2003, acc: 0.9429
[ 7] TRAIN loss: 0.1660, acc: 0.9516
[ 8] TRAIN loss: 0.1494, acc: 0.9564
[ 9] TRAIN loss: 0.1254, acc: 0.9635
[10] TRAIN loss: 0.1155, acc: 0.9662


In [None]:
test_loss = 0
test_correct = 0
test_preds = []

model1.eval()
with torch.no_grad():
    for x, y in testloader:
        x, y = x.reshape(-1, seq_length, input_size).to(device), y.to(device)

        outputs = model1(x)
        loss = criterion(outputs, y)
        
        test_loss += loss.item()
        _, predicted = outputs.max(1)
        test_correct += predicted.eq(y).sum().item()
        
        test_preds.extend(predicted.tolist())

stacked_acc = test_correct/len(testset)
stacked_loss = test_loss/len(testloader)
print('TEST loss: %.4f, acc: %.4f' % (stacked_loss, stacked_acc))

TEST loss: 0.1190, acc: 0.9658


In [None]:
model2 = BiLSTM(batch_size, seq_length, input_size, hidden_size, num_classes).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model2.parameters(), lr=0.001)

In [None]:
epochs = 10

model2.train()
for epoch in range(epochs):
    train_loss = 0
    train_correct = 0

    for x, y in trainloader:
        x, y = x.reshape(-1, seq_length, input_size).to(device), y.to(device)

        optimizer.zero_grad()
        outputs = model2(x)
        loss = criterion(outputs, y)
                
        loss.backward()
        optimizer.step()
        
        train_loss += loss.item()
        _, predicted = outputs.max(1)
        train_correct += predicted.eq(y).sum().item()
        
    train_loss = train_loss / len(trainloader)
    train_acc = train_correct / len(trainset)
        
    print('[%2d] TRAIN loss: %.4f, acc: %.4f' % (epoch + 1, train_loss, train_acc))

[ 1] TRAIN loss: 1.9172, acc: 0.2969
[ 2] TRAIN loss: 0.7999, acc: 0.7359
[ 3] TRAIN loss: 0.4478, acc: 0.8616
[ 4] TRAIN loss: 0.2895, acc: 0.9141
[ 5] TRAIN loss: 0.2093, acc: 0.9385
[ 6] TRAIN loss: 0.1755, acc: 0.9479
[ 7] TRAIN loss: 0.1446, acc: 0.9572
[ 8] TRAIN loss: 0.1241, acc: 0.9634
[ 9] TRAIN loss: 0.1096, acc: 0.9678
[10] TRAIN loss: 0.0933, acc: 0.9728


In [None]:
test_loss = 0
test_correct = 0
test_preds = []

model2.eval()
with torch.no_grad():
    for x, y in testloader:
        x, y = x.reshape(-1, seq_length, input_size).to(device), y.to(device)

        outputs = model2(x)
        loss = criterion(outputs, y)
        
        test_loss += loss.item()
        _, predicted = outputs.max(1)
        test_correct += predicted.eq(y).sum().item()
        
        test_preds.extend(predicted.tolist())

bi_acc = test_correct/len(testset)
bi_loss = test_loss/len(testloader)
print('TEST loss: %.4f, acc: %.4f' % (bi_loss, bi_acc))

TEST loss: 0.0987, acc: 0.9708


In [None]:
print("--------Result--------")
print('Single LSTM TEST loss: %.4f, acc: %.4f' % (single_loss, single_acc))
print('Stacked LSTM TEST loss: %.4f, acc: %.4f' % (stacked_loss, stacked_acc))
print('Bi-directional LSTM TEST loss: %.4f, acc: %.4f' % (bi_loss, bi_acc))

--------Result--------
Single LSTM TEST loss: 0.1868, acc: 0.9437
Stacked LSTM TEST loss: 0.1190, acc: 0.9658
Bi-directional LSTM TEST loss: 0.0987, acc: 0.9708
