# Requirement

You are a ML engineer. You need to implement a model that processes natual language in the IMDB dataset. Most of the code has been implemented, but there are still parts to be implemented. The RNN model needs to be further implemented. If RNN model you implement is perfect, You might not need hyperparameter tuning. But, you can try if necesseray.

# Constraints
It's not allowed to add or delete cells. You can't change the `DO NOT CHANGE` cells. Only `CHANGEABLE` cells can be changed. Of course, you can add or delete them during competitions. But, they must be returned to proper condition prior to the end of the competiton. You may lose points if you change cells that are not allowed or solve it usnig a illegal method.

In [None]:
# Cell 1 - Install packages
# DO NOT CHANGE

!pip install torch==1.8.0 torchtext==0.9.0

In [None]:
# Cell 2 - Import packages
# DO NOT CHANGE

import os
import torch
import torch.nn as nn
import torch.nn.functional as F
from torchtext.legacy import datasets, data
import random

In [None]:
# Cell 3 - Hyperparameters
# CHANGEABLE

device = 'cpu'
batch_size = 64
learning_rate = 0.001
epochs = 5
dropout_ratio = 0.2
n_layer = 1
hidden_dim = 32
embed_dim = 128

In [None]:
# Cell 4 - Import dataset, trainset and testset
# DO NOT CHANGE

TEXT = data.Field(sequential=True, batch_first=True, lower=True)
LABEL = data.Field(sequential=False, batch_first=True)

trainset, testset = datasets.IMDB.splits(TEXT, LABEL)

In [None]:
# Cell 5 - Build Vocabulary
# DO NOT CHANGE

TEXT.build_vocab(trainset, min_freq=5)
LABEL.build_vocab(trainset)

vocab_size = len(TEXT.vocab)
n_classes = 2

In [None]:
# Cell 6 - Split validset from trainset and Create data loaders
# DO NOT CHANGE

trainset, valset = trainset.split(split_ratio=0.8)
train_iterator, val_iterator, test_iterator = data.BucketIterator.splits((trainset, valset, testset), 
                                                                         shuffle=True, repeat=False, batch_size=batch_size)

In [None]:
# Cell 7 - Print data informations
# CHANGEABLE

print('Voca size : {}'.format(vocab_size))
print('Number of class : {}'.format(n_classes))
print('Mini-batch size of trainset : {}'.format(len(train_iterator)))
print('Mini-batch size of testset : {}'.format(len(test_iterator)))
print('Mini-batch size of validset : {}'.format(len(val_iterator)))

In [None]:
# Cell 8 - RNN Model
# CHANGEABLE

class GRU(nn.Module):
    def __init__(self, n_layers, hidden_dim, n_vocab, embed_dim, n_classes, dropout_ratio):
        super(GRU, self).__init__()
        self.n_layers = n_layers
        self.hidden_dim = hidden_dim
        self.dropout = nn.Dropout(dropout_ratio)

        self.embed = nn.Embedding(n_vocab, embed_dim)
        self.gru = nn.GRU()
        self.out = nn.Linear(self.hidden_dim, n_classes)

    def forward(self, x):
        x = self.embed(x)
        h_0 = self.init_hidden(x.size(0))
        x, _ = self.gru(x, h_0)
        h_t = x[:,-1,:]
        self.dropout(h_t)
        output = self.out(h_t)
        return output

    def init_hidden(self, batch_size=1):
        weight = next(self.parameters()).data
        return weight.new(self.n_layers, batch_size, self.hidden_dim).zero_()

In [None]:
# Cell 9 - Define functions
# DO NOT CHANGE

def train(model, optimizer, train_iterator):
    model.train()
    for b, batch in enumerate(train_iterator):
        x, y = batch.text.to(device), batch.label.to(device)
        y.data.sub_(1)
        
        optimizer.zero_grad()
        output = model(x)
        loss = F.cross_entropy(output, y)
        loss.backward()
        optimizer.step()
        
def evaluate(model, val_iterator):
    total_correct, total_loss = 0, 0
    
    model.eval()
    for batch in val_iterator:
        x, y = batch.text.to(device), batch.label.to(device)
        y.data.sub_(1)
        
        prediction = model(x)
        loss = F.cross_entropy(prediction, y, reduction='sum')
        total_loss += loss.item()
        total_correct += (prediction.max(1)[1].view(y.size()).data == y.data).sum()
        
    val_len = len(val_iterator.dataset)
    avg_loss = total_loss / val_len
    avg_accuracy = total_correct / val_len
    
    return avg_loss, avg_accuracy

In [None]:
# Cell 10 - Train model
# DO NOT CHANGE

model = GRU(n_layer, hidden_dim, vocab_size, embed_dim, n_classes, dropout_ratio).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

best_accuracy = 0.0
val_accuracy = 0.0
if not os.path.isdir("wsi_nlp"):
    os.makedirs("wsi_nlp")
        
for epoch in range(epochs):
    train(model, optimizer, train_iterator)
    val_loss, val_accuracy = evaluate(model, val_iterator)

    print('[epoch: {}] loss = {}, accuracy: {} '.format(epoch + 1, val_loss, val_accuracy))
    
    if val_accuracy > best_accuracy:
        torch.save(model.state_dict(), './wsi_nlp/nlp_model.pt')
        best_accuracy = val_accuracy

In [None]:
# Cell 11 - Show the accuracy on testset
# DO NOT CHANGE

model.load_state_dict(torch.load('./wsi_nlp/nlp_model.pt'))
test_loss, test_accuracy = evaluate(model, test_iterator)
print('accuracy: %5.2f' % (test_accuracy))