In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import collections, math, random, sys

In [None]:
import torch 
torch.set_default_device('cpu')
import torch.nn as nn
import torch.optim as optim

In [None]:
sys.path.append('/kaggle/input/nus-sms-corpus/')
print(sys.path)
from utils import *
print(sys.path)

In [None]:
traindata = read_mono('/kaggle/input/nus-sms-corpus/data/small', delim='')
devdata = read_mono('/kaggle/input/nus-sms-corpus/data/dev', delim='')
testdata = read_mono('/kaggle/input/nus-sms-corpus/data/test', delim='')

vocab = Vocab()
for words in traindata:
    vocab |= words

In [None]:
# # create model that numberizes each word in the dataset
# class CustomDataset(Dataset):
#     def __init__(self, data, vocab):
#         self.data = data
#         self.vocab = vocab
        
#     def __len__(self):
#         return len(self.data)
    
#     def __getitem__(self, idx):
#         return [self.vocab.numberize(word) for word in self.data[idx]]

In [None]:
# Define the LSTM Model
class LSTMModel(nn.Module):
    def __init__(self, voc_dim, hidden_dim):
        super(LSTMModel, self).__init__()
        self.hidden_dim = hidden_dim
        self.voc_dim = voc_dim
        self.lstm = nn.LSTMCell(voc_dim, hidden_dim)
        self.linear = nn.Linear(hidden_dim, voc_dim)
        
    
    def forward(self, x, hc):
        # LSTMCell(x, (h, c)) returns (h', c')
        h, c = self.lstm(x, hc)
        y = nn.functional.log_softmax(self.linear(h), dim=1)
        return y, (h, c)
    
hidden_dim = 128

        
    
        

In [None]:
# train that jawn
def train(model, traindata, vocab, optimizer, criterion, num_epochs=10):
    prev_dev_acc = None
    for epoch in range(num_epochs):
        random.shuffle(traindata) #important
        train_loss = 0.0
        for line in traindata:
            optimizer.zero_grad()
            # inputs are 0->n-1 bc they predict the next char
            inputs = torch.tensor([vocab.numberize(word) for word in line[:-1]])
            #use one_hot to avoid embeddings bc I don't know what they are
            inputs = torch.nn.functional.one_hot(inputs, len(vocab)).float()
            #targets are 1->n bc they are the ones being predicted
            targets = torch.tensor([vocab.numberize(word) for word in line[1:]])
            # init hidden and cell states to 0 tensors from 2.17
            # matrix of len(line) - BOS by all of the hidden states
            h, c = torch.zeros(len(line)-1, model.hidden_dim), torch.zeros(len(line)-1, model.hidden_dim)
            # this is how we get the 
            output, hidden = model(inputs, (h, c))
            # compute loss 
            loss = criterion(output.view(-1, len(vocab)), targets)
            
            # reset gradients and update parameters/train loss
            loss.backward()
            nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            train_loss += loss.item()
        
        dev_acc = evaluate(model, devdata, vocab)
        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')
        
        #if dev acc decreases, halve learning rate
        if prev_dev_acc is not None and dev_acc <= prev_dev_acc:
            for param_group in optimizer.param_groups:
                param_group['lr'] *= 0.5
            print(f"lr={optimizer.param_groups[0]['lr']}")
            
            # break if learning rate goes below threshold
            if optimizer.param_groups[0]['lr'] < .000001:
                break
            
            prev_dev_acc = dev_acc
            
                

    # save the trained model
    torch.save((list(model.parameters()), vocab), 'model.small')
    
    return model

In [None]:
# Evaluate function
def evaluate(model, data, vocab):
    with torch.no_grad():
        num_correct = 0
        total = 0
        
        for l in data:
            inputs = [vocab.numberize(word) for word in l[:-1]]
            inputs = torch.nn.functional.one_hot(torch.tensor(inputs), len(vocab)).float()
            targets = [vocab.numberize(word) for word in l[1:]]
            targets = torch.tensor(targets)
            
            h, c = torch.zeros(len(l)-1, model.hidden_dim), torch.zeros(len(l)-1, model.hidden_dim)
            # this is how we get the outputs and check if they're correct
            output, hidden = model(inputs, (h, c))
            hidden, predicted = output.max(1)
            
            num_correct += (predicted == targets).sum().item()
            total += len(targets)
            
        acc = num_correct / total
            
    return acc
            

In [None]:
# creating the model
model = LSTMModel(len(vocab), hidden_dim)

In [None]:
# define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [None]:
trained = train(model, traindata, vocab, optimizer, criterion)
acc = evaluate(trained, devdata, vocab)
print(f"Final Dev Accuracy: {acc:.4f}")