# LSTM with Moby Dick

In [1]:
import torch
import torch.nn as nn

import nltk
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag

from sklearn.feature_extraction.text import CountVectorizer

import random
import numpy as np

In [2]:
nltk.download('gutenberg')

[nltk_data] Downloading package gutenberg to
[nltk_data]     /Users/cillian/nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!


True

In [3]:
nltk.corpus.gutenberg.fileids()

['austen-emma.txt',
 'austen-persuasion.txt',
 'austen-sense.txt',
 'bible-kjv.txt',
 'blake-poems.txt',
 'bryant-stories.txt',
 'burgess-busterbrown.txt',
 'carroll-alice.txt',
 'chesterton-ball.txt',
 'chesterton-brown.txt',
 'chesterton-thursday.txt',
 'edgeworth-parents.txt',
 'melville-moby_dick.txt',
 'milton-paradise.txt',
 'shakespeare-caesar.txt',
 'shakespeare-hamlet.txt',
 'shakespeare-macbeth.txt',
 'whitman-leaves.txt']

In [4]:
raw = nltk.corpus.gutenberg.raw('melville-moby_dick.txt')
print(raw[21945:23000])
raw = raw[21945:200000]

CHAPTER 1

Loomings.


Call me Ishmael.  Some years ago--never mind how long
precisely--having little or no money in my purse, and nothing
particular to interest me on shore, I thought I would sail about a
little and see the watery part of the world.  It is a way I have of
driving off the spleen and regulating the circulation.  Whenever I
find myself growing grim about the mouth; whenever it is a damp,
drizzly November in my soul; whenever I find myself involuntarily
pausing before coffin warehouses, and bringing up the rear of every
funeral I meet; and especially whenever my hypos get such an upper
hand of me, that it requires a strong moral principle to prevent me
from deliberately stepping into the street, and methodically knocking
people's hats off--then, I account it high time to get to sea as soon
as I can.  This is my substitute for pistol and ball.  With a
philosophical flourish Cato throws himself upon his sword; I quietly
take to the ship.  There is nothing

In [5]:
raw

'CHAPTER 1\r\n\r\nLoomings.\r\n\r\n\r\nCall me Ishmael.  Some years ago--never mind how long\r\nprecisely--having little or no money in my purse, and nothing\r\nparticular to interest me on shore, I thought I would sail about a\r\nlittle and see the watery part of the world.  It is a way I have of\r\ndriving off the spleen and regulating the circulation.  Whenever I\r\nfind myself growing grim about the mouth; whenever it is a damp,\r\ndrizzly November in my soul; whenever I find myself involuntarily\r\npausing before coffin warehouses, and bringing up the rear of every\r\nfuneral I meet; and especially whenever my hypos get such an upper\r\nhand of me, that it requires a strong moral principle to prevent me\r\nfrom deliberately stepping into the street, and methodically knocking\r\npeople\'s hats off--then, I account it high time to get to sea as soon\r\nas I can.  This is my substitute for pistol and ball.  With a\r\nphilosophical flourish Cato throws himself upon his sword; I quietl

# BoW

In [6]:
tokens = word_tokenize(raw)
print(tokens[:50])

['CHAPTER', '1', 'Loomings', '.', 'Call', 'me', 'Ishmael', '.', 'Some', 'years', 'ago', '--', 'never', 'mind', 'how', 'long', 'precisely', '--', 'having', 'little', 'or', 'no', 'money', 'in', 'my', 'purse', ',', 'and', 'nothing', 'particular', 'to', 'interest', 'me', 'on', 'shore', ',', 'I', 'thought', 'I', 'would', 'sail', 'about', 'a', 'little', 'and', 'see', 'the', 'watery', 'part', 'of']


In [7]:
cnv = CountVectorizer(analyzer = 'word', tokenizer = lambda x: x.split(', '),
                       preprocessor=None, stop_words=None, ngram_range=(1, 1), lowercase=False)
data = cnv.fit_transform(tokens).toarray()

In [8]:
data

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [9]:
len(data)

36931

In [10]:
cnv.inverse_transform(data)

[array(['CHAPTER'], dtype='<U23'),
 array(['1'], dtype='<U23'),
 array(['Loomings'], dtype='<U23'),
 array(['.'], dtype='<U23'),
 array(['Call'], dtype='<U23'),
 array(['me'], dtype='<U23'),
 array(['Ishmael'], dtype='<U23'),
 array(['.'], dtype='<U23'),
 array(['Some'], dtype='<U23'),
 array(['years'], dtype='<U23'),
 array(['ago'], dtype='<U23'),
 array(['--'], dtype='<U23'),
 array(['never'], dtype='<U23'),
 array(['mind'], dtype='<U23'),
 array(['how'], dtype='<U23'),
 array(['long'], dtype='<U23'),
 array(['precisely'], dtype='<U23'),
 array(['--'], dtype='<U23'),
 array(['having'], dtype='<U23'),
 array(['little'], dtype='<U23'),
 array(['or'], dtype='<U23'),
 array(['no'], dtype='<U23'),
 array(['money'], dtype='<U23'),
 array(['in'], dtype='<U23'),
 array(['my'], dtype='<U23'),
 array(['purse'], dtype='<U23'),
 array([','], dtype='<U23'),
 array(['and'], dtype='<U23'),
 array(['nothing'], dtype='<U23'),
 array(['particular'], dtype='<U23'),
 array(['to'], dtype='<U23'),
 array(

In [11]:
dim = len(cnv.vocabulary_)
print('Dimension of input & output: ', dim)

Dimension of input & output:  6069


# Define Model

In [12]:
class LSTM(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, num_layers):
        super(LSTM, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.num_layers = num_layers
        
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers)
        self.fc = nn.Linear(hidden_size, output_size)
        
    def forward(self, input, hidden, cell):
        out, (hidden, cell) = self.lstm(input, (hidden, cell))
        fc_out = self.fc(out)
        return fc_out, hidden, cell
    
    def init_hidden_cell(self):
        hidden = torch.zeros(self.num_layers, 1, self.hidden_size)
        # if cuda -> hidden = torch.zeros(self.num_layers, 1, self.hidden_size).cuda()
        cell = torch.zeros(self.num_layers, 1, self.hidden_size)
        # if cuda -> cell = torch.zeros(self.num_layers, 1, self.hidden_size).cuda()
        return hidden, cell

In [13]:
model = LSTM(dim, 1000, dim, 1)
# cuda => model = LSTM(dim, 1000, dim, 1).cuda()

# Train Model

In [14]:
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
loss = nn.CrossEntropyLoss()

In [15]:
seq_len = 20
num_epochs = 5

In [16]:
for epoch in range(num_epochs):
    sp = list(range(0, len(data) - 2 * seq_len, seq_len))
    sp = np.add(sp, random.randint(0, seq_len))
    random.shuffle(sp)
    
    for i in range(len(sp)):
        (hidden, cell) = model.init_hidden_cell()
        
        X = torch.from_numpy(data[sp[i] : sp[i] + seq_len].astype(float)).type(torch.FloatTensor).reshape(seq_len, 1, dim)
        # cuda -> X = torch.from_numpy(data[sp[i] : sp[i] + seq_len].astype(float)).type(torch.FloatTensor).cuda().reshape(seq_len, 1, dim)
        y = torch.from_numpy(data[sp[i] + 1 : sp[i] + seq_len + 1].astype(float))
        # cuda -> y = torch.from_numpy(data[sp[i] : sp[i] + seq_len + 1].astype(float)).cuda()
        
        _, y = y.max(dim=1)
        pre, hidden, cell = model(X, hidden, cell)
        cost = loss(pre.reshape(seq_len, dim), y.reshape(seq_len))
        # cuda -> cost = loss(pre.reshape(seq_len, dim), y.reshape(seq_len).cuda())
        
        optimizer.zero_grad()
        cost.backward()
        
        nn.utils.clip_grad_norm_(model.parameters(), 5)
        
        optimizer.step()
        
        if (i + 1) % 100 == 0:
            print('Epoch [%d/%d], Iter [%d/%d] Loss: %.4f'
                     %(epoch+1, num_epochs, i + 1, len(sp), cost.item()))

Epoch [1/5], Iter [100/1845] Loss: 7.7937
Epoch [1/5], Iter [200/1845] Loss: 6.9152
Epoch [1/5], Iter [300/1845] Loss: 6.8927
Epoch [1/5], Iter [400/1845] Loss: 7.3932
Epoch [1/5], Iter [500/1845] Loss: 5.9576
Epoch [1/5], Iter [600/1845] Loss: 6.5322
Epoch [1/5], Iter [700/1845] Loss: 6.9642
Epoch [1/5], Iter [800/1845] Loss: 7.1908
Epoch [1/5], Iter [900/1845] Loss: 6.6079


KeyboardInterrupt: 

#  Test Model

In [22]:
start_num = 0
text = ""

for item in cnv.inverse_transform(data[start_num:start_num + seq_len]) :
    text += " " + item[0]
    
text += " <Generated>"

model.eval()
hidden, cell = model.init_hidden_cell()

X_test = torch.from_numpy(data[start_num:start_num + seq_len].astype(float)).type(torch.FloatTensor).reshape(seq_len, 1, dim)
# cuda -> X_test = torch.from_numpy(data[start_num:start_num + seq_len].astype(float)).type(torch.FloatTensor).cuda().reshape(seq_len, 1, dim)

for pos in range(10) :
    
    pre, hidden, cell = model(X_test, hidden, cell)
    
    m = torch.nn.Softmax(dim = 0)
    pre = m(pre.reshape(seq_len, -1))
    pre = torch.multinomial(pre, 1)
    
    temp = np.zeros((20, dim))
        
    for i, item in enumerate(pre) :
                
        temp[i][item] = 1 
    
        text += " " + cnv.inverse_transform(temp[i])[0][0]
    
    X_test = torch.from_numpy(temp.astype(float)).type(torch.FloatTensor).reshape(seq_len, 1, dim)
    # cuda -> X_test = torch.from_numpy(temp.astype(float)).type(torch.FloatTensor).cuda().reshape(seq_len, 1, dim)
    
print("* Generated Text : \n", text)

* Generated Text : 
  CHAPTER 1 Loomings . Call me Ishmael . Some years ago -- never mind how long precisely -- having little <Generated> scolding roused from monotonous highest coats caper swung vital half suppose entering gave poor UNITED boggy unearthly alone bride With else deliciousness uncheered Harry woollen begged sinner bedroom blend solitary spilled thistles butterfly plentifully air state-room Entering Wrapping sea-storm cups call tell order infant comprehending Call distinct condensed dead subsequent rows scoria whale-hunters HARDY creaking compliment singularly sundry dividends shan't wash-stand song trying identical herds engulphed forgotten imperfectly befriending relatives frogs lord with enjoyed sublime Him junks engaged law colds Its thrown third imposed Johnny condemning pocket city casement canoes prolonged thawed sloop supplicating ruins Suppose boundless thing worldly shipped countenance obvious emigrant Scandinavian shooting yourselves lands natives long-drawn si