In [151]:
import os

import pandas as pd
import torch

from torch.nn import functional as F
from torch.autograd import Variable
from torch import nn, optim

from tqdm import tqdm_notebook
from cached_property import cached_property
from textblob import TextBlob
from boltons.iterutils import windowed, chunked_iter

In [90]:
chicago = pd.read_csv('../data/CHICAGO_CORPUS/CHICAGO_NOVEL_CORPUS_METADATA/CHICAGO_CORPUS_NOVELS2.csv')

In [91]:
fnames = chicago.sort_values('PUBL_DATE', ascending=False)['FILENAME'].head(10)

In [92]:
paths = list(map(lambda fn: os.path.join('../data/CHICAGO_CORPUS/CHICAGO_NOVEL_CORPUS/', fn), fnames))

In [93]:
class Text(TextBlob):
    
    @classmethod
    def from_file(cls, path):
        with open(path) as fh:
            return cls(fh.read())
        
    def vocab(self):
        return set(self.words.lower())

In [107]:
class Corpus:
    
    def __init__(self, paths):
        self.paths = paths
    
    def texts(self):
        for path in self.paths:
            yield Text.from_file(path)
    
    @cached_property
    def vocab(self):
        text_vocabs = [t.vocab() for t in self.texts()]
        return list(set.union(*text_vocabs))
    
    @cached_property
    def word_to_index(self):
        return {w: i for i, w in enumerate(self.vocab)}
    
    def ngrams(self, n=3):
        for text in self.texts():
            for sent in text.sentences:
                words = list(sent.words.lower())
                for ngram in windowed(words, n):
                    yield [self.word_to_index[w] for w in ngram]

In [108]:
corpus = Corpus(paths)

In [143]:
class NgramLanguageModeler(nn.Module):
    
    def __init__(self, vocab_size, embedding_dim, context_size, hidden_dim):
        super().__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.linear1 = nn.Linear(context_size * embedding_dim, hidden_dim)
        self.linear2 = nn.Linear(hidden_dim, vocab_size)
        
    def forward(self, inputs):
        # 2x100 -> 1x200
        embeds = self.embeddings(inputs).view((1, -1))
        out = F.relu(self.linear1(embeds))
        out = self.linear2(out)
        return F.log_softmax(out)

In [None]:
NGRAM_ORDER = 3

In [144]:
loss_function = nn.NLLLoss()

In [152]:
model = NgramLanguageModeler(len(corpus.vocab), 100, NGRAM_ORDER-1, 128)

In [153]:
optimizer = optim.SGD(model.parameters(), lr=0.001)

In [155]:
for chunk in chunked_iter(corpus.ngrams(NGRAM_ORDER), 1000):
    
    total_loss = 0
    
    for ngram in chunk:

        x = Variable(torch.LongTensor(ngram[:-1]))
        y = Variable(torch.LongTensor([ngram[-1]]))

        model.zero_grad()

        y_pred = model(x)

        loss = loss_function(y_pred, y)

        loss.backward()
        optimizer.step()
        
        total_loss += loss.data[0]
        
    print(total_loss)

10146.040800094604
10061.55030632019
9969.23020362854
9853.376891613007
9808.436485767365
9704.317622184753
9529.344534873962
9321.834131002426
9282.774372339249
9197.706761479378
9070.579119741917
8829.402950003743
8739.677726745605
8772.877785384655
8789.959976315498
8617.163014173508
8248.349059402943


KeyboardInterrupt: 