In [20]:
import torch
import json
import torch.nn as nn
import numpy as np

from random import randrange

EMBEDDING_FILE = 'glove.840B.300d.txt'
EMBEDDING_DIM = 64

class WordLanguageModel(nn.Module):
    def __init__(self, num_unique_words, hidden_state_dim=512):
        super(WordLanguageModel, self).__init__()
        self.embeddings = nn.Embedding(num_unique_words, EMBEDDING_DIM)
                            
        self.lstm = nn.LSTM(input_size=EMBEDDING_DIM, hidden_size=hidden_state_dim, num_layers=3)

        self.linear_layer_1 = nn.Linear(hidden_state_dim, 64)
        self.linear_layer_2 = nn.Linear(64, num_unique_words)

        self.relu = nn.LeakyReLU()

    
    def forward(self, word_indices_tensor):
        out = self.embeddings(word_indices_tensor)

        out, h = self.lstm(out)
        
        out = self.linear_layer_1(out)
        out = self.relu(out)
        
        out = self.linear_layer_2(out)
        return out
    
class WordLanguageModelWrapper:
    def __init__(self, context_window=300, hidden_state_dim=512):
        self.context_window = context_window
        self.hidden_state_dim = hidden_state_dim
        
    def load_data(self, file_path):
        with open(file_path, 'r') as file:
            self.training_data = [v for v in file.read().replace('.', ' .').replace(',', ' ,').replace('!', ' !').replace('?', ' ?').replace(':', ' :').replace(';', ' ;').replace("'", " ' ").replace("(", " ) ").replace("-", " - ").split() if v != '']
        
        print('Num Words training data', len(self.training_data))
        
    def load_word_embeddings(self):
        self.embeddings_dict = {}
        with open(EMBEDDING_FILE, 'r') as f:
            for line in f:
                values = line.split()
                word = values[0]
                try:
                    vector = np.asarray(values[1:], "float32")
                    self.embeddings_dict[word] = vector
                except:
                    pass

    def load_unique_words(self, min_word_count=0): 
        word_counts = {}
        for w in self.training_data:
            if w not in word_counts:
                word_counts[w] = 0
            word_counts[w] += 1
        
        self.unique_words = {w: i for i, w in enumerate([w for w, count in word_counts.items() if count > min_word_count])}
        self.num_unique_words = len(self.unique_words) + 1
        print('Num Words:', self.num_unique_words)

    def create_model(self):
        has_cuda = torch.cuda.is_available()
        print('Has CUDA', has_cuda)
        self.device = torch.device('cuda' if has_cuda else 'cpu')
        self.model = WordLanguageModel(self.num_unique_words, self.hidden_state_dim).to(self.device)
    
    def train_model(self, epochs=1, lr=0.0001, skip_factor=1):
        optimizer = torch.optim.Adam(self.model.parameters(), lr=lr)
        
        for epoch in range(epochs):
            offset = randrange(self.context_window * skip_factor)
            total_loss = 0

            while offset + self.context_window + 1 < len(self.training_data):
                optimizer.zero_grad()
                
                text_in = self.training_data[offset:offset + self.context_window + 1]
                #print(text_in)
                #text_in_tensor = self._text_to_embedding_tensor(text_in)
                text_out_tensor = self._text_to_index_tensor(text_in)
                prediction = self.model.forward(torch.LongTensor(np.array(text_out_tensor[:-1])).to(self.device))
                loss = nn.functional.cross_entropy(prediction, torch.LongTensor(text_out_tensor[1:]).to(self.device))
                loss.backward()
                optimizer.step()

                total_loss += loss.item()
            
                offset += self.context_window * skip_factor
            print(f'Epoch {epoch + 1}, Train Loss {total_loss}')
    
    def predict(self, text, length=200):
        for i in range(length):
            text_in_tensor = self._text_to_index_tensor(text)
            prediction = self.model.forward(torch.LongTensor(text_in_tensor).to(self.device))
            
            probs = nn.functional.softmax(prediction[-1], dim=0).detach().cpu().numpy()
            res_index = np.random.choice(range(len(probs) - 1), p=probs[:-1]/sum(probs[:-1]))
            res_word = list(self.unique_words.keys())[res_index]
            
            text.append(res_word)
        
        return text
    
    def store_model(self, name):
        torch.save(self.model.state_dict(), f'models/{name}.pt')
        
        with open(f'models/{name}.json', 'w') as f:
            json.dump({'unique_words': self.unique_words, 'num_unique_words': self.num_unique_words}, f)
    
    def load_model(self, name):
        with open(f'models/{name}.json') as f:
            d = json.load(f)
            self.unique_words = d['unique_words']
            self.num_unique_words = d['num_unique_words']
        
        self.create_model()

        self.model.load_state_dict(torch.load(f'models/{name}.pt'))
        self.model.eval()
    
    def _text_to_embedding_tensor(self, text_in):
        return [self._get_word_embedding(word) for word in text_in]
    
    def _text_to_index_tensor(self, text_in):
        text_in_tensor = []
        for word in text_in:
            text_in_tensor.append(self._get_word_index(word))
        return text_in_tensor
    
    def _get_word_embedding(self, word):
        if word in self.embeddings_dict and word != '<unk>':
            return self.embeddings_dict[word]
        else:
            return np.zeros(EMBEDDING_DIM)
        
    def _get_word_index(self, word):
        return self.unique_words[word] if word in self.unique_words else self.num_unique_words - 1

In [21]:
# Model training
def train():
    model = WordLanguageModelWrapper(hidden_state_dim=512)
    #model.load_word_embeddings()
    #model.load_data('wiki.train.tokens')
    model.load_data('1984.txt')
    model.load_unique_words()
    model.create_model()

    for i in range(100000):
        model.train_model(epochs=10)
        print(model.predict(['None']))

        if i % 10 == 0:
            model.store_model(f'epoch_{i + 1}')
train()

Num Words training data 124260
Num Words: 9769
Has CUDA True
Epoch 1, Train Loss 3071.3514699935913
Epoch 2, Train Loss 2685.243432998657
Epoch 3, Train Loss 2664.2964062690735
Epoch 4, Train Loss 2649.4608402252197
Epoch 5, Train Loss 2646.533802509308
Epoch 6, Train Loss 2644.2896852493286
Epoch 7, Train Loss 2643.7853541374207
Epoch 8, Train Loss 2643.2550024986267
Epoch 9, Train Loss 2642.8167390823364
Epoch 10, Train Loss 2642.0050134658813
['None', 'dead', 'have', 'O', ',', 'Hundreds', "'", 'up', 'know', 'looked', 'lump', 'of', 'sitting', ',', 'If', 'one', 'A', 'it', 'of', ',', 'powerful', 'her', 'said', 'six', 'aeroplane', '.', 'it', ',', ',', 'questions', 'renting', 'that', 'need', ',', 'came', ',"', 'the', 'saucepan', 'the', 'said', 'some', 'the', 'only', '.', 'physical', 'But', 'the', 'dare', ',', ',', 'and', 'I', ',', 'astronomy', 'had', 'treachery', 'to', 'still', 'might', 'In', 'suddenly', 'estimates', ',', 'die', 'to', 'huge', 'with', 'at', 'right', 'When', 'earth', 'paci

KeyboardInterrupt: 

In [2]:
model = WordLanguageModelWrapper(hidden_state_dim=512)
model.load_word_embeddings()
model.load_model('epoch_91')
print(model.predict(['Patrik']))

Has CUDA True


  prediction = self.model.forward(torch.FloatTensor(text_in_tensor).to(self.device))


['Patrik', 'scorn', 'allowed', 'the', 'eyes', '.', 'New', 'actresses', 'addressed', 'the', 'enemies', 'pay', 'series', 'ft', 'year', 'for', 'the', 'city', '.', '=', '=', '=', '120', 'at', 'Remaining', 'de', '@,@', ')', '=', 'and', 'number', 'game', 'was', '"', 'Lindisfarne', 'hours', '<unk>', '89', ',', '1947', '@-@', "'s", 'decision', ',', 'or', 'Tell', 'March', 'Francisco', 'Club', 'weapons', 'had', 'allied', '.', 'In', 'a', 'Time', 'and', 'court', 'throw', 'the', '2015', 'Convoy', '(', 'Queensland', 'published', '<unk>', 'rail', '110', 'caused', 'he', 'came', 'greatly', 'got', 'shot', '.', 'An', 'May', 'Cowell', 'was', 'fire', 'economy', '.', 'many', 'was', 'about', 'his', 'other', 'National', 'key', 'storyline', '"', ',', 'with', 'birth', 'greater', 'way', 'to', 'Andrew', 'favored', 'him', 'in', '2008', 'home', ',', 'Aquitaine', 'under', 'them', ',', 'the', '4', 'clubs', '@-@', 'such', 'in', 'the', 'Official', 'teaching', 'jurisdiction', ',', 'as', 'they', 'as', 'the', 'way', 'of',