In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm

import torch
import torch.nn as nn
from torch.nn import functional as F
from torch import optim
from IPython.display import clear_output
import os

import sys
sys.path.append('utils/')
import myutils

In [2]:
trainloader, validloader, vocab, word2id, id2word = myutils.prepare_imdb_data()

Data has been successfully loaded


In [4]:
class LanguageModel(nn.Module):
    
    def __init__(self, hidden_dim, vocab_size, embedding_dim, 
                 linear_dim=128, n_layers=3, train_on_gpu=True):
        super().__init__()
        self.hidden_dim = hidden_dim
        self.embedding_dim = embedding_dim
        self.n_layers = n_layers
        self.train_on_gpu = train_on_gpu
        
        # Embeddings
        self.embeddings = nn.Embedding(vocab_size, embedding_dim=embedding_dim)
        
        # LSTM
        dropout = 1 if n_layers > 1 else 0
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, n_layers, batch_first=True, dropout=dropout)
        
        # fully-connected layes
        self.fc = nn.Sequential(
            nn.Linear(hidden_dim, linear_dim),
            nn.ReLU(),
            nn.Linear(linear_dim, linear_dim),
            nn.ReLU(),
            nn.Linear(linear_dim, vocab_size)
        )
      
    
    def forward(self, x, h):
        ''' Forward pass through the network. 
            x are inputs and the hidden/cell state `hidden`. '''
        
        x = self.embeddings(x)
        
        output, hidden = self.lstm(x, h)
        output = output.contiguous().view(-1, self.hidden_dim)
        output = self.fc(output)
        output = F.log_softmax(output)
        
        return output, hidden

    
    def init_hidden(self, batch_size):
        ''' Initializes hidden state '''
        # Create two new tensors with sizes n_layers x batch_size x n_hidden,
        # initialized to zero, for hidden state and cell state of LSTM
        weight = next(self.parameters()).data
        
        if (self.train_on_gpu):
            hidden = (weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().cuda(),
                  weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().cuda())
        else:
            hidden = (weight.new(self.n_layers, batch_size, self.hidden_dim).zero_(),
                      weight.new(self.n_layers, batch_size, self.hidden_dim).zero_())
        
        return hidden

In [5]:
import torch
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
batch_size = 64
hidden_dim = 256
linear_dim = 256
embedding_dim = 110
vocab_size = len(vocab)
n_layers = 1

model = LanguageModel(hidden_dim, vocab_size, embedding_dim, linear_dim, n_layers, train_on_gpu=False)

In [None]:
def eval_epoch(model, eval_loader, eval_on_gpu=True):
    criterion = nn.NLLLoss()
    loss_log = []
    model.eval()
    for sequence in eval_loader:
        #init hidden
        h = net.init_hidden(batch_size)
        h = tuple([each.data for each in h])
        # switch to gpu/cpu
        if eval_on_gpu:
            X = sequence[0][:, :-1].cuda()
            y = sequence[0][:, 1:].cuda()
        else:
            X = sequence[0][:, :-1]
            y = sequence[0][:, 1:]
        
        output, hidden = model(X, h)
        loss = criterion(output, y.contiguous().view(-1))
        loss_log.append(loss.item())
    return loss_log

def train_epoch(model, optimizer, train_loader, train_on_gpu=True):
    criterion = nn.NLLLoss()
    loss_log = []
    model.train()
    for sequence in train_loader:
        optimizer.zero_grad()
        h = net.init_hidden(batch_size)
        h = tuple([each.data for each in h])
        if train_on_gpu:
            X = sequence[0][:, :-1].cuda()
            y = sequence[0][:, 1:].cuda()
        else:
            X = sequence[0][:, :-1]
            y = sequence[0][:, 1:]
        output, hidden = model(X, h)
        loss = criterion(output, y.contiguous().view(-1))
        loss.backward()
        optimizer.step()
        loss_log.append(loss.item())
    return loss_log   

def plot_history(train_history, title='loss'):
    plt.figure()
    plt.title('{}'.format(title))
    plt.plot(train_history, label='train', zorder=1)    
    plt.xlabel('train steps')
    plt.legend(loc='best')
    plt.grid()
    plt.show()
    
def train(model, opt, n_epochs, train_loader, train_on_gpu=True, save_to_disk=True):
    train_log = []
    total_steps = 0
    
    if train_on_gpu:
        model.cuda()
    for epoch in range(n_epochs):
        train_loss = train_epoch(model, opt, train_loader)
        train_log.extend(train_loss)
        total_steps += len(train_loader)
        
        clear_output()
        print ('Epoch [{}/{}], Loss: {:.4f}' 
                .format(epoch+1, n_epochs, np.mean(train_log[-100:])))
        plot_history(train_log)
        
    if save_to_disk:
        torch.save(model, 'model.pt')
        
def eval_model(model, eval_loader, eval_on_gpu=True):
    eval_log = []
    
    if eval_on_gpu:
        model.cuda()
    eval_loss = eval_epoch(model, eval_loader)
    eval_log.extend(eval_loss)

    clear_output()
    plot_history(eval_log)
    return eval_log

In [12]:
from gensim.models.wrappers import FastText

model = FastText.load_fasttext_format('cc.en.300')

In [36]:
imdb_embeddings = {}
for w in tqdm(vocab):
    try:
        imdb_embeddings[w] = model.wv[w]
    except Exception:
        try:
            imdb_embeddings[w] = model.wv[w.lower()]
        except Exception:
            try:
                imdb_embeddings[w] = model.wv[w.lower()[:-1]]
            except Exception:
                try:
                    imdb_embeddings[w] = model.wv[w.lower()[:-2]]
                except Exception:
                    imdb_embeddings[w] = np.random.normal(scale=0.6, size=(300, ))



  0%|          | 0/80389 [00:00<?, ?it/s][A[A

 17%|█▋        | 14045/80389 [00:00<00:00, 139545.85it/s][A[A

 37%|███▋      | 29688/80389 [00:00<00:00, 147842.68it/s][A[A

 55%|█████▍    | 44184/80389 [00:00<00:00, 146879.52it/s][A[A

 75%|███████▍  | 60021/80389 [00:00<00:00, 149747.83it/s][A[A

 93%|█████████▎| 74860/80389 [00:00<00:00, 149484.71it/s][A[A

100%|██████████| 80389/80389 [00:00<00:00, 148805.98it/s][A[A

In [58]:
import pickle

with open('fasttext_ptb.pkl', 'wb') as f:
    pickle.dump(ptb_embeddings, f)

In [48]:
imdb_embeddings = []

In [46]:
len(imdb_embeddings) == len(vocab)

True

In [None]:
imdb_embeddings = {w: model.wv[w] for w in vocab}

In [35]:
model.wv['xn'].shape

(300,)

In [49]:
from __future__ import print_function
import os
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
 
import chainer
from __future__ import print_function
from tqdm import tqdm
import numpy as np
import chainer
import os
import re
import torch
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader

In [50]:
train, val, test = chainer.datasets.get_ptb_words()

In [51]:
ptb_dict = chainer.datasets.get_ptb_words_vocabulary()

In [52]:
ptb_dict

{'aer': 0,
 'banknote': 1,
 'berlitz': 2,
 'calloway': 3,
 'centrust': 4,
 'cluett': 5,
 'fromstein': 6,
 'gitano': 7,
 'guterman': 8,
 'hydro-quebec': 9,
 'ipo': 10,
 'kia': 11,
 'memotec': 12,
 'mlx': 13,
 'nahb': 14,
 'punts': 15,
 'rake': 16,
 'regatta': 17,
 'rubens': 18,
 'sim': 19,
 'snack-food': 20,
 'ssangyong': 21,
 'swapo': 22,
 'wachter': 23,
 '<eos>': 24,
 'pierre': 25,
 '<unk>': 26,
 'N': 27,
 'years': 28,
 'old': 29,
 'will': 30,
 'join': 31,
 'the': 32,
 'board': 33,
 'as': 34,
 'a': 35,
 'nonexecutive': 36,
 'director': 37,
 'nov.': 38,
 'mr.': 39,
 'is': 40,
 'chairman': 41,
 'of': 42,
 'n.v.': 43,
 'dutch': 44,
 'publishing': 45,
 'group': 46,
 'rudolph': 47,
 'and': 48,
 'former': 49,
 'consolidated': 50,
 'gold': 51,
 'fields': 52,
 'plc': 53,
 'was': 54,
 'named': 55,
 'this': 56,
 'british': 57,
 'industrial': 58,
 'conglomerate': 59,
 'form': 60,
 'asbestos': 61,
 'once': 62,
 'used': 63,
 'to': 64,
 'make': 65,
 'kent': 66,
 'cigarette': 67,
 'filters': 68,
 'h

In [53]:
ptb_embeddings = {}
for w in tqdm(ptb_dict):
    try:
        ptb_embeddings[w] = model.wv[w]
    except Exception:
        try:
            ptb_embeddings[w] = model.wv[w.lower()]
        except Exception:
            try:
                ptb_embeddings[w] = model.wv[w.lower()[:-1]]
            except Exception:
                try:
                    ptb_embeddings[w] = model.wv[w.lower()[:-2]]
                except Exception:
                    ptb_embeddings[w] = np.random.normal(scale=0.6, size=(300, ))



  0%|          | 0/10000 [00:00<?, ?it/s][A[A

100%|██████████| 10000/10000 [00:00<00:00, 123992.79it/s][A[A