In [1]:
import os
import collections
import matplotlib.pyplot as plt
import numpy as np
from argparse import Namespace
import pandas as pd
import re
import torch
import torchtext
import torch.nn as nn
from pathlib import Path
from gensim.models import Word2Vec
import time
import copy

# Plan A: Self-trained Word2Vec + Homemade LSTM

In [2]:
# Set Numpy and PyTorch seeds
def set_seeds(seed, cuda):
    np.random.seed(seed)
    torch.manual_seed(seed)
    if cuda:
        torch.cuda.manual_seed_all(seed)
        
# Creating directories
def create_dirs(dirpath):
    if not os.path.exists(dirpath):
        os.makedirs(dirpath)

In [3]:
# Arguments
args = Namespace(
    seed=1234,
    cuda=True,
    path="data",
    w2vmodel_path="language.w2v.model",
    batch_size=16,
    num_workers=4
)
# Set seeds
set_seeds(seed=args.seed, cuda=args.cuda)

# Check CUDA
if not torch.cuda.is_available():
    args.cuda = False
args.device = torch.device("cuda" if args.cuda else "cpu")
print("Using CUDA: {}".format(args.cuda))

Using CUDA: True


## Preparing the data

Use panda to conveniently import three `TSV` data files as `DataFrame`s.

In [4]:
path =  Path(args.path)
training_df = pd.read_csv(path / 'training_set.ss', sep='\t')
test_df = pd.read_csv(path / 'test_set.ss', sep='\t')
validation_df = pd.read_csv(path / 'validation_set.ss', sep='\t')

In [6]:
training_df

Unnamed: 0,user_id,product_id,rating,review_content
0,ur0116181/,\tt0185937,1,sometimes popular opinion really sucks about a...
1,ur0116181/,\tt0169547,10,this is an amazing piece of work and it probab...
2,ur0116181/,\tt0478304,5,this movie does not belong in a cinema . <ssss...
3,ur0116181/,\tt0195685,10,the first nominees for next year 's oscars in ...
4,ur0116181/,\tt0217869,10,my expectations for writer/director m. night s...
5,ur0116181/,\tt0181689,10,let me warn you that a.i. was my favorite film...
6,ur0116181/,\tt0062622,10,this was the movie that made me fall in love w...
7,ur0116181/,\tt0155267,9,"i do n't want to jinx this movie , but this is..."
8,ur0116181/,\tt0140352,10,this film is unavoidably being compared to dir...
9,ur0116181/,\tt0887912,10,without a doubt this is the best war film sinc...


## Load in Word2Vec model

We want to use word2vec as our embedding layer. First, we load the w2vmodel that we trained(26.89M).

See [how](./word2vec.ipynb) the w2vmodel is trained.

In [5]:
w2vmodel = Word2Vec.load(args.w2vmodel_path)

In [8]:
w2vmodel.wv.vectors[1487]

array([ 0.0832856 ,  0.17038353,  0.13446064,  0.20960386, -0.27410474,
        0.04419966, -0.02441939, -0.2884749 ,  0.08365629, -0.3235816 ,
       -0.14133385, -0.05964658,  0.07084928,  0.10685515, -0.595156  ,
        0.1006216 ,  0.08503133, -0.20746365,  0.37630245,  0.35833853,
       -0.06807327,  0.3119023 , -0.29489398,  0.20061803, -0.10797408,
       -0.14101546, -0.38280725,  0.054291  , -0.44207925, -0.232832  ,
       -0.486326  ,  0.05088602, -0.08845872, -0.21135303, -0.24881552,
       -0.1047746 , -0.309808  ,  0.02793502,  0.12140413,  0.08667814,
       -0.04950768, -0.05872349,  0.00774467,  0.23892054, -0.32583383,
       -0.19085103, -0.08125331, -0.01782029, -0.00889523,  0.09103105,
       -0.37290302,  0.08022729, -0.24092077,  0.2355133 , -0.16579767,
       -0.07698391, -0.01896097, -0.12648211, -0.12753847, -0.02031306,
       -0.03370816,  0.11911461,  0.21125719, -0.13139032,  0.0097007 ,
       -0.52950764,  0.07178081, -0.06581753,  0.1629323 ,  0.15

## Vocabulary & Vectorizer

We write a class `Vectorizer` to help us tokenize text and get word vectors/id from w2vmodel and convert w2vmodel to `nn.Embedding` model straight away.

Notice that we have process some common English stop words. `<sssss>`, `--rrb--` and some other tokens are included, which are not much meaningful but commonly seen in the dataset.

In [6]:
stop_words = ['and', ',', '.', 'ourselves', 'hers', 'between', 'yourself', 'but', 'again', 'there', 'about', 'once', 
              'during', 'out', 'very', 'having', 'with', 'they', 'own', 'an', 'be', 'some', 'for', 'do', 'its', 'yours', 
              'such', 'into', 'of', 'most', 'itself', 'other', 'off', 'is', 's', 'am', 'or', 'who', 'as', 'from', 'him',
              'each', 'the', 'themselves', 'until', 'below', 'are', 'we', 'these', 'your', 'his', 'me', 'were', 'her',
              'himself', 'this', 'should', 'our', 'their', 'above', 'both','to', 'ours', 'had', 'she',  'when', 'at', 
              'them','been', 'have', 'in', 'will', 'on', 'does', 'yourselves', 'then', 'that', 'because', 'what', 'so', 
              'can', 'did', 'now', 'he', 'you', 'herself', 'has', 'myself', 'which', 'those', 'i', 'after', 'whom',
              'theirs', 'my', 'a', 'by', 'doing', 'it', 'was', '<sssss>', '-rrb-', '-lrb-']

class Vectorizer(object):
    def __init__(self, model):
        self.word_list = model.wv.index2word
        self.vector_list = w2vmodel.wv.vectors
    
    def getVector(self, word):
        try:
            i = self.word_list.index(word)
            return self.vector_list[i]
        except:
            i = self.word_list.index('unknown')
            return self.vector_list[i]
        
    def getVectorsFromText(self, text):
        vectors = []
        for word in text.split(" "):
            if word in stop_words:
                continue
            
            vectors.append(self.getVector(word))
        
        return vectors
    
    # Get id of a word
    def getId(self, word):
        try:
            i = self.word_list.index(word)
            return i
        except:
            i = self.word_list.index('unknown')
            return i
    
    # Tokenize the sentences and replace every token with its id
    def getIdsFromText(self, text):
        ids = []
        for word in text.split(" "):
            if word in stop_words:
                continue
            
            ids.append(self.getId(word))
        
        return ids
    
    def getVectorById(self, id):
        return self.vector_list[id];
    
    # Generate nn.Embedding from trained w2vmodel
    def getEmbedding(self):
        weights = torch.FloatTensor(self.vector_list)
        return torch.nn.Embedding.from_pretrained(weights)
            


In [7]:
voca = Vectorizer(w2vmodel)

In [8]:
voca.getIdsFromText("hello my name is derek")

[3848, 436, 8560]

## Customize Dataset

To extend `torch.utils.data.Dataset`, we can build a Dataset class which with `DataLoader` will help us load data in batches.

In `__getitem__`, we use our `Vectorizer` to process the text into list of id.

In [12]:
class MovieReviewDataset(torch.utils.data.Dataset):
    def __init__(self, df, vectorizer: Vectorizer, text_col = 'review_content', rating_col = 'rating'):
        self.df = df
        self.text_col = text_col
        self.rating_col = rating_col
        self.vectorizer = vectorizer
    
    def __len__(self):
        return len(self.df.index)
    
    def __getitem__(self, idx):
        line = self.df.iloc[idx]
        text = torch.tensor(self.vectorizer.getIdsFromText(line[self.text_col]))
        rating = torch.tensor(line[self.rating_col])
        
        return {'text': text, 'rating': rating}

In [13]:
# Create both training and validation datasets

dataframes = {'training': training_df, 'validation': validation_df}

datasets = {x: MovieReviewDataset(dataframes[x], voca)
              for x in ['training', 'validation']}

dataset_sizes = {x: len(datasets[x]) for x in ['training', 'validation']}

In [14]:
datasets['training'].__getitem__(4)['text'][1]

tensor(1487)

## Build DataLoader

As mentioned, `DataLoader` helps us to load data in batches while training, which will make our training process more gpu-efficient thus faster. 

In [15]:
# Costumize `DataLoader` batch format
def variable_size_collate(batch):
    ratings = []
    longest_len = 0
    texts = []
    for item in batch:
        thislen = len(item['text'])
        longest_len = thislen if thislen > longest_len else longest_len
        ratings.append(item['rating'])
    
    for i in range(longest_len):
        pos =  []
        for item in batch:
            text = item['text']
            if i < len(text):
                pos.append(text[i])
            else:
                pos.append(torch.tensor(0))
        pos = torch.stack(pos)
        texts.append(pos)
        
    ratings = torch.FloatTensor(ratings)
    texts = torch.stack(texts)
    return {'text': texts, 'rating': ratings}

dataloaders = {x: torch.utils.data.DataLoader(datasets[x], batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers, collate_fn=variable_size_collate)
              for x in ['training', 'validation']}

## Get Embedding Layer

In [11]:
embedding = voca.getEmbedding()
embedding

Embedding(21750, 100)

In [17]:
embedding(torch.LongTensor(voca.getIdsFromText("hello my name is derek trust issue hey what the fuck is that")))

tensor([[ 0.0325,  0.1499,  0.2310,  0.0769,  0.0743,  0.1067,  0.2858, -0.5892,
          0.4588, -0.4283,  0.2836, -0.2005,  0.2104,  0.1039, -0.3846,  0.0301,
         -0.1920, -0.1838,  0.1904,  0.2130,  0.2405, -0.0829,  0.1903,  0.4234,
         -0.1527, -0.2340, -0.2299,  0.0274,  0.0333,  0.2278, -0.0440, -0.1399,
         -0.1714, -0.4231, -0.4074, -0.3462,  0.2565, -0.0600,  0.0391, -0.2285,
         -0.2423,  0.0792,  0.4543,  0.4119,  0.1438,  0.2681,  0.0897,  0.5281,
         -0.2478,  0.1927, -0.3498,  0.0594,  0.0254,  0.2006, -0.3440, -0.0786,
          0.0687,  0.3086, -0.0354,  0.0525,  0.0537, -0.1680, -0.0263, -0.0323,
         -0.1826, -0.1126, -0.0501, -0.1170,  0.0148, -0.4299,  0.2880,  0.0525,
         -0.1989, -0.0569,  0.1596,  0.0896,  0.2335, -0.1160, -0.5452, -0.3574,
          0.2767,  0.0259, -0.1858, -0.0766,  0.1818, -0.2650, -0.0564,  0.1181,
         -0.0819, -0.1206, -0.2344, -0.2920,  0.4595, -0.2293,  0.0395, -0.4779,
         -0.0076, -0.4342, -

## Define LSTM Structure

We get to design our own LSTM model by writing class extending `nn.Module`. I have this structure in mind: Embedding layer + LSTM(bidirectional) + Linear. 

To see the introduction of LSTM, check out the other [notebook](./rating_model_fastai.ipynb#LSTM).

In [51]:
class LSTM(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim, bidirectional = False):
        
        super().__init__()
        
        self.embedding = nn.Embedding(input_dim, embedding_dim)
        
        self.rnn = nn.LSTM(embedding_dim, hidden_dim, bidirectional=bidirectional)
        
        self.fc = nn.Linear(hidden_dim *  (2 if bidirectional else 1) , output_dim)
        
    def forward(self, text):

        #text = [sent len, batch size]
        
        embedded = self.embedding(text)
        
        #embedded = [sent len, batch size, emb dim]
        
        output, hidden = self.rnn(embedded)
        
        #output = [sent len, batch size, hid dim]
        #hidden = [1, batch size, hid dim]
        
        return self.fc(output[-1,:,:])

In [52]:
INPUT_DIM = embedding.num_embeddings
EMBEDDING_DIM = embedding.embedding_dim
HIDDEN_DIM = 256
OUTPUT_DIM = 1

In [53]:
model = LSTM(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM, bidirectional=True)

In [54]:
model.to(args.device)
# Replace embedding layer with our trained w2v embedding layer
model.embedding = embedding

In [15]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 733,697 trainable parameters


## Init Optimizer & Criterion

In [76]:
import torch.optim as optim

optimizer = optim.SGD(model.parameters(), lr=1e-3)
lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=7, gamma=0.1)

In [77]:
# Here we use MSELoss, in the training process down below, we altered this into RMSE as required
criterion = nn.MSELoss()

In [78]:
model = model.to(args.device)
criterion = criterion.to(args.device)

## Train Model

In [83]:
def train_model(model: nn.Module, criterion, optimizer, scheduler, num_epochs=5):
    since = time.time()
    
    best_model_wts = copy.deepcopy(model.state_dict())
    best_loss = float("inf")
    
    for epoch in range(num_epochs):
        print('Epoch {}/{}'.format(epoch, num_epochs - 1))
        print('-' * 10)
        
        for phase in ['training', 'validation']:
            if phase == 'training':
                scheduler.step()
                model.train()
            else:
                model.eval()
            
            running_loss = 0.0
            # running_corrects = 0
            
            for data in dataloaders[phase]:
                texts = data['text'].to(args.device)
                ratings = data['rating'].to(args.device)
                
                optimizer.zero_grad()
                
                with torch.set_grad_enabled(phase == 'training'):
                    preds = model(texts)
                    
                    # Alter the MSE to RMSE by adding the sqrt computation
                    loss = torch.sqrt(criterion(preds, ratings))
                    
                    if phase == 'training':
                        loss.backward()
                        optimizer.step()
                
                running_loss += loss.item() * texts.size(1)
                # running_corrects += torch.sum(torch.round(preds) == ratings)
                
            epoch_loss = running_loss / dataset_sizes[phase]
            # epoch_acc = running_corrects.double() / dataset_sizes[phase]
            
            print('{} Loss: {:.4f}'.format(
                phase, epoch_loss))

            # deep copy the model
            if phase == 'validation' and epoch_loss < best_loss:
                best_loss = epoch_loss
                best_model_wts = copy.deepcopy(model.state_dict())
                
        print()
    
    time_elapsed = time.time() - since
    print('Training complete in {:.0f}m {:.0f}s'.format(
            time_elapsed // 60, time_elapsed % 60))
    print('Best val Loss: {:4f}'.format(best_loss))

    # load best model weights
    model.load_state_dict(best_model_wts)
    return model  

In [84]:
model = train_model(model, criterion, optimizer, lr_scheduler,
                       num_epochs=10)

Epoch 0/9
----------
training Loss: 3.6331
validation Loss: 2.2946

Epoch 1/9
----------
training Loss: 2.3042
validation Loss: 2.2925

Epoch 2/9
----------
training Loss: 2.2967
validation Loss: 2.2900

Epoch 3/9
----------
training Loss: 2.2918
validation Loss: 2.2824

Epoch 4/9
----------
training Loss: 2.2914
validation Loss: 2.2759

Epoch 5/9
----------
training Loss: 2.2872
validation Loss: 2.2746

Epoch 6/9
----------
training Loss: 2.2881
validation Loss: 2.2749

Epoch 7/9
----------
training Loss: 2.2886
validation Loss: 2.2784

Epoch 8/9
----------
training Loss: 2.2874
validation Loss: 2.2795

Epoch 9/9
----------
training Loss: 2.2848
validation Loss: 2.2755

Training complete in 24m 52s
Best val Loss: 2.274637


## Predict Helper Function

In [85]:
def predict(text):
    intens = torch.tensor([[i] for i in voca.getIdsFromText(text)]).to(args.device)
    outtens = model(intens)
    rating = round(outtens[0].item());
    if rating < 1:
        rating = 1
    elif rating > 10:
        rating = 10
    return rating
    

In [90]:
predict("haha good bad so")

5

In [91]:
torch.save(model, './trained/stage-1.pkl')

  "type " + obj.__name__ + ". It won't be checked "


In [92]:
model2 = torch.load('./trained/stage-1.pkl')

In [93]:
def predict2(text):
    intens = torch.tensor([[i] for i in voca.getIdsFromText(text)]).to(args.device)
    outtens = model2(intens)
    rating = round(outtens[0].item());
    if rating < 1:
        rating = 1
    elif rating > 10:
        rating = 10
    return rating
    

In [98]:
predict2('not worth it')

3

## Predict on Test Set

In [102]:
ratings = []
for line in test_df['review_content']:
    ratings.append(predict2(line))

In [104]:
output_data = {'review_content': test_df['review_content'], 'rating': ratings}

In [106]:
output_df = pd.DataFrame(output_data)
output_df.to_csv('senti_output.ss', sep='\t')

## Conclusion

**Sizes:**

Language Model Size(word2vec):       26.89M

Classifier Model Size:               11.38M

Total Size:                          38.27M

**Loss:**

RMSE:                              2.274637

**Test Set Output:**

File:            senti_output.ss (discarded)

**Comment:**

This model is more like a "built from scratch" attempt. The Word2Vec model was trained from a very limited corpus and the LSTM model structure itself is rather simple and not very much thought out. So this model expectedly performs much worse than the `AWD_LSTM` model that I trained using fastai in another [notebook](./rating_model_fastai.ipynb) as "Plan B".

- the language model I trained is pretty tiny with the size of only 27M, compared to 191.32M of "Plan B". As a result, the language model as the first layer of the network performs poorly right in front of the head, making it barely possible to train a good classifier.
- the LSTM model I designed is way too simple with the size of only 11.38M, compared to 487.05M of "Plan B".

But even with a small size of language model and LSTM model like that, it still take a machine equiped with an M40 graphic card a relatively long time to train. I'm guessing the main reason is that the model is trained starting from 'zero_state'. In terms of "Plan B", transfer learning comes with an advantage of preloaded already-trained weights, which makes the training process shorter.