In [25]:
# for auto-reloading external modules
# see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython
%load_ext autoreload
%autoreload 2

In [26]:
from __future__ import unicode_literals, print_function, division

import unicodedata
import string
import operator
import concurrent
import time
import re
import random
import spacy
import pickle
import torch 
import nltk
import sys

import torch.nn as nn
import pandas as pd
import numpy as np

from io import open
from bs4 import BeautifulSoup
from contractions import CONTRACTION_MAP
from nltk.tokenize.toktok import ToktokTokenizer
tokenizer = ToktokTokenizer()
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
from pathlib import Path
from tqdm import tqdm

base = Path('../aclImdb')

# The Data

## Extracting

In [295]:
train_neg = base/'train/neg'
train_pos = base/'train/pos'

In [296]:
neg_exs = []
neg_ratings = []
for ex in train_neg.iterdir():
    neg_exs.append('train/neg/' + ex.name)
    neg_ratings.append(int(ex.name.split('_')[-1].split('.')[0]))
neg_labels = [0] * len(neg_exs)

In [58]:
pos_exs = []
pos_ratings = []
for ex in train_pos.iterdir():
    pos_exs.append('train/pos/' + ex.name)
    pos_ratings.append(int(ex.name.split('_')[-1].split('.')[0]))
pos_labels = [1] * len(pos_exs)

In [59]:
df = pd.DataFrame(data={'path': neg_exs + pos_exs, 'target': neg_labels + pos_labels, 'review_rating': neg_ratings + pos_ratings})

[id]_[rating].txt 
- where [id] is a unique id and [rating] is the star **rating for that review** on a 1-10 scale

In [60]:
df.sample(frac=1).head()

Unnamed: 0,path,target,review_rating
228,train/neg/1573_1.txt,0,1
4261,train/neg/6298_2.txt,0,2
12304,train/neg/4670_4.txt,0,4
4455,train/neg/5154_1.txt,0,1
8327,train/neg/12485_1.txt,0,1


In [61]:
df.to_csv('train.csv', index=False)

## Cleaning 

In [12]:
df = pd.read_csv('train.csv')
neg_df = df[df['target'] == 0]
pos_df = df[df['target'] == 1]

In [13]:
df.sample(frac=1.).head()

Unnamed: 0,path,target,review_rating
16634,train/pos/3602_7.txt,1,7
12573,train/pos/5708_10.txt,1,10
21552,train/pos/11873_8.txt,1,8
7495,train/neg/5024_4.txt,0,4
9868,train/neg/3804_2.txt,0,2


In [14]:
ex = df.loc[1]
path = ex.path
target = ex.target

In [15]:
# open file removing trailing spaces
file = open(str(base/path), encoding='utf-8').read()

In [16]:
file

'Well...tremors I, the original started off in 1990 and i found the movie quite enjoyable to watch. however, they proceeded to make tremors II and III. Trust me, those movies started going downhill right after they finished the first one, i mean, ass blasters??? Now, only God himself is capable of answering the question "why in Gods name would they create another one of these dumpster dives of a movie?" Tremors IV cannot be considered a bad movie, in fact it cannot be even considered an epitome of a bad movie, for it lives up to more than that. As i attempted to sit though it, i noticed that my eyes started to bleed, and i hoped profusely that the little girl from the ring would crawl through the TV and kill me. did they really think that dressing the people who had stared in the other movies up as though they we\'re from the wild west would make the movie (with the exact same occurrences) any better? honestly, i would never suggest buying this movie, i mean, there are cheaper ways to 

In [17]:
def strip_html_tags(s):
    soup = BeautifulSoup(s, "html.parser")
    return soup.get_text()

# Turn a Unicode string to plain ASCII, thanks to
# https://stackoverflow.com/a/518232/2809427
def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )

# Expand contractions (it's = it is), thanks to 
# https://towardsdatascience.com/a-practitioners-guide-to-natural-language-processing-part-i-processing-understanding-text-9f4abfd13e72
def expand_contractions(text, contraction_mapping=CONTRACTION_MAP):
    contractions_pattern = re.compile('({})'.format('|'.join(contraction_mapping.keys())), 
                                      flags=re.IGNORECASE|re.DOTALL)
    def expand_match(contraction):
        match = contraction.group(0)
        first_char = match[0]
        expanded_contraction = contraction_mapping.get(match)\
                                if contraction_mapping.get(match)\
                                else contraction_mapping.get(match.lower())                       
        expanded_contraction = first_char+expanded_contraction[1:]
        return expanded_contraction
        
    expanded_text = contractions_pattern.sub(expand_match, text)
    expanded_text = re.sub("'", "", expanded_text)
    return expanded_text

# again, thanks to 
# https://towardsdatascience.com/a-practitioners-guide-to-natural-language-processing-part-i-processing-understanding-text-9f4abfd13e72
def remove_stopwords(text):
    tokens = tokenizer.tokenize(text)
    tokens = [token.strip() for token in tokens]
    filtered_tokens = [token for token in tokens if token not in stop_words]
    filtered_text = ' '.join(filtered_tokens)    
    return filtered_text

def normalizeString(s, stopwords=True, contractions=False):
    # Remove html tags 
    s = strip_html_tags(s.lower().strip())
    # Lowercase, trim, and remove non-letter characters
    s = unicodeToAscii(s)
    # add spaces too ! ? .
    s = re.sub(r"([.!?])", r" \1 ", s)
    # expand contractions 
    if not contractions:
        s = expand_contractions(s)
    # remove all other characters
    s = re.sub(r"[^a-zA-Z.!?]+", r" ", s).strip()
    # remove stop words 
    if not stopwords: 
        s = remove_stopwords(s)
    return s

In [18]:
clean_file = normalizeString(file, stopwords=True, contractions=False)
clean_file

'well . . . tremors i the original started off in and i found the movie quite enjoyable to watch . however they proceeded to make tremors ii and iii . trust me those movies started going downhill right after they finished the first one i mean ass blasters ? ? ? now only god himself is capable of answering the question why in gods name would they create another one of these dumpster dives of a movie ? tremors iv cannot be considered a bad movie in fact it cannot be even considered an epitome of a bad movie for it lives up to more than that . as i attempted to sit though it i noticed that my eyes started to bleed and i hoped profusely that the little girl from the ring would crawl through the tv and kill me . did they really think that dressing the people who had stared in the other movies up as though they we are from the wild west would make the movie with the exact same occurrences any better ? honestly i would never suggest buying this movie i mean there are cheaper ways to find thin

## Structuring 

In [19]:
# Thanks to, 
# https://pytorch.org/tutorials/intermediate/seq2seq_translation_tutorial.html
# stanford nlp a4
class Lang:
    def __init__(self):
        self.word2id = dict()
        self.word2count = dict()
        self.id2word = dict()
        self.word2id['<pad>'] = 0   # Pad Token
        self.word2id['<s>'] = 1     # Start Token
        self.word2id['</s>'] = 2    # End Token
        self.word2id['<unk>'] = 3   # Unknown Token
        self.fixed_vocab = {'<pad>', '<s>' , '</s>', '<unk>'}
        
        self.id2word = {v: k for k, v in self.word2id.items()}
        self.n_words = len(self.id2word)

    def addSentence(self, sentence):
        for word in sentence.split(' '):
            self.addWord(word)

    def addWord(self, word):
        if word not in self.word2id:
            self.word2id[word] = self.n_words
            self.word2count[word] = 1
            self.id2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1
            
    def top_n_words_model(self, n):
        top_lang = Lang()
        ordered_words = sorted(lang.word2count.items(), key=operator.itemgetter(1), reverse=True)
        for w, f in ordered_words[:n]: 
            top_lang.addWord(w)
            top_lang.word2count[w] = f 
        return top_lang
    
    def get_id(self, word):
        return self.word2id[word] if word in self.word2id else self.word2id['<unk>']
            
def dump_model(lang, name='imdb_language_class'):
    lang_pkl = pickle.dumps(lang, protocol=pickle.HIGHEST_PROTOCOL)
    open('{}.pkl'.format(name), 'wb').write(lang_pkl)
    
def load_model(name='imdb_language_class'):
    with open('imdb_language_class.pkl', 'rb') as fp:
        lang = pickle.load(fp)
    return lang

In [20]:
def normalize_and_track(lang, path):
    file = open(str(base/path), encoding='utf-8').read()
    # normalize
    clean_file = normalizeString(file, stopwords=True, contractions=False)
    # track words into model
    for w in clean_file.split(' '):
        lang.addWord(w)   
    return True

def populate_language(lang, df):
    # Multithread normalizing and tracking the train dataset
    start_time = time.time()
    results = [normalize_and_track(lang, p) for p in df['path'].values]
    duration = time.time() - start_time
    print("Normalized and Tracked in {} seconds".format(duration))
    # Ensure success on all path values
    assert all(results)

In [21]:
lang = Lang()
populate_language(lang, df)

Normalized and Tracked in 82.86931920051575 seconds


In [349]:
dump_model(lang)

In [350]:
len(lang.word2id.items()), lang.n_words, len(lang.word2count.keys())+4

(78360, 78360, 78360)

In [23]:
lang = load_model()

In [353]:
int_file = [lang.get_id(w) for w in clean_file.split(' ')]
' '.join(list(map(str, int_file)))

'41 27 27 27 42 43 8 44 45 46 47 34 43 48 8 49 50 51 15 52 27 53 54 55 15 56 42 57 34 58 27 59 60 61 62 45 63 64 65 66 54 67 8 68 6 43 69 70 71 72 72 72 73 74 75 76 19 77 7 78 8 79 80 47 81 82 83 54 84 85 6 7 86 87 88 7 24 49 72 42 89 90 16 91 24 92 49 47 93 18 90 16 94 91 95 96 7 24 92 49 97 18 98 99 15 100 101 102 27 103 43 104 15 105 106 18 43 107 102 108 109 45 15 110 34 43 111 112 102 8 113 114 30 8 115 83 116 117 8 118 34 119 60 27 120 54 121 122 102 123 8 124 125 126 127 47 8 128 62 99 103 106 54 129 130 30 8 131 132 83 56 8 49 5 8 133 134 135 136 137 72 138 43 83 139 140 141 12 49 43 69 35 130 142 143 15 144 145 102 146 41 27'

In [356]:
# Sanity check word indexing 
assert " ".join([lang.id2word[w_idx] for w_idx in int_file]) == clean_file

# Training Helpers

In [282]:
import math
import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [365]:
def open_and_clean(lang, path):
    file = open(str(base/path), encoding='utf-8').read()
    clean_file = normalizeString(file, stopwords=True, contractions=False)
    return clean_file

def prepare_df(lang, df):
    results = [(open_and_clean(lang, p), t) for (p, t) in zip(df['path'].values, df['target'].values)] 
    return results

def batch_iter(lang, data, batch_size, shuffle=False):
    batch_num = math.ceil(len(data) / batch_size)

    if shuffle:
        data = data.sample(frac=1.)
    
    for i in range(batch_num):
        lb, ub = i * batch_size, min((i + 1) * batch_size, len(data))
        batch_df = data[lb:ub]
        
        # open, clean, sort the batch_df
        results = prepare_df(lang, batch_df)
        results = sorted(results, key=lambda e: len(e[0].split(' ')), reverse=True)
        sents, targets = [e[0].split(' ') for e in results], [e[1] for e in results]
        
        yield sents, torch.tensor(targets, dtype=torch.float32)

In [366]:
def indexesFromSentence(lang, sentence):
    return [lang.word2id['<s>']] + [lang.get_id(word) for word in sentence] + [lang.word2id['</s>']]

def to_input_tensor(lang, sents):
    sents_id = [indexesFromSentence(lang, s) for s in sents]
    lengths = [len(s) for s in sents_id]
    sents_pad = pad_sents(sents_id, lang.word2id['<pad>'])
    sents_var = torch.tensor(sents_pad, dtype=torch.long, device=device)
    return torch.t(sents_var), lengths

In [367]:
def pad_sents(sents, pad_token):
    sents_padded = []
    max_seq = max(map(len, sents))
    def pad(s):
        diff = max_seq - len(s)
        s = s + [pad_token for _ in range(diff)]
        return s
    sents_padded = [pad(s) for s in sents]
    return sents_padded

# Model 

In [371]:
class SentModel(nn.Module):
    def __init__(self, embed_size, hidden_size, lang):
        super(SentModel, self).__init__()
        self.lang = lang
        self.embed = nn.Embedding(len(lang.word2id), embed_size, padding_idx=lang.word2id['<pad>'])
        self.gru = nn.GRU(embed_size, hidden_size, bias=True)
        self.l1 = nn.Linear(hidden_size, 1)
        
    def forward(self, sents):
        s_tensor, lengths = to_input_tensor(self.lang, sents)
        emb = self.embed(s_tensor)
        
        # pack + rnn sequence + unpack
        x = nn.utils.rnn.pack_padded_sequence(emb, lengths)
        output, hidden = self.gru(x)
        output, _ = nn.utils.rnn.pad_packed_sequence(output)
        
        # batch_size, seq_len, hidden_size
        output_batch = output.transpose(0, 1)
        
        # batch_size, hidden_size
        out_avg = output_batch.sum(dim=1)
        
        # batch_size, 1
        linear_out = self.l1(out_avg)
        out = torch.sigmoid(linear_out).squeeze(-1)
        
        return out

# Training Training

In [369]:
hidden_size = 20
embed_size = 300
model = SentModel(embed_size, hidden_size, lang)
model = model.to(device)

lr = 1e-3
clip_grad = 5.
optimizer = torch.optim.Adam(model.parameters())
loss_fcn = nn.BCELoss()

In [372]:
train_batch_size = 32
epochs = 2 

for e in range(epochs):
    epoch_loss = 0
    train_iter = -1
    begin_time = time.time()
    
    for sents, targets in batch_iter(lang, df, train_batch_size, shuffle=True):
        train_iter += 1 
        optimizer.zero_grad()
    
        preds = model(sents)
        loss = loss_fcn(preds, targets)
        epoch_loss += loss.item()
    
        loss.backward()
        grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), clip_grad)
        optimizer.step()
        
        if train_iter > 20: break
    
    print('epoch %d, avg. loss %.2f, time elapsed %.2f sec' % (epoch, 
                                                                epoch_loss / train_iter,
                                                                time.time() - begin_time), file=sys.stderr)

KeyboardInterrupt: 