In [21]:
from __future__ import unicode_literals, print_function, division

import unicodedata
import string
import concurrent
import time
import re
import random
import spacy
import pickle
import torch 
import nltk

import torch.nn as nn
import pandas as pd
import numpy as np

from io import open
from bs4 import BeautifulSoup
from contractions import CONTRACTION_MAP
from nltk.tokenize.toktok import ToktokTokenizer
tokenizer = ToktokTokenizer()
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
from pathlib import Path

base = Path('../aclImdb')

In [2]:
df = pd.read_csv('train.csv')
neg_df = df[df['target'] == 0]
pos_df = df[df['target'] == 1]

In [22]:
def strip_html_tags(s):
    soup = BeautifulSoup(s, "html.parser")
    return soup.get_text()

# Turn a Unicode string to plain ASCII, thanks to
# https://stackoverflow.com/a/518232/2809427
def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )

# Expand contractions (it's = it is), thanks to 
# https://towardsdatascience.com/a-practitioners-guide-to-natural-language-processing-part-i-processing-understanding-text-9f4abfd13e72
def expand_contractions(text, contraction_mapping=CONTRACTION_MAP):
    contractions_pattern = re.compile('({})'.format('|'.join(contraction_mapping.keys())), 
                                      flags=re.IGNORECASE|re.DOTALL)
    def expand_match(contraction):
        match = contraction.group(0)
        first_char = match[0]
        expanded_contraction = contraction_mapping.get(match)\
                                if contraction_mapping.get(match)\
                                else contraction_mapping.get(match.lower())                       
        expanded_contraction = first_char+expanded_contraction[1:]
        return expanded_contraction
        
    expanded_text = contractions_pattern.sub(expand_match, text)
    expanded_text = re.sub("'", "", expanded_text)
    return expanded_text

# again, thanks to 
# https://towardsdatascience.com/a-practitioners-guide-to-natural-language-processing-part-i-processing-understanding-text-9f4abfd13e72
def remove_stopwords(text):
    tokens = tokenizer.tokenize(text)
    tokens = [token.strip() for token in tokens]
    filtered_tokens = [token for token in tokens if token not in stop_words]
    filtered_text = ' '.join(filtered_tokens)    
    return filtered_text

def normalizeString(s, stopwords=True, contractions=False):
    # Remove html tags 
    s = strip_html_tags(s.lower().strip())
    # Lowercase, trim, and remove non-letter characters
    s = unicodeToAscii(s)
    # add spaces too ! ? .
    s = re.sub(r"([.!?])", r" \1 ", s)
    # expand contractions 
    if not contractions:
        s = expand_contractions(s)
    # remove all other characters
    s = re.sub(r"[^a-zA-Z.!?]+", r" ", s).strip()
    # remove stop words 
    if not stopwords: 
        s = remove_stopwords(s)
    return s

In [23]:
# Thanks to, 
# https://pytorch.org/tutorials/intermediate/seq2seq_translation_tutorial.html
class Lang:
    def __init__(self):
        self.word2index = {}
        self.word2count = {}
        self.index2word = {0: "SOS", 1: "EOS"}
        self.n_words = 2  # Count SOS and EOS

    def addSentence(self, sentence):
        for word in sentence.split(' '):
            self.addWord(word)

    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1
            
def dump_model(lang, name='imdb_language_class'):
    lang_pkl = pickle.dumps(lang, protocol=pickle.HIGHEST_PROTOCOL)
    open('{}.pkl'.format(name), 'wb').write(lang_pkl)
    
def load_model(name='imdb_language_class'):
    with open('imdb_language_class.pkl', 'rb') as fp:
        lang = pickle.load(fp)
    return lang

In [3]:
df['review_rating'].iplot(
    kind='hist',
    xTitle='rating',
    linecolor='black',
    yTitle='count',
    title='Review Rating Distribution')

In [24]:
def normalize_and_track(lang, path):
    file = open(str(base/path), encoding='utf-8').read()
    # normalize
    clean_file = normalizeString(file, stopwords=True, contractions=False)
    # track words into model
    for w in clean_file.split(' '):
        lang.addWord(w)   
    return True

def populate_language(lang, df):
    # Multithread normalizing and tracking the train dataset
    start_time = time.time()
    with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
        args = ((lang, f) for f in df['path'].values)
        results = executor.map(lambda p: normalize_and_track(*p), args)
    duration = time.time() - start_time
    print("Normalized and Tracked in {} seconds".format(duration))
    # Ensure success on all path values
    for r in results: assert r == True

In [30]:
def top_n_words(lang, n, include_stops=True, exclude={}):
    ordered_words = sorted(lang.word2count.items(), key=operator.itemgetter(1), reverse=True)
    if not include_stops:
        ordered_words = [(w, f) for (w, f) in ordered_words if ((w not in stop_words and w not in ['.', '?', '!']) and w not in exclude)]
    return ordered_words[:n]

def plot_top_words(top_w, n, title='Top {} words in review after removing stop words'.format(n)):
    topdf = pd.DataFrame(top_w, columns=['Text', 'Count'])
    topdf.groupby('Text').sum()['Count'].sort_values(ascending=False).iplot(
    kind='bar',  yTitle='Count', linecolor='black', title=title)

In [27]:
lang = Lang()
populate_language(lang, df)

Normalized and Tracked in 67.40042328834534 seconds


In [32]:
n = 20
top_w = top_n_words(lang, n)
plot_top_words(top_w, n,title='Top {} words in review before removing stop words'.format(n))

In [33]:
n = 20
top_w = top_n_words(lang, n, include_stops=False)
plot_top_words(top_w, n)

In [34]:
neg_lang = Lang()
populate_language(neg_lang, neg_df)

Normalized and Tracked in 34.45558404922485 seconds


In [35]:
n = 20
top_w = top_n_words(neg_lang, n, include_stops=False)
plot_top_words(top_w, n, title='Top {} words in negative reviews'.format(n))

In [36]:
pos_lang = Lang()
populate_language(pos_lang, pos_df)

Normalized and Tracked in 36.10590386390686 seconds


In [37]:
n = 20
top_w_pos = top_n_words(pos_lang, n, include_stops=False)
plot_top_words(top_w_pos, n, title='Top {} words in positive reviews'.format(n))

In [38]:
n = 20
topw_pos = list(map(lambda x: x[0], top_w_pos))
top_w = top_n_words(neg_lang, n, include_stops=False, exclude=topw_pos)
plot_top_words(top_w, n, title='Top {} exclusive words in negative reviews'.format(n))