In [35]:
import numpy as np
import pandas as pd
import scipy as sp

from scipy import sparse

import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

import string
import re
import glob

from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer, FeatureHasher

import keras
from keras.optimizers import Adam
from keras.callbacks import ModelCheckpoint
from keras.layers import Dense, Embedding, LSTM, Dropout
from keras.models import Sequential

import matplotlib.pyplot as plt

print('Keras version: %s' % keras.__version__)

PATH = "data/aclImdb"

Keras version: 2.1.5


In [2]:
# or use nltk or spacy
htmltag = re.compile(r'<.*?>')
numbers = re.compile(r'[0-9]')
quotes = re.compile(r'\"|`')
punctuation = re.compile(r'([%s])'% string.punctuation)
english_stopwords =set(stopwords.words('english'))
stemmer = PorterStemmer()

In [3]:
# read files in the given tree, using subfolders as the target classes
def read_files(folder, subfolders):
    corpus, labels = [], []
    for index, label in enumerate(subfolders):
        path = '/'.join([folder, label, '*.txt'])
        for filename in glob.glob(path):
            corpus.append(open(filename, 'r').read())
            labels.append(index)
    return corpus, np.array(labels).astype(np.int)

# pre-processor
def preprocess(s):
    # lowercase
    s = s.lower()
    # remove html tags
    s = htmltag.sub(' ', s)
    # remove numbers
    s = numbers.sub(' ', s)
    # remove quotes
    s = quotes.sub(' ', s)
    # replace puctuation
    s = punctuation.sub(' ', s)
    return s
    
# tokenization
def tokenize(s):
    # use a serious tokenizer
    tokens = nltk.word_tokenize(s)
    # remove stopwords
    tokens = filter(lambda w: not w in english_stopwords, tokens)
    # stem words
    tokens = [stemmer.stem(token) for token in tokens]
    return tokens

In [4]:
#coprus_train_pos = [open(filename, 'r').read() for filename in glob.glob(PATH + '/train/pos/*.txt')]
#coprus_train_neg = [open(filename, 'r').read() for filename in glob.glob(PATH + '/train/neg/*.txt')]
corpus_train, y_train = read_files(PATH + '/train', ['neg', 'pos'])
corpus_test, y_test = read_files(PATH + '/test', ['neg', 'pos'])

In [5]:
len(corpus_train), len(y_train), corpus_train[0], y_train[0], corpus_train[24999], y_train[24999]

(25000,
 25000,
 'Hi, Everyone, If you saw "Singing in the Rain," you remember the scene of Gene Kelly dancing in the rain. You also remember the dance number of Donald O\'Connor, "Make \'em Laugh." If you saw "Royal Wedding," you will remember Fred Astaire dancing on the ceiling. If you saw "Jailhouse Rock," you will even remember the title dance number choreographed by The King himself.<br /><br />That is what is missing here. There could have been some blockbuster dance numbers in this presentation. The closest was Chuck McGowan\'s "I Can Do That." the mere fact that you have some talented people on stage moving together does not make a great dance film. Richard Attenborough was to blame for this failure. He pointed the camera at the stage and thought that would be a good thing.<br /><br />Yelling at people auditioning for a part in a Broadway production is not entertainment. Michael Douglas would be just as badly cast if he were in a Western or a comedy. He is OK when he is in a Mi

In [6]:
len(corpus_test), len(y_test), corpus_test[0], y_test[0]

(25000,
 25000,
 'Yes, in this movie you are treated to multiple little snowmen on the attack in apparently a very warm climate so yes this movie is definitely not to be taken seriously. It is in fact a much worse movie than the original as at least with that one the whole production looked like it cost more than a couple of bucks and a video camera to make. It has its funny moments, but really playing off the cheapness of your movie and making that be your intended laughs is kind of weak film making if you ask me. You can not come up with a good story, your effects are going to really be bad, hey let us just make the movie look as bad as possible with horrible one liners and we have our movie. The first one at least had a somewhat credible story as the snowman in that one attacked during the winter and not what amounts to a resort. It also had better effects too, this one is just a step or two ahead of "Hobgoblins" as far as the monsters are concerned and you really want to be more th

In [7]:
vectorizer = CountVectorizer(preprocessor=preprocess, tokenizer=tokenize)
term_doc_train = vectorizer.fit_transform(corpus_train)
term_doc_test = vectorizer.transform(corpus_test)

In [8]:
vocab = vectorizer.get_feature_names()
vocab[100:102]

[u'abilityof', u'abishai']

In [9]:
vocab_size = len(vocab)
h = FeatureHasher(n_features=10, input_type='string')
f = h.fit_transform(['q', 'w'])
f.shape, f.toarray()

((2, 10), array([[-1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
        [ 0., -1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.]]))

In [10]:
term_doc_train[0]

<1x50440 sparse matrix of type '<type 'numpy.int64'>'
	with 85 stored elements in Compressed Sparse Row format>

In [11]:
term_doc_train[100].toarray()

array([[0, 0, 0, ..., 0, 0, 0]])

In [12]:
vectorizer.vocabulary_['cool']

9133

In [13]:
# Multinomial Naive Bayes
alpha = 0.1 # smoothing parameter
class MultinomialNaiveBayes():
    """
    Arguments:
        alpha: smoothing parameter
    """
    def __init__(self, alpha=0.1):
        self.b = 0
        self.r = 0
        self.alpha = alpha

    def fit(self, X, y):
        # bias
        N_pos = (y==1).shape[0]
        N_neg = (y==0).shape[0]
        self.b = np.log(N_pos / N_neg)
        # count of occurences for every token in vocabulary as they appear in positive samples
        p = alpha + X[y==1].sum(axis=0)
        p_l1 = np.linalg.norm(p, ord=1) # L1 norm
        # count of occurences for every token in vocabulary as they appear in negative samples
        q = alpha + X[y==0].sum(axis=0)
        q_l1 = np.linalg.norm(q, ord=1) # L1 norm
        # log count ratio
        self.r = np.log((p/p_l1) / (q/q_l1))
        #self.r = sp.sparse.csr_matrix(self.r.T)
        return self.r, self.b

    def predict(self, X):
        y_pred = np.sign(sp.sparse.csr_matrix.dot(X, self.r.T) + self.b)
        y_pred[y_pred==-1] = 0
        return y_pred
    
    def score(self, X, y):
        y_predict = self.predict(X)
        y_reshaped = np.reshape(y, y_predict.shape)
        return (y_reshaped == y_predict).mean()

In [14]:
model = MultinomialNaiveBayes()
r, b = model.fit(term_doc_train, y_train)
b, r.shape, term_doc_train.shape

(0.0, (1, 50440), (25000, 50440))

In [15]:
term_doc_train.shape, r.shape, term_doc_train[0], r

((25000, 50440),
 (1, 50440),
 <1x50440 sparse matrix of type '<type 'numpy.int64'>'
 	with 85 stored elements in Compressed Sparse Row format>,
 matrix([[ 2.53558105, -2.26020949, -0.69100689, ...,  2.53558105,
           0.13768578, -4.1249941 ]]))

In [16]:
# accuracy on training set
y_pred = model.predict(term_doc_train)
#y_train = np.reshape(y_train, (25000, 1))
(np.reshape(y_train, (25000, 1)) == y_pred).mean()

0.78408

In [17]:
# accuracy on validation set
y_pred2 = model.predict(term_doc_test)
#y_test = np.reshape(y_test, (25000, 1))
(np.reshape(y_test, (25000, 1)) == y_pred2).mean()

0.68644000000000005

In [18]:
# now let's binary term document
term_doc_train = term_doc_train.sign() # turn everything into 1 or 0
term_doc_test = term_doc_test.sign() # turn everything into 1 or 0
term_doc_train.shape, term_doc_test.shape

((25000, 50440), (25000, 50440))

In [19]:
model = MultinomialNaiveBayes()
model.fit(term_doc_train, y_train)
accuracy_train = model.score(term_doc_train, y_train)
accuracy_test = model.score(term_doc_test, y_test)
accuracy_train, accuracy_test

(0.76848000000000005, 0.67620000000000002)

In [20]:
term_doc_train.shape, y_train.shape, term_doc_train[y_train==0].sum(axis=0).shape, term_doc_train[y_train==1].sum(axis=0).shape

((25000, 50440), (25000,), (1, 50440), (1, 50440))

In [21]:
(y_train==0).shape, (y_train==1).shape, y_pred.shape

((25000,), (25000,), (25000, 1))

In [22]:
# now with plain logistic regression
model = LogisticRegression()
model.fit(term_doc_train, y_train)
# accuracy on training
y_pred = model.predict(term_doc_train)
accuracy_train = (y_train == y_pred).mean()
# accuracy on validation
y_pred = model.predict(term_doc_test)
accuracy_test = (y_test == y_pred).mean()
accuracy_train, accuracy_test

(0.99187999999999998, 0.85855999999999999)

In [23]:
# now with regularized logistic regression
model = LogisticRegression(C=0.01, dual=True)
model.fit(term_doc_train, y_train)
# accuracy on training
y_pred = model.predict(term_doc_train)
accuracy_train = (y_train == y_pred).mean()
# accuracy on validation
y_pred = model.predict(term_doc_test)
accuracy_test = (y_test == y_pred).mean()
accuracy_train, accuracy_test

(0.90251999999999999, 0.87387999999999999)

In [24]:
# now combining Naive Base and Logistic Regression
"""
class NBLR(keras.Model):
    def __init__(self):
        super(NBLR, self).__init__(name='NBLR')
        self.softmax = keras.layers.Activation('softmax')

    def call(self, inputs):
        out = self.softmax(inputs)
        return out

model = NBLR()
model.compile(loss='mean_squared_error', optimizer='sgd', metrics=['accuracy'])
losses = model.fit(x=term_doc_train, y=y_train)
"""

"\nclass NBLR(keras.Model):\n    def __init__(self):\n        super(NBLR, self).__init__(name='NBLR')\n        self.softmax = keras.layers.Activation('softmax')\n\n    def call(self, inputs):\n        out = self.softmax(inputs)\n        return out\n\nmodel = NBLR()\nmodel.compile(loss='mean_squared_error', optimizer='sgd', metrics=['accuracy'])\nlosses = model.fit(x=term_doc_train, y=y_train)\n"