# Classify Movies as Comedy or Drama
### Dan Finkel
### September 7 2018

In [59]:
# standard imports
import numpy as np
import pandas as pd
import cPickle
from collections import defaultdict
import re
import sys
import os
import glob
from tqdm import tqdm
from collections import Counter
import matplotlib.pyplot as plt
from datetime import datetime

# keras imports
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical

from keras.layers import Embedding, Dropout
from keras.layers import Dense, Input, Flatten, ZeroPadding1D
from keras.layers import Conv1D, MaxPooling1D, Embedding, Merge, Dropout, Activation
from keras.models import Model, Sequential

from keras.engine import Layer, InputSpec
import tensorflow as tf

# sklearn imports
from sklearn import metrics
from sklearn import preprocessing
from sklearn.svm import LinearSVC
from sklearn.externals import joblib
from sklearn.metrics import roc_curve, auc
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, f1_score
from sklearn.model_selection import StratifiedKFold
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.cross_validation import train_test_split
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer
from sklearn import metrics
from sklearn import preprocessing
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer

# allows for inline plotting
%matplotlib inline

# pretty plots
plt.style.use("bmh")

# Globals
MAX_SEQUENCE_LENGTH = 150
MAX_NB_WORDS = 20000
EMBEDDING_DIM = 100
VALIDATION_SPLIT = 0.2



In [None]:
"""Prepare Training data"""

def get_reviews(labels_fname, genre, dirpath):

    # read data
    df_neg = pd.read_csv(labels_fname + '_neg_genres.csv', encoding='utf-8')
    df_pos = pd.read_csv(labels_fname + '_pos_genres.csv', encoding='utf-8')
    
    # pull genre
    genre_neg_idx = df_neg.loc[df_neg[genre] == 1]['ref'].values
    genre_pos_idx = df_pos.loc[df_pos[genre] == 1]['ref'].values

    # get the review filenames
    def get_fname(search_string):
        return glob.glob(search_string)[0].split('/')[-1]

    neg_dirpath = dirpath + "neg/"
    genre_neg_fnames = [get_fname(neg_dirpath + str(ii) + '_*.txt') for ii in genre_neg_idx]

    pos_dirpath = dirpath + "pos/"
    genre_pos_fnames = [get_fname(pos_dirpath + str(ii) + '_*.txt') for ii in genre_pos_idx]

    # pull the text of the reviews
    def get_reviews(dirpath, fnames):
        reviews = []
        for fname in fnames:
            with open(dirpath + fname, 'r') as myfile:
                data=myfile.read()
            reviews.append(data)
        return reviews

    genre_neg_reviews = get_reviews(neg_dirpath, genre_neg_fnames)
    genre_pos_reviews = get_reviews(pos_dirpath, genre_pos_fnames)
    
    df_out = pd.DataFrame(data=genre_neg_reviews + genre_pos_reviews)
    df_out.columns = ['text']
    df_out.title = df_neg['title'].values + df_pos['title'].values
    return df_out

In [None]:
# Pull drama reviews
df_train_drama = get_reviews("stanford_train",
                             "Drama",
                             "/Users/dfinkel/Downloads/aclImdb/train/")
df_train_drama['class'] = 0

# Pull comedy reviews
df_train_comedy = get_reviews("stanford_train",
                              "Comedy",
                              "/Users/dfinkel/Downloads/aclImdb/train/")
df_train_comedy['class'] = 1

# build training frame
df_train = pd.concat([df_train_drama, df_train_comedy])

# shuffle rows
df_train = df_train.sample(frac=1).reset_index(drop=True)
df_train.head()

In [49]:
print len(df_train)
df_train = df_train.drop_duplicates('text')
print len(df_train)

19833
17448


## Prep for Classification

In [50]:
"""
Tokenize the comments and 
convert to numerical sequences
"""

# Tokenize comments
texts = df_train['text'].values

# Initialize tokenizer
# nb_words tells tokenizer to only keep MAX_NB_WORDS
# when texts_to_sequences is applied
tokenizer = Tokenizer(nb_words=MAX_NB_WORDS)

# encoding text tokens to sequences
tokenizer.fit_on_texts(texts)

# convert text to numerical arrays
sequences = tokenizer.texts_to_sequences(texts)

In [51]:
# count unique tokens in corpus
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Found 75170 unique tokens.


In [52]:
"""
Bookkeeping
"""
# convert comment labels to categorical tensor
labels = df_train['class']
labels = to_categorical(np.asarray(labels))

# zero-pad comments
data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)

('Shape of data tensor:', (17448, 150))
('Shape of label tensor:', (17448, 2))


In [53]:
# randomly sort data into train/test sets
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]
nb_validation_samples = int(VALIDATION_SPLIT * data.shape[0])

x_train = data[:-nb_validation_samples]
y_train = labels[:-nb_validation_samples]
x_val = data[-nb_validation_samples:]
y_val = labels[-nb_validation_samples:]

print('Number of positive and negative reviews in traing and validation set ')
print y_train.sum(axis=0)
print y_val.sum(axis=0)

Number of positive and negative reviews in traing and validation set 
[8643. 5316.]
[2180. 1309.]


In [54]:
# import GLOVE word embeddings
# see: https://nlp.stanford.edu/projects/glove/
GLOVE_DIR = "/Users/dfinkel/proto_dev/data_science/glove_data/glove_files/"
embeddings_index = {}
f = open(os.path.join(GLOVE_DIR, 'glove.6B.100d.txt'))
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()
print('Total %s word vectors in Glove 6B 100d.' % len(embeddings_index))

Total 400000 word vectors in Glove 6B 100d.


In [55]:
"""
Create embedding layer 
using glove data
"""

# embedding matrix maps words onto vectors
# initialize with uniform random numbers (?)
embedding_matrix = np.random.random((len(word_index) + 1, EMBEDDING_DIM))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index 
        # will be uniform random numbers
        embedding_matrix[i] = embedding_vector
        
embedding_layer = Embedding(len(word_index) + 1,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=True)

Use model from https://arxiv.org/pdf/1404.2188.pdf

In [56]:
class KMaxPooling(Layer):
    """
    K-max pooling layer that extracts the k-highest activations from a sequence (2nd dimension).
    TensorFlow backend.
    """
    def __init__(self, k=1, axis=1, **kwargs):
        super(KMaxPooling, self).__init__(**kwargs)
        self.input_spec = InputSpec(ndim=3)
        self.k = k

        assert axis in [1,2],  'expected dimensions (samples, filters, convolved_values),\
                   cannot fold along samples dimension or axis not in list [1,2]'
        self.axis = axis

        # need to switch the axis with the last elemnet
        # to perform transpose for tok k elements since top_k works in last axis
        self.transpose_perm = [0,1,2] #default
        self.transpose_perm[self.axis] = 2
        self.transpose_perm[2] = self.axis

    def compute_output_shape(self, input_shape):
        input_shape_list = list(input_shape)
        input_shape_list[self.axis] = self.k
        return tuple(input_shape_list)

    def call(self, x):
        # swap sequence dimension to get top k elements along axis=1
        transposed_for_topk = tf.transpose(x, perm=self.transpose_perm)

        # extract top_k, returns two tensors [values, indices]
        top_k = tf.nn.top_k(transposed_for_topk, k=self.k, sorted=True, name=None)[0]

        # return back to normal dimension but now sequence dimension has only k elements
        # performing another transpose will get the tensor back to its original shape
        # but will have k as its axis_1 size
        transposed_back = tf.transpose(top_k, perm=self.transpose_perm)

        return transposed_back


class Folding(Layer):

    def __init__(self, **kwargs):
        super(Folding, self).__init__(**kwargs)
        self.input_spec = InputSpec(ndim=3)

    def compute_output_shape(self, input_shape):
        return (input_shape[0], input_shape[1], int(input_shape[2]/2))

    def call(self, x):
        input_shape = x.get_shape().as_list()

        # split the tensor along dimension 2 into dimension_axis_size/2
        # which will give us 2 tensors
        splits = tf.split(x, num_or_size_splits=int(input_shape[2]/2), axis=2)

        # reduce sums of the pair of rows we have split onto
        reduce_sums = [tf.reduce_sum(split, axis=2) for split in splits]

        # stack them up along the same axis we have reduced
        row_reduced = tf.stack(reduce_sums, axis=2)
        return row_reduced

In [57]:
"""
model based on: https://arxiv.org/pdf/1404.2188.pdf
"""

# Initialize model
model_1 = Sequential()

# Add an embedding layer
model_1.add(Embedding(len(word_index) + 1,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=True))

# zero bad 49 zeros to 
# both sides of comment
model_1.add(ZeroPadding1D((49,49)))

# 1D convolution
# 64 channels in output
# 50 values in convolution window
# zero pad output to perserve size
model_1.add(Conv1D(64, 50, padding="same"))

# Kmax pool
# Return k max values
# perserve order
model_1.add(KMaxPooling(k=5, axis=1))

# Activate w relu
model_1.add(Activation("relu"))

# zero pad 24 zeros to
# both sides of representation
model_1.add(ZeroPadding1D((24,24)))

# 1D convolution
# 64 channels in output
# 25 values in convolution window
# zero pad output to perserve size
model_1.add(Conv1D(64, 25, padding="same"))

# Fold output
#   - sum 2 channels together
#   - halves the channels
model_1.add(Folding())

# Kmax pool
# Return k max values
# perserve order
model_1.add(KMaxPooling(k=5, axis=1))

# Activate w relu
model_1.add(Activation("relu"))

# Flatten the data
model_1.add(Flatten())

# Connect the neural net
model_1.add(Dense(2, activation="softmax"))

model_1.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 150, 100)          7517100   
_________________________________________________________________
zero_padding1d_3 (ZeroPaddin (None, 248, 100)          0         
_________________________________________________________________
conv1d_3 (Conv1D)            (None, 248, 64)           320064    
_________________________________________________________________
k_max_pooling_3 (KMaxPooling (None, 5, 64)             0         
_________________________________________________________________
activation_3 (Activation)    (None, 5, 64)             0         
_________________________________________________________________
zero_padding1d_4 (ZeroPaddin (None, 53, 64)            0         
_________________________________________________________________
conv1d_4 (Conv1D)            (None, 53, 64)            102464    
__________

In [58]:
model_1.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['acc'])

model_1.fit(x_train, y_train, validation_data=(x_val, y_val),
          nb_epoch=10, batch_size=128)

Train on 13959 samples, validate on 3489 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x116fc8c50>

In [71]:
X_ = df_train['text'].values
Y_ = df_train['class'].values
# train classifier
# non tf-idf pipeline
pipeline = Pipeline([
    ('vect', CountVectorizer(ngram_range=(1, 3))),
    ('clf', LogisticRegression(C=0.39810717055349731, class_weight=None)),])
pipeline.fit(X_, Y_)

Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 3), preprocessor=None, stop_words=None,
        st...ty='l2', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False))])

In [62]:
def most_informative_feature_for_binary_classification(pipeline, n=10):
    class_labels = pipeline.named_steps['clf'].classes_
    if 'feats' in pipeline.named_steps.keys():
        feature_names = pipeline.named_steps['feats'].get_feature_names()
    elif 'vect' in pipeline.named_steps.keys():
        feature_names = pipeline.named_steps['vect'].get_feature_names()
    topn_class1 = sorted(zip(pipeline.named_steps['clf'].coef_[0], feature_names))[:n]
    topn_class2 = sorted(zip(pipeline.named_steps['clf'].coef_[0], feature_names))[-n:]

#     for coef, feat in topn_class1:
#         print class_labels[0], coef, feat

#     print

#     for coef, feat in reversed(topn_class2):
#         print class_labels[1], coef, feat
    return topn_class1, topn_class2

def extract_features(pipeline):
    """
    extract the informative features
    from the classifier
    """
    # vect = pipeline.steps[0][1]
    # clf = pipeline.steps[2][1]
    topn_class1, topn_class2 = most_informative_feature_for_binary_classification(pipeline, n=200000)

    class1 = pd.DataFrame(data=topn_class1, columns=(('weight', 'word')))
    class2 = pd.DataFrame(data=topn_class2, columns=(('weight', 'word')))

    class1['ngram_size'] = class1['word'].apply(meas_length)
    class2['ngram_size'] = class2['word'].apply(meas_length)

    # name columns
    class1.columns = ['Weight', 'word', 'ngram_size']
    class2.columns = ['Weight', 'word', 'ngram_size']

    # break out unigrams, bigrams and trigrams
    unigrams = pd.concat([class1.loc[class1['ngram_size'] == 1], class2.loc[class2['ngram_size'] == 1]])
    bigrams = pd.concat([class1.loc[class1['ngram_size'] == 2], class2.loc[class2['ngram_size'] == 2]])
    trigrams = pd.concat([class1.loc[class1['ngram_size'] == 3], class2.loc[class2['ngram_size'] == 3]])
    return unigrams, bigrams, trigrams

# Extract the unigrams, bigrams and trigrams
# used by the trained classifier
unigrams, bigrams, trigrams = extract_features(pipeline)

TypeError: 'NoneType' object is not iterable

In [91]:
pipeline.predict(['bruce', 'cusack'])

array([0, 0])

In [79]:
print len(Y_), np.sum(Y_), len(Y_) - np.sum(Y_)

17448 6625 10823


In [82]:
class_labels = pipeline.named_steps['clf'].classes_
if 'feats' in pipeline.named_steps.keys():
    feature_names = pipeline.named_steps['feats'].get_feature_names()
elif 'vect' in pipeline.named_steps.keys():
    feature_names = pipeline.named_steps['vect'].get_feature_names()


In [84]:
n = 100
topn_class1 = sorted(zip(pipeline.named_steps['clf'].coef_[0], feature_names))[:n]
topn_class2 = sorted(zip(pipeline.named_steps['clf'].coef_[0], feature_names))[-n:]

In [86]:
topn_class1

[(-0.39825643274804623, u'cool'),
 (-0.39021804290406786, u'bruce'),
 (-0.3844036176860867, u'drama'),
 (-0.3609975417327202, u'score'),
 (-0.3605436219098733, u'there were'),
 (-0.3594451751334926, u'jerry'),
 (-0.3520874980464845, u'negative'),
 (-0.32477966303041717, u'johnny'),
 (-0.31719365946169253, u'william'),
 (-0.31371268870647956, u'camp'),
 (-0.3065744449726966, u'carrey'),
 (-0.3002654278718152, u'apart'),
 (-0.30005766001178674, u'sandler'),
 (-0.2954877416410513, u'70'),
 (-0.29204944550893064, u'minor'),
 (-0.28890971956636874, u'what he'),
 (-0.2885285049002655, u'very good'),
 (-0.2879559253586889, u'murders'),
 (-0.28771482200647164, u'editing'),
 (-0.28734658415890824, u'language'),
 (-0.28513352402933645, u'yourself'),
 (-0.2837746062065589, u'is one of'),
 (-0.28258459481739306, u'human'),
 (-0.282387640244754, u'it can'),
 (-0.28180395918058765, u'not worth'),
 (-0.2795779830834692, u'fantastic'),
 (-0.2782520972184833, u'comedies'),
 (-0.2743665040128819, u'bros

In [87]:
topn_class2

[(0.23910153207981566, u'decided'),
 (0.2393177980793017, u'her in'),
 (0.23993820465989013, u'top notch'),
 (0.23997217514598002, u'reed'),
 (0.2406227094961414, u'see this'),
 (0.2416905939566745, u'she has'),
 (0.24233253257266898, u'miyazaki'),
 (0.24285631064043098, u'really great'),
 (0.2432265892297749, u'where they'),
 (0.24366884113405304, u'blah'),
 (0.2442749522910675, u'zizek'),
 (0.24454422018457087, u'kiss'),
 (0.2445999719138599, u'summer'),
 (0.24476688538101588, u'experience'),
 (0.24484202407764408, u'particularly'),
 (0.24527655956522282, u'gielgud'),
 (0.2453002223911645, u'pure'),
 (0.2453594455310116, u'robert'),
 (0.24558650961868195, u'marie'),
 (0.24566471797539957, u'it bad'),
 (0.24704094942018745, u'but it'),
 (0.24744296789893255, u'fbi'),
 (0.24922163723206053, u'but this was'),
 (0.24939849032141448, u'cute'),
 (0.2502926632068763, u'the direction'),
 (0.2511370064504894, u'darius'),
 (0.2513535693741552, u'din'),
 (0.25156971212101426, u'underground'),
 

In [93]:
df_train.head().values

array([["This is without a shadow of a doubt the absolute worst movie Steven Seagal has ever made. And that says a lot. Don't get fooled by the rating, it's way too good. This abomination hadn't even been worthy of a 0/10 rating, if such a thing existed. <br /><br />- Absolutely no plot <br /><br />- Worst action scenes ever, and there aren't too many of them either <br /><br />- Seagal doesn't do anything himself, including the fighting, talking (lots of dubbing), and so on. As always. <br /><br />- Seagal is fat, lazy and couldn't care less about this movie. Something which is very obvious all the way through<br /><br />Take all the other garbage DTV movies Seagal has made, multiply them with each other, multiply this with a thousand billions, and all the badness you then get won't even describe 1 % of this absolute crapfest.",
        0],
       ["I was very displeased with this move. Everything was terrible from the start. The comedy was unhumorous, the action overdone, the songs u