# Import Packages

In [1]:
# For Data Preparation
import tensorflow as tf
import numpy as np
import pandas as pd
import re # regular expressions


# To clean up texts
import nltk.data
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer('english')
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
nltk.download('wordnet')
nltk.download('punkt')
tokenizer = nltk.data.load('nltk:tokenizers/punkt/english.pickle')


# For Word Embedding
from collections import Counter
import gensim
import gensim.models as g
from gensim.models import Word2Vec
from gensim.models import Phrases

from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import matplotlib as mpl
import matplotlib.pyplot as plt


# For the Model
from keras.preprocessing.text import Tokenizer
from keras.models import Sequential, Model
from keras.layers import LSTM, Bidirectional,Dropout, Input, SpatialDropout1D, CuDNNLSTM, Dense
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from keras.layers.normalization import BatchNormalization
from keras.optimizers import Adam
from keras.models import load_model
from keras.preprocessing.sequence import pad_sequences



from mlxtend.plotting import plot_learning_curves
import matplotlib.pyplot as plt
from mlxtend.data import iris_data
from mlxtend.preprocessing import shuffle_arrays_unison

import logging

import sklearn
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.datasets import load_digits
from sklearn.model_selection import learning_curve
from sklearn.model_selection import ShuffleSplit


import tensorflowjs as tfjs


  from ._conv import register_converters as _register_converters


[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/eunbeejang/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/eunbeejang/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Using TensorFlow backend.


# Data Preparation

In [28]:
# Load data
IMDB_train = pd.read_csv('./IMDB-train.txt', sep='\t', encoding='latin-1', header=None)
IMDB_train_y = IMDB_train[:][1]
IMDB_valid = pd.read_csv('./IMDB-valid.txt', sep='\t', encoding='latin-1', header=None)
IMDB_valid_y = IMDB_valid[:][1]
IMDB_test = pd.read_csv('./IMDB-test.txt', sep='\t', encoding='latin-1', header=None)
IMDB_test_y = IMDB_test[:][1]
stemmer = SnowballStemmer("english", ignore_stopwords=True)

print("Data loaded.")

Data loaded.


In [29]:
frames = [IMDB_train, IMDB_valid]
frames_y = [IMDB_train_y, IMDB_valid_y]
IMDB_train = pd.concat(frames)
IMDB_train_y = pd.concat(frames_y)

In [8]:
#IMDB_train = IMDB_train[:][0]
#IMDB_test = IMDB_test[:][0]

In [47]:
IMDB_train[:10]

Unnamed: 0,0,1
0,For a movie that gets no respect there sure ar...,1
1,Bizarre horror movie filled with famous faces ...,1
2,"A solid, if unremarkable film. Matthau, as Ein...",1
3,It's a strange feeling to sit alone in a theat...,1
4,"You probably all already know this by now, but...",1
5,I saw the movie with two grown children. Altho...,1
6,You're using the IMDb.<br /><br />You've given...,1
7,This was a good film with a powerful message o...,1
8,"Made after QUARTET was, TRIO continued the qua...",1
9,"For a mature man, to admit that he shed a tear...",1


In [30]:
def preprocessing(data):
    new_data = []
    #i = 0
    for sentence in (data[:][0]):
        #clean = re.compile('<.*?>')
        new_sentence = re.sub('<.*?>', '', sentence) # remove HTML tags
        new_sentence = re.sub(r'[^\w\s]', '', new_sentence) # remove punctuation
        new_sentence = new_sentence.lower() # convert to lower case
        if new_sentence != '':
            new_data.append(new_sentence)
    return new_data

In [31]:
IMDB_train = preprocessing(IMDB_train)
IMDB_test = preprocessing(IMDB_test)

In [32]:
# Convert a sentence into a list of words
def sentence_to_wordlist(sentence, remove_stopwords=False):
    # Convert words to lower case and split them
    words = sentence.lower().split()
    # Lemmatizing
    #words = [lemmatizer.lemmatize(word) for word in words]
    # 6. Return a list of words
    return(words)

In [33]:
# whole data into a list of sentences where each sentence is a list of word items
def list_of_sentences(data):
    sentences = []
    for i in data:
        sentences.append(sentence_to_wordlist(i))
    return sentences

In [34]:
train_x = list_of_sentences(IMDB_train)
train_y = IMDB_train_y.tolist()

In [9]:
len(train_x)

25000

# Word Embedding

In [35]:
# Create Word Vectors

wv_model = Word2Vec(size=128, window=5, min_count=4, workers=4)

wv_model.build_vocab(train_x) 
wv_model.train(train_x, total_examples=wv_model.corpus_count, epochs=wv_model.iter)
word_vectors = wv_model.wv
words = list(wv_model.wv.vocab)

# Calling init_sims will make the model will be better for memory
# if we don't want to train the model over and over again
wv_model.init_sims(replace=True)

#n_words = print(len(words))

print("Number of word vectors: {}".format(len(word_vectors.vocab)))

# save model
wv_model.wv.save_word2vec_format('model.txt', binary=False)

# load model
#new_model = Word2Vec.load('model.bin')



  


Number of word vectors: 35674


In [36]:
import gensim
# Need the interactive Tools for Matplotlib
import numpy as np
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE

In [37]:
from gensim.models.keyedvectors import KeyedVectors

new_model = KeyedVectors.load_word2vec_format('model.txt')
#model.save_word2vec_format('model.txt', binary=False)

In [55]:
#from gensim.scripts.glove2word2vec import glove2word2vec
#glove2word2vec(glove_input_file="model.txt", word2vec_output_file="model.word2vec.txt")
#glove2word2vec(glove_input_file="out_vec.txt", word2vec_output_file="retrofittedglove.word2vec.txt")

(35675, 1)

In [57]:
original_model = KeyedVectors.load_word2vec_format('model.txt')
retrofitted_model = KeyedVectors.load_word2vec_format('out_vec.txt')

In [81]:
new_words = list(retrofitted_model.wv.vocab)

  """Entry point for launching an IPython kernel.


In [82]:
# Build dictionary & inv_vocab

def create_vocab(data_collect, max_vocab):
    # Get raw data
    x_list = data_collect
    sample_count = sum([len(x) for x in x_list])
    words = []
    for data in x_list:
        words.extend([data])
    count = Counter(words) # word count
    inv_vocab = [x[0] for x in count.most_common(max_vocab)]
    vocab = {x: i for i, x in enumerate(inv_vocab, 1)}
    return vocab, inv_vocab

In [83]:
vocab, inv_vocab = create_vocab(words, len(words))
ret_vocab, ret_inv_vocab = create_vocab(new_words, len(new_words))

In [84]:
len(inv_vocab)

35674

In [85]:
# Find the max length sentence
def find_max_length_sentence(sentence):
    max_length = 0
    for i in sentence:
        length = len(sentence_to_wordlist(i))
        if max_length < length:
            max_length = length
    return max_length

In [86]:
seq_length = find_max_length_sentence(IMDB_train)
print(seq_length)

2450


In [87]:
# Map each word to corresponding vector
def map_to_vec(word):
    vec = wv_model[word]
    return vec

#map_to_vec('care')

In [88]:
# Embedding Matrix
def make_emb_matrix(inv_vocab):
    emb_matrix = []
    for word in inv_vocab:
        emb_matrix.append(map_to_vec(word))
    return emb_matrix

In [89]:
embedding = np.asarray(make_emb_matrix(inv_vocab))
ret_embedding = np.asarray(make_emb_matrix(ret_inv_vocab))


  This is separate from the ipykernel package so we can avoid doing imports until


# Initialize Word Embeddings in Keras

In [96]:
wv_dim = 100
num_words = len(word_vectors.vocab)
vocab = Counter(words)
ret_vocab = Counter(new_words)

In [91]:

word_index = {t[0]: i+1 for i,t in enumerate(vocab.most_common(num_words-1))}

train_sequences = [[word_index.get(t, 0) for t in sentence]
             for sentence in IMDB_train[:len(IMDB_train)]]

test_sequences = [[word_index.get(t, 0)
                   for t in sentence] for sentence in IMDB_test[:len(IMDB_test)]]

# Pad zeros to match the size of matrix
train_data = pad_sequences(train_sequences, maxlen=seq_length, padding="post", truncating="post")
test_data = pad_sequences(test_sequences, maxlen=seq_length, padding="post", truncating="post")


In [92]:
# Initialize the matrix with random numbers
wv_matrix = (np.random.rand(num_words, wv_dim) - 0.5) / 5.0
for word, i in word_index.items():
    if i >= num_words:
        continue
    try:
        embedding_vector = word_vectors[word]
        # words not found in embedding index will be all-zeros.
        wv_matrix[i] = embedding_vector
    except:
        pass


In [97]:

ret_word_index = {t[0]: i+1 for i,t in enumerate(ret_vocab.most_common(num_words-1))}

ret_train_sequences = [[ret_word_index.get(t, 0) for t in sentence]
             for sentence in IMDB_train[:len(IMDB_train)]]

ret_test_sequences = [[ret_word_index.get(t, 0)
                   for t in sentence] for sentence in IMDB_test[:len(IMDB_test)]]

# Pad zeros to match the size of matrix
ret_train_data = pad_sequences(ret_train_sequences, maxlen=seq_length, padding="post", truncating="post")
ret_test_data = pad_sequences(ret_test_sequences, maxlen=seq_length, padding="post", truncating="post")


In [98]:
# Initialize the matrix with random numbers
ret_wv_matrix = (np.random.rand(num_words, wv_dim) - 0.5) / 5.0
for word, i in ret_word_index.items():
    if i >= num_words:
        continue
    try:
        ret_embedding_vector = ret_word_vectors[word]
        # words not found in embedding index will be all-zeros.
        ret_wv_matrix[i] = ret_embedding_vector
    except:
        pass


#  Model

In [None]:
# Embedding
wv_layer = Embedding(num_words,
                     wv_dim,
                     mask_zero=False,
                     weights=[wv_matrix],
                     input_length=seq_length,
                     trainable=False)

ret_wv_layer = Embedding(num_words,
                     wv_dim,
                     mask_zero=False,
                     weights=[ret_wv_matrix],
                     input_length=seq_length,
                     trainable=False)

# Inputs
comment_input = Input(shape=(seq_length,), dtype='int64')
embedded_sequences = wv_layer(comment_input) # regular word2vec
embedded_sequences = SpatialDropout1D(0.2)(embedded_sequences)


#ret_embedded_sequences = ret_wv_layer(comment_input)# retrofitted word2vec


# LSTM
x = Bidirectional(LSTM(64, return_sequences=False))(embedded_sequences)

# Output
x = Dropout(0.2)(x)
x = BatchNormalization()(x)
preds = Dense(1, activation='sigmoid')(x)

# build the model
model = Model(inputs=[comment_input], outputs=preds)
model.compile(loss='binary_crossentropy',   #binary_crossentropy
              optimizer=Adam(lr=0.001, clipnorm=.25, beta_1=0.7, beta_2=0.99),
              metrics=['accuracy'])

print(model.summary())

hist = model.fit(train_data, IMDB_train_y, validation_data=(test_data, IMDB_test_y), epochs=15, batch_size=32)