In [19]:
import tensorflow as tf

In [30]:
import pandas as pd
import seaborn as sns
import numpy as np
from tensorflow import keras
from sklearn.model_selection import train_test_split

## 1. load raw data

In [119]:
# df = pd.read_csv('data/moviereviews2.tsv',sep='\t')


In [121]:
# load from source
df = pd.read_csv('data/moviereviews2.tsv',sep='\t')
df.dropna(inplace=True)

In [122]:
# split train and test 
train_X, test_X, train_Y, test_Y = train_test_split(df['review'].tolist(),df['label'].tolist(), test_size=0.3,
                                                    random_state=42)

## 2. Preprocessing
drop stop word, invalid symbol, lowert characters, ...

In [123]:
import nltk 
import re
# nltk.down/lo/ad('stopwords')
#def normalize_corpus(doc):
wpt = nltk.WordPunctTokenizer()
stop_words = nltk.corpus.stopwords.words('english')
def normalize_document(doc):
    try:
        doc = re.sub(r'[^a-zA-Z\s]','', doc, re.I|re.A)
    except Exception as e:
        print(doc)
        raise e
    doc = doc.lower()
    doc = doc.strip()
    tokens = wpt.tokenize(doc)
    filtered_tokens = [token for token in tokens if token not in stop_words]
    doc = ' '.join(filtered_tokens)
    return doc


normalize_corpus = np.vectorize(normalize_document) 
train_norm_corpus = normalize_corpus(train_X)   
test_norm_corpus = normalize_corpus(test_X)

## 3. Generate tensorflow dataset for training

In [124]:
from tensorflow.keras.preprocessing import text
tokenizer = text.Tokenizer()
tokenizer.fit_on_texts(train_norm_corpus)

In [125]:
word2id = tokenizer.word_index
id2word = {v:k for k,v in word2id.items()}
vocab_size = len(word2id)
print('Vocabulary size:', vocab_size)

train_wids = [[word2id[word] for word in text.text_to_word_sequence(doc)] for doc in train_norm_corpus]
test_wids = [[word2id[word] for word in text.text_to_word_sequence(doc) if word in word2id.keys()] 
             for doc in test_norm_corpus]

Vocabulary size: 36184


In [126]:
def gen_train():
    i = 0
    curr_wids = train_wids
    curr_label = train_Y
    while i < len(wids):
        yield curr_wids[i], labels[i]
        i += 1

def gen_test():
    i = 0
    curr_wids = test_wids
    curr_label = test_Y
    while i < len(wids):
        yield curr_wids[i], labels[i]
        i += 1
        
train_dataset = tf.data.Dataset.from_generator(
    gen_train, 
    output_types=(tf.int32, tf.string), 
    output_shapes=( (None,), ())
)

test_dataset = tf.data.Dataset.from_generator(
    gen_test,
    output_types = (tf.int32, tf.string),
    output_shapes = ( (None,), ())
)

In [127]:
# convert label to 0 and 1 
def convert_label(x,y):
    if y == 'neg':
        return x, 0
    return x, 1
    
train_dataset = train_dataset.map(convert_label)
test_dataset = test_dataset.map(convert_label)
# set up batch size for generator 
padded_shapes = ([None],())
train_batches = train_dataset.shuffle(100).padded_batch(32, padded_shapes = padded_shapes)
test_batches = test_dataset.shuffle(100).padded_batch(32, padded_shapes = padded_shapes)

## 4. Build model
### 4.1 Build neural network (Basic DNN)

In [128]:
from tensorflow.keras.models import Sequential 
from tensorflow.keras import layers
embedding_dim=8

model = keras.Sequential([
  layers.Embedding(vocab_size, embedding_dim),
  layers.GlobalAveragePooling1D(),
  layers.Dense(5, activation='relu'),
  layers.Dropout(0.8),
  layers.Dense(1, activation='sigmoid')
])

model.summary()

Model: "sequential_12"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_12 (Embedding)     (None, None, 8)           289472    
_________________________________________________________________
global_average_pooling1d_6 ( (None, 8)                 0         
_________________________________________________________________
dense_24 (Dense)             (None, 5)                 45        
_________________________________________________________________
dropout_6 (Dropout)          (None, 5)                 0         
_________________________________________________________________
dense_25 (Dense)             (None, 1)                 6         
Total params: 289,523
Trainable params: 289,523
Non-trainable params: 0
_________________________________________________________________


In [129]:
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

history = model.fit_generator(
    train_batches,
    epochs=50,
    validation_data=test_batches, 
    validation_steps=2
)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


### 4.2 Build Neural Network (RNN)

In [130]:

embedding_dim=64

model = keras.Sequential([
  layers.Embedding(vocab_size, embedding_dim),
  layers.Bidirectional(layers.LSTM(64)),
  layers.Dense(16, activation='relu'),
  layers.Dropout(0.5),
  layers.Dense(1, activation='sigmoid')
])

model.summary()

model.compile(optimizer=tf.keras.optimizers.Adam(1e-4),
              loss='binary_crossentropy',
              metrics=['accuracy'])

history = model.fit_generator(
    train_batches,
    epochs=50,
    validation_data=test_batches, 
    validation_steps=2
)

Model: "sequential_13"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_13 (Embedding)     (None, None, 64)          2315776   
_________________________________________________________________
bidirectional_6 (Bidirection (None, 128)               66048     
_________________________________________________________________
dense_26 (Dense)             (None, 16)                2064      
_________________________________________________________________
dropout_7 (Dropout)          (None, 16)                0         
_________________________________________________________________
dense_27 (Dense)             (None, 1)                 17        
Total params: 2,383,905
Trainable params: 2,383,905
Non-trainable params: 0
_________________________________________________________________
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/5