# Text Sentiment Classification

In [None]:
import numpy as np
from tensorflow import keras
import os
import tarfile
from d2l.tensorflow.data import Vocab, batch_iter

##  Reading Data

In [None]:
fname = '../data/aclImdb_v1.tar.gz'
data_dir = '../data/'
with tarfile.open(fname, 'r') as f:
    f.extractall(data_dir)

In [None]:
def read_imdb(folder='train'):
    data, labels = [], []
    for label in ['pos', 'neg']:
        folder_name = os.path.join(data_dir, 'aclImdb', folder, label)
        for file in os.listdir(folder_name):
            with open(os.path.join(folder_name, file), 'rb') as f:
                review = f.read().decode('utf-8').replace('\n', '')
                data.append(review)
                labels.append(1 if label == 'pos' else 0)
    return data, labels

train_data, test_data = read_imdb('train'), read_imdb('test')
print('# trainings:', len(train_data[0]), '\n# tests:', len(test_data[0]))
for x, y in zip(train_data[0][:3], train_data[1][:3]):
    print('label:', y, 'review:', x[0:60])

## Tokenization and Vocabulary 



In [None]:
def tokenize(sentences):
    return [line.split(' ') for line in sentences]

train_tokens = tokenize(train_data[0])
test_tokens = tokenize(test_data[0])

vocab = Vocab([tk for line in train_tokens for tk in line], min_freq=5)

## Padding to the Same Length


In [None]:
max_len = 500

def pad(x):
    if len(x) > max_len:        
        return x[:max_len]
    else:
        return x + [vocab.unk] * (max_len - len(x))
    
train_features = np.array([pad(vocab[line]) for line in train_tokens])
test_features = np.array([pad(vocab[line]) for line in test_tokens])

### Create Data Iterator

In [None]:
batch_size = 64

train_steps = len(train_features) // batch_size
test_steps = len(test_features) // batch_size

train_iter = batch_iter(train_features, train_data[1], batch_size)
test_iter = batch_iter(test_features, test_data[1], batch_size)

In [None]:
for X, y in train_iter:
    print('X', X.shape, 'y', y.shape)
    break

## Use a Bidirectional LSTM Model

In [None]:
vocab_size, embed_size, num_hiddens, num_layers = len(vocab), 50, 100, 2

In [None]:
model = keras.models.Sequential()

model.add(keras.layers.Embedding(vocab_size, embed_size))

for i in range(num_layers - 1):
    model.add(keras.layers.Bidirectional(
        keras.layers.LSTM(num_hiddens, return_sequences=True))
    )
model.add(keras.layers.Bidirectional(keras.layers.LSTM(num_hiddens)))

model.add(keras.layers.Dense(2, activation='softmax'))

model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

## Load Pre-trained Word Vectors


In [None]:
embedding_vector = {}

with open('../data/glove.6B.50d.txt', 'r') as f:
    for line in f:
        value = line.split(' ')
        word = value[0]
        coef = np.array(value[1:], dtype=np.float32)
        embedding_vector[word] = coef

embedding_matrix = np.zeros((vocab_size, embed_size))

for word, i in vocab.token_to_idx.items():
    embedding_value = embedding_vector.get(word)
    if embedding_value is not None:
        embedding_matrix[i] = embedding_value

In [None]:
embedding_matrix.shape

In [None]:
model.layers[0].weights[0][0:2]

In [None]:
model.layers[0].set_weights([embedding_matrix])

In [None]:
model.layers[0].weights[0][0:2]

## Train and Evaluate the Model



In [None]:
num_epochs = 5
model.fit(
    train_iter, steps_per_epoch=train_steps, 
    validation_data=test_iter, validation_steps=test_steps,
    epochs=5
)

## Predict sencences

In [None]:
def predict_sentiment(net, vocab, sentence):
    sentence = np.array(vocab[sentence.split()])
    label = np.argmax(net(sentence.reshape((1, -1))), axis=1)
    return 'positive' if label == 1 else 'negative'

In [None]:
predict_sentiment(net, vocab, 'this movie is so great')

In [None]:
predict_sentiment(net, vocab, 'this movie is so bad')