# Sentiment Classification

## Import Dependencies

In [0]:
import re

In [0]:
import numpy as np
import pandas as pd

In [0]:
import tensorflow as tf

In [0]:
from keras.utils import to_categorical

Using TensorFlow backend.


In [0]:
import nltk
nltk.download("stopwords")
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [0]:
from nltk.tokenize import sent_tokenize
nltk.download("punkt")
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [0]:
from keras.layers import Input, Dense, Embedding, GRU, Bidirectional, Layer, TimeDistributed
from keras.models import Model
from keras import backend as K
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping

In [0]:
from keras.utils import plot_model

## Trained Embeddings

In [0]:
!wget http://nlp.stanford.edu/data/glove.6B.zip

--2019-03-17 03:02:34--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2019-03-17 03:02:34--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


2019-03-17 03:04:13 (8.42 MB/s) - ‘glove.6B.zip’ saved [862182613/862182613]



In [0]:
!unzip glove.6B.zip

Archive:  glove.6B.zip
  inflating: glove.6B.50d.txt        
  inflating: glove.6B.100d.txt       
  inflating: glove.6B.200d.txt       
  inflating: glove.6B.300d.txt       


In [0]:
embeddings_index = {}
with open("glove.6B.300d.txt", "r") as embeddings_file:
    for embedding_line in embeddings_file:
        embedding = embedding_line.split(" ")
        word = embedding[0]
        embeddings_index[word] = np.asarray(embedding[1:], dtype="float32")

## Main Function

## Hyperparameters

In [0]:
max_words = 20000
max_words_per_sentence = 20
max_sentences_per_doc = 20
embedding_size = 300
batch_size = 128
learning_rate = 0.001
total_epoch = 5

## Preprocess data

In [0]:
stop_words = set(stopwords.words("english"))

In [0]:
def parse_text(data, max_words, max_words_per_sentence, max_sentences_per_doc, tokenizer=None):
    words = data
    
    words = words.apply(lambda doc: doc.lower().strip())
    words = words.apply(lambda doc: re.sub(r"([?.!,¿])", r" \1 ", doc))
    words = words.apply(lambda doc: re.sub(r'[" "]+', " ", doc))
    words = words.apply(lambda doc: re.sub(r"[^a-zA-Z?.!,¿]+", " ", doc))
    words = words.apply(lambda doc: doc.rstrip().strip())
    
    words = words.apply(lambda doc: doc.split(" "))
    words = words.apply(lambda doc: [word for word in doc if word not in stop_words])
    words = words.apply(lambda doc: " ".join(doc))
    
    if tokenizer is None:
        tokenizer = Tokenizer(num_words=max_words, oov_token="<UNK>")
        tokenizer.fit_on_texts(words)
    
    words = words.apply(lambda doc: sent_tokenize(doc))
    words = pad_sequences(
        words,
        maxlen=max_sentences_per_doc,
        dtype=object,
        padding="post",
        truncating="post",
        value=""
    )
    
    words = np.apply_along_axis(tokenizer.texts_to_sequences, 0, words)
    words = np.apply_along_axis(lambda sentences: pad_sequences(
        sentences,
        maxlen=max_words_per_sentence,
        padding="post",
        truncating="post"
    ), 1, words)
    
    return words, tokenizer

## Load the Data

In [0]:
train_data = pd.read_csv("data/train.csv")
valid_data = pd.read_csv("data/valid.csv")
test_data = pd.read_csv("data/test.csv")

In [0]:
train_stars = train_data["stars"].apply(int) - 1
train_label = to_categorical(train_stars, num_classes=5)
valid_stars = valid_data["stars"].apply(int) - 1
valid_label = to_categorical(valid_stars, num_classes=5)

In [0]:
train_words, tokenizer = parse_text(train_data["text"], max_words, max_words_per_sentence, max_sentences_per_doc)
valid_words, _ = parse_text(valid_data["text"], max_words, max_words_per_sentence, max_sentences_per_doc, tokenizer=tokenizer)
test_words, _ = parse_text(test_data["text"], max_words, max_words_per_sentence, max_sentences_per_doc, tokenizer=tokenizer)

In [0]:
embedding_weights = np.zeros((len(tokenizer.word_index) + 1, embedding_size))
for word, word_id in tokenizer.word_index.items():
    embedding = embeddings_index.get(word)
    if embedding is not None:
        embedding_weights[word_id] = embedding

## Attention Layer

In [0]:
class Attention(Layer):
    def __init__(self):
        super(Attention, self).__init__()
        
    def build(self, input_shape):
        self.context_vector = self.add_weight(
            name="context_vector",
            shape=(input_shape[1][2], 1),
            initializer="uniform",
            trainable=True
        )
        super(Attention, self).build(input_shape)
    
    def call(self, inputs):
        gru, hidden_hidden = inputs
        
        attention = K.dot(hidden_hidden, self.context_vector)
        
        attention = K.softmax(attention, axis=1)
        
        return K.sum(attention * gru, axis=1)
    
    def compute_output_shape(self, input_shape):
        return (input_shape[0][0], input_shape[0][2])

## Model

In [0]:
inputs = Input(shape=(max_sentences_per_doc, max_words_per_sentence))

In [0]:
words_input = Input(shape=(max_words_per_sentence, ))

In [0]:
embeddings = Embedding(
    len(tokenizer.word_index) + 1,
    embedding_size,
    input_length=max_words_per_sentence,
    weights=[embedding_weights],
    trainable=False
)(words_input)

Instructions for updating:
Colocations handled automatically by placer.


In [0]:
gru = Bidirectional(GRU(
    50,
    activation="tanh",
    recurrent_activation="sigmoid",
    return_sequences=True
))(embeddings)

In [0]:
hidden_hidden = Dense(100, activation="tanh")(gru)

In [0]:
attention_layer = Attention()

In [0]:
attention_output = attention_layer([gru, hidden_hidden])

In [0]:
words_attention_model = Model(inputs=words_input, outputs=attention_output)

In [0]:
sentence_embeddings = TimeDistributed(words_attention_model)(inputs)

In [0]:
sentence_gru = Bidirectional(GRU(
    50,
    activation="tanh",
    recurrent_activation="sigmoid",
    return_sequences=True
))(sentence_embeddings)

In [0]:
sentence_hidden_hidden = Dense(100, activation="tanh")(sentence_gru)

In [0]:
sentence_attention_layer = Attention()

In [0]:
sentence_attention_output = sentence_attention_layer([sentence_gru, sentence_hidden_hidden])

In [0]:
sentiment = Dense(5, activation="softmax")(sentence_attention_output)

In [0]:
model = Model(inputs=inputs, outputs=sentiment)

In [0]:
words_attention_model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            (None, 20)           0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 20, 300)      23076000    input_2[0][0]                    
__________________________________________________________________________________________________
bidirectional_1 (Bidirectional) (None, 20, 100)      105300      embedding_1[0][0]                
__________________________________________________________________________________________________
dense_1 (Dense)                 (None, 20, 100)      10100       bidirectional_1[0][0]            
__________________________________________________________________________________________________
attention_

In [0]:
plot_model(model, "words_attention_model.png", show_shapes=True)

In [0]:
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 20, 20)       0                                            
__________________________________________________________________________________________________
time_distributed_1 (TimeDistrib (None, 20, 100)      23191500    input_1[0][0]                    
__________________________________________________________________________________________________
bidirectional_2 (Bidirectional) (None, 20, 100)      45300       time_distributed_1[0][0]         
__________________________________________________________________________________________________
dense_2 (Dense)                 (None, 20, 100)      10100       bidirectional_2[0][0]            
__________________________________________________________________________________________________
attention_

In [0]:
plot_model(model, "model.png", show_shapes=True)

In [0]:
optimizer = Adam(lr=learning_rate)

In [0]:
model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])

## Train the Model

In [0]:
model.fit(
    train_words,
    train_label,
    epochs=total_epoch,
    batch_size=batch_size,
    callbacks=[EarlyStopping(monitor="val_loss", patience=1, restore_best_weights=True)],
    validation_data=(valid_words, valid_label)
)

Instructions for updating:
Use tf.cast instead.
Train on 100000 samples, validate on 10000 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7fe69c37fc18>

## Save the Model

In [0]:
filepath = "model.h5"

In [0]:
model.save(filepath)

## Test the model

In [0]:
train_score = model.evaluate(train_words, train_label, batch_size=batch_size)
print('Training Loss: {}\n Training Accuracy: {}\n'.format(train_score[0], train_score[1]))

Training Loss: 0.6621654364776611
 Training Accuracy: 0.72103



In [0]:
valid_score = model.evaluate(valid_words, valid_label, batch_size=batch_size)
print('Validation Loss: {}\n Validation Accuracy: {}\n'.format(valid_score[0], valid_score[1]))

Validation Loss: 0.7651836580276489
 Validation Accuracy: 0.6838



## Predict and Save the Result

In [0]:
test_pre = model.predict(test_words, batch_size=batch_size).argmax(axis=-1) + 1
sub_df = pd.DataFrame()
sub_df["review_id"] = test_data["review_id"]
sub_df["pre"] = test_pre
sub_df.to_csv("pre.csv", index=False)