In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import numpy as np
import pandas as pd
from nltk import word_tokenize
import seaborn as sns
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers


# GET TRAIN, VAL AND TEST DATA #

def build_data(train_data, test_data):
    train_data = train_data.values
    test_data = test_data.values
    
    train_examples = []
    train_labels = []
    
    test_examples = []
    

    for i in range(len(train_data)):
        
        train_examples.append(train_data[i, ...])
        train_labels.append(train_data[i, 3])
        
            

    train_examples = np.asarray(train_examples)
    train_phrases = train_examples[:, 2]
    
    train_labels = np.asarray(train_labels)
    
    for i in range(len(test_data)):
        test_examples.append(test_data[i, ...])

    
    test_examples = np.asarray(test_examples)
    test_phrases = test_examples[:, 2]

    
    print(train_phrases.shape, train_labels.shape)
    print(test_phrases.shape)
    
    return train_phrases, train_labels, test_phrases
     


def get_label_distribution(train_labels):
    
    sns.set(color_codes=True)

    dist = train_labels
    print(len(dist))

    dist_dict = {'0':0, '1':0, '2':0, '3':0, '4':0}

    for j in range(len(dist)):  
        dist_dict[str(dist[j])] += 1


    print(dist_dict)

    x = []
    y = []

    for value in dist_dict.keys():
        x.append(value)
    for value in dist_dict.values():
        y.append(value)

    fig, ax = plt.subplots(figsize=(12,8))
    sns.barplot(x, y)
    

# PRE-PROCESSING TEXT #

def get_vocab(train_phrases):
    
    phrases = []
    for i in range(len(train_phrases)):
        phrases.append(train_phrases[i])

    words = []

    for phrase in phrases:
        words.append(phrase.split())
        
    final_words = []
    special_chars = [".", ",", ";", "!", "?", "#", "&", "$", "''", "'", "!?"] #found after seeing vocab words

    for j in range(len(words)):
        for k in range(len(words[j])):
            if words[j][k] not in special_chars:
                final_words.append(words[j][k].lower())

    final_words.append(' ')

    vocab_words = sorted(set(final_words)) #get a vocabulary of unique words

    vocab = {u:i for i, u in enumerate(vocab_words)}
    return vocab



def word2index(string, vocab):
    
    special_chars = [".", ",", ";", "!", "?", "#", "&", "$", "''", "'", "!?"] #found after seeing vocab words
    idx_list = []
    ws = string.split()
    for w in ws:
        if w not in special_chars:
            w = w.lower()
            idx_list.append(vocab[w])
    return np.asarray(idx_list)


def pad_phrase(phrase, max_len):
    phrase_len = len(phrase)
    miss = max_len - phrase_len
    before = np.zeros(miss)
    
    return np.concatenate((before, phrase))



train_path = '../input/sentiment-analysis-on-movie-reviews/train.tsv.zip'
test_path = '../input/sentiment-analysis-on-movie-reviews/test.tsv.zip'
train_data = pd.read_csv(train_path, sep="\t")
test_data = pd.read_csv(test_path, sep="\t")

train_phrases, train_labels, test_phrases = build_data(train_data, test_data)

get_label_distribution(train_labels)

all_phrases = []
for i in range(len(train_phrases)):
    all_phrases.append(train_phrases[i])

for i in range(len(test_phrases)):
    all_phrases.append(test_phrases[i])


vocab = get_vocab(all_phrases)
print()
print(str(len(vocab)) + ' unique words in vocabulary!')
print()

train_encoded = []
max_len = 55

for phrase in train_phrases:
    if len(word2index(phrase, vocab)) < max_len:
        train_encoded.append(pad_phrase(word2index(phrase, vocab), max_len))
    else:
        print('PHRASE TOO BIG!')

    
train_encoded = np.asarray(train_encoded)
print(train_phrases.shape, train_encoded.shape)
print(train_phrases[0], train_encoded[0], train_labels[0])

print()

print(vocab)



In [None]:
from keras.optimizers import SGD
from keras.utils import to_categorical

vocab_size = len(vocab)
embedding_dim = 100
rnn_units = 32
batch_size = 32

model = build_model(batch_size, vocab_size, embedding_dim, rnn_units)
model.summary()


opt = SGD(lr=0.01, momentum=0.9)
model.compile(optimizer = opt, loss = 'categorical_crossentropy', metrics = ['acc'])
history = model.fit(train_encoded, to_categorical(train_labels), batch_size = batch_size, epochs = 10, validation_split = 0.2)

In [None]:
epoch_count = range(1, len(history.history['loss']) + 1)


plt.plot(epoch_count, history.history['loss'], 'r-')
plt.plot(epoch_count, history.history['val_loss'], 'b-')
plt.legend(['Training Loss', 'Validation Loss'])
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.show()

In [None]:
test_encoded = []
max_len = 55

for phrase in test_phrases:
    
    if len(word2index(phrase, vocab)) < max_len:
        test_encoded.append(pad_phrase(word2index(phrase, vocab), max_len))
    else:
        print('PHRASE TOO BIG!')

    
test_encoded = np.asarray(test_encoded)
print(test_phrases.shape, test_encoded.shape)


print()

y_pred = model.predict_classes(test_encoded)
print(y_pred.shape)
sub_file = pd.read_csv('../input/sentiment-analysis-on-movie-reviews/sampleSubmission.csv',sep=',')
sub_file.Sentiment=y_pred
sub_file.to_csv('Submission.csv',index=False)