In [85]:
import numpy as np
import string
import pandas as pd
import matplotlib.pyplot as plt
import nltk
import keras

from keras.preprocessing.text import Tokenizer, text_to_word_sequence
from keras.preprocessing.sequence import pad_sequences
from sklearn import random_projection
from sklearn.metrics import accuracy_score
from nltk.corpus import stopwords

from keras.layers import Input ,Dense, Dropout, Activation, LSTM
from keras.layers import Conv1D, Convolution2D, MaxPooling2D, Flatten, Reshape, BatchNormalization, Concatenate
from keras.models import Sequential
from keras.layers.wrappers import TimeDistributed
from keras.layers.pooling import GlobalAveragePooling1D
from keras.optimizers import SGD
from keras.utils import np_utils
from keras.models import Model
from keras import metrics

stop_words = set(stopwords.words('english') + list(string.punctuation))

import warnings
warnings.filterwarnings('ignore')

stop_words = set(stopwords.words('english') + list(string.punctuation))


# -------------- Helper Functions --------------
def tokenize(text):
    '''
    :param text: a doc with multiple sentences, type: str
    return a word list, type: list
    https://textminingonline.com/dive-into-nltk-part-ii-sentence-tokenize-and-word-tokenize
    e.g.
    Input: 'It is a nice day. I am happy.'
    Output: ['it', 'is', 'a', 'nice', 'day', 'i', 'am', 'happy']
    '''
    tokens = []
    for word in nltk.word_tokenize(text):
        word = word.lower()
        if word not in stop_words and not word.isnumeric():
            tokens.append(word)
    return tokens


def get_sequence(data, seq_length, vocab_dict):
    '''
    :param data: a list of words, type: list
    :param seq_length: the length of sequences,, type: int
    :param vocab_dict: a dict from words to indices, type: dict
    return a dense sequence matrix whose elements are indices of words,
    '''
    data_matrix = np.zeros((len(data), seq_length), dtype=int)
    for i, doc in enumerate(data):
        for j, word in enumerate(doc):
            # YOUR CODE HERE
            if j == seq_length:
                break
            word_idx = vocab_dict.get(word, 1) # 1 means the unknown word
            data_matrix[i, j] = word_idx
    return data_matrix


def read_data(file_name, input_length, vocab=None):
    """
    https://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_csv.html
    """
    df = pd.read_csv(file_name)
    df['words'] = df['text'].apply(tokenize)

    if vocab is None:
        vocab = set()
        for i in range(len(df)):
            for word in df.iloc[i]['words']:
                vocab.add(word)
    vocab_dict = dict()
    vocab_dict['<pad>'] = 0 # 0 means the padding signal
    vocab_dict['<unk>'] = 1 # 1 means the unknown word
    vocab_size = 2
    for v in vocab:
        vocab_dict[v] = vocab_size
        vocab_size += 1

    data_matrix = get_sequence(df['words'], input_length, vocab_dict)
    stars = df['stars'].apply(int) - 1
    return df['review_id'], stars, data_matrix, vocab
# ----------------- End of Helper Functions-----------------


def load_data(input_length):
     # Load training data and vocab
    train_id_list, train_data_label, train_data_matrix, vocab = read_data("data/train.csv", input_length)
    K = max(train_data_label)+1  # labels begin with 0

    # Load testing data
    test_id_list, _, test_data_matrix, _ = read_data("data/test.csv", input_length, vocab=vocab)
    test_data_label = pd.read_csv("data/valid.csv")['stars'] - 1
    
    print("Vocabulary Size:", len(vocab))
    print("Training Set Size:", len(train_id_list))
    print("Test Set Size:", len(test_id_list))
    print("Training Set Shape:", train_data_matrix.shape)
    print("Testing Set Shape:", test_data_matrix.shape)

    # Converts a class vector to binary class matrix.
    # https://keras.io/utils/#to_categorical
    train_data_label = keras.utils.to_categorical(train_data_label, num_classes=K)
    test_data_label = keras.utils.to_categorical(test_data_label, num_classes=K)
    return train_data_matrix, train_data_label, test_data_matrix, test_data_label, vocab

In [2]:
test_data = pd.read_csv("data/valid.csv")
test_data_label = pd.read_csv("data/valid.csv")['stars'] - 1

In [25]:
Train = pd.read_csv("data/train.csv")
Test = pd.read_csv("data/test.csv")
Valid = pd.read_csv("data/valid.csv")

#################################################################
Train = Train.iloc[0:10000,:]
#################################################################
# Label
Train_y = Train['stars']
Test_y = Valid['stars']
df = pd.concat([Train, Valid])

In [83]:
# Hyperparameters

input_length = 30
embedding_size = 100
hidden_size = 100
batch_size = 100
dropout_rate = 0.5
filters = 100
kernel_sizes = [3, 4, 5]
padding = 'valid'
activation = 'relu'
strides = 1
pool_size = 2
learning_rate = 0.1
total_epoch = 10

In [4]:
train_data_matrix, train_data_label, test_data_matrix, test_data_label, vocab = load_data(input_length)

Vocabulary Size: 114655
Training Set Size: 100000
Test Set Size: 10000
Training Set Shape: (100000, 30)
Testing Set Shape: (10000, 30)


In [78]:
#################################################################
train_data_matrix = train_data_matrix[0:10000]
train_data_label = train_data_label[1:10000]
#################################################################

In [11]:
# Data shape
N = train_data_matrix.shape[0]
K = train_data_label.shape[1]

input_size = len(vocab) + 2
output_size = K



## CNN

In [86]:
# New model
# YOUR CODE HERE
x = Input(shape=(input_length, ))

# embedding layer and dropout
# YOUR CODE HERE
e = Embedding(input_dim=input_size, output_dim=embedding_size, input_length=input_length)(x)
e_d = Dropout(dropout_rate)(e)

# construct the sequence tensor for CNN
# YOUR CODE HERE
e_d = Reshape((input_length, embedding_size, 1))(e_d)

# CNN layers
conv_blocks = []
for kernel_size in kernel_sizes:
    # YOUR CODE HERE
    conv = Conv2D(filters=filters, kernel_size=(kernel_size, embedding_size), padding=padding, activation=activation, strides=(strides, strides))(e_d)
    maxpooling = MaxPool2D(pool_size=((input_length-kernel_size)//strides+1, 1))(conv)
    faltten = Flatten()(maxpooling)
    conv_blocks.append(faltten)

# concatenate CNN results
# YOUR CODE HERE
c = Concatenate()(conv_blocks) if len(kernel_sizes) > 1 else conv_blocks[0]
c_d = Dropout(dropout_rate)(c)

# dense layer
# YOUR CODE HERE
d = Dense(hidden_size, activation=activation)(c_d)

# output layer
# YOUR CODE HERE
y = Dense(output_size, activation='softmax')(d)

# build your own model
# YOUR CODE HERE
model_CNN = Model(x, y)

# SGD optimizer with momentum
optimizer = SGD(lr=learning_rate, decay=1e-6, momentum=0.9, nesterov=True)

# compile model
model_CNN.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])



In [87]:
# training
model_CNN.fit(train_data_matrix, train_data_label, epochs=total_epoch, batch_size=batch_size)
# testing
train_score = model_CNN.evaluate(train_data_matrix, train_data_label, batch_size=batch_size)
test_score = model_CNN.evaluate(test_data_matrix, test_data_label, batch_size=batch_size)

print('Training Loss: {}\n Training Accuracy: {}\n'
      'Testng Loss: {}\n Testing accuracy: {}'.format(
          train_score[0], train_score[1],
          test_score[0], test_score[1]))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Training Loss: 0.8139531481167048
 Training Accuracy: 0.6515954822149828
Testng Loss: 2.32424614071846
 Testing accuracy: 0.31980000153183935


## LSTM

In [52]:
# New model_LSTM
model_LSTM = Sequential()

# embedding layer and dropout
# YOUR CODE HERE
model_LSTM.add(Embedding(input_dim=input_size,
                         output_dim=embedding_size, input_length=input_length))
model_LSTM.add(Dropout(dropout_rate))

# LSTM layer
# YOUR CODE HERE
model_LSTM.add(LSTM(units=hidden_size))

# output layer
# YOUR CODE HERE
model_LSTM.add(Dense(K, activation='softmax'))

# SGD optimizer with momentum
optimizer = SGD(lr=learning_rate, decay=1e-6, momentum=0.9, nesterov=True)

# compile model_LSTM
model_LSTM.compile(loss='categorical_crossentropy',
                   optimizer=optimizer, metrics=['accuracy'])

In [81]:
# training
model_LSTM.fit(train_data_matrix, train_data_label, epochs=total_epoch, batch_size=batch_size)
# testing
train_score = model_LSTM.evaluate(train_data_matrix, train_data_label, batch_size=batch_size)
test_score = model_LSTM.evaluate(test_data_matrix, test_data_label, batch_size=batch_size)

print('Training Loss: {}\n Training Accuracy: {}\n'
      'Testng Loss: {}\n Testing accuracy: {}'.format(
          train_score[0], train_score[1],
          test_score[0], test_score[1]))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Training Loss: 0.8509425757956097
 Training Accuracy: 0.6583975199295113
Testng Loss: 2.266466431617737
 Testing accuracy: 0.2720000001788139


In [None]:
# predicting
test_pre = model.predict(test_data_matrix, batch_size=batch_size)
sub_df = pd.DataFrame()
sub_df["review_id"] = test_id_list
sub_df["pre"] = test_pre
sub_df.to_csv("pre.csv", index=False)