In [None]:
import tensorflow as tf
import pandas as pd
import numpy as np
import re
import nltk
import random
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from string import punctuation
from sklearn.preprocessing import OneHotEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
import time

%config IPCompleter.greedy=True
%config IPCompleter.use_jedi=False

# (case1) dataset : output_reviews_PA_FL (same as 1st model) & 3 convolution layers with tanh & fully cnnected layer with relu

In [None]:
review = pd.read_csv('output_reviews_PA_FL.csv')

del review['review_id']
del review['user_id']
del review['business_id']
del review['useful']
del review['funny']
del review['cool']
del review['date']
del review['state']
del review['Unnamed: 0.1']
del review['Unnamed: 0']

review = review[:1000]
review['split'] = 'train'
review['split'][800:1000] = 'test'


from collections import Counter
from nltk.corpus import stopwords
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.optimizers.legacy import SGD

stopwords = stopwords.words('english')
stemmer = PorterStemmer()

#########################
# Define the vocabulary #
#########################
def clean_doc(doc):
    # split into tokens by white space
    tokens = doc.split()
    # prepare regex for char filtering
    re_punc = re.compile('[%s]' % re.escape(punctuation))
    # remove punctuation from each word
    tokens = [re_punc.sub('', w) for w in tokens]
    # filter out stop words
    tokens = [w for w in tokens if not w in stopwords]
    # filter out short tokens
    tokens = [word for word in tokens if len(word) >= 1]
    # Stem the token
    tokens = [stemmer.stem(token) for token in tokens]
    return tokens


def add_doc_to_vocab(docs, vocab):
    # input: docs: a list of sentences / vocab: a vocabulary dictionary

    for doc in docs:
        tokens = clean_doc(doc)
        vocab.update(tokens)
    return vocab # updated vocabulary


def doc_to_line(doc, vocab):
    tokens = clean_doc(doc)
    # filter by vocab
    tokens = [token for token in tokens if token in vocab]
    line = ' '.join(tokens)
    return line


def clean_docs(docs, vocab):
    lines = []
    for doc in docs:
        line = doc_to_line(doc, vocab)
        lines.append(line)
    return lines


#########################
# embedding             #
#########################
# prepare bag-of-words encoding of docs
def prepare_data(train_docs, test_docs, mode):
    # create the tokenizer
    tokenizer = Tokenizer()
    # fit the tokenizer on the documents
    tokenizer.fit_on_texts(train_docs)
    #tokenizer.fit_on_texts(test_docs)
    #print(train_docs[0])
    # encode training data set
    Xtrain = tokenizer.texts_to_matrix(train_docs, mode=mode)
    #print(Xtrain[0])
    # encode test data set
    Xtest = tokenizer.texts_to_matrix(test_docs, mode=mode)

    if mode == 'count':
        scaler=MinMaxScaler()
        scaler.fit(Xtrain)
        Xtrain=scaler.transform(Xtrain)
        scaler.fit(Xtest)
        Xtest=scaler.transform(Xtest)

    return Xtrain, Xtest


#########################
# CNN                   #
#########################
class CNN(tf.keras.Model):
    def __init__(self, f):
        super(CNN, self).__init__()

        self.conv1 = tf.keras.layers.Conv1D(filters=32, kernel_size=5, activation='tanh', input_shape=(f,1))
        self.pool1 = tf.keras.layers.MaxPool1D(2)
        self.conv2 = tf.keras.layers.Conv1D(filters=64, kernel_size=5, activation='tanh')
        self.pool2 = tf.keras.layers.MaxPool1D(2)
        self.conv3 = tf.keras.layers.Conv1D(filters=128, kernel_size=5, activation='tanh')
        self.pool3 = tf.keras.layers.MaxPool1D(2)

        self.flatten = tf.keras.layers.Flatten()
        self.dropout = tf.keras.layers.Dropout(0.5)
        self.fullyconn = tf.keras.layers.Dense(1024, activation='relu')

        self.out = tf.keras.layers.Dense(units=10, activation=None)

    def call(self, x):
        o_conv1 = self.conv1(x)
        o_pool1 = self.pool1(o_conv1)
        o_conv2 = self.conv2(o_pool1)
        o_pool2 = self.pool2(o_conv2)
        o_conv3 = self.conv3(o_pool2)
        o_pool3 = self.pool3(o_conv3)
        o_flat = self.flatten(o_pool3)
        o_dropout = self.dropout(o_flat)
        o_fc = self.fullyconn(o_dropout)
        logits = self.out(o_fc)
        y_pred = tf.nn.softmax(logits)

        return y_pred, logits


# accuracy
def compute_accuracy(y_pred, y):
    corr_pred = tf.equal(tf.argmax(y_pred,1), tf.argmax(y,1))
    acc = tf.reduce_mean(tf.cast(corr_pred, tf.float32))
    return acc


# loss function
def cross_entropy_loss(logits, y):
    return tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=y))


# optimisation
optimiser = tf.keras.optimizers.legacy.Adam(1e-4)
def tune_param(model, x, y):
    with tf.GradientTape() as tape:
        y_pred, logits = model(x)
        loss = cross_entropy_loss(logits, y)
    gradients = tape.gradient(loss, model.trainable_variables)
    optimiser.apply_gradients(zip(gradients, model.trainable_variables))


# Separate the sentences and the labels for training and testing
train_x = list(review[review.split=='train'].text)
train_y = np.array(review[review.split=='train'].stars)

test_x = list(review[review.split=='test'].text)
test_y = np.array(review[review.split=='test'].stars)


# apply one-hot encoding to a target value
train_y, test_y = tf.one_hot(train_y, depth=10), tf.one_hot(test_y, depth=10)


# Run Experiment of 4 different modes
modes =  ['binary', 'count', 'tfidf', 'freq']
results = pd.DataFrame()


for mode in modes:
    print('mode: ', mode)

    # Instantiate a vocab object
    vocab = Counter()

    # Define a vocabulary for each fold
    vocab = add_doc_to_vocab(train_x, vocab)

    # Clean the sentences
    train_x = clean_docs(train_x, vocab)
    test_x = clean_docs(test_x, vocab)

    # encode data using different mode
    Xtrain, Xtest = prepare_data(train_x, test_x, mode)

    Xtrain = np.reshape(Xtrain, (Xtrain.shape[0], Xtrain.shape[1], 1))
    Xtest = np.reshape(Xtest, (Xtest.shape[0], Xtest.shape[1], 1))

    # train data reconstruction (shuffle -> batch form)
    train_data = tf.data.Dataset.from_tensor_slices((Xtrain, train_y))
    train_data = train_data.repeat().shuffle(100).batch(50)
    train_data_iter = iter(train_data)

    CNN_model = CNN(Xtrain.shape[1])
    # optimise 200 epoch , but try optimise 10000 epoch in HPC
    for i in range(201):
        # assign each 50 data
        batch_x, batch_y = next(train_data_iter)

        if i % 100 == 0:
            train_accuracy = compute_accuracy(CNN_model(batch_x)[0], batch_y)
            print("%d epoch : accuracy %f" % (i, train_accuracy))

        # parameter update
        tune_param(CNN_model, batch_x, batch_y)


    test_acc = compute_accuracy(CNN_model(Xtest)[0], test_y)
    print("accuracy : %f" %test_acc)
    results[mode] = [test_acc*100]


print()
print(results)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  review['split'][800:1000] = 'test'


mode:  binary
0 epoch : accuracy 0.100000
100 epoch : accuracy 0.860000
200 epoch : accuracy 0.980000
accuracy : 0.375000
mode:  count
0 epoch : accuracy 0.020000
100 epoch : accuracy 0.880000
200 epoch : accuracy 1.000000
accuracy : 0.300000
mode:  tfidf
0 epoch : accuracy 0.020000
100 epoch : accuracy 1.000000
200 epoch : accuracy 1.000000
accuracy : 0.335000
mode:  freq
0 epoch : accuracy 0.020000
100 epoch : accuracy 0.500000
200 epoch : accuracy 0.740000
accuracy : 0.260000

                                     binary  \
0  tf.Tensor(37.5, shape=(), dtype=float32)   

                                           count  \
0  tf.Tensor(30.000002, shape=(), dtype=float32)   

                                      tfidf  \
0  tf.Tensor(33.5, shape=(), dtype=float32)   

                                       freq  
0  tf.Tensor(26.0, shape=(), dtype=float32)  


# (case2) dataset : output_reviews_PA_FL (same as 1st model) & 3 convolution layers with relu & fully cnnected layer with tanh

In [None]:
from collections import Counter
from nltk.corpus import stopwords
from sklearn.preprocessing import MinMaxScaler


stopwords = stopwords.words('english')
stemmer = PorterStemmer()

review = pd.read_csv('output_reviews_PA_FL.csv')

del review['review_id']
del review['user_id']
del review['business_id']
del review['useful']
del review['funny']
del review['cool']
del review['date']
del review['state']
del review['Unnamed: 0.1']
del review['Unnamed: 0']

review = review[:1000]
review['split'] = 'train'
review['split'][800:1000] = 'test'



#########################
# Define the vocabulary #
#########################
def clean_doc(doc):
    # split into tokens by white space
    tokens = doc.split()
    # prepare regex for char filtering
    re_punc = re.compile('[%s]' % re.escape(punctuation))
    # remove punctuation from each word
    tokens = [re_punc.sub('', w) for w in tokens]
    # filter out stop words
    tokens = [w for w in tokens if not w in stopwords]
    # filter out short tokens
    tokens = [word for word in tokens if len(word) >= 1]
    # Stem the token
    tokens = [stemmer.stem(token) for token in tokens]
    return tokens

def add_doc_to_vocab(docs, vocab):

    # input: docs: a list of sentences / vocab: a vocabulary dictionary
    for doc in docs:
        tokens = clean_doc(doc)
        vocab.update(tokens)
    return vocab  # updated vocabulary


def doc_to_line(doc, vocab):
    tokens = clean_doc(doc)
    # filter by vocab
    tokens = [token for token in tokens if token in vocab]
    line = ' '.join(tokens)
    return line


def clean_docs(docs, vocab):
    lines = []
    for doc in docs:
        line = doc_to_line(doc, vocab)
        lines.append(line)
    return lines


#########################
# embedding             #
#########################
# prepare bag-of-words encoding of docs
def prepare_data(train_docs, test_docs, mode):
    # create the tokenizer
    tokenizer = Tokenizer()
    # fit the tokenizer on the documents
    tokenizer.fit_on_texts(train_docs)
    #tokenizer.fit_on_texts(test_docs)
    #print(train_docs[0])
    # encode training data set
    Xtrain = tokenizer.texts_to_matrix(train_docs, mode=mode)
    #print(Xtrain[0])
    # encode test data set
    Xtest = tokenizer.texts_to_matrix(test_docs, mode=mode)

    if mode == 'count':
        scaler=MinMaxScaler()
        scaler.fit(Xtrain)
        Xtrain=scaler.transform(Xtrain)
        scaler.fit(Xtest)
        Xtest=scaler.transform(Xtest)

    return Xtrain, Xtest


#########################
# CNN                   #
#########################
class CNN(tf.keras.Model):
    def __init__(self):
        super(CNN, self).__init__()

        self.conv1 = tf.keras.layers.Conv1D(filters=32, kernel_size=5, activation='relu')
        self.pool1 = tf.keras.layers.MaxPool1D(2)
        self.conv2 = tf.keras.layers.Conv1D(filters=64, kernel_size=5, activation='relu')
        self.pool2 = tf.keras.layers.MaxPool1D(2)
        self.conv3 = tf.keras.layers.Conv1D(filters=128, kernel_size=5, activation='relu')
        self.pool3 = tf.keras.layers.MaxPool1D(2)

        self.flatten = tf.keras.layers.Flatten()
        self.dropout = tf.keras.layers.Dropout(0.5)
        self.fullyconn = tf.keras.layers.Dense(1024, activation='tanh')

        self.out = tf.keras.layers.Dense(units=5, activation=None)

    def call(self, x):
        o_conv1 = self.conv1(x)
        o_pool1 = self.pool1(o_conv1)
        o_conv2 = self.conv2(o_pool1)
        o_pool2 = self.pool2(o_conv2)
        o_conv3 = self.conv3(o_pool2)
        o_pool3 = self.pool3(o_conv3)
        o_flat = self.flatten(o_pool3)
        o_dropout = self.dropout(o_flat)
        o_fc = self.fullyconn(o_dropout)
        logits = self.out(o_fc)
        y_pred = tf.nn.softmax(logits)

        return y_pred, logits


# accuracy
def compute_accuracy(y_pred, y):
    corr_pred = tf.equal(tf.argmax(y_pred,1), tf.argmax(y,1))
    acc = tf.reduce_mean(tf.cast(corr_pred, tf.float32))

    return acc


# loss function
def cross_entropy_loss(logits, y):
    return tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=y))


#optimisation
optimiser = tf.keras.optimizers.legacy.Adam(1e-4)
def tune_param(model, x, y):
    with tf.GradientTape() as tape:
        y_pred, logits = model(x)
        loss = cross_entropy_loss(logits, y)
    gradients = tape.gradient(loss, model.trainable_variables)
    optimiser.apply_gradients(zip(gradients, model.trainable_variables))


# Separate the sentences and the labels for training and testing
train_x = list(review[review.split=='train'].text)
train_y = np.array(review[review.split=='train'].stars)

test_x = list(review[review.split=='test'].text)
test_y = np.array(review[review.split=='test'].stars)


# apply one-hot encoding to a target value
train_y, test_y = tf.one_hot(train_y, depth=5), tf.one_hot(test_y, depth=5)


# Run Experiment of 4 different modes
modes =  ['binary', 'count', 'tfidf', 'freq']
results = pd.DataFrame()


for mode in modes:
    print('mode: ', mode)

    # Instantiate a vocab object
    vocab = Counter()

    # Define a vocabulary for each fold
    vocab = add_doc_to_vocab(train_x, vocab)

    # Clean the sentences
    train_x = clean_docs(train_x, vocab)
    test_x = clean_docs(test_x, vocab)

    # encode data using freq mode
    Xtrain, Xtest = prepare_data(train_x, test_x, mode)

    Xtrain = np.reshape(Xtrain, (Xtrain.shape[0], Xtrain.shape[1], 1))
    Xtest = np.reshape(Xtest, (Xtest.shape[0], Xtest.shape[1], 1))

    # train data reconstruction (shuffle -> batch form)
    train_data = tf.data.Dataset.from_tensor_slices((Xtrain, train_y))
    train_data = train_data.repeat().shuffle(60000).batch(50)
    train_data_iter = iter(train_data)

    CNN_model = CNN()
    # optimise 200 epoch , but try optimise 10000 epoch in HPC
    for i in range(201):
        # assign each 50 data
        batch_x, batch_y = next(train_data_iter)

        if i % 100 == 0:
            train_accuracy = compute_accuracy(CNN_model(batch_x)[0], batch_y)
            print("%d epoch : accuracy %f" % (i, train_accuracy))

        # parameter update
        tune_param(CNN_model, batch_x, batch_y)


    test_acc = compute_accuracy(CNN_model(Xtest)[0], test_y)
    print("accuracy : %f" %test_acc)
    results[mode] = [test_acc*100]


print()
print(results)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  review['split'][800:1000] = 'test'


mode:  binary
0 epoch : accuracy 0.260000
100 epoch : accuracy 0.300000
200 epoch : accuracy 0.440000
accuracy : 0.275000
mode:  count
0 epoch : accuracy 0.100000
100 epoch : accuracy 0.380000
200 epoch : accuracy 0.320000
accuracy : 0.230000
mode:  tfidf
0 epoch : accuracy 0.240000
100 epoch : accuracy 0.320000
200 epoch : accuracy 0.480000
accuracy : 0.285000
mode:  freq
0 epoch : accuracy 0.080000
100 epoch : accuracy 0.380000
200 epoch : accuracy 0.240000
accuracy : 0.225000

                                     binary  \
0  tf.Tensor(27.5, shape=(), dtype=float32)   

                                      count  \
0  tf.Tensor(23.0, shape=(), dtype=float32)   

                                      tfidf  \
0  tf.Tensor(28.5, shape=(), dtype=float32)   

                                       freq  
0  tf.Tensor(22.5, shape=(), dtype=float32)  


# (case3) dataset : review_test.csv (sample from original yelp data) & 3 convolution layers with tanh & fully cnnected layer with relu

In [None]:

review = pd.read_csv('review_test.csv')
review['split'] = 'train'
review['split'][800:1000] = 'test'

del review['Unnamed: 0']


from collections import Counter
from nltk.corpus import stopwords
from sklearn.preprocessing import MinMaxScaler


stopwords = stopwords.words('english')
stemmer = PorterStemmer()


#########################
# Define the vocabulary #
#########################
def clean_doc(doc):
    # split into tokens by white space
    tokens = doc.split()
    # prepare regex for char filtering
    re_punc = re.compile('[%s]' % re.escape(punctuation))
    # remove punctuation from each word
    tokens = [re_punc.sub('', w) for w in tokens]
    # filter out stop words
    tokens = [w for w in tokens if not w in stopwords]
    # filter out short tokens
    tokens = [word for word in tokens if len(word) >= 1]
    # Stem the token
    tokens = [stemmer.stem(token) for token in tokens]
    return tokens


def add_doc_to_vocab(docs, vocab):
    # docs: a list of sentences (docs)
    # vocab: a vocabulary dictionary
    for doc in docs:
        tokens = clean_doc(doc)
        vocab.update(tokens)
    return vocab # updated vocabulary


def doc_to_line(doc, vocab):
    tokens = clean_doc(doc)
    # filter by vocab
    tokens = [token for token in tokens if token in vocab]
    line = ' '.join(tokens)
    return line


def clean_docs(docs, vocab):
    lines = []
    for doc in docs:
        line = doc_to_line(doc, vocab)
        lines.append(line)
    return lines


#########################
# embedding             #
#########################
# prepare bag-of-words encoding of docs
def prepare_data(train_docs, test_docs, mode):
    # create the tokenizer
    tokenizer = Tokenizer()
    # fit the tokenizer on the documents
    tokenizer.fit_on_texts(train_docs)
    #tokenizer.fit_on_texts(test_docs)
    #print(train_docs[0])
    # encode training data set
    Xtrain = tokenizer.texts_to_matrix(train_docs, mode=mode)
    #print(Xtrain[0])
    # encode test data set
    Xtest = tokenizer.texts_to_matrix(test_docs, mode=mode)

    if mode == 'count':
        scaler=MinMaxScaler()
        scaler.fit(Xtrain)
        Xtrain=scaler.transform(Xtrain)
        scaler.fit(Xtest)
        Xtest=scaler.transform(Xtest)

    return Xtrain, Xtest


#########################
# CNN                   #
#########################
class CNN(tf.keras.Model):
    def __init__(self):
        super(CNN, self).__init__()

        self.conv1 = tf.keras.layers.Conv1D(filters=32, kernel_size=5, activation='tanh')
        self.pool1 = tf.keras.layers.MaxPool1D(2)
        self.conv2 = tf.keras.layers.Conv1D(filters=64, kernel_size=5, activation='tanh')
        self.pool2 = tf.keras.layers.MaxPool1D(2)
        self.conv3 = tf.keras.layers.Conv1D(filters=128, kernel_size=5, activation='tanh')
        self.pool3 = tf.keras.layers.MaxPool1D(2)

        self.flatten = tf.keras.layers.Flatten()
        self.dropout = tf.keras.layers.Dropout(0.5)
        self.fullyconn = tf.keras.layers.Dense(1024, activation='relu')

        self.out = tf.keras.layers.Dense(units=10, activation=None)

    def call(self, x):
        o_conv1 = self.conv1(x)
        o_pool1 = self.pool1(o_conv1)
        o_conv2 = self.conv2(o_pool1)
        o_pool2 = self.pool2(o_conv2)
        o_conv3 = self.conv3(o_pool2)
        o_pool3 = self.pool3(o_conv3)
        o_flat = self.flatten(o_pool3)
        o_dropout = self.dropout(o_flat)
        #o_fc = self.fullyconn(o_dropout)
        logits = self.out(o_dropout)
        y_pred = tf.nn.softmax(logits)

        return y_pred, logits


# accuracy
def compute_accuracy(y_pred, y):
    corr_pred = tf.equal(tf.argmax(y_pred,1), tf.argmax(y,1))
    acc = tf.reduce_mean(tf.cast(corr_pred, tf.float32))
    return acc


# loss function
def cross_entropy_loss(logits, y):
    return tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=y))


#optimisation
optimiser = tf.keras.optimizers.legacy.Adam(1e-4)
def tune_param(model, x, y):
    with tf.GradientTape() as tape:
        y_pred, logits = model(x)
        loss = cross_entropy_loss(logits, y)
    gradients = tape.gradient(loss, model.trainable_variables)
    optimiser.apply_gradients(zip(gradients, model.trainable_variables))


# Separate the sentences and the labels for training and testing
train_x = list(review[review.split=='train'].text)
train_y = np.array(review[review.split=='train'].stars)

test_x = list(review[review.split=='test'].text)
test_y = np.array(review[review.split=='test'].stars)


# apply one-hot encoding to a target value
train_y, test_y = tf.one_hot(train_y, depth=10), tf.one_hot(test_y, depth=10)


# Run Experiment of 4 different modes
modes =  ['binary', 'count', 'tfidf', 'freq']
results = pd.DataFrame()


for mode in modes:
    print('mode: ', mode)

    # Instantiate a vocab object
    vocab = Counter()

    # Define a vocabulary for each fold
    vocab = add_doc_to_vocab(train_x, vocab)

    # Clean the sentences
    train_x = clean_docs(train_x, vocab)
    test_x = clean_docs(test_x, vocab)

    # encode data using different mode
    Xtrain, Xtest = prepare_data(train_x, test_x, mode)

    Xtrain = np.reshape(Xtrain, (Xtrain.shape[0], Xtrain.shape[1], 1))
    Xtest = np.reshape(Xtest, (Xtest.shape[0], Xtest.shape[1], 1))

    # train data reconstruction (shuffle -> batch form)
    train_data = tf.data.Dataset.from_tensor_slices((Xtrain, train_y))
    train_data = train_data.repeat().shuffle(100).batch(50)
    train_data_iter = iter(train_data)

    CNN_model = CNN()
    # optimise 200 epoch , but try optimise 10000 epoch in HPC
    for i in range(201):
        # assign each 50 data
        batch_x, batch_y = next(train_data_iter)

        if i % 100 == 0:
            train_accuracy = compute_accuracy(CNN_model(batch_x)[0], batch_y)
            print("%d epoch : accuracy %f" % (i, train_accuracy))

        # parameter update
        tune_param(CNN_model, batch_x, batch_y)

    test_acc = compute_accuracy(CNN_model(Xtest)[0], test_y)
    print("accuracy : %f" %test_acc)
    results[mode] = [test_acc*100]


print()
print(results)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  review['split'][800:1000] = 'test'


mode:  binary
0 epoch : accuracy 0.100000
100 epoch : accuracy 0.420000
200 epoch : accuracy 0.680000
accuracy : 0.572864
mode:  count
0 epoch : accuracy 0.080000
100 epoch : accuracy 0.720000
200 epoch : accuracy 0.940000
accuracy : 0.497487
mode:  tfidf
0 epoch : accuracy 0.080000
100 epoch : accuracy 0.900000
200 epoch : accuracy 0.980000
accuracy : 0.562814
mode:  freq
0 epoch : accuracy 0.080000
100 epoch : accuracy 0.480000
200 epoch : accuracy 0.620000
accuracy : 0.517588

                                         binary  \
0  tf.Tensor(57.28643, shape=(), dtype=float32)   

                                          count  \
0  tf.Tensor(49.74874, shape=(), dtype=float32)   

                                           tfidf  \
0  tf.Tensor(56.281406, shape=(), dtype=float32)   

                                            freq  
0  tf.Tensor(51.758797, shape=(), dtype=float32)  


# (case4) dataset : review_test.csv (sample from original yelp data) & 3 convolution layers with relu & fully cnnected layer with tanh

In [None]:
#########################
# Define the vocabulary #
#########################
review = pd.read_csv('review_test.csv')
review['split'] = 'train'
review['split'][800:1000] = 'test'

del review['Unnamed: 0']


from collections import Counter
from nltk.corpus import stopwords
from sklearn.preprocessing import MinMaxScaler

stopwords = stopwords.words('english')
stemmer = PorterStemmer()


def clean_doc(doc):
    # split into tokens by white space
    tokens = doc.split()
    # prepare regex for char filtering
    re_punc = re.compile('[%s]' % re.escape(punctuation))
    # remove punctuation from each word
    tokens = [re_punc.sub('', w) for w in tokens]
    # filter out stop words
    tokens = [w for w in tokens if not w in stopwords]
    # filter out short tokens
    tokens = [word for word in tokens if len(word) >= 1]
    # Stem the token
    tokens = [stemmer.stem(token) for token in tokens]
    return tokens


def add_doc_to_vocab(docs, vocab):

    # docs: a list of sentences (docs)
    # vocab: a vocabulary dictionary
    for doc in docs:
        tokens = clean_doc(doc)
        vocab.update(tokens)
    return vocab # updated vocabulary


def doc_to_line(doc, vocab):
    tokens = clean_doc(doc)
    # filter by vocab
    tokens = [token for token in tokens if token in vocab]
    line = ' '.join(tokens)
    return line


def clean_docs(docs, vocab):
    lines = []
    for doc in docs:
        line = doc_to_line(doc, vocab)
        lines.append(line)
    return lines


#########################
# embedding             #
#########################
# prepare bag-of-words encoding of docs
def prepare_data(train_docs, test_docs, mode):
    # create the tokenizer
    tokenizer = Tokenizer()
    # fit the tokenizer on the documents
    tokenizer.fit_on_texts(train_docs)
    #tokenizer.fit_on_texts(test_docs)
    #print(train_docs[0])
    # encode training data set
    Xtrain = tokenizer.texts_to_matrix(train_docs, mode=mode)
    #print(Xtrain[0])
    # encode test data set
    Xtest = tokenizer.texts_to_matrix(test_docs, mode=mode)

    if mode == 'count':
        scaler=MinMaxScaler()
        scaler.fit(Xtrain)
        Xtrain=scaler.transform(Xtrain)
        scaler.fit(Xtest)
        Xtest=scaler.transform(Xtest)

    return Xtrain, Xtest


#########################
# CNN                   #
#########################
class CNN(tf.keras.Model):
    def __init__(self):
        super(CNN, self).__init__()

        self.conv1 = tf.keras.layers.Conv1D(filters=32, kernel_size=5, activation='relu')
        self.pool1 = tf.keras.layers.MaxPool1D(2)
        self.conv2 = tf.keras.layers.Conv1D(filters=64, kernel_size=5, activation='relu')
        self.pool2 = tf.keras.layers.MaxPool1D(2)
        self.conv3 = tf.keras.layers.Conv1D(filters=128, kernel_size=5, activation='relu')
        self.pool3 = tf.keras.layers.MaxPool1D(2)

        self.flatten = tf.keras.layers.Flatten()
        self.dropout = tf.keras.layers.Dropout(0.5)
        self.fullyconn = tf.keras.layers.Dense(1024, activation='tanh')

        self.out = tf.keras.layers.Dense(units=10, activation=None)

    def call(self, x):
        o_conv1 = self.conv1(x)
        o_pool1 = self.pool1(o_conv1)
        o_conv2 = self.conv2(o_pool1)
        o_pool2 = self.pool2(o_conv2)
        o_conv3 = self.conv3(o_pool2)
        o_pool3 = self.pool3(o_conv3)
        o_flat = self.flatten(o_pool3)
        o_dropout = self.dropout(o_flat)
        #o_fc = self.fullyconn(o_dropout)
        logits = self.out(o_dropout)
        y_pred = tf.nn.softmax(logits)

        return y_pred, logits


# accuracy
def compute_accuracy(y_pred, y):
    corr_pred = tf.equal(tf.argmax(y_pred,1), tf.argmax(y,1))
    acc = tf.reduce_mean(tf.cast(corr_pred, tf.float32))
    return acc


# loss function
def cross_entropy_loss(logits, y):
    return tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=y))


# optimisation
optimiser = tf.keras.optimizers.legacy.Adam(1e-4)
def tune_param(model, x, y):
    with tf.GradientTape() as tape:
        y_pred, logits = model(x)
        loss = cross_entropy_loss(logits, y)
    gradients = tape.gradient(loss, model.trainable_variables)
    optimiser.apply_gradients(zip(gradients, model.trainable_variables))


# Separate the sentences and the labels for training and testing
train_x = list(review[review.split=='train'].text)
train_y = np.array(review[review.split=='train'].stars)

test_x = list(review[review.split=='test'].text)
test_y = np.array(review[review.split=='test'].stars)


# apply one-hot encoding to a target value
train_y, test_y = tf.one_hot(train_y, depth=10), tf.one_hot(test_y, depth=10)


# Run Experiment of 4 different modes
modes =  ['binary', 'count', 'tfidf', 'freq']
results = pd.DataFrame()


for mode in modes:
    print('mode: ', mode)

    # Instantiate a vocab object
    vocab = Counter()

    # Define a vocabulary for each fold
    vocab = add_doc_to_vocab(train_x, vocab)

    # Clean the sentences
    train_x = clean_docs(train_x, vocab)
    test_x = clean_docs(test_x, vocab)

    # encode data using different mode
    Xtrain, Xtest = prepare_data(train_x, test_x, mode)

    Xtrain = np.reshape(Xtrain, (Xtrain.shape[0], Xtrain.shape[1], 1))
    Xtest = np.reshape(Xtest, (Xtest.shape[0], Xtest.shape[1], 1))

    # train data reconstruction (shuffle -> batch form)
    train_data = tf.data.Dataset.from_tensor_slices((Xtrain, train_y))
    train_data = train_data.repeat().shuffle(100).batch(50)
    train_data_iter = iter(train_data)

    CNN_model = CNN()
    # optimise 200 epoch , but try optimise 10000 epoch in HPC
    for i in range(201):
        # assign each 50 data
        batch_x, batch_y = next(train_data_iter)

        if i % 100 == 0:
            train_accuracy = compute_accuracy(CNN_model(batch_x)[0], batch_y)
            print("%d epoch : accuracy %f" % (i, train_accuracy))

        # parameter update
        tune_param(CNN_model, batch_x, batch_y)

    test_acc = compute_accuracy(CNN_model(Xtest)[0], test_y)
    print("accuracy : %f" %test_acc)
    results[mode] = [test_acc*100]


print()
print(results)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  review['split'][800:1000] = 'test'


mode:  binary
0 epoch : accuracy 0.100000
100 epoch : accuracy 0.540000
200 epoch : accuracy 0.740000
accuracy : 0.547739
mode:  count
0 epoch : accuracy 0.080000
100 epoch : accuracy 0.580000
200 epoch : accuracy 0.880000
accuracy : 0.462312
mode:  tfidf
0 epoch : accuracy 0.060000
100 epoch : accuracy 0.880000
200 epoch : accuracy 0.960000
accuracy : 0.542714
mode:  freq
0 epoch : accuracy 0.060000
100 epoch : accuracy 0.460000
200 epoch : accuracy 0.580000
accuracy : 0.517588

                                         binary  \
0  tf.Tensor(54.77387, shape=(), dtype=float32)   

                                           count  \
0  tf.Tensor(46.231155, shape=(), dtype=float32)   

                                          tfidf  \
0  tf.Tensor(54.27136, shape=(), dtype=float32)   

                                            freq  
0  tf.Tensor(51.758797, shape=(), dtype=float32)  


# additional exp - dataset : review_test & using stride and padding

In [None]:
#########################
# Define the vocabulary #
#########################
review = pd.read_csv('review_test.csv')
review['split'] = 'train'
review['split'][800:1000] = 'test'

#del review['review_id']
#del review['user_id']
#del review['business_id']
#del review['useful']
#del review['funny']
#del review['cool']
#del review['date']
#del review['state']
#del review['Unnamed: 0.1']
del review['Unnamed: 0']



from collections import Counter
from nltk.corpus import stopwords
from sklearn.preprocessing import MinMaxScaler

stopwords = stopwords.words('english')
stemmer = PorterStemmer()

def clean_doc(doc):
    # split into tokens by white space
    tokens = doc.split()
    # prepare regex for char filtering
    re_punc = re.compile('[%s]' % re.escape(punctuation))
    # remove punctuation from each word
    tokens = [re_punc.sub('', w) for w in tokens]
    # filter out stop words
    tokens = [w for w in tokens if not w in stopwords]
    # filter out short tokens
    tokens = [word for word in tokens if len(word) >= 1]
    # Stem the token
    tokens = [stemmer.stem(token) for token in tokens]
    return tokens

def add_doc_to_vocab(docs, vocab):

    # docs: a list of sentences (docs)
    # vocab: a vocabulary dictionary
    for doc in docs:
        tokens = clean_doc(doc)
        vocab.update(tokens)
    return vocab # updated vocabulary

def doc_to_line(doc, vocab):
    tokens = clean_doc(doc)
    # filter by vocab
    tokens = [token for token in tokens if token in vocab]
    line = ' '.join(tokens)
    return line

def clean_docs(docs, vocab):
    lines = []
    for doc in docs:
        line = doc_to_line(doc, vocab)
        lines.append(line)
    return lines

# prepare bag-of-words encoding of docs
def prepare_data(train_docs, test_docs, mode):
    # create the tokenizer
    tokenizer = Tokenizer()
    # fit the tokenizer on the documents
    tokenizer.fit_on_texts(train_docs)
    #tokenizer.fit_on_texts(test_docs)
    #print(train_docs[0])
    # encode training data set
    Xtrain = tokenizer.texts_to_matrix(train_docs, mode=mode)
    #print(Xtrain[0])
    # encode test data set
    Xtest = tokenizer.texts_to_matrix(test_docs, mode=mode)

    if mode == 'count':
        scaler=MinMaxScaler()
        scaler.fit(Xtrain)
        Xtrain=scaler.transform(Xtrain)
        scaler.fit(Xtest)
        Xtest=scaler.transform(Xtest)

    return Xtrain, Xtest


class CNN(tf.keras.Model):
    def __init__(self):
        super(CNN, self).__init__()

        self.conv1 = tf.keras.layers.Conv1D(filters=32, kernel_size=5, activation='tanh', strides=1, padding='same')
        self.pool1 = tf.keras.layers.MaxPool1D(2)
        self.conv2 = tf.keras.layers.Conv1D(filters=64, kernel_size=5, activation='tanh', strides=1, padding='same')
        self.pool2 = tf.keras.layers.MaxPool1D(2)
        self.conv3 = tf.keras.layers.Conv1D(filters=128, kernel_size=5, activation='tanh', strides=1, padding='same')
        self.pool3 = tf.keras.layers.MaxPool1D(2)

        self.flatten = tf.keras.layers.Flatten()
        self.dropout = tf.keras.layers.Dropout(0.5)
        self.fullyconn = tf.keras.layers.Dense(1024, activation='relu')

        self.out = tf.keras.layers.Dense(units=10, activation=None)

    def call(self, x):
        #x = tf.reshape(x, [-1,28,28,1])
        o_conv1 = self.conv1(x)
        o_pool1 = self.pool1(o_conv1)
        o_conv2 = self.conv2(o_pool1)
        o_pool2 = self.pool2(o_conv2)
        o_conv3 = self.conv3(o_pool2)
        o_pool3 = self.pool3(o_conv3)
        o_flat = self.flatten(o_pool3)
        o_dropout = self.dropout(o_flat)
        #o_fc = self.fullyconn(o_dropout)
        logits = self.out(o_dropout)
        y_pred = tf.nn.softmax(logits)

        return y_pred, logits



def compute_accuracy(y_pred, y):
    corr_pred = tf.equal(tf.argmax(y_pred,1), tf.argmax(y,1))
    acc = tf.reduce_mean(tf.cast(corr_pred, tf.float32))

    return acc


def cross_entropy_loss(logits, y):
    return tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=y))


#optimiser = tf.keras.optimizers.Adam(1e-4)
optimiser = tf.keras.optimizers.legacy.Adam(1e-4)
def tune_param(model, x, y):
    with tf.GradientTape() as tape:
        y_pred, logits = model(x)
        loss = cross_entropy_loss(logits, y)
    gradients = tape.gradient(loss, model.trainable_variables)
    #tf.keras.optimizers.legacy.SGD(learning_rate=0.1)
    optimiser.apply_gradients(zip(gradients, model.trainable_variables))




# Separate the sentences and the labels for training and testing
train_x = list(review[review.split=='train'].text)
train_y = np.array(review[review.split=='train'].stars)


test_x = list(review[review.split=='test'].text)
test_y = np.array(review[review.split=='test'].stars)

# apply one-hot encoding to a target value
train_y, test_y = tf.one_hot(train_y, depth=10), tf.one_hot(test_y, depth=10)


# Run Experiment of 4 different modes
# !!!!(Nina) 'count', 'tfidf', 'freq' -> need to normalise
modes =  ['binary', 'count', 'tfidf', 'freq']
results = pd.DataFrame()

for mode in modes:
    print('mode: ', mode)

    # Instantiate a vocab object
    vocab = Counter()

    # Define a vocabulary for each fold
    vocab = add_doc_to_vocab(train_x, vocab)

    # Clean the sentences
    train_x = clean_docs(train_x, vocab)
    test_x = clean_docs(test_x, vocab)

    # encode data using freq mode
    Xtrain, Xtest = prepare_data(train_x, test_x, mode)

    Xtrain = np.reshape(Xtrain, (Xtrain.shape[0], Xtrain.shape[1], 1))
    Xtest = np.reshape(Xtest, (Xtest.shape[0], Xtest.shape[1], 1))

    # train data reconstruction (shuffle -> batch form)
    train_data = tf.data.Dataset.from_tensor_slices((Xtrain, train_y))
    train_data = train_data.repeat().shuffle(100).batch(50)
    train_data_iter = iter(train_data)

    CNN_model = CNN()
    # optimise 10000 epoch
    for i in range(201):
        # assign each 50 data
        batch_x, batch_y = next(train_data_iter)


        if i % 100 == 0:
            train_accuracy = compute_accuracy(CNN_model(batch_x)[0], batch_y)
            print("%d epoch : accuracy %f" % (i, train_accuracy))

        # parameter update
        tune_param(CNN_model, batch_x, batch_y)

    test_acc = compute_accuracy(CNN_model(Xtest)[0], test_y)
    print("accuracy : %f" %test_acc)
    results[mode] = [test_acc*100]

print()
print(results)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  review['split'][800:1000] = 'test'


mode:  binary
0 epoch : accuracy 0.060000
100 epoch : accuracy 0.440000
200 epoch : accuracy 0.860000
accuracy : 0.557789
mode:  count
0 epoch : accuracy 0.080000
100 epoch : accuracy 0.680000
200 epoch : accuracy 0.900000
accuracy : 0.517588
mode:  tfidf
0 epoch : accuracy 0.040000
100 epoch : accuracy 0.940000
200 epoch : accuracy 0.980000
accuracy : 0.587940
mode:  freq
0 epoch : accuracy 0.020000
100 epoch : accuracy 0.480000
200 epoch : accuracy 0.580000
accuracy : 0.517588

                                          binary  \
0  tf.Tensor(55.778896, shape=(), dtype=float32)   

                                           count  \
0  tf.Tensor(51.758797, shape=(), dtype=float32)   

                                          tfidf  \
0  tf.Tensor(58.79397, shape=(), dtype=float32)   

                                            freq  
0  tf.Tensor(51.758797, shape=(), dtype=float32)  


# additional exp - dataset : review_test (same as 1st model) & only 1 convolution

In [None]:
#########################
# Define the vocabulary #
#########################
review = pd.read_csv('review_test.csv')
review['split'] = 'train'
review['split'][800:1000] = 'test'

#del review['review_id']
#del review['user_id']
#del review['business_id']
#del review['useful']
#del review['funny']
#del review['cool']
#del review['date']
#del review['state']
#del review['Unnamed: 0.1']
del review['Unnamed: 0']

from collections import Counter
from nltk.corpus import stopwords
stopwords = stopwords.words('english')
stemmer = PorterStemmer()

def clean_doc(doc):
    # split into tokens by white space
    tokens = doc.split()
    # prepare regex for char filtering
    re_punc = re.compile('[%s]' % re.escape(punctuation))
    # remove punctuation from each word
    tokens = [re_punc.sub('', w) for w in tokens]
    # filter out stop words
    tokens = [w for w in tokens if not w in stopwords]
    # filter out short tokens
    tokens = [word for word in tokens if len(word) >= 1]
    # Stem the token
    tokens = [stemmer.stem(token) for token in tokens]
    return tokens

def add_doc_to_vocab(docs, vocab):
    '''
    input:
        docs: a list of sentences (docs)
        vocab: a vocabulary dictionary
    output:
        return an updated vocabulary
    '''
    for doc in docs:
        tokens = clean_doc(doc)
        vocab.update(tokens)
    return vocab

def doc_to_line(doc, vocab):
    tokens = clean_doc(doc)
    # filter by vocab
    tokens = [token for token in tokens if token in vocab]
    line = ' '.join(tokens)
    return line

def clean_docs(docs, vocab):
    lines = []
    for doc in docs:
        line = doc_to_line(doc, vocab)
        lines.append(line)
    return lines

# prepare bag-of-words encoding of docs
def prepare_data(train_docs, test_docs, mode):
    # create the tokenizer
    tokenizer = Tokenizer()
    # fit the tokenizer on the documents
    tokenizer.fit_on_texts(train_docs)
    #tokenizer.fit_on_texts(test_docs)
    #print(train_docs[0])
    # encode training data set
    Xtrain = tokenizer.texts_to_matrix(train_docs, mode=mode)
    #print(Xtrain[0])
    # encode test data set
    Xtest = tokenizer.texts_to_matrix(test_docs, mode=mode)
    return Xtrain, Xtest

def train_cnn(train_x, train_y, batch_size = 50, epochs = 10, verbose =2):

    n_words = train_x.shape[1]

    # n_words : the number of vocab
    model = tf.keras.models.Sequential([
        tf.keras.layers.Conv1D(filters=100, kernel_size=5, activation='relu', input_shape=(n_words,1)),
        tf.keras.layers.MaxPool1D(2),

        # !!!! (Nina) more kernel layers!!!
        #tf.keras.layers.Conv1D(filters=100, kernel_size=5, activation='relu', input_shape=(n_words,1)),
        # tf.keras.layers.MaxPool1D(2),

        tf.keras.layers.Flatten(),

        tf.keras.layers.Dropout(0.5),

        # !!!! (Nina) 3 convolution layers & 3 dense layers
        tf.keras.layers.Dense( units=6, activation='softmax')
    ])

    model.compile( loss = 'sparse_categorical_crossentropy', optimizer = 'adam', metrics = ['accuracy'])
    model.fit(train_x, train_y, batch_size, epochs, verbose)
    return model

# Separate the sentences and the labels for training and testing
train_x = list(review[review.split=='train'].text)
train_y = np.array(review[review.split=='train'].stars)
print('train_x size: ', len(train_x))
print('train_y size: ', len(train_y))

test_x = list(review[review.split=='test'].text)
test_y = np.array(review[review.split=='test'].stars)
print('test_x size: ', len(test_x))
print('test_y size: ', len(test_y))

# Run Experiment of 4 different modes
# !!!!(Nina) 'count', 'tfidf', 'freq' -> need to normalise
modes = ['binary', 'count', 'tfidf', 'freq']
results = pd.DataFrame()

for mode in modes:
    print('mode: ', mode)

    # Instantiate a vocab object
    vocab = Counter()

    # Define a vocabulary for each fold
    vocab = add_doc_to_vocab(train_x, vocab)

    # Clean the sentences
    train_x = clean_docs(train_x, vocab)
    test_x = clean_docs(test_x, vocab)

    # encode data using freq mode
    Xtrain, Xtest = prepare_data(train_x, test_x, mode)

    Xtrain = np.reshape(Xtrain, (Xtrain.shape[0], Xtrain.shape[1], 1))
    Xtest = np.reshape(Xtest, (Xtest.shape[0], Xtest.shape[1], 1))

    # train the model
    model = train_cnn(Xtrain, train_y)

    # evaluate the model
    loss, acc = model.evaluate(Xtest, test_y, verbose=0)
    print('Test Accuracy: {}'.format(acc*100))
    results[mode] = [acc*100]

print()
print(results)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  review['split'][800:1000] = 'test'


train_x size:  800
train_y size:  800
test_x size:  199
test_y size:  199
mode:  binary
Epoch 1/10
16/16 - 10s - loss: 1.3761 - accuracy: 0.5163 - 10s/epoch - 649ms/step
Epoch 2/10
16/16 - 7s - loss: 0.9059 - accuracy: 0.6500 - 7s/epoch - 460ms/step
Epoch 3/10
16/16 - 8s - loss: 0.5882 - accuracy: 0.8050 - 8s/epoch - 471ms/step
Epoch 4/10
16/16 - 7s - loss: 0.3765 - accuracy: 0.8938 - 7s/epoch - 447ms/step
Epoch 5/10
16/16 - 7s - loss: 0.2314 - accuracy: 0.9513 - 7s/epoch - 463ms/step
Epoch 6/10
16/16 - 8s - loss: 0.1509 - accuracy: 0.9775 - 8s/epoch - 473ms/step
Epoch 7/10
16/16 - 7s - loss: 0.0987 - accuracy: 0.9875 - 7s/epoch - 444ms/step
Epoch 8/10
16/16 - 7s - loss: 0.0702 - accuracy: 0.9925 - 7s/epoch - 448ms/step
Epoch 9/10
16/16 - 8s - loss: 0.0499 - accuracy: 0.9975 - 8s/epoch - 469ms/step
Epoch 10/10
16/16 - 7s - loss: 0.0356 - accuracy: 0.9987 - 7s/epoch - 443ms/step
Test Accuracy: 65.32663106918335
mode:  count
Epoch 1/10
16/16 - 9s - loss: 1.3892 - accuracy: 0.5400 - 9s/ep

# additional exp - dataset : output_reviews_PA_FL (same as 1st model) & only 1 convolution

In [None]:
#########################
# Define the vocabulary #
#########################
review = pd.read_csv('output_reviews_PA_FL.csv')

del review['review_id']
del review['user_id']
del review['business_id']
del review['useful']
del review['funny']
del review['cool']
del review['date']
del review['state']
del review['Unnamed: 0.1']
del review['Unnamed: 0']

review = review[:1000]
review['split'] = 'train'
review['split'][800:1000] = 'test'

from collections import Counter
from nltk.corpus import stopwords
stopwords = stopwords.words('english')
stemmer = PorterStemmer()

def clean_doc(doc):
    # split into tokens by white space
    tokens = doc.split()
    # prepare regex for char filtering
    re_punc = re.compile('[%s]' % re.escape(punctuation))
    # remove punctuation from each word
    tokens = [re_punc.sub('', w) for w in tokens]
    # filter out stop words
    tokens = [w for w in tokens if not w in stopwords]
    # filter out short tokens
    tokens = [word for word in tokens if len(word) >= 1]
    # Stem the token
    tokens = [stemmer.stem(token) for token in tokens]
    return tokens

def add_doc_to_vocab(docs, vocab):
    '''
    input:
        docs: a list of sentences (docs)
        vocab: a vocabulary dictionary
    output:
        return an updated vocabulary
    '''
    for doc in docs:
        tokens = clean_doc(doc)
        vocab.update(tokens)
    return vocab

def doc_to_line(doc, vocab):
    tokens = clean_doc(doc)
    # filter by vocab
    tokens = [token for token in tokens if token in vocab]
    line = ' '.join(tokens)
    return line

def clean_docs(docs, vocab):
    lines = []
    for doc in docs:
        line = doc_to_line(doc, vocab)
        lines.append(line)
    return lines

# prepare bag-of-words encoding of docs
def prepare_data(train_docs, test_docs, mode):
    # create the tokenizer
    tokenizer = Tokenizer()
    # fit the tokenizer on the documents
    tokenizer.fit_on_texts(train_docs)
    #tokenizer.fit_on_texts(test_docs)
    #print(train_docs[0])
    # encode training data set
    Xtrain = tokenizer.texts_to_matrix(train_docs, mode=mode)
    #print(Xtrain[0])
    # encode test data set
    Xtest = tokenizer.texts_to_matrix(test_docs, mode=mode)
    return Xtrain, Xtest

def train_cnn(train_x, train_y, batch_size = 50, epochs = 10, verbose =2):

    n_words = train_x.shape[1]

    # n_words : the number of vocab
    model = tf.keras.models.Sequential([
        tf.keras.layers.Conv1D(filters=100, kernel_size=5, activation='relu', input_shape=(n_words,1)),
        tf.keras.layers.MaxPool1D(2),

        # !!!! (Nina) more kernel layers!!!
        #tf.keras.layers.Conv1D(filters=100, kernel_size=5, activation='relu', input_shape=(n_words,1)),
        # tf.keras.layers.MaxPool1D(2),

        tf.keras.layers.Flatten(),

        tf.keras.layers.Dropout(0.5),

        # !!!! (Nina) 3 convolution layers & 3 dense layers
        tf.keras.layers.Dense( units=6, activation='softmax')
    ])

    model.compile( loss = 'sparse_categorical_crossentropy', optimizer = 'adam', metrics = ['accuracy'])
    model.fit(train_x, train_y, batch_size, epochs, verbose)
    return model

# Separate the sentences and the labels for training and testing
train_x = list(review[review.split=='train'].text)
train_y = np.array(review[review.split=='train'].stars)
print('train_x size: ', len(train_x))
print('train_y size: ', len(train_y))

test_x = list(review[review.split=='test'].text)
test_y = np.array(review[review.split=='test'].stars)
print('test_x size: ', len(test_x))
print('test_y size: ', len(test_y))

# Run Experiment of 4 different modes
# !!!!(Nina) 'count', 'tfidf', 'freq' -> need to normalise
modes = ['binary', 'count', 'tfidf', 'freq']
results = pd.DataFrame()

for mode in modes:
    print('mode: ', mode)

    # Instantiate a vocab object
    vocab = Counter()

    # Define a vocabulary for each fold
    vocab = add_doc_to_vocab(train_x, vocab)

    # Clean the sentences
    train_x = clean_docs(train_x, vocab)
    test_x = clean_docs(test_x, vocab)

    # encode data using freq mode
    Xtrain, Xtest = prepare_data(train_x, test_x, mode)

    Xtrain = np.reshape(Xtrain, (Xtrain.shape[0], Xtrain.shape[1], 1))
    Xtest = np.reshape(Xtest, (Xtest.shape[0], Xtest.shape[1], 1))

    # train the model
    model = train_cnn(Xtrain, train_y)

    # evaluate the model
    loss, acc = model.evaluate(Xtest, test_y, verbose=0)
    print('Test Accuracy: {}'.format(acc*100))
    results[mode] = [acc*100]

print()
print(results)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  review['split'][800:1000] = 'test'


train_x size:  800
train_y size:  800
test_x size:  200
test_y size:  200
mode:  binary
Epoch 1/10
16/16 - 8s - loss: 1.3919 - accuracy: 0.4850 - 8s/epoch - 528ms/step
Epoch 2/10
16/16 - 8s - loss: 0.9395 - accuracy: 0.6087 - 8s/epoch - 471ms/step
Epoch 3/10
16/16 - 7s - loss: 0.6210 - accuracy: 0.8050 - 7s/epoch - 422ms/step
Epoch 4/10
16/16 - 8s - loss: 0.4098 - accuracy: 0.8950 - 8s/epoch - 495ms/step
Epoch 5/10
16/16 - 11s - loss: 0.2621 - accuracy: 0.9538 - 11s/epoch - 673ms/step
Epoch 6/10
16/16 - 9s - loss: 0.1737 - accuracy: 0.9750 - 9s/epoch - 585ms/step
Epoch 7/10
16/16 - 9s - loss: 0.1238 - accuracy: 0.9887 - 9s/epoch - 586ms/step
Epoch 8/10
16/16 - 8s - loss: 0.0886 - accuracy: 0.9925 - 8s/epoch - 496ms/step
Epoch 9/10
16/16 - 7s - loss: 0.0669 - accuracy: 0.9962 - 7s/epoch - 458ms/step
Epoch 10/10
16/16 - 8s - loss: 0.0509 - accuracy: 0.9975 - 8s/epoch - 480ms/step
Test Accuracy: 43.50000023841858
mode:  count
Epoch 1/10
16/16 - 19s - loss: 1.4122 - accuracy: 0.4762 - 19s/