# 10. Neural Bag-of-Word Model for Sentiment analysis classification
## 10.1. Data preparation
1. Separate data into training and testing sets

2. Loading and cleaning data to remove punctuation and numbers

3. Defining a vocabulary of preferred words

In [3]:
from string import punctuation
from os import listdir
from collections import Counter
from nltk.corpus import stopwords
from tensorflow.keras.preprocessing.text import Tokenizer
from sklearn.model_selection import train_test_split
import re
import random


## load docs
def load_doc(fn):
    file = open(fn, 'r')
    text = file.read()
    file.close()
    return text

## tokenize docs
def clean_doc(doc):
    tokens = doc.split()
    re_punc = re.compile('[%s]' % re.escape(punctuation))
    tokens = [re_punc.sub('', w) for w in tokens]
    tokens = [w for w in tokens if w.isalpha()]
    tokens = [w for w in tokens if w not in set(stopwords.words('english'))]
    tokens = [w for w in tokens if len(w) > 1]
    return tokens


def doc_to_line(fn, vocab):
    doc = load_doc(fn)
    tokens = clean_doc(doc)
    tokens = [w for w in tokens if w in vocab]
    return ' '.join(tokens)

## process corpus
def process_docs(directory, vocab, is_train):
    lines = list()
    for fn in listdir(directory):
        if is_train and fn.startswith('cv9'):
            continue
        if not is_train and not fn.startswith('cv9'):
            continue        
        path = directory + '/' + fn
        line = doc_to_line(path, vocab)
        lines.append(line)
    return lines

## create vocab
def create_vocab(corpus):
    vocab = Counter()
    for tokens in corpus:
        vocab.update(tokens)
    return vocab


def save_file(fn, doc):
    file = open(fn, 'w')
    file.write(doc)
    file.close()
    

def load_dataset(vocab, is_train):
    pos = process_docs('txt_sentoken/pos', vocab, is_train)
    neg = process_docs('txt_sentoken/neg', vocab, is_train)
    docs = neg + pos
    labels = [0 for _ in range(len(neg))] + [1 for _ in range(len(pos))]
    return docs, labels
    
def create_tokenizer(lines):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(lines)
    return tokenizer

In [4]:
vocab = load_doc('vocab.txt')
vocab = vocab.split('\n')

X_train, y_train = load_dataset(vocab, True)
X_test, y_test = load_dataset(vocab, False)

tokenizer = create_tokenizer(X_train)
Xtrain_vec = tokenizer.texts_to_matrix(X_train, mode='freq')
Xtest_vec = tokenizer.texts_to_matrix(X_test, mode='freq')
display(Xtrain_vec.shape, Xtest_vec.shape)

(1800, 26897)

(200, 26897)

In [9]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.utils import plot_model
import numpy as np

def defined_model(n_words):
    model = Sequential()
    model.add(Dense(50, input_shape=(n_words,), activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    
    model.summary()
    plot_model(model, to_file='neuralBoW.png', show_shapes=True)
    return model

n_words = Xtest_vec.shape[1]
model = defined_model(n_words)

model.fit(Xtrain_vec, y_train, epochs=5, verbose=2)

loss, acc = model.evaluate(Xtest_vec, y_test, verbose=0)
print(f'Test acc= {acc * 100}%')

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_7 (Dense)              (None, 50)                1344900   
_________________________________________________________________
dense_8 (Dense)              (None, 1)                 51        
Total params: 1,344,951
Trainable params: 1,344,951
Non-trainable params: 0
_________________________________________________________________


ValueError: numpy.ndarray size changed, may indicate binary incompatibility. Expected 88 from C header, got 80 from PyObject

In [11]:
import pickle


# file = open('Xtrain_vec.dat', 'wb')
# pickle.dump(Xtrain_vec, file)
# file.close()

# file = open('y_train.dat', 'wb')
# pickle.dump(y_train, file)
# file.close()

# file = open('Xtest_vec.dat', 'wb')
# pickle.dump(Xtest_vec, file)
# file.close()

# file = open('y_test.dat', 'wb')
# pickle.dump(y_test, file)
# file.close()