## Bag-of-Words

### Develop a Neural Bag-of-Words Model for Sentiment Analysis

In [None]:
## convert dataset to text model(same example of book)

import pandas as pd
import os

# Load your dataset into a Pandas DataFrame
data = pd.read_csv(r'D:\NLP\Deep_Learning_in_NLP\Phishing_Email.csv')

# Create a directory to store .txt files (optional)
clean_dir = r'D:\NLP\Deep_Learning_in_NLP\Clean'
spam_dir = r'D:\NLP\Deep_Learning_in_NLP\Spam'
os.makedirs(clean_dir, exist_ok=True)
os.makedirs(spam_dir, exist_ok=True)

# Iterate through the column and write to .txt files
for i in range(len(data)):

    print(i)
    if data['Email Type'][i] == 'Safe Email':

        # Define the file name for the .txt file
        filename = os.path.join(clean_dir, f"record_{i}.txt")
    
        # Write the record to the .txt file
        with open(filename, 'w', encoding="utf-8") as txt_file:
            txt_file.write(str(data['Email Text'][i]))
    else:
        # Define the file name for the .txt file
        filename = os.path.join(spam_dir, f"record_{i}.txt")

        # Write the record to the .txt file
        with open(filename, 'w', encoding="utf-8") as txt_file:
           txt_file.write(str(data['Email Text'][i]))

##### Data Preparation

In [1]:
## spam email

import string
import re
from os import listdir
from collections import Counter
from nltk.corpus import stopwords


# load doc into memory
def load_doc(filename):
    # open the file as read only
    file = open(filename, 'r', encoding="utf-8")
    # read all text
    text = file.read()
    # close the file
    file.close()
    return text

# turn a doc into clean tokens
def clean_doc(doc):
    # split into tokens by white space
    tokens = doc.split()
    # prepare regex for char filtering
    re_punc = re.compile('[%s]' % re.escape(string.punctuation))
    # remove punctuation from each word
    tokens = [re_punc.sub('', w) for w in tokens]
    # remove remaining tokens that are not alphabetic
    tokens = [word for word in tokens if word.isalpha()]
    # filter out stop words
    stop_words = set(stopwords.words('english'))
    tokens = [w for w in tokens if not w in stop_words]
    # filter out short tokens
    tokens = [word for word in tokens if len(word) > 1]
    return tokens


# load doc and add to vocab
def add_doc_to_vocab(filename, vocab):
    # load doc
    doc = load_doc(filename)
    # clean doc
    tokens = clean_doc(doc)
    # update counts
    vocab.update(tokens)

# load all docs in a directory
def process_docs(directory, vocab):
    # walk through all files in the folder
    for filename in listdir(directory):
        # skip any reviews in the test set
        if filename.startswith('record_15') or filename.startswith('record_16') or filename.startswith('record_17') or filename.startswith('record_18'):
            continue
        # create the full path of the file to open
        path = directory + '/' + filename
        # add doc to vocab
        add_doc_to_vocab(path, vocab)

# save list to file
def save_list(lines, filename):
    # convert lines to a single blob of text
    data = '\n'.join(lines)
    # open file
    file = open(filename, 'w')
    # write text
    file.write(data)
    # close file
    file.close()

# define vocab
vocab = Counter()
# add all docs to vocab
process_docs(r"D:\NLP\Deep_Learning_in_NLP\phishing_detection\Clean", vocab)
process_docs(r"D:\NLP\Deep_Learning_in_NLP\phishing_detection\Phishing", vocab)
# print the size of the vocab
print(len(vocab))
# keep tokens with a min occurrence
min_occurane = 2
tokens = [k for k,c in vocab.items() if c >= min_occurane]
print(len(tokens))
# save tokens to a vocabulary file
save_list(tokens, 'vocab.txt')

100791
49414


##### Bag-of-Words Representation

In [2]:
## Reviews to Lines of Tokens

import string
import re
from os import listdir
from nltk.corpus import stopwords


# load doc into memory
def load_doc(filename):
    # open the file as read only
    file = open(filename, 'r', encoding="latin-1")
    # read all text
    text = file.read()
    # close the file
    file.close()
    return text

# turn a doc into clean tokens
def clean_doc(doc):
    # split into tokens by white space
    tokens = doc.split()
    # prepare regex for char filtering
    re_punc = re.compile('[%s]' % re.escape(string.punctuation))
    # remove punctuation from each word
    tokens = [re_punc.sub('', w) for w in tokens]
    # remove remaining tokens that are not alphabetic
    tokens = [word for word in tokens if word.isalpha()]
    # filter out stop words
    stop_words = set(stopwords.words('english'))
    tokens = [w for w in tokens if not w in stop_words]
    # filter out short tokens
    tokens = [word for word in tokens if len(word) > 1]
    return tokens

# load doc, clean and return line of tokens
def doc_to_line(filename, vocab):
    # load the doc
    doc = load_doc(filename)
    # clean doc
    tokens = clean_doc(doc)
    # filter by vocab
    tokens = [w for w in tokens if w in vocab]
    return ' '.join(tokens)

# load all docs in a directory
def process_docs(directory, vocab):
    lines = list()
    # walk through all files in the folder
    for filename in listdir(directory):
        # skip any reviews in the test set
        if filename.startswith('record_15') or filename.startswith('record_16') or filename.startswith('record_17') or filename.startswith('record_18'):
            continue
        # create the full path of the file to open
        path = directory + '/' + filename
        # load and clean the doc
        line = doc_to_line(path, vocab)
        # add to list
        lines.append(line)
    return lines

# load and clean a dataset
def load_clean_dataset(vocab):
    # load documents
    phishing = process_docs(r"D:\NLP\Deep_Learning_in_NLP\phishing_detection\Phishing", vocab)
    clean = process_docs(r"D:\NLP\Deep_Learning_in_NLP\phishing_detection\Clean", vocab)
    docs = phishing + clean
    # prepare labels
    labels = [0 for _ in range(len(phishing))] + [1 for _ in range(len(clean))]
    return docs, labels
    
# load the vocabulary
vocab_filename = 'vocab.txt'
vocab = load_doc(vocab_filename)
vocab = vocab.split()
vocab = set(vocab)
# load all training reviews
docs, labels = load_clean_dataset(vocab)
# summarize what we have
print(len(docs), len(labels))

7556 7556


##### Email to Bag-of-Words Vectors

In [3]:
## preparing train and test data

import string
import re
from os import listdir
from nltk.corpus import stopwords
from keras.preprocessing.text import Tokenizer


# load doc into memory
def load_doc(filename):
    # open the file as read only
    file = open(filename, 'r', encoding="latin-1")
    # read all text
    text = file.read()
    # close the file
    file.close()
    return text

# turn a doc into clean tokens
def clean_doc(doc):
    # split into tokens by white space
    tokens = doc.split()
    # prepare regex for char filtering
    re_punc = re.compile('[%s]' % re.escape(string.punctuation))
    # remove punctuation from each word
    tokens = [re_punc.sub('', w) for w in tokens]
    # remove remaining tokens that are not alphabetic
    tokens = [word for word in tokens if word.isalpha()]
    # filter out stop words
    stop_words = set(stopwords.words('english'))
    tokens = [w for w in tokens if not w in stop_words]
    # filter out short tokens
    tokens = [word for word in tokens if len(word) > 1]
    return tokens

# load doc, clean and return line of tokens
def doc_to_line(filename, vocab):
    # load the doc
    doc = load_doc(filename)
    # clean doc
    tokens = clean_doc(doc)
    # filter by vocab
    tokens = [w for w in tokens if w in vocab]
    return ' '.join(tokens)

# load all docs in a directory
def process_docs(directory, vocab, is_train):
    lines = list()
    # walk through all files in the folder
    for filename in listdir(directory):
        # skip any reviews in the test set
        if is_train and filename.startswith('record_15') or filename.startswith('record_16') or filename.startswith('record_17') or filename.startswith('record_18'):
            continue
        if not is_train and not (filename.startswith('record_15') or filename.startswith('record_16') or filename.startswith('record_17') or filename.startswith('record_18')):
            continue
        # create the full path of the file to open
        path = directory + '/' + filename
        # load and clean the doc
        line = doc_to_line(path, vocab)
        # add to list
        lines.append(line)
    return lines

# load and clean a dataset
def load_clean_dataset(vocab, is_train):
    # load documents
    phishing = process_docs(r"D:\NLP\Deep_Learning_in_NLP\phishing_detection\Phishing", vocab, is_train)
    clean = process_docs(r"D:\NLP\Deep_Learning_in_NLP\phishing_detection\Clean", vocab, is_train)
    docs = phishing + clean
    # prepare labels
    labels = [0 for _ in range(len(phishing))] + [1 for _ in range(len(clean))]
    return docs, labels
    
# fit a tokenizer
def create_tokenizer(lines):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(lines)
    return tokenizer

# load the vocabulary
vocab_filename = 'vocab.txt'
vocab = load_doc(vocab_filename)
vocab = vocab.split()
vocab = set(vocab)
# load all reviews
train_docs, ytrain = load_clean_dataset(vocab, True)
test_docs, ytest = load_clean_dataset(vocab, False)
# create the tokenizer
tokenizer = create_tokenizer(train_docs)
# encode data
Xtrain = tokenizer.texts_to_matrix(train_docs, mode='freq')
Xtest = tokenizer.texts_to_matrix(test_docs, mode='freq')
print(Xtrain.shape, Xtest.shape)

(7556, 43178) (88, 43178)


##### Sentiment Analysis Models

In [6]:
##  training and evaluating an MLP bag-of-words model

import string
import numpy as np
import re
from os import listdir
from nltk.corpus import stopwords
from keras.preprocessing.text import Tokenizer
from keras.utils import plot_model
from keras.models import Sequential
from keras.layers import Dense

# load doc into memory
def load_doc(filename):
    # open the file as read only
    file = open(filename, 'r', encoding="latin-1")
    # read all text
    text = file.read()
    # close the file
    file.close()
    return text

# turn a doc into clean tokens
def clean_doc(doc):
    # split into tokens by white space
    tokens = doc.split()
    # prepare regex for char filtering
    re_punc = re.compile('[%s]' % re.escape(string.punctuation))
    # remove punctuation from each word
    tokens = [re_punc.sub('', w) for w in tokens]
    # remove remaining tokens that are not alphabetic
    tokens = [word for word in tokens if word.isalpha()]
    # filter out stop words
    stop_words = set(stopwords.words('english'))
    tokens = [w for w in tokens if not w in stop_words]
    # filter out short tokens
    tokens = [word for word in tokens if len(word) > 1]
    return tokens

# load doc, clean and return line of tokens
def doc_to_line(filename, vocab):
    # load the doc
    doc = load_doc(filename)
    # clean doc
    tokens = clean_doc(doc)
    # filter by vocab
    tokens = [w for w in tokens if w in vocab]
    return ' '.join(tokens)

# load all docs in a directory
def process_docs(directory, vocab, is_train):
    lines = list()
    # walk through all files in the folder
    for filename in listdir(directory):
        # skip any reviews in the test set
        if is_train and filename.startswith('record_15') or filename.startswith('record_16') or filename.startswith('record_17') or filename.startswith('record_18'):
            continue
        if not is_train and not (filename.startswith('record_15') or filename.startswith('record_16') or filename.startswith('record_17') or filename.startswith('record_18')):
            continue
        # create the full path of the file to open
        path = directory + '/' + filename
        # load and clean the doc
        line = doc_to_line(path, vocab)
        # add to list
        lines.append(line)
    return lines

# load and clean a dataset
def load_clean_dataset(vocab, is_train):
    # load documents
    phishing = process_docs(r"D:\NLP\Deep_Learning_in_NLP\phishing_detection\Phishing", vocab, is_train)
    clean = process_docs(r"D:\NLP\Deep_Learning_in_NLP\phishing_detection\Clean", vocab, is_train)
    docs = phishing + clean
    # prepare labels
    labels = [0 for _ in range(len(phishing))] + [1 for _ in range(len(clean))]
    return docs, labels
    
# fit a tokenizer
def create_tokenizer(lines):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(lines)
    return tokenizer

# define the model
def define_model(n_words):
    # define network
    model = Sequential()
    model.add(Dense(50, input_shape=(n_words,), activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    # compile network
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    # summarize defined model
    model.summary()
    #plot_model(model, to_file='model.png', show_shapes=True)
    return model

# load the vocabulary
vocab_filename = 'vocab.txt'
vocab = load_doc(vocab_filename)
vocab = vocab.split()
vocab = set(vocab)
# load all reviews
train_docs, ytrain = load_clean_dataset(vocab, True)
test_docs, ytest = load_clean_dataset(vocab, False)
# create the tokenizer
tokenizer = create_tokenizer(train_docs)
# encode data
Xtrain = tokenizer.texts_to_matrix(train_docs, mode='freq')
Xtest = tokenizer.texts_to_matrix(test_docs, mode='freq')
# define the model
n_words = Xtest.shape[1]
model = define_model(n_words)
# fit network
model.fit(Xtrain, np.array(ytrain), epochs=10, verbose=2)
# evaluate
loss, acc = model.evaluate(Xtest, np.array(ytest), verbose=0)
print('Test Accuracy: %f' % (acc*100))

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_4 (Dense)             (None, 50)                2158950   
                                                                 
 dense_5 (Dense)             (None, 1)                 51        
                                                                 
Total params: 2159001 (8.24 MB)
Trainable params: 2159001 (8.24 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
Epoch 1/10
237/237 - 3s - loss: 0.5783 - accuracy: 0.8849 - 3s/epoch - 13ms/step
Epoch 2/10
237/237 - 3s - loss: 0.2741 - accuracy: 0.9708 - 3s/epoch - 11ms/step
Epoch 3/10
237/237 - 3s - loss: 0.1424 - accuracy: 0.9788 - 3s/epoch - 11ms/step
Epoch 4/10
237/237 - 3s - loss: 0.0935 - accuracy: 0.9829 - 3s/epoch - 11ms/step
Epoch 5/10
237/237 - 3s - loss: 0.0695 - accuracy: 0.9856 - 3s/epoch - 11ms/step
Epoch 6

##### Comparing Word Scoring Methods

In [4]:
import string
import re
import numpy as np
from os import listdir
from nltk.corpus import stopwords
from keras.preprocessing.text import Tokenizer
from keras.models import Sequential
from keras.layers import Dense
from pandas import DataFrame
from matplotlib import pyplot


# load doc into memory
def load_doc(filename):
    # open the file as read only
    file = open(filename, 'r', encoding="latin-1")
    # read all text
    text = file.read()
    # close the file
    file.close()
    return text

# turn a doc into clean tokens
def clean_doc(doc):
    # split into tokens by white space
    tokens = doc.split()
    # prepare regex for char filtering
    re_punc = re.compile('[%s]' % re.escape(string.punctuation))
    # remove punctuation from each word
    tokens = [re_punc.sub('', w) for w in tokens]
    # remove remaining tokens that are not alphabetic
    tokens = [word for word in tokens if word.isalpha()]
    # filter out stop words
    stop_words = set(stopwords.words('english'))
    tokens = [w for w in tokens if not w in stop_words]
    # filter out short tokens
    tokens = [word for word in tokens if len(word) > 1]
    return tokens

# load doc, clean and return line of tokens
def doc_to_line(filename, vocab):
    # load the doc
    doc = load_doc(filename)
    # clean doc
    tokens = clean_doc(doc)
    # filter by vocab
    tokens = [w for w in tokens if w in vocab]
    return ' '.join(tokens)

# load all docs in a directory
def process_docs(directory, vocab, is_train):
    lines = list()
    # walk through all files in the folder
    for filename in listdir(directory):
        # skip any reviews in the test set
        if is_train and filename.startswith('record_15') or filename.startswith('record_16') or filename.startswith('record_17') or filename.startswith('record_18'):
            continue
        if not is_train and not (filename.startswith('record_15') or filename.startswith('record_16') or filename.startswith('record_17') or filename.startswith('record_18')):
            continue
        # create the full path of the file to open
        path = directory + '/' + filename
        # load and clean the doc
        line = doc_to_line(path, vocab)
        # add to list
        lines.append(line)
    return lines

# load and clean a dataset
def load_clean_dataset(vocab, is_train):
    # load documents
    phishing = process_docs(r"D:\NLP\Deep_Learning_in_NLP\phishing_detection\Phishing", vocab, is_train)
    clean = process_docs(r"D:\NLP\Deep_Learning_in_NLP\phishing_detection\Clean", vocab, is_train)
    docs = phishing + clean
    # prepare labels
    labels = [0 for _ in range(len(phishing))] + [1 for _ in range(len(clean))]
    return docs, labels


# define the model
def define_model(n_words):
    # define network
    model = Sequential()
    model.add(Dense(50, input_shape=(n_words,), activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    # compile network
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    # summarize defined model
    return model

# evaluate a neural network model
def evaluate_mode(Xtrain, ytrain, Xtest, ytest):
    scores = list()
    n_repeats = 10
    n_words = Xtest.shape[1]
    for i in range(n_repeats):
        # define network
        model = define_model(n_words)
        # fit network
        model.fit(Xtrain, ytrain, epochs=10, verbose=0)
        # evaluate
        _, acc = model.evaluate(Xtest, np.array(ytest), verbose=0)
        scores.append(acc)
        print('%d accuracy: %s' % ((i+1), acc))
    return scores

# prepare bag of words encoding of docs
def prepare_data(train_docs, test_docs, mode):
    # create the tokenizer
    tokenizer = Tokenizer()
    # fit the tokenizer on the documents
    tokenizer.fit_on_texts(train_docs)
    # encode training data set
    Xtrain = tokenizer.texts_to_matrix(train_docs, mode=mode)
    # encode training data set
    Xtest = tokenizer.texts_to_matrix(test_docs, mode=mode)
    return Xtrain, Xtest

# load the vocabulary
vocab_filename = 'vocab.txt'
vocab = load_doc(vocab_filename)
vocab = vocab.split()
vocab = set(vocab)
# load all reviews
train_docs, ytrain = load_clean_dataset(vocab, True)
test_docs, ytest = load_clean_dataset(vocab, False)
# run experiment
modes = ['binary', 'count', 'tfidf', 'freq']
results = DataFrame()
for mode in modes:
    # prepare data for mode
    Xtrain, Xtest = prepare_data(train_docs, test_docs, mode)
    # evaluate model on data for mode
    results[mode] = evaluate_mode(Xtrain, np.array(ytrain), Xtest, ytest)
# summarize results
print(results.describe())
# plot results
results.boxplot()
pyplot.show()

1 accuracy: 0.9659090638160706
2 accuracy: 0.9659090638160706
3 accuracy: 0.9659090638160706
4 accuracy: 0.9659090638160706
5 accuracy: 0.9659090638160706
6 accuracy: 0.9659090638160706
7 accuracy: 0.9659090638160706
8 accuracy: 0.9659090638160706
9 accuracy: 0.9659090638160706
10 accuracy: 0.9659090638160706
1 accuracy: 0.9659090638160706
2 accuracy: 0.9545454382896423
3 accuracy: 0.9659090638160706
4 accuracy: 0.9545454382896423
5 accuracy: 0.9318181872367859
6 accuracy: 0.9545454382896423
7 accuracy: 0.9659090638160706
8 accuracy: 0.9659090638160706
9 accuracy: 0.9545454382896423
10 accuracy: 0.9545454382896423
1 accuracy: 0.9659090638160706
2 accuracy: 0.9431818127632141
3 accuracy: 0.9659090638160706
4 accuracy: 0.9659090638160706
5 accuracy: 0.9431818127632141
6 accuracy: 0.9545454382896423
7 accuracy: 0.9545454382896423
8 accuracy: 0.9545454382896423
9 accuracy: 0.9659090638160706
10 accuracy: 0.9659090638160706


MemoryError: Unable to allocate 1.22 GiB for an array with shape (7556, 43178) and data type float32

##### Predicting Sentiment for New Email

In [7]:
## making predictions for new review data

import string
import numpy as np
import re
from os import listdir
from nltk.corpus import stopwords
from keras.preprocessing.text import Tokenizer
from keras.utils import plot_model
from keras.models import Sequential
from keras.layers import Dense


# load doc into memory
def load_doc(filename):
    # open the file as read only
    file = open(filename, 'r', encoding="latin-1")
    # read all text
    text = file.read()
    # close the file
    file.close()
    return text

# turn a doc into clean tokens
def clean_doc(doc):
    # split into tokens by white space
    tokens = doc.split()
    # prepare regex for char filtering
    re_punc = re.compile('[%s]' % re.escape(string.punctuation))
    # remove punctuation from each word
    tokens = [re_punc.sub('', w) for w in tokens]
    # remove remaining tokens that are not alphabetic
    tokens = [word for word in tokens if word.isalpha()]
    # filter out stop words
    stop_words = set(stopwords.words('english'))
    tokens = [w for w in tokens if not w in stop_words]
    # filter out short tokens
    tokens = [word for word in tokens if len(word) > 1]
    return tokens

# load doc, clean and return line of tokens
def doc_to_line(filename, vocab):
    # load the doc
    doc = load_doc(filename)
    # clean doc
    tokens = clean_doc(doc)
    # filter by vocab
    tokens = [w for w in tokens if w in vocab]
    return ' '.join(tokens)

# load all docs in a directory
def process_docs(directory, vocab):
    lines = list()
    # walk through all files in the folder
    for filename in listdir(directory):
        # create the full path of the file to open
        path = directory + '/' + filename
        # load and clean the doc
        line = doc_to_line(path, vocab)
        # add to list
        lines.append(line)
    return lines

# load and clean a dataset
def load_clean_dataset(vocab):
    # load documents
    phishing = process_docs(r"D:\NLP\Deep_Learning_in_NLP\phishing_detection\Phishing", vocab)
    clean = process_docs(r"D:\NLP\Deep_Learning_in_NLP\phishing_detection\Clean", vocab)
    docs = phishing + clean
    # prepare labels
    labels = [0 for _ in range(len(phishing))] + [1 for _ in range(len(clean))]
    return docs, labels

# fit a tokenizer
def create_tokenizer(lines):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(lines)
    return tokenizer

# define the model
def define_model(n_words):
    # define network
    model = Sequential()
    model.add(Dense(50, input_shape=(n_words,), activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    # compile network
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    # summarize defined model
    model.summary()
    plot_model(model, to_file='model.png', show_shapes=True)
    return model

# classify a review as negative or positive
def predict_sentiment(review, vocab, tokenizer, model):
    # clean
    tokens = clean_doc(review)
    # filter by vocab
    tokens = [w for w in tokens if w in vocab]
    # convert to line
    line = ' '.join(tokens)
    # encode
    encoded = tokenizer.texts_to_matrix([line], mode='binary')
    # predict sentiment
    yhat = model.predict(encoded, verbose=0)
    # retrieve predicted percentage and label
    percent_pos = yhat[0,0]
    if round(percent_pos) == 0:
        return (1-percent_pos), 'Phishing'
    return percent_pos, 'clean'

# load the vocabulary
vocab_filename = 'vocab.txt'
vocab = load_doc(vocab_filename)
vocab = vocab.split()
vocab = set(vocab)
# load all reviews
train_docs, ytrain = load_clean_dataset(vocab)
test_docs, ytest = load_clean_dataset(vocab)
# create the tokenizer
tokenizer = create_tokenizer(train_docs)
# encode data
Xtrain = tokenizer.texts_to_matrix(train_docs, mode='binary')
Xtest = tokenizer.texts_to_matrix(test_docs, mode='binary')
# define network
n_words = Xtrain.shape[1]
model = define_model(n_words)
# fit network
model.fit(Xtrain, np.array(ytrain), epochs=10, verbose=2)
# test positive text
text = '''multiple o ' gazm 4 men no . 1 male sexual enhancement pill on the market more info here decommission zgy gold jqh chisel an belly kd amplify zh century gds connally th webster wu munch baz breakaway ba dereference qi walkie qk spearhead xw capitoline hte planetoid gr bless yg advisor dmb psychotherapist tdu northern tqt bald tm league ar polyglot otd gouda mf repartee zx parsnip bl handicraftsmen ik no'''
percent, sentiment = predict_sentiment(text, vocab, tokenizer, model)
print('Review: [%s]\nSentiment: %s (%.3f%%)' % (text, sentiment, percent*100))
# test negative text
text = '''transition items i have a few questions regarding the transition . access programmers - the contractors will be managed by the end - users they are supporting from now on . the budget dollars for them are in cc 103849 - $ 50 k per month . do you want each department to continue paying their invoices against these budget dollars in 103849 or through their own cost centers ? do we transfer budget dollars ? close the cost center ? equipment - we have 1 flat screen and several large monitors . should we just return all equipment to surplus or offer to others ? space - will commoditylogic be moving and assuming the space that will be vacated on 3 / 12 ? fax machine will be disconnected / returned . digital scanner will be surplused . there are two printers that can be surplused unless someone else ( cl ? ) wants / needs them . turkeylegs and tarzan . i currently charge 50 % of my time to cl and 50 % to my cost center - 103850 . should i continue this or charge all my time to cl and close my cost center ? since marvia is also be redeployed , there is no one else in my cost center . let me know if you want to sit down and discuss or you can just provide quick answers here or in voice mail .'''
percent, sentiment = predict_sentiment(text, vocab, tokenizer, model)
print('Review: [%s]\nSentiment: %s (%.3f%%)' % (text, sentiment, percent*100))

Model: "sequential_34"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_68 (Dense)            (None, 50)                2158950   
                                                                 
 dense_69 (Dense)            (None, 1)                 51        
                                                                 
Total params: 2159001 (8.24 MB)
Trainable params: 2159001 (8.24 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
You must install pydot (`pip install pydot`) and install graphviz (see instructions at https://graphviz.gitlab.io/download/) for plot_model to work.
Epoch 1/10
248/248 - 3s - loss: 0.1805 - accuracy: 0.9590 - 3s/epoch - 14ms/step
Epoch 2/10
248/248 - 3s - loss: 0.0481 - accuracy: 0.9871 - 3s/epoch - 12ms/step
Epoch 3/10
248/248 - 3s - loss: 0.0321 - accuracy: 0.9901 - 3s/epoch - 12ms/step
Epoch 4/10
248/248 