This is a simple proof of concept of how we can apply basic nltk techniques to perform sentiment analysis of The Message (MSG) translation of the bible leveraging tensorflow.

High level approach:

* Create an array of the most frequntly occuring ([lemmatized](https://nlp.stanford.edu/IR-book/html/htmledition/stemming-and-lemmatization-1.html)) words from the training samples
* Create feature/label sets for positive and negative sentiment data by counting the number of popular words in each sample, from the array created above
* Using the above labelled features as inputs, train a 3 layer feedfoward neural network which will output an array containing probability percentages for True and False
* We save the model for later use and later run it on the Bible (MSG) saving the results in a sqlite database

Note, the basis for this comes largely from [Rachit Mishra's work](https://becominghuman.ai/deep-learning-using-tensorflow-and-nltk-analyzing-corpuss-sentiments-part-1-bec9d6c1051). I amended with the below:

* added a method to create the layers of the neural network
* saved the model after training for later re-use
* logged training results to be able to view in tensorboard
* ran the model on every verse from 3 books of the bible
    * Ecclesiastes
    * Proverbs
    * Psalms
* saved the results of the above predictions to a sqlite db


In [1]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import numpy as np
import random
import json
from collections import Counter

import logging
logging.basicConfig(level=logging.DEBUG)
logger = logging.getLogger(__name__)

lemmatizer = WordNetLemmatizer()
# max number of lines we want/need to create the frequency array
hm_lines = 10000000

# lexicon - find all words in pos and neg data sets
def create_lexicon(pos, neg):
    lexicon = []
    for file in [pos, neg]:
        with open(file, 'r') as f:
            contents = f.readlines()
            for l in contents[:hm_lines]:  # upto however many lines we're gonna read
                all_words = word_tokenize(l.lower())  # tokenizing words per line
                lexicon += list(all_words)

    # lemmatize all these words
    lexicon = [lemmatizer.lemmatize(i) for i in lexicon]
    # stemming them into legitimate words
    # Input Vector is going to be that input vector
    # ieally - LEXICON should be shortest possible so that
    # we can have a decent sized model in terms of 3 layers
    # 1000
    w_counts = Counter(lexicon)
    # this gives us a dictionary like elements
    # w_counts = {'the':52000, 'and',:22323} EXAMPLE
    l2 = []
    for w in w_counts:
        if 1000 > w_counts[w] > 50:
            l2.append(w)
            # because we dont want super common words like 'the' 'and' 'or' etc. - NOT VALUABLE
    logger.debug('lexicon contains {0} words'.format(len(l2)))
    return l2
    # l2 is the final lexicon


def sample_handling(sample, lexicon, classification):
    featureset = []  # [1 0] pos sentiment [0 1] negative sentiment
    with open(sample, 'r') as f:
        # parse in each line
        contents = f.readlines()
        # loop through each word of each line
        for l in contents[:hm_lines]:
            current_words = word_tokenize(l.lower())
            current_words = [lemmatizer.lemmatize(i) for i in current_words]
            features = np.zeros(len(lexicon))
            for word in current_words:
                if word.lower() in lexicon:
                    index_value = lexicon.index(word.lower())
                    # like the example discussed earlier
                    features[index_value] += 1
            features = list(features)
            featureset.append([features, classification])
        logger.debug('featureset contains {0} features'.format(len(featureset)))

    return featureset


def create_feature_sets_and_labels(pos, neg, test_size=0.1):
    lexicon = create_lexicon(pos, neg)
    global_lexicon = lexicon
    features = []
    logger.debug('Creating feature set for positive')
    features += sample_handling('pos.txt', lexicon, [1, 0])
    logger.debug('Creating feature set for negative')
    features += sample_handling('neg.txt', lexicon, [0, 1])
    random.shuffle(features)
    logger.debug('features length is {}'.format(len(features)))
    # does tf.agrmax([output]) == tf.argmax)[expectations]) was the final question
    # this was the question earlier

    # want to shuffle bcause that's how NN Model works - it's going to be shifting the
    # weights for RNN model to work
    features = np.array(features)
    testing_size = int(test_size * len(features))

    # x is features, y is labels
    train_x = list(features[:, 0][:-testing_size])
    train_y = list(features[:, 1][:-testing_size])

    test_x = list(features[:, 0][:-testing_size:])
    test_y = list(features[:, 1][:-testing_size:])

    return train_x, train_y, test_x, test_y, lexicon


if __name__ == '__main__':
    # Create the train/test groups
    train_x, train_y, test_x, test_y, global_lexicon = create_feature_sets_and_labels('pos.txt', 'neg.txt')
    '''
    with open('sentiment_set.pickle', 'wb') as f:
        pickle.dump([train_x, train_y, test_x, test_y], f)
    '''

# now we want to run this through a deep neural network


DEBUG:__main__:foo
DEBUG:__main__:lexicon contains 423 words
DEBUG:__main__:Creating feature set for positive
DEBUG:__main__:featureset contains 5331 features
DEBUG:__main__:Creating feature set for negative
DEBUG:__main__:featureset contains 5331 features
DEBUG:__main__:features length is 10662


In [10]:
import tensorflow as tf
#from TF_own_data_model import create_feature_sets_and_labels
import numpy as np

from tensorflow.python.framework.ops import reset_default_graph
reset_default_graph()

# can load the data from the pickle
    # or you could just write it down
#train_x, train_y, test_x, test_y = create_feature_sets_and_labels('pos.txt', 'neg.txt')
    # 3 hidden layers is probably good enough
        # no. of classes = 2 (pos and neg)
n_nodes_hl1 = 500
n_nodes_hl2 = 500
n_nodes_hl3 = 500
n_classes = 2

batch_size = 100  # can do batches of 100 features at a time
x = tf.placeholder('float', [None, len(train_x[0])])  # [None by 423]
y = tf.placeholder('float')

def linear(X, n_input, n_output, activation=None, scope=None):
    with tf.variable_scope(scope or "linear"):
        W = tf.get_variable(
            name='W',
            shape=[n_input, n_output],
            initializer=tf.random_normal_initializer(mean=0.0, stddev=0.1))
        b = tf.get_variable(
            name='b',
            shape=[n_output],
            initializer=tf.constant_initializer())
        h = tf.matmul(X, W) + b
        if activation is not None:
            h = activation(h)
        return h

def neural_network(data):
    # layer 1
    h1 = linear(data, len(train_x[0]), n_nodes_hl1, tf.nn.relu, scope='layer1')
    # layer 2
    h2 = linear(h1, n_nodes_hl1, n_nodes_hl2, tf.nn.relu, scope='layer2')
    # layer 3
    h3 = linear(h2, n_nodes_hl2, n_nodes_hl3, tf.nn.relu, scope='layer3')
    # output
    output = linear(h3, n_nodes_hl3, n_classes, None, scope='output')
    # See the names of any operations in the graph
    #print([op.name for op in tf.get_default_graph().get_operations()])
    return output

    # now all we have to do is explain to TF, what to do with this model
    # need to specify how we want to run data through that model

def train_neural_network(x):
    prediction = neural_network(x)
    cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=prediction, labels=y))

    # AdamOptimizer seems to automatically adjust the learning rate as opposed to
    # GradientDescent's fixed
    optimizer = tf.train.AdamOptimizer().minimize(cost)
    #optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.01).minimize(cost)

    # Create a saver object to save our graph
    saver = tf.train.Saver()
    
    n_epochs = 10

    tf.summary.scalar("cost", cost)
    summary_op = tf.summary.merge_all()
    
    with tf.Session() as sess:
        
        writer = tf.summary.FileWriter('summary/log', graph=sess.graph)
        sess.run(tf.global_variables_initializer())  # initializes our variables. Session has now begun.

        for epoch in range(n_epochs):
            epoch_loss = 0  # we'll calculate the loss as we go

            i = 0
            while i < len(train_x):
                #we want to take batches(chunks); take a slice, then another size)
                start = i
                end = i+batch_size

                batch_x = np.array(train_x[start:end])
                batch_y = np.array(train_y[start:end])
                _, c, summary = sess.run([optimizer, cost, summary_op], feed_dict={x: batch_x, y: batch_y})
                
                # write log
                writer.add_summary(summary, epoch * n_epochs + i)
                
                # Create a checkpoint in every iteration
                #saver.save(sess, 'model/model_iter', global_step=epoch)
                
                epoch_loss += c
                i+=batch_size

            print('Epoch', epoch, 'completed out of', n_epochs, 'loss:', epoch_loss)

        pred_op = tf.nn.softmax(prediction, name='pred_op')
        # Save the final model
        saver.save(sess, 'model/model_final')
        
        correct = tf.equal(tf.argmax(prediction, 1), tf.argmax(y, 1))
        accuracy = tf.reduce_mean(tf.cast(correct, 'float'))
        tf.summary.scalar("accuracy", accuracy)
        
        print('Accuracy:', accuracy.eval({x: test_x, y: test_y}))
        
        # Run classification against data
        feed_dict = {x: np.array([test_x[35]]).astype('float32')}
        sentiment = sess.run(tf.nn.softmax(prediction), feed_dict)
        print('sentiment is {}'.format(sentiment))

train_neural_network(x)


Epoch 0 completed out of 10 loss: 66.7704710364
Epoch 1 completed out of 10 loss: 44.6850358844
Epoch 2 completed out of 10 loss: 29.4173834622
Epoch 3 completed out of 10 loss: 18.2085752636
Epoch 4 completed out of 10 loss: 16.9207520336
Epoch 5 completed out of 10 loss: 21.1601841375
Epoch 6 completed out of 10 loss: 19.8161383942
Epoch 7 completed out of 10 loss: 12.5131532773
Epoch 8 completed out of 10 loss: 8.32716379315
Epoch 9 completed out of 10 loss: 6.76637831237
Accuracy: 0.967903
sentiment is [[  9.99436796e-01   5.63225010e-04]]


In [2]:
def sample_handling(phrase, lexicon=global_lexicon):
    featureset = []  # [1 0] pos sentiment [0 1] negative sentiment
    current_words = word_tokenize(phrase.lower())
    current_words = [lemmatizer.lemmatize(i) for i in current_words]
    features = np.zeros(len(lexicon))
    for word in current_words:
        if word.lower() in lexicon:
            index_value = lexicon.index(word.lower())
            # like the example discussed earlier
            features[index_value] += 1
    features = list(features)
    
    return features

In [12]:
def run_prediction(phrase):
    # Parse in MSG bible
    with open('MSG.json','r') as foo:
        msg = foo.read()
        msg = json.loads(msg)
    
    with tf.Session() as sess:
        # Create a saver object to save our graph
        saver = tf.train.import_meta_graph('model/model_final.meta')
        saver.restore(sess, tf.train.latest_checkpoint('model/'))
        #sess.run(tf.global_variables_initializer())  # initializes our variables. Session has now begun.

        # Get default graph (supply your custom graph if you have one)
        graph = tf.get_default_graph()
        
        pred_op = graph.get_tensor_by_name("pred_op:0")
        
        verses = []
        
        '''
        sqlite> .schema bible
        CREATE TABLE bible (book text, chapter int, verse int, sentiment int, pos real, neg real);
        '''

        import sqlite3
        conn = sqlite3.connect('bible.db')
        c = conn.cursor()
        
        books = ('Psalms',)
        
        for book in books:
            for chap in msg[book]:
                for verse in msg[book][chap].items():
                    verse_info = {'chapter': chap,
                                  'verse': verse[0],
                                  'content': verse[1]}

                    verse_info['content'] = np.array([sample_handling(verse_info['content'])]).astype('float32')
                    feed_dict = {x: verse_info['content']}
                    sentiment = sess.run(tf.nn.softmax(pred_op), feed_dict)
                    negpos = {0:'negative', 1:'positive'}
                    summary = (book,
                               int(verse_info['chapter']),
                               int(verse_info['verse']),
                               int(sess.run(tf.argmin(sentiment,1))),
                               sentiment.tolist()[0][0],
                               sentiment.tolist()[0][1]
                              )
                    print(summary)
                    c.execute('INSERT INTO bible VALUES (?,?,?,?,?,?)', summary)
        conn.commit()
        conn.close()
        
run_prediction('x')

INFO:tensorflow:Restoring parameters from model/model_final


INFO:tensorflow:Restoring parameters from model/model_final


('Psalms', 146, 6, 1, 0.5609954595565796, 0.43900448083877563)
('Psalms', 146, 9, 1, 0.7307742238044739, 0.26922574639320374)
('Psalms', 146, 8, 1, 0.7182794809341431, 0.28172045946121216)
('Psalms', 146, 7, 0, 0.28153279423713684, 0.7184671759605408)
('Psalms', 146, 1, 0, 0.4697975516319275, 0.5302023887634277)
('Psalms', 146, 3, 1, 0.7310504913330078, 0.2689494788646698)
('Psalms', 146, 10, 1, 0.7308939695358276, 0.26910606026649475)
('Psalms', 146, 4, 1, 0.7147825360298157, 0.28521740436553955)
('Psalms', 146, 2, 0, 0.2690059542655945, 0.7309939861297607)
('Psalms', 146, 5, 0, 0.2756005525588989, 0.7243994474411011)
('Psalms', 30, 9, 0, 0.2689414322376251, 0.7310585975646973)
('Psalms', 30, 11, 0, 0.26894405484199524, 0.7310559153556824)
('Psalms', 30, 7, 0, 0.2692694365978241, 0.7307305932044983)
('Psalms', 30, 12, 0, 0.2692525386810303, 0.7307474613189697)
('Psalms', 30, 2, 0, 0.2850539982318878, 0.714945912361145)
('Psalms', 30, 5, 1, 0.7158778309822083, 0.284122109413147)
('Psal