In [1]:

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Flatten, LSTM, Dropout, Activation
from keras.layers.embeddings import Embedding
import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
import re
import string
import numpy as np
import pandas as pd
from sklearn.manifold import TSNE
from sklearn.model_selection import train_test_split
np.random.seed(45)

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
df = pd.read_csv('./bible/bible_data_set.csv')
yt =  df.iloc[:,1]
yt1 = df.iloc[:,4]

In [3]:
data = pd.concat ([yt1, yt], axis =1)

In [4]:
OT_books = ['Genesis', 'Exodus', 'Leviticus', 'Numbers', 'Deuteronomy', 'Joshua', 'Judges',
            'Ruth', '1 Samuel', '2 Samuel', '1 Kings', '2 Kings', '1 Chronicles', '2 Chronicles',
            'Ezra', 'Nehemiah', 'Esther', 'Job', 'Psalms', 'Proverbs', 'Ecclesiastes',
            'Song of Solomon', 'Isaiah', 'Jeremiah', 'Lamentations', 'Ezekiel', 'Daniel',
            'Hosea', 'Joel', 'Amos', 'Obadiah', 'Jonah', 'Micah', 'Nahum', 'Habakkuk',
            'Zephaniah', 'Haggai',    'Zechariah',    'Malachi']

In [5]:
NT_books = ['Matthew', 'Mark', 'Luke', 'John', 'Acts', 'Romans', '1 Corinthians',
            '2 Corinthians', 'Galatians', 'Ephesians', 'Philippians', 'Colossians',
            '1 Thessalonians', '2 Thessalonians', '1 Timothy', '2 Timothy', 'Titus', 'Philemon',
            'Hebrews', 'James', '1 Peter', '2 Peter', '1 John', '2 John', '3 John', 'Jude',
            'Revelation']

In [6]:
X_train, X_test, y_train, y_test = train_test_split(data['text'], data['book'],
                                                    stratify=data['book'], 
                                                    test_size=0.3)

In [7]:
y_train_label = y_train.map(lambda x : 1 if x in OT_books else 2)
y_test_label = y_test.map(lambda x : 1 if x in OT_books else 2)

In [8]:
def clean_text(text):
    translator = str.maketrans('','', string.punctuation)
    text = text.translate(translator)
    text = text.lower().split()
    stops = set(stopwords.words("english"))
    text = [w for w in text if not w in stops and len(w) >= 3]
    text = " ".join(text)
        # Clean the text
    text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\^", " ^ ", text)
    text = re.sub(r"\+", " + ", text)
    text = re.sub(r"\-", " - ", text)
    text = re.sub(r"\=", " = ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
    text = re.sub(r":", " : ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r" u s ", " american ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e - mail", "email", text)
    text = re.sub(r"j k", "jk", text)
    text = re.sub(r"\s{2,}", " ", text)
    
    text = text.split()
    stemmer = SnowballStemmer('english')
    stemmed_words = [stemmer.stem(word) for word in text]
    text = " ".join(stemmed_words)
    return text

In [9]:
#data['text'] = data['text'].map(lambda x: clean_text(x))
X_train_clean = X_train.map(lambda x: clean_text(x))
X_test_clean = X_test.map(lambda x: clean_text(x))

In [10]:
vocabulary_size = 20000
tokenizer_train = Tokenizer(num_words= vocabulary_size)
tokenizer_train.fit_on_texts(X_train_clean)
sequences_train = tokenizer_train.texts_to_sequences(X_train_clean)
outcome_train = pad_sequences(sequences_train, maxlen=100)
tokenizer_test = Tokenizer(num_words= vocabulary_size)
tokenizer_test.fit_on_texts(X_test_clean)
sequences_test = tokenizer_test.texts_to_sequences(X_test_clean)
outcome_test = pad_sequences(sequences_test, maxlen=100)


In [11]:
model_lstm = Sequential()
model_lstm.add(Embedding(20000, 100, input_length=100))
model_lstm.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model_lstm.add(Dense(1, activation='sigmoid'))
model_lstm.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [12]:
model_lstm.fit(outcome_train, np.array(y_train_label), validation_split=0.4, epochs=3)

Train on 13062 samples, validate on 8709 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x123e6c710>

In [13]:
scores = model_lstm.evaluate(outcome_test, np.array(y_test_label), verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))

Accuracy: 74.42%


In [14]:
word_list = []
for word, i in tokenizer_train.word_index.items():
    word_list.append(word)

In [15]:
type(word_list)
light = word_list.index('light')
dark = word_list.index('dark')

In [16]:
lstm_embds = model_lstm.layers[0].get_weights()[0]

In [17]:
lstm_tsne_embds = TSNE(n_components=2).fit_transform(lstm_embds)


In [19]:
from sklearn.metrics.pairwise import cosine_distances

In [24]:
x =lstm_tsne_embds[light,].reshape(1,-1)
y = lstm_tsne_embds[dark,].reshape(1,-1)
print(cosine_distances(x,y))

[[0.12725514]]
