<a href="https://colab.research.google.com/github/gorogoro-uk/NLP-project/blob/master/Sherlock_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Sherlock Holmes Text prediction project

In [2]:
# mount google drive
from google.colab import drive
drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [19]:
# import code
import os
from pathlib import Path
import string
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import nltk
nltk.download('punkt')
from nltk.corpus import stopwords
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional
from tensorflow.keras import regularizers

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [4]:
# define working directory
os.chdir('./drive/My Drive/sherlock')
print(f"listdir:    {os.listdir(os.getcwd())}")
STORY_FILE = './sh_short.txt'
print(f"STORY_FILE: {STORY_FILE}")

listdir:    ['sh_short.txt']
STORY_FILE: ./sh_short.txt


In [6]:
# import story text file
with open(STORY_FILE) as reader:
    raw_data = reader.read()   # raw_data is very long string

print(f"raw_data type:    {type(raw_data)}")
print(f"raw_data length:  {len(raw_data)}")
print(raw_data[:300])

raw_data type:    <class 'str'>
raw_data length:  46514

I. A SCANDAL IN BOHEMIA.
I.
To Sherlock Holmes she is always _the_ woman. I have seldom heard him
mention her under any other name. In his eyes she eclipses and
predominates the whole of her sex. It was not that he felt any emotion
akin to love for Irene Adler. All emotions, and that one particular


In [7]:
# remove chapter numbers [I. II. III. etc]
tx1 = raw_data.replace('II.','')
tx1 = tx1.replace('III.','')
tx1 = tx1.replace('IV.','')
tx1 = tx1.replace('V.','')
tx1 = tx1.replace('VI.','')
tx1 = tx1.replace('VII.','')
tx1 = tx1.replace('VIII.','')
tx1 = tx1.replace('IX.','')
tx1 = tx1.replace(' X.',' ')
tx1 = tx1.replace('XI.','')
tx1 = tx1.replace('XII.','')
tx1 = tx1.replace('\nI.','\n')
print(f"tx1:\n{tx1[:300]}")

# replace end of line with space: out=long string
# keep sentence ending punctuation [.?!] to allow sentence splitting below
# include other angled quote marks found in the text [”“‘’]
print(f"punctuation:  {string.punctuation}")
punc_string = "\"#$%&'()*+,-/:;<=>@[\]^_`{|}~'”“‘’—"
print(f"punc string:  {punc_string}")
tt = tx1.maketrans('', '', punc_string)
tx2 = tx1.translate(tt)
tx2 = tx2.replace('\n',' ')
print(f"tx2:\n{tx1[:300]}")

# split into sentences: out=list of strings, each one a sentence
tx3 = nltk.tokenize.sent_tokenize(tx2)
print(f"tx3:\n{tx3[:300]}")
print(f"type:    {type(tx3)}")
print(f"length:  {len(tx3)}")

# remove sentence ending punctuation [.?!]
# make lowercase
# remove white space at start/end
tx4 = [s.translate(s.maketrans('','','.?!')) for s in tx3]
tx4 = [s.lower() for s in tx4]
tx4 = [s.strip() for s in tx4]
print(f"tx4:\n{tx4[:300]}")

tx1:

 A SCANDAL IN BOHEMIA.

To Sherlock Holmes she is always _the_ woman. I have seldom heard him
mention her under any other name. In his eyes she eclipses and
predominates the whole of her sex. It was not that he felt any emotion
akin to love for Irene Adler. All emotions, and that one particularly,

punctuation:  !"#$%&'()*+,-./:;<=>?@[\]^_`{|}~
punc string:  "#$%&'()*+,-/:;<=>@[\]^_`{|}~'”“‘’—
tx2:

 A SCANDAL IN BOHEMIA.

To Sherlock Holmes she is always _the_ woman. I have seldom heard him
mention her under any other name. In his eyes she eclipses and
predominates the whole of her sex. It was not that he felt any emotion
akin to love for Irene Adler. All emotions, and that one particularly,

tx3:
['  A SCANDAL IN BOHEMIA.', 'To Sherlock Holmes she is always the woman.', 'I have seldom heard him mention her under any other name.', 'In his eyes she eclipses and predominates the whole of her sex.', 'It was not that he felt any emotion akin to love for Irene Adler.', 'All emotions 

In [8]:
# remove stopwords
# keep all stopwords (may activate in future)
tx5 = tx4

In [9]:
# tokenize words, create word_index
tokenizer = Tokenizer(oov_token = '<oov>')
tokenizer.fit_on_texts(tx5)
total_words = len(tokenizer.word_index)
print(f"Total words: {total_words}")
print(f"tokenized text: {tokenizer.get_config()}")

# create numer sequences from word sentences
tx6 = tokenizer.texts_to_sequences(tx5)
print(f"tx6, tokenized sentences:\n{tx6[:300]}")

Total words: 1951
tokenized text: {'num_words': None, 'filters': '!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', 'lower': True, 'split': ' ', 'char_level': False, 'oov_token': '<oov>', 'document_count': 669, 'word_counts': '{"a": 225, "scandal": 4, "in": 156, "bohemia": 8, "to": 245, "sherlock": 11, "holmes": 47, "she": 71, "is": 86, "always": 5, "the": 446, "woman": 11, "i": 259, "have": 62, "seldom": 3, "heard": 8, "him": 24, "mention": 1, "her": 48, "under": 8, "any": 4, "other": 12, "name": 6, "his": 105, "eyes": 9, "eclipses": 1, "and": 255, "predominates": 1, "whole": 4, "of": 236, "sex": 1, "it": 153, "was": 124, "not": 60, "that": 103, "he": 109, "felt": 2, "emotion": 2, "akin": 2, "love": 5, "for": 46, "irene": 13, "adler": 13, "all": 28, "emotions": 1, "one": 27, "particularly": 2, "were": 26, "abhorrent": 1, "cold": 2, "precise": 1, "but": 53, "admirably": 2, "balanced": 1, "mind": 6, "take": 7, "most": 11, "perfect": 1, "reasoning": 3, "observing": 2, "machine": 1, "world": 1, "has

In [10]:
# create n-grams of increasing length from each sentence sequence
tx7 = []
for sent_seq in tx6:
   for i in range(1, len(sent_seq)):
        n_gram_seq = sent_seq[:i+1]
        tx7.append(n_gram_seq)

print(f"tx7, n-gram sequences:\n{tx7[:300]}")

tx7, n-gram sequences:
[[7, 243], [7, 243, 8], [7, 243, 8, 134], [5, 95], [5, 95, 28], [5, 95, 28, 17], [5, 95, 28, 17, 15], [5, 95, 28, 17, 15, 198], [5, 95, 28, 17, 15, 198, 2], [5, 95, 28, 17, 15, 198, 2, 96], [3, 19], [3, 19, 321], [3, 19, 321, 135], [3, 19, 321, 135, 57], [3, 19, 321, 135, 57, 767], [3, 19, 321, 135, 57, 767, 27], [3, 19, 321, 135, 57, 767, 27, 136], [3, 19, 321, 135, 57, 767, 27, 136, 244], [3, 19, 321, 135, 57, 767, 27, 136, 244, 90], [3, 19, 321, 135, 57, 767, 27, 136, 244, 90, 172], [8, 13], [8, 13, 119], [8, 13, 119, 17], [8, 13, 119, 17, 768], [8, 13, 119, 17, 768, 4], [8, 13, 119, 17, 768, 4, 769], [8, 13, 119, 17, 768, 4, 769, 2], [8, 13, 119, 17, 768, 4, 769, 2, 245], [8, 13, 119, 17, 768, 4, 769, 2, 245, 6], [8, 13, 119, 17, 768, 4, 769, 2, 245, 6, 27], [8, 13, 119, 17, 768, 4, 769, 2, 245, 6, 27, 770], [9, 11], [9, 11, 22], [9, 11, 22, 14], [9, 11, 22, 14, 12], [9, 11, 22, 14, 12, 459], [9, 11, 22, 14, 12, 459, 244], [9, 11, 22, 14, 12, 459, 244, 460], 

In [12]:
# pre-pad sequences to common size
# convert to numpy array
max_seq_len = max([len(x) for x in tx7])
print(max_seq_len)
tx8 = np.array(pad_sequences(tx7, maxlen=max_seq_len, padding='pre'))
print(f"tx8, padded sequences:\n{tx8[:300]}")
print(tx8.dtype)

79
tx8, padded sequences:
[[  0   0   0 ...   0   7 243]
 [  0   0   0 ...   7 243   8]
 [  0   0   0 ... 243   8 134]
 ...
 [  0   0   0 ... 830  42   2]
 [  0   0   0 ...  42   2 335]
 [  0   0   0 ...   2 335   6]]
int32


In [13]:
# create predictors and label
features = tx8[:,:-1]
print(f"features:\n{features.shape}\n{features[:5,:]}")
labels = np.expand_dims(tx8[:,-1], axis=1)
print(f"labels:\n{labels.shape}\n{labels}")

# label = ku.to_categorical(label, num_classes=total_words)    # one hot encoding

features:
(7853, 78)
[[  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   7]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   7 243]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   7 243   8]
 [  0   0 

In [25]:
# prepare model
embedding_size = 100

sh_model = Sequential([
  Embedding(total_words, embedding_size, input_length=max_seq_len-1),                   # [batch, sentence(79), embedding(100)] 
  Bidirectional(LSTM(150, return_sequences = True)),                                    # [batch, 79, 300]
  Dropout(0.2),                                                                         # [batch, 79, 300]
  LSTM(100),                                                                            # [batch, 100]
  Dense(total_words/2, activation='relu', kernel_regularizer=regularizers.l2(0.01)),    # [batch, 975]
  Dense(total_words, activation='softmax')                                              # [batch, 1951]
])                                             

sh_model.compile(loss='categorical_crossentropy', optimizer='Adam', metrics=['accuracy'])
