## Imports & Configs

In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
import re # para mexer com regexp

In [2]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')

from nltk.corpus import stopwords
stop_words = stopwords.words('english')

from nltk.tokenize import word_tokenize

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
import tensorflow as tf

device_name = tf.test.gpu_device_name()

if device_name != '/device:GPU:0':
  raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))

Found GPU at: /device:GPU:0


## Loading data


In [4]:
# function to download the book from gutenberg's project
import urllib.request as urllib2

def gutenberg_download(link):
  ### https://www.gutenberg.org/

  start_string = ' START OF THE PROJECT GUTENBERG EBOOK '
  end_string = ' END OF THE PROJECT GUTENBERG EBOOK '
  data = urllib2.urlopen(link) 
  text = (data.read()).decode('utf-8')
  
  cut_start = text.find(start_string)+len(start_string)
  cut_end = text.find(end_string)+len(end_string)

  text = text[cut_start:cut_end]
  return text

In [5]:
books = [
  {"Alice’s Adventures in Wonderland":'https://www.gutenberg.org/files/11/11-0.txt'},
  {'Pride and Prejudice by Jane Austen': 'https://www.gutenberg.org/files/1342/1342-0.txt'},
  {'Frankenstein; Or, The Modern Prometheus by Mary Wollstonecraft Shelley': 'https://www.gutenberg.org/files/84/84-0.txt'},
  {'The Adventures of Sherlock Holmes by Arthur Conan Doyle':'https://www.gutenberg.org/files/1661/1661-0.txt'},
  {"Robert's Rules of Order by Henry M. Robert": 'https://www.gutenberg.org/cache/epub/9097/pg9097.txt'}
]

In [6]:
df_books = pd.DataFrame(columns=['title', 'link', 'text'])

for i in books:
  item = list(i.items())[0]

  df_books = df_books.append({'title': item[0], 'link': item[1], 'text': gutenberg_download(item[1])}, ignore_index=True)

df_books

Unnamed: 0,title,link,text
0,Alice’s Adventures in Wonderland,https://www.gutenberg.org/files/11/11-0.txt,ALICE’S ADVENTURES IN WONDERLAND ***\r\n\r\n[I...
1,Pride and Prejudice by Jane Austen,https://www.gutenberg.org/files/1342/1342-0.txt,PRIDE AND PREJUDICE ***\r\n\r\n\r\n\r\n\r\nTHE...
2,"Frankenstein; Or, The Modern Prometheus by Mar...",https://www.gutenberg.org/files/84/84-0.txt,FRANKENSTEIN ***\r\n\r\n\r\n\r\n\r\nFrankenste...
3,The Adventures of Sherlock Holmes by Arthur Co...,https://www.gutenberg.org/files/1661/1661-0.txt,THE ADVENTURES OF SHERLOCK HOLMES ***\r\n\r\nc...
4,Robert's Rules of Order by Henry M. Robert,https://www.gutenberg.org/cache/epub/9097/pg90...,ROBERT’S RULES OF ORDER ***\r\n\r\n\r\n\r\n\r\...


In [7]:
df_books.text[0]



## Process Text

In [8]:
# tokenização

def tokenize(text):
  tokens = nltk.tokenize.word_tokenize(text)
  return tokens


In [9]:
# Remoção StopWords

def remove_stop_words(tokens):
  result = [i for i in tokens if not i in stop_words]
  result = [e for e in result if e.isalnum()]
  return result

In [10]:
# Stemming
nltk.download('rslp')
stemmer = nltk.stem.RSLPStemmer()

def stemming(result):
  stemado = [stemmer.stem(i) for i in result]
  # fd = nltk.FreqDist(w.lower() for w in stemado if w not in stop_words)

  return stemado

[nltk_data] Downloading package rslp to /root/nltk_data...
[nltk_data]   Package rslp is already up-to-date!


In [11]:
# prepare pipeline

def prepare_my_data(df, columns):
  for column in columns:
    df[f'{column}_processed'] = df[column] \
      .apply(lambda x: tokenize(x))

    df[f'{column}_processed'] = df[f'{column}_processed'] \
      .apply(lambda x: remove_stop_words(x))

    df[f'{column}_processed'] = df[f'{column}_processed'] \
    .apply(lambda x: stemming(x))

    df[f'{column}_processed'] = df[f'{column}_processed'] \
      .apply(lambda x: ' '.join(x))

  return df

df = prepare_my_data(df_books, ['text'])

In [12]:
df.head(1)

Unnamed: 0,title,link,text,text_processed
0,Alice’s Adventures in Wonderland,https://www.gutenberg.org/files/11/11-0.txt,ALICE’S ADVENTURES IN WONDERLAND ***\r\n\r\n[I...,alic s adventur in wonderland illustration ali...


## Modeling

#### imports

In [35]:
import tensorflow as tf
from tensorflow import keras

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

from keras.layers import Dense, LSTM, Embedding, Bidirectional
from keras.models import Sequential

from tensorflow.keras.optimizers import Adam

#### Creating a dictionary with all books

* since the size of documents are too long, wi'll be setting a vocubalary with top 1000 words in the entire dataset

In [14]:
from sklearn.feature_extraction.text import CountVectorizer

def get_top_n_words(corpus, n=None):
    """
    List the top n words in a vocabulary according to occurrence in a text corpus.
    
    get_top_n_words(["I love Python", "Python is a language programming", "Hello world", "I love the world"]) -> 
    [('python', 2),
     ('world', 2),
     ('love', 2),
     ('hello', 1),
     ('is', 1),
     ('programming', 1),
     ('the', 1),
     ('language', 1)]
    """
    vec = CountVectorizer().fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in     vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]

In [36]:
# set the maximun words in dict. It will take the most frequent words

max_words = 1000

tokenizer = Tokenizer()

data = ''.join([i for i in df_books.text])
corpus = data.lower().split(' ')

# filtering top words since we want to train in various documnets
corpus = get_top_n_words(corpus, max_words)
corpus = [i[0] for i in corpus]

tokenizer.fit_on_texts(corpus)

word_index = tokenizer.word_index 
total_words = len(tokenizer.word_index) + 1

In [37]:
len(word_index)

998

#### global helper functions

In [38]:
# helper function to make sequences of words
max_sequence_len = 4

def make_setences(data, max_seq_len=100):
  corpus = data.lower().split('\n')
  
  input_sequences = []
  for line in corpus:
    token_list = tokenizer.texts_to_sequences([line])[0]
    
    for i in range(1, len(token_list)):
      n_gram_sequence = token_list[:i+1]
      input_sequences.append(n_gram_sequence) 
  
  input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_seq_len, padding='pre'))

  xs = input_sequences[:, :-1]
  labels = input_sequences[:, -1]

  ys = tf.keras.utils.to_categorical(labels, num_classes=total_words)
  return xs, ys

In [39]:
# helper function to predict new texts
def predict_text():
  input_text = input().strip().lower()
  encoded_text = tokenizer.texts_to_sequences([input_text])[0]
  pad_encoded = pad_sequences([encoded_text], maxlen=max_sequence_len-1, truncating='pre')

  for i in (model.predict(pad_encoded)[0]).argsort()[-3:][::-1]:
    pred_word = tokenizer.index_word[i]
    print("Next word suggestion:", pred_word)

In [40]:
import matplotlib.pyplot as plt

def plot_loss(history):
  history_dict = history.history
  history_dict.keys()

  acc = history_dict['accuracy']
  val_acc = history_dict['val_accuracy']
  loss = history_dict['loss']
  val_loss = history_dict['val_loss']

  epochs = range(1, len(acc) + 1)

  # "bo" is for "blue dot"
  plt.plot(epochs, loss, 'bo', label='Training loss')
  # b is for "solid blue line"
  plt.plot(epochs, val_loss, 'b', label='Validation loss')
  plt.title('Training and validation loss')
  plt.xlabel('Epochs')
  plt.ylabel('Loss')
  plt.legend()

  plt.show()


In [41]:

def plot_accuracy(history):
  history_dict = history.history
  history_dict.keys()

  acc = history_dict['accuracy']
  val_acc = history_dict['val_accuracy']
  loss = history_dict['loss']
  val_loss = history_dict['val_loss']
  
  epochs = range(1, len(acc) + 1)

  plt.plot(epochs, acc, 'bo', label='Training acc')
  plt.plot(epochs, val_acc, 'b', label='Validation acc')
  plt.title('Training and validation accuracy')
  plt.xlabel('Epochs')
  plt.ylabel('Accuracy')
  plt.legend()

  plt.show()


#### Base Model

In [45]:
# base model
model = Sequential([
    Embedding(total_words, 240, input_length=max_sequence_len-1),
    Bidirectional(LSTM(75)),
    Dense(total_words, activation='softmax'),
])
adam = Adam(lr=0.01)
model.compile(loss='categorical_crossentropy', optimizer=adam, metrics=['accuracy'])
model.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 3, 240)            239760    
                                                                 
 bidirectional_2 (Bidirectio  (None, 150)              189600    
 nal)                                                            
                                                                 
 dense_2 (Dense)             (None, 999)               150849    
                                                                 
Total params: 580,209
Trainable params: 580,209
Non-trainable params: 0
_________________________________________________________________


  super(Adam, self).__init__(name, **kwargs)


#### Training with Alice's Book Model

In [46]:
# making setences only for alices book
book = 0 

xs, ys = make_setences(df_books.text[book], max_sequence_len)
print(xs.shape, ys.shape)

(16638, 3) (16638, 999)


In [47]:
history = model.fit(xs, ys, epochs=30, verbose=1)
model.save('alice_model.h5')


Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


In [48]:
predict_text()

he is not
Next word suggestion: to
Next word suggestion: the
Next word suggestion: this


In [49]:
predict_text()


she is not
Next word suggestion: the
Next word suggestion: to
Next word suggestion: of


In [50]:
predict_text()

this is a good
Next word suggestion: thing
Next word suggestion: opportunity
Next word suggestion: character


In [51]:
predict_text()


today is the
Next word suggestion: same
Next word suggestion: thing
Next word suggestion: mock


In [54]:
predict_text()


let's take the 
Next word suggestion: and
Next word suggestion: of
Next word suggestion: queen


#### Adding Pride and Judice Book 

In [None]:
from tensorflow import keras
model = keras.models.load_model('/content/alice_model.h5')


In [52]:
# making setences only for alices book
book = 1 

xs, ys = make_setences(df_books.text[book], max_sequence_len)
print(xs.shape, ys.shape)

(80241, 3) (80241, 999)


In [53]:
history = model.fit(xs, ys, epochs=30, verbose=1)
model.save('alice__judit_model.h5')


Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


In [54]:
predict_text()


he is not
Next word suggestion: have
Next word suggestion: know
Next word suggestion: be


In [55]:
predict_text()


she is not
Next word suggestion: know
Next word suggestion: have
Next word suggestion: listen


In [57]:
predict_text()


let me take
Next word suggestion: it
Next word suggestion: never
Next word suggestion: the


In [None]:
predict_text()


whats next
Next word suggestion: morning
Next word suggestion: day
Next word suggestion: to


In [69]:
model.save('alice_judit_model.h5')

#### Adding Frankestein's Book

In [16]:
from tensorflow import keras
model = keras.models.load_model('/content/alice_judit_model.h5')


In [58]:
# making setences only for alices book
book = 2

xs, ys = make_setences(df_books.text[book], max_sequence_len)
print(xs.shape, ys.shape)

(44862, 3) (44862, 999)


In [59]:
history = model.fit(xs, ys, epochs=30, verbose=1)
model.save('alice_judit_frankestein_model.h5')

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


In [60]:
predict_text()


i love my
Next word suggestion: father
Next word suggestion: heart
Next word suggestion: and


In [62]:
predict_text()


she is
Next word suggestion: to
Next word suggestion: not
Next word suggestion: the


In [64]:
predict_text()


he is
Next word suggestion: to
Next word suggestion: not
Next word suggestion: the


In [65]:
predict_text()


today is good
Next word suggestion: held
Next word suggestion: mean
Next word suggestion: read


#### Adding The Adventures of Sherlock Holmes Book

In [66]:
# making setences only for alices book
book = 3

xs, ys = make_setences(df_books.text[book], max_sequence_len)
print(xs.shape, ys.shape)

(64891, 3) (64891, 999)


In [67]:
history = model.fit(xs, ys, epochs=30, verbose=1)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


In [68]:
predict_text()


i love my
Next word suggestion: wife
Next word suggestion: father
Next word suggestion: surprise


In [69]:
predict_text()


she is
Next word suggestion: i
Next word suggestion: the
Next word suggestion: to


In [70]:
predict_text()


he is
Next word suggestion: to
Next word suggestion: i
Next word suggestion: and


In [71]:
predict_text()


he is not
Next word suggestion: been
Next word suggestion: be
Next word suggestion: know


In [72]:
predict_text()


she is not
Next word suggestion: easily
Next word suggestion: and
Next word suggestion: the


#### Adding Robert's Rules Book

In [73]:
# making setences only for alices book
book = 4

xs, ys = make_setences(df_books.text[book], max_sequence_len)
print(xs.shape, ys.shape)

(24472, 3) (24472, 999)


In [74]:
history = model.fit(xs, ys, epochs=30, verbose=1)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


In [75]:
predict_text()


i love my
Next word suggestion: order
Next word suggestion: point
Next word suggestion: find


In [76]:
predict_text()


she sad to
Next word suggestion: the
Next word suggestion: reconsider
Next word suggestion: adjourn


In [77]:
predict_text()


he sad to 
Next word suggestion: the
Next word suggestion: be
Next word suggestion: adjourn


In [79]:
predict_text()


my name is
Next word suggestion: put
Next word suggestion: etc
Next word suggestion: before


#### Fitting one more time with all books

In [80]:
# making setences only for alices book
xs, ys = make_setences(''. join([i for i in df_books.text]), max_sequence_len)
print(xs.shape, ys.shape)

(231107, 3) (231107, 999)


In [81]:
history = model.fit(xs, ys, epochs=30, verbose=1)
model.save('all_books_final.h5')

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
 926/7223 [==>...........................] - ETA: 39s - loss: 4.8317 - accuracy: 0.1268

KeyboardInterrupt: ignored

In [82]:
predict_text()


i love my
Next word suggestion: friend
Next word suggestion: dear
Next word suggestion: and


In [83]:
predict_text()


she sad to
Next word suggestion: the
Next word suggestion: be
Next word suggestion: her


In [84]:
predict_text()


he sad to
Next word suggestion: the
Next word suggestion: be
Next word suggestion: her


In [85]:
predict_text()


i'm good in
Next word suggestion: the
Next word suggestion: my
Next word suggestion: his


#### trying a simple model

In [86]:
from keras.layers import Dense, LSTM, Embedding, Bidirectional
from keras.models import Sequential
from tensorflow.keras.optimizers import Adam

model_1 = Sequential([
    Embedding(total_words, 32, input_length=max_sequence_len-1),
    LSTM(16),
    Dense(total_words, activation='softmax'),
])
adam = Adam(lr=0.01)
model_1.compile(loss='categorical_crossentropy', optimizer=adam, metrics=['accuracy'])
model_1.summary()

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_3 (Embedding)     (None, 3, 32)             31968     
                                                                 
 lstm_3 (LSTM)               (None, 16)                3136      
                                                                 
 dense_3 (Dense)             (None, 999)               16983     
                                                                 
Total params: 52,087
Trainable params: 52,087
Non-trainable params: 0
_________________________________________________________________


  super(Adam, self).__init__(name, **kwargs)


In [87]:
history_1 = model_1.fit(xs, ys, epochs=30, verbose=1)


Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30

KeyboardInterrupt: ignored

In [88]:
# helper function to predict new texts
def predict_text_1():
  input_text = input().strip().lower()
  encoded_text = tokenizer.texts_to_sequences([input_text])[0]
  pad_encoded = pad_sequences([encoded_text], maxlen=max_sequence_len-1, truncating='pre')

  for i in (model_1.predict(pad_encoded)[0]).argsort()[-3:][::-1]:
    pred_word = tokenizer.index_word[i]
    print("Next word suggestion:", pred_word)

predict_text_1()

she is 
Next word suggestion: to
Next word suggestion: in
Next word suggestion: not


In [89]:
predict_text_1()


he is
Next word suggestion: to
Next word suggestion: in
Next word suggestion: not


In [90]:
predict_text_1()


tomorow wi'll be
Next word suggestion: to
Next word suggestion: in
Next word suggestion: the


In [91]:
predict_text_1()


my friend have
Next word suggestion: been
Next word suggestion: the
Next word suggestion: to


#### Trying a roubust model

In [92]:
from keras.layers import Dense, LSTM, Embedding, Bidirectional
from keras.models import Sequential
from tensorflow.keras.optimizers import Adam

model_2 = Sequential([
    Embedding(total_words, 240, input_length=max_sequence_len-1),
    Bidirectional(LSTM(150)),
    Dense(300, activation='relu'),
    Dense(total_words, activation='softmax'),
])
adam = Adam(lr=0.01)
model_2.compile(loss='categorical_crossentropy', optimizer=adam, metrics=['accuracy'])
model_2.summary()

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_4 (Embedding)     (None, 3, 240)            239760    
                                                                 
 bidirectional_3 (Bidirectio  (None, 300)              469200    
 nal)                                                            
                                                                 
 dense_4 (Dense)             (None, 300)               90300     
                                                                 
 dense_5 (Dense)             (None, 999)               300699    
                                                                 
Total params: 1,099,959
Trainable params: 1,099,959
Non-trainable params: 0
_________________________________________________________________


  super(Adam, self).__init__(name, **kwargs)


In [93]:
history_2 = model_2.fit(xs, ys, epochs=30, verbose=1)


Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


## Conclusions:

* small books have a better accuracy than larger books
* memory was a problem, so i have to limit the maximum words in dictionary
* the roubust model is worse than the simplest model