# Natural Langauge Processing: Text Prediction

----

### Imports

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf

from re import sub
from sklearn.model_selection import train_test_split

# Transforms strings into numerical tokens.
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

----

### Preprocessing

In [2]:
# Getting the dataset and creating X,y
dataset = pd.read_csv('Restaurant_Reviews.tsv', delimiter = '\t', quoting = 3)
corpus = []

# Preprocessing the reviews.
for review in dataset.index:
    review = dataset['Review'][review].lower()
    review = sub('[^a-z]', ' ', review)

    review = review.split()    
    for index in range(1, len(review)):
        corpus.append(review[:index + 1])

----

### Text Tokenization

In [3]:
max_words = 500
oov_token = '<OOV>'
padding_type = 'pre'
embedding_dim = 8

# Creating the Word Index
tokenizer = Tokenizer(num_words = max_words, oov_token = oov_token)
tokenizer.fit_on_texts(corpus)
word_index = tokenizer.word_index

# Defining Input and Word Index Length
max_len = max(len(review) for review in corpus) - 1
total_words = len(word_index) + 1

# Creating X and y
corpus = tokenizer.texts_to_sequences(corpus)
X = [review[:-1] for review in corpus]
y = [review[-1:] for review in corpus]

X = pad_sequences(X, padding = padding_type, maxlen = max_len)
y = tf.keras.utils.to_categorical(y, num_classes = total_words)

----

### Creating the Model

In [4]:
model = tf.keras.models.Sequential([
    tf.keras.layers.Embedding(total_words, 64, input_length = max_len),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
    tf.keras.layers.Dense(total_words / 2, activation = 'relu'),
    tf.keras.layers.Dense(total_words, activation = 'sigmoid')
])

model.compile(optimizer = 'adam', loss = 'categorical_crossentropy', metrics = ['accuracy'])

----

### Viewing Model Details

In [5]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 31, 64)            129536    
_________________________________________________________________
bidirectional (Bidirectional (None, 128)               66048     
_________________________________________________________________
dense (Dense)                (None, 1012)              130548    
_________________________________________________________________
dense_1 (Dense)              (None, 2024)              2050312   
Total params: 2,376,444
Trainable params: 2,376,444
Non-trainable params: 0
_________________________________________________________________


----

### Training the Neural Network & Testing

In [6]:
results =  model.fit(X, y, epochs = 50)

Train on 10040 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


----

### Predicting Text

In [7]:
seed_text = 'The selection on the menu was great and so were'
next_words = 2

for _ in range(next_words):
    tokenized_text = tokenizer.texts_to_sequences([seed_text])[0]
    token_list = pad_sequences([tokenized_text], maxlen = max_len, padding = padding_type)
    prediction = model.predict_classes(token_list)
    
    for word, index in word_index.items():
        if index == prediction:
            seed_text += (' ' + word)
            break
            
print(seed_text)

The selection on the menu was great and so were the prices
