<a href="https://colab.research.google.com/github/chrismoroney/TensorFlow-practice/blob/main/movie_reviews.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
sentences = [
    'Today is a fantastic day.',
    'Today is a horrible day.',
    'Be a good person.',
    'I want to know what it is like to be a new person.'
]

In [None]:
tokenizer = Tokenizer(num_words = 100, oov_token = "<OOV>")
tokenizer.fit_on_texts(sentences)
word_idx = tokenizer.word_index
print(word_idx)

{'<OOV>': 1, 'a': 2, 'is': 3, 'today': 4, 'day': 5, 'be': 6, 'person': 7, 'to': 8, 'fantastic': 9, 'horrible': 10, 'good': 11, 'i': 12, 'want': 13, 'know': 14, 'what': 15, 'it': 16, 'like': 17, 'new': 18}


In [None]:
sequences = tokenizer.texts_to_sequences(sentences)

In [None]:
print(sequences)

[[4, 3, 2, 9, 5], [4, 3, 2, 10, 5], [6, 2, 11, 7], [12, 13, 8, 14, 15, 16, 3, 17, 8, 6, 2, 18, 7]]


In [None]:
test_data = ["Will today be a good day?"]

In [None]:
test_sequences = tokenizer.texts_to_sequences(test_data)

In [None]:
print(test_sequences)

[[1, 4, 6, 2, 11, 5]]


<h1> Padding </h1>
Same as above, but using padding now

In [None]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
padded_output = pad_sequences(sequences)
print(padded_output)

[[ 0  0  0  0  0  0  0  0  4  3  2  9  5]
 [ 0  0  0  0  0  0  0  0  4  3  2 10  5]
 [ 0  0  0  0  0  0  0  0  0  6  2 11  7]
 [12 13  8 14 15 16  3 17  8  6  2 18  7]]


In [None]:
padded_output = pad_sequences(sequences, padding = 'post', maxlen = 8)
print(padded_output)

[[ 4  3  2  9  5  0  0  0]
 [ 4  3  2 10  5  0  0  0]
 [ 6  2 11  7  0  0  0  0]
 [16  3 17  8  6  2 18  7]]


In [None]:
padded_output = pad_sequences(sequences, padding = 'post', maxlen = 6, truncating='post')
print(padded_output)

[[ 4  3  2  9  5  0]
 [ 4  3  2 10  5  0]
 [ 6  2 11  7  0  0]
 [12 13  8 14 15 16]]


<h1> Stopwords and cleaning up text </h1>

In [None]:
from bs4 import BeautifulSoup
soup = BeautifulSoup("over the rainbow and into the woods.")
sentence = soup.get_text()
print(sentence)

over the rainbow and into the woods.


In [None]:
stopwords = ["a", "about", "above", "after", "again", "against", "all", "am", "an", "and", "any", "are", "as", "at",
             "be", "because", "been", "before", "being", "below", "between", "both", "but", "by", "could", "did", "do",
             "does", "doing", "down", "during", "each", "few", "for", "from", "further", "had", "has", "have", "having",
             "he", "hed", "hes", "her", "here", "heres", "hers", "herself", "him", "himself", "his", "how",
             "hows", "i", "id", "ill", "im", "ive", "if", "in", "into", "is", "it", "its", "itself",
             "lets", "me", "more", "most", "my", "myself", "nor", "of", "on", "once", "only", "or", "other", "ought",
             "our", "ours", "ourselves", "out", "over", "own", "same", "she", "shed", "shell", "shes", "should",
             "so", "some", "such", "than", "that", "thats", "the", "their", "theirs", "them", "themselves", "then",
             "there", "theres", "these", "they", "theyd", "theyll", "theyre", "theyve", "this", "those", "through",
             "to", "too", "under", "until", "up", "very", "was", "we", "wed", "well", "were", "weve", "were",
             "what", "whats", "when", "whens", "where", "wheres", "which", "while", "who", "whos", "whom", "why",
             "whys", "with", "would", "you", "youd", "youll", "youre", "youve", "your", "yours", "yourself",
             "yourselves"]

In [None]:
words = sentence.split()
filtered_sentence = ""
for word in words:
  if word not in stopwords:
    filtered_sentence=filtered_sentence+word+" "
sentences.append(filtered_sentence)

In [None]:
print(sentence)
print(filtered_sentence)

over the rainbow and into the woods.
rainbow woods. 


In [None]:
#remove punctuation
sentence = "Hey there! How are you doing?? Would you like a drink?"
import string
table = str.maketrans('', '', string.punctuation)
words = sentence.split()
filtered_sentence = ""
for word in words:
  word = word.translate(table)
  if word not in stopwords:
    filtered_sentence=filtered_sentence+word+" "
sentences.append(filtered_sentence)

In [None]:
print(sentence)
#stopwords and no punctuation
print(filtered_sentence)

Hey there! How are you doing?? Would you like a drink?
Hey How Would like drink 


<h1> Real dataset example </h1>

In [None]:
pip install tensorflow-datasets



In [None]:
import tensorflow_datasets as tfds

In [None]:
imdb_sentences = []
imdb_labels = []
train_data = tfds.as_numpy(tfds.load('imdb_reviews', split="train"))
for data in train_data:
  imdb_sentences.append(str(data['text']))


In [None]:
tokenizer = Tokenizer(num_words = 5000)
tokenizer.fit_on_texts(imdb_sentences)
sequences = tokenizer.texts_to_sequences(imdb_sentences)

In [None]:
print(tokenizer.word_index)



In [None]:
table = str.maketrans('', '' ,string.punctuation)

train_imdb_sentences = []
train_imdb_labels = []
train_data = tfds.as_numpy(tfds.load('imdb_reviews', split="train"))

for item in train_data:
  sentence = str(item['text'].decode('UTF-8').lower())
  soup = BeautifulSoup(sentence)
  sentence = soup.get_text()
  words = sentence.split()
  filtered_sentence = ""
  for word in words:
    word = word.translate(table)
    if word not in stopwords:
      filtered_sentence += word + " "
  train_imdb_sentences.append(filtered_sentence)
  train_imdb_labels.append(item['label'])

  soup = BeautifulSoup(sentence)


In [None]:
test_imdb_sentences = []
test_imdb_labels = []
test_data = tfds.as_numpy(tfds.load('imdb_reviews', split="test"))

for item in test_data:
  sentence = str(item['text'].decode('UTF-8').lower())
  soup = BeautifulSoup(sentence)
  sentence = soup.get_text()
  words = sentence.split()
  filtered_sentence = ""
  for word in words:
    word = word.translate(table)
    if word not in stopwords:
      filtered_sentence += word + " "
  test_imdb_sentences.append(filtered_sentence)
  test_imdb_labels.append(item['label'])

  soup = BeautifulSoup(sentence)


In [None]:
vocab_size = 25000
max_length = 150
trunc_type = 'post'
padding_type = 'post'
oov_token = "<OOV>"

In [None]:
print(train_imdb_sentences[0])
print(train_imdb_labels[0])
print(len(train_imdb_sentences))
print(len(train_imdb_labels))

print(test_imdb_sentences[0])
print(test_imdb_labels[0])
print(len(test_imdb_sentences))
print(len(test_imdb_labels))

absolutely terrible movie dont lured christopher walken michael ironside great actors must simply worst role history even great acting not redeem movies ridiculous storyline movie early nineties us propaganda piece pathetic scenes columbian rebels making cases revolutions maria conchita alonso appeared phony pseudolove affair walken nothing pathetic emotional plug movie devoid real meaning disappointed movies like ruining actors like christopher walkens good name barely sit 
0
25000
25000
films make careers george romero night living dead kevin smith clerks robert rodriguez el mariachi add list onur tukels absolutely amazing dingalingless flawless filmmaking assured professional aforementioned movies havent laughed hard since saw full monty even dont think laughed quite hard speak tukels talent considerable dingalingless chock full double entendres one sit copy script linebyline examination fully appreciate uh breadth width every shot beautifully composed clear sign surehanded director

In [None]:
tokenizer = Tokenizer(num_words = vocab_size, oov_token = oov_token)
tokenizer.fit_on_texts(train_imdb_sentences)

In [None]:
word_idx = tokenizer.word_index
print(word_idx)



In [None]:
training_sequences = tokenizer.texts_to_sequences(train_imdb_sentences)
testing_sequences = tokenizer.texts_to_sequences(test_imdb_sentences)
training_padded = pad_sequences(training_sequences, padding = padding_type)
testing_padded = pad_sequences(testing_sequences, padding = padding_type)

In [None]:
import numpy as np

training_padded = np.array(training_padded)
training_labels = np.array(train_imdb_labels)
testing_padded = np.array(testing_padded)
testing_labels = np.array(test_imdb_labels)

In [None]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, 12),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(24, activation='relu', kernel_regularizer = tf.keras.regularizers.l2(0.01)),
    tf.keras.layers.Dropout(.25),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

In [None]:
adam = tf.keras.optimizers.Adam(learning_rate = 0.0001, beta_1 = 0.9, beta_2  = 0.999, amsgrad=False)

model.compile(loss='binary_crossentropy', optimizer=adam, metrics=['accuracy'])
model.summary

<bound method Model.summary of <keras.engine.sequential.Sequential object at 0x783ab1569030>>

In [None]:
num_epochs = 50
model.fit(training_padded, training_labels, epochs=num_epochs,
          validation_data=(testing_padded, testing_labels))

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x783ab20a9f60>

In [None]:
positive_sentence = ["The movie was amazing!"]

In [None]:
sequence = tokenizer.texts_to_sequences(positive_sentence)
print(sequence)

[[1, 2, 1, 354]]


In [None]:
sequences = tokenizer.texts_to_sequences(positive_sentence)
padded = pad_sequences(sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)
print("Result:", model.predict(padded)[0][0])

Result: 0.70664716


In [None]:
negative_sentence = ["The movie was horrible!"]

In [None]:
sequence = tokenizer.texts_to_sequences(negative_sentence)
print(sequence)

[[1, 2, 1, 394]]


In [None]:
sequences = tokenizer.texts_to_sequences(sentence)
padded = pad_sequences(sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)
print("Result:", model.predict(padded)[0][0])

Result: 0.14515273
