### IMDB movie reviews sentiment classifier using TensorFlow

In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [2]:
vocab_size = 10000
embedding_dim = 16
max_length = 500
trunc_type='post'
padding_type='post'
oov_tok = "<OOV>"
training_size = 35000

In [3]:
reviews = pd.read_csv('IMDB Dataset.csv')
reviews.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [4]:
sentences = reviews['review']
labels = reviews['sentiment']

In [5]:
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 
stop_words = set(stopwords.words('english')) 

In [7]:
import nltk
nltk.download('punkt')
filtered_sentences = []
for s in sentences:
    word_tokens = word_tokenize(s) 
    filtered_sentence = [w for w in word_tokens if not w in stop_words]
    filtered_sentences.append(" ".join(filtered_sentence))

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\musta\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


In [8]:
len(filtered_sentences[0])

1338

In [9]:
len(filtered_sentences)

50000

In [10]:
labels

0        positive
1        positive
2        positive
3        negative
4        positive
           ...   
49995    positive
49996    negative
49997    negative
49998    negative
49999    negative
Name: sentiment, Length: 50000, dtype: object

In [11]:
def sentiment(word):
    if word == 'positive':
        return 1
    else:
        return 0
labels = labels.apply(sentiment)

In [12]:
training_sentences = sentences[0:training_size]
testing_sentences = sentences[training_size:]
training_labels = labels[0:training_size]
testing_labels = labels[training_size:]

In [13]:
tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(training_sentences)
word_index = tokenizer.word_index
training_sequences = tokenizer.texts_to_sequences(training_sentences)
training_padded = pad_sequences(training_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)
testing_sequences = tokenizer.texts_to_sequences(testing_sentences)
testing_padded = pad_sequences(testing_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

In [14]:
len(training_sequences),training_padded

(35000,
 array([[ 28,   5,   2, ...,   0,   0,   0],
        [  4, 388, 119, ...,   0,   0,   0],
        [ 11, 190,  12, ...,   0,   0,   0],
        ...,
        [ 10, 811,  21, ...,   0,   0,   0],
        [  1, 125,  10, ...,   0,   0,   0],
        [ 12,  18,  91, ...,   0,   0,   0]]))

In [15]:
training_padded = np.array(training_padded)
training_labels = np.array(training_labels)
testing_padded = np.array(testing_padded)
testing_labels = np.array(testing_labels)

In [16]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(24, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

In [17]:
num_epochs = 10
history = model.fit(training_padded, training_labels, epochs=num_epochs, validation_data=(testing_padded, testing_labels), verbose=2)

Epoch 1/10
1094/1094 - 7s - loss: 0.4860 - accuracy: 0.7815 - val_loss: 0.3249 - val_accuracy: 0.8649 - 7s/epoch - 6ms/step
Epoch 2/10
1094/1094 - 5s - loss: 0.2631 - accuracy: 0.8989 - val_loss: 0.2937 - val_accuracy: 0.8779 - 5s/epoch - 5ms/step
Epoch 3/10
1094/1094 - 5s - loss: 0.2177 - accuracy: 0.9177 - val_loss: 0.2968 - val_accuracy: 0.8781 - 5s/epoch - 5ms/step
Epoch 4/10
1094/1094 - 5s - loss: 0.1922 - accuracy: 0.9273 - val_loss: 0.2610 - val_accuracy: 0.8987 - 5s/epoch - 5ms/step
Epoch 5/10
1094/1094 - 6s - loss: 0.1735 - accuracy: 0.9355 - val_loss: 0.2654 - val_accuracy: 0.8982 - 6s/epoch - 5ms/step
Epoch 6/10
1094/1094 - 5s - loss: 0.1576 - accuracy: 0.9424 - val_loss: 0.2758 - val_accuracy: 0.8963 - 5s/epoch - 5ms/step
Epoch 7/10
1094/1094 - 5s - loss: 0.1469 - accuracy: 0.9479 - val_loss: 0.2903 - val_accuracy: 0.8922 - 5s/epoch - 4ms/step
Epoch 8/10
1094/1094 - 5s - loss: 0.1377 - accuracy: 0.9514 - val_loss: 0.2977 - val_accuracy: 0.8936 - 5s/epoch - 5ms/step
Epoch 9/

In [31]:
def predict_sentence(sentence):
    sentence = [sentence]
    sequences = tokenizer.texts_to_sequences(sentence)
    padded = pad_sequences(sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)
    sentiment = model.predict(padded)
    return ("Positive Sentiment") if sentiment > 0.5 else ("Negative Sentiment")

In [22]:
predict_sentence("""The show wants to be a serious mystery. It want's to be grim. There is a constant synth sound in 
nearly every scene foreshadowing grim things are coming,even on the most banal scenes.It feels like someone in post 
production wanted to play around with Dolby Atmos as much as possible instead of doing a proper sound mix that supports 
the action on screen. From a technical aspect the sound is well engineered, but it doesn't do anything for the show.
And you get a perfect exercise in overacting by the semi talented cast. Every line is delivered frowned with meaning, 
swollen as if there were an underlying meaning to every word, glimpse or motion.""")





'Negative'

In [23]:
predict_sentence(""" It continunes few days after where we left off last season and it focuses more on political drama and dilemmas inside the walls. This season gets straight to the point and introduces new villain named Kenny that turns out to be somehow related to Levi. There are many parties trying to get Eren and Historia. For what reason? It is yet to be reveald. Mystery we all feel in love with is still there and there are even more questions, but don't let that distract you from the fact that this season ,even tho is only few episodes in, already has some major reveals in it. Pacing is amazing and i wasn't bored for a moment while watching. Things are only getting more interesting when Erwin reveals how he's planning to ovethrow the governmant without using the force. This season has a very refreshing and different plot compared to previous ones as we can see from the lack of titans and focuse on the issues inside the walls rather than beyond. However it's still as, if not more interesting than before. There is world buidling,mystery,back stabbing and some major questions answered which there is hopfully more to come.""")



'Positive'

In [24]:
predict_sentence("""
Another She-Hulk episode, another half an hour wasted.
Where to begin? Firstly, its now incredibly clear that this show does not care about getting anywhere anytime soon. Plot may as well not exist in the world of she hulk, since we waste 30 minutes on issues such as dating, weddings and copyright.
I can give this episode one positive factor; the acting is mainly good. Of course, the actors cannot save this terrible script, but they mostly try.
Another weakness in this episode is the awful CGI and editing. Scenes just clumsily collide with no thought about how they flow. Also, the music is just terrible.
""")



'Negative'

In [32]:
import gradio as gr
def update(name):
    return f"Welcome to Gradio, {name}!"

with gr.Blocks() as demo:
    gr.Markdown("Start typing below and then click **Run** to see the output.")
    with gr.Row():
        inp = gr.Textbox(placeholder='Enter a review', label='User Review')
        out = gr.Label(label='Sentiment Analysis')
    btn = gr.Button("Analysis")
    btn.click(fn=predict_sentence, inputs=inp, outputs=out)

demo.launch()

Running on local URL:  http://127.0.0.1:7866

To create a public link, set `share=True` in `launch()`.




