In [59]:
import os
import json
import pandas as pd
import zipfile
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Embedding
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences


# print(tf.config.list_physical_devices("GPU"))
data = pd.read_csv("../data/IMDB_reviews.csv")

data.head()


Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [60]:
# data.shape


data["sentiment"].value_counts()


sentiment
positive    25000
negative    25000
Name: count, dtype: int64

In [61]:
data.replace({"sentiment": {"positive": 1, "negative": 0}}, inplace=True)

data["sentiment"].value_counts()
# data.head()


  data.replace({"sentiment": {"positive": 1, "negative": 0}}, inplace=True)


sentiment
1    25000
0    25000
Name: count, dtype: int64

In [62]:
train_data, test_data = train_test_split(
    data, random_state=42, shuffle=True, test_size=0.1
)


tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(train_data["review"])

X_train = pad_sequences(tokenizer.texts_to_sequences(train_data["review"]), maxlen=200)
X_test = pad_sequences(tokenizer.texts_to_sequences(test_data["review"]), maxlen=200)

y_train = train_data["sentiment"]
y_test = test_data["sentiment"]


In [66]:
model = Sequential()
model.add(Embedding(input_dim=5000, output_dim=128, input_length=200))
model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation="sigmoid"))

model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])




In [67]:
model.summary()


In [68]:
model.fit(X_train, y_train, epochs=3, batch_size=64, validation_split=0.2)

# loss = model.evaluate(X_test, y_test)


Epoch 1/3
[1m563/563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m939s[0m 2s/step - accuracy: 0.7383 - loss: 0.5123 - val_accuracy: 0.8573 - val_loss: 0.3438
Epoch 2/3
[1m563/563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m987s[0m 2s/step - accuracy: 0.8597 - loss: 0.3413 - val_accuracy: 0.8479 - val_loss: 0.3750
Epoch 3/3
[1m563/563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1009s[0m 2s/step - accuracy: 0.8883 - loss: 0.2805 - val_accuracy: 0.8681 - val_loss: 0.3329
[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m58s[0m 367ms/step - accuracy: 0.8714 - loss: 0.3267


In [70]:
loss, accuracy = model.evaluate(X_test, y_test)

print(f"Loss :{loss}")
print(f"Accuracy :{accuracy}")


[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m47s[0m 297ms/step - accuracy: 0.8714 - loss: 0.3267
Loss :0.3158178925514221
Accuracy :0.8766000270843506


In [1]:
def predict_sentiment(review):
    sequence = tokenizer.texts_to_sequences([review])
    padded_sequence = pad_sequences(sequence, maxlen=200)
    # padded_sequence = pad_sequences(tokenizer.texts_to_sequences(review), maxlen=200)
    prediction = model.predict(padded_sequence)

    sentiment = "positive" if prediction[0][0] > 0.5 else "negative"
    return sentiment


In [75]:
rev = "that movie was great"

print(predict_sentiment(rev))


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 476ms/step
positive
