## Importing the dependencies

In [2]:
import os
import json

from zipfile import ZipFile
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [3]:
kaggle_dictionary = json.load(open("kaggle.json"))

In [4]:
kaggle_dictionary.keys()

dict_keys(['username', 'key'])

In [5]:
# Setup kaggle credentials as environment variables
os.environ["KAGGLE_USERNAME"] = kaggle_dictionary["username"]
os.environ["KAGGLE_KEY"] = kaggle_dictionary["key"]

In [6]:
!kaggle datasets download -d lakshmi25npathi/imdb-dataset-of-50k-movie-reviews

Dataset URL: https://www.kaggle.com/datasets/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews
License(s): other
Downloading imdb-dataset-of-50k-movie-reviews.zip to C:\Users\Admin




  0%|          | 0.00/25.7M [00:00<?, ?B/s]
  4%|3         | 1.00M/25.7M [00:00<00:22, 1.17MB/s]
  8%|7         | 2.00M/25.7M [00:01<00:10, 2.28MB/s]
 12%|#1        | 3.00M/25.7M [00:01<00:07, 3.31MB/s]
 16%|#5        | 4.00M/25.7M [00:01<00:05, 4.18MB/s]
 19%|#9        | 5.00M/25.7M [00:01<00:04, 5.01MB/s]
 23%|##3       | 6.00M/25.7M [00:01<00:03, 5.66MB/s]
 27%|##7       | 7.00M/25.7M [00:01<00:03, 5.81MB/s]
 31%|###1      | 8.00M/25.7M [00:01<00:03, 6.18MB/s]
 35%|###5      | 9.00M/25.7M [00:02<00:02, 6.51MB/s]
 39%|###8      | 10.0M/25.7M [00:02<00:02, 6.72MB/s]
 43%|####2     | 11.0M/25.7M [00:02<00:02, 6.48MB/s]
 47%|####6     | 12.0M/25.7M [00:02<00:02, 6.12MB/s]
 51%|#####     | 13.0M/25.7M [00:02<00:02, 6.30MB/s]
 54%|#####4    | 14.0M/25.7M [00:02<00:01, 6.35MB/s]
 58%|#####8    | 15.0M/25.7M [00:03<00:02, 5.08MB/s]
 66%|######6   | 17.0M/25.7M [00:03<00:01, 7.55MB/s]
 70%|#######   | 18.0M/25.7M [00:03<00:01, 7.39MB/s]
 74%|#######3  | 19.0M/25.7M [00:03<00:00, 7.48MB/s]
 

In [10]:
zip_file_path = "C:/Users/Admin/imdb-dataset-of-50k-movie-reviews.zip"

# Unzip the dataset file
with ZipFile(zip_file_path, "r") as zip_ref:
    zip_ref.extractall("C:/Users/Admin/imdb-dataset") 

In [12]:
extracted_files = os.listdir("C:/Users/Admin/imdb-dataset")
print(extracted_files)

['IMDB Dataset.csv']


## Loading the dataset

In [13]:
data = pd.read_csv("C:/Users/Admin/imdb-dataset/IMDB Dataset.csv")

In [14]:
data.shape

(50000, 2)

In [15]:
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [16]:
data.tail()

Unnamed: 0,review,sentiment
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative
49999,No one expects the Star Trek movies to be high...,negative


In [17]:
data['sentiment'].value_counts()

positive    25000
negative    25000
Name: sentiment, dtype: int64

In [18]:
data.replace({"sentiment": {"positive": 1, "negative": 0}}, inplace=True)

In [19]:
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1


In [20]:
data['sentiment'].value_counts()

1    25000
0    25000
Name: sentiment, dtype: int64

In [21]:
# split data into training data and test data
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

In [22]:
print(train_data.shape)
print(test_data.shape)

(40000, 2)
(10000, 2)


## Data Preprocessing

In [23]:
# Tokenize text data
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(train_data["review"])
X_train = pad_sequences(tokenizer.texts_to_sequences(train_data["review"]), maxlen=200)
X_test = pad_sequences(tokenizer.texts_to_sequences(test_data["review"]), maxlen=200)

In [24]:
print(X_train)

[[1935    1 1200 ...  205  351 3856]
 [   3 1651  595 ...   89  103    9]
 [   0    0    0 ...    2  710   62]
 ...
 [   0    0    0 ... 1641    2  603]
 [   0    0    0 ...  245  103  125]
 [   0    0    0 ...   70   73 2062]]


In [25]:
print(X_test)

[[   0    0    0 ...  995  719  155]
 [  12  162   59 ...  380    7    7]
 [   0    0    0 ...   50 1088   96]
 ...
 [   0    0    0 ...  125  200 3241]
 [   0    0    0 ... 1066    1 2305]
 [   0    0    0 ...    1  332   27]]


In [26]:
Y_train = train_data['sentiment']
Y_test = test_data['sentiment']

In [27]:
print(Y_train)

39087    0
30893    0
45278    1
16398    0
13653    0
        ..
11284    1
44732    1
38158    0
860      1
15795    1
Name: sentiment, Length: 40000, dtype: int64


## LSTM - Long Short-Term Memory

In [28]:
# Build the model

model = Sequential()
model.add(Embedding(input_dim=5000, output_dim=128, input_length=200))
model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation="sigmoid"))

**Dense(1):**
Đây là lớp đầu ra với một neuron (1 unit), vì bài toán này là bài toán phân loại nhị phân (positive hoặc negative). Mỗi unit trong lớp đầu ra sẽ biểu diễn một giá trị xác suất.

**activation="sigmoid":**
Hàm kích hoạt sigmoid được sử dụng trong các bài toán phân loại nhị phân vì nó đưa ra một đầu ra trong khoảng từ 0 đến 1, phù hợp để biểu diễn xác suất. Giá trị càng gần 1 thì càng có khả năng sentiment là tích cực, còn gần 0 là tiêu cực.

In [29]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 200, 128)          640000    
                                                                 
 lstm (LSTM)                 (None, 128)               131584    
                                                                 
 dense (Dense)               (None, 1)                 129       
                                                                 
Total params: 771,713
Trainable params: 771,713
Non-trainable params: 0
_________________________________________________________________


In [30]:
# Compile the model
model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])

**"binary_crossentropy"** được sử dụng trong các bài toán phân loại nhị phân (binary classification), nơi mà bạn có hai lớp (ví dụ: tích cực và tiêu cực).

## Training the model

In [31]:
model.fit(X_train, Y_train, epochs=5, batch_size=64, validation_split=0.2)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x227b1c9b250>

## Model Evaluation

In [32]:
loss, accuracy = model.evaluate(X_test, Y_test)
print(f"Test Loss: {loss}")
print(f"Test Accuracy: {accuracy}")

Test Loss: 0.30397823452949524
Test Accuracy: 0.8740000128746033


## Building a Predictive System

In [33]:
def predict_sentiment(review):
  # tokenize and pad the review
  sequence = tokenizer.texts_to_sequences([review])
  padded_sequence = pad_sequences(sequence, maxlen=200)
  prediction = model.predict(padded_sequence)
  sentiment = "positive" if prediction[0][0] > 0.5 else "negative"
  return sentiment

In [34]:
# example usage
new_review = "This movie was fantastic. I loved it."
sentiment = predict_sentiment(new_review)
print(f"The sentiment of the review is: {sentiment}")

The sentiment of the review is: positive


In [35]:
# example usage
new_review = "This movie was not that good"
sentiment = predict_sentiment(new_review)
print(f"The sentiment of the review is: {sentiment}")

The sentiment of the review is: negative


In [36]:
# example usage
new_review = "This movie was ok but not that good."
sentiment = predict_sentiment(new_review)
print(f"The sentiment of the review is: {sentiment}")

The sentiment of the review is: negative


In [38]:
# example usage
new_review = "This movie was a bit logical because the plot structure was complicated.However, I still liked it"
sentiment = predict_sentiment(new_review)
print(f"The sentiment of the review is: {sentiment}")

The sentiment of the review is: positive
