In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
nltk.download('punkt')

from nltk.tokenize import word_tokenize
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, Dropout
from sklearn.preprocessing import LabelEncoder

import warnings
warnings.filterwarnings('ignore')
sns.set()

In [None]:
imdb = pd.read_csv("datasets/IMDB_Dataset.csv")
imdb.head()

In [None]:
imdb['sentiment'].value_counts()

In [None]:
text = imdb['review'][0]
print(text)
print("<=======================>")
print(word_tokenize(text))

In [None]:
corpus = []
for text in imdb['review']:
    words = [word.lower() for word in word_tokenize(text)]
    corpus.append(words)

In [None]:
num_words = len(corpus)
print(num_words)

In [None]:
imdb.shape

In [None]:
train_size = int(imdb.shape[0] * 0.8)
X_train = imdb['review'][:train_size]
y_train = imdb['sentiment'][:train_size]

X_test = imdb['review'][train_size:]
y_test = imdb['sentiment'][train_size:]

In [None]:
tokenizer = Tokenizer(num_words)
tokenizer.fit_on_texts(X_train)
X_train = tokenizer.texts_to_sequences(X_train)
X_train = pad_sequences(X_train, maxlen=128, truncating='post', padding='post')

In [None]:
X_train[0]

In [None]:
len(X_train[0])

In [None]:
X_test = tokenizer.texts_to_sequences(X_test)
X_test = pad_sequences(X_test, maxlen=128, truncating='post', padding='post')

In [None]:
X_test[0]

In [None]:
len(X_test[0])

In [None]:
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

In [None]:
le = LabelEncoder()
y_train = le.fit_transform(y_train)
y_test = le.transform(y_test)

In [None]:
model = Sequential()

model.add(Embedding(input_dim=num_words, output_dim=100,
                   input_length=128, trainable=True))
model.add(LSTM(50, activation='relu', return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(100, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid'))

In [None]:
model.summary()

In [None]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [None]:
model.fit(X_train, y_train, epochs=2, batch_size=32)