# **Sources**
* IMDB Dataset: https://www.kaggle.com/datasets/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews

In [None]:
import tensorflow as tf
tf.__version__

# **Data**

In [None]:
from google.colab import drive
drive.mount('/content/drive')

url = "/content/drive/MyDrive/Temp/Datasets/IMDB Dataset.csv"

In [None]:
import pandas as pd
df = pd.read_csv(url)

df.drop_duplicates(inplace=True)

df.head()

## Preparing Y

In [None]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
Y = le.fit_transform(df.sentiment)
CLASSES = le.classes_

## Preparing X

### Preprocessing

In [None]:
!pip install emoji

In [None]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')

In [None]:
import emoji
from nltk.tokenize import word_tokenize

import string
punc = string.punctuation
abbv = {
    "AFAIK":"as far as I know",
	"IMO":	"in my opinion",
	"IMHO":	"in my humble opinion",
	"LGTM":	"look good to me",
	"AKA":	"also know as",
	"ASAP":	"as sone as possible",
	"BTW":	"by the way",
	"FAQ":	"frequently asked questions",
	"DIY":	"do it yourself",
	"DM":	"direct message",
	"FYI":	"for your information",
	"IC":	"i see",
	"IOW":	"in other words",
	"IIRC":	"If I Remember Correctly",
	"icymi":"In case you missed it",
	"CUZ":	"because",
	"COS":	"because",
	"nv":	"nevermind",
	"PLZ":	"please",
}

from nltk.corpus import stopwords
stopwords.words('english')

import re
html_pattern = re.compile('<.*?>')
urls_pattern = re.compile(r'https?://\S+|www\.\S+')
emoji_pattern = re.compile("["
	u"\U0001F600-\U0001F64F"  # emoticons
	u"\U0001F300-\U0001F5FF"  # symbols & pictographs
	u"\U0001F680-\U0001F6FF"  # transport & map symbols
	u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
"]+", flags=re.UNICODE)


from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

def preprocess(text):

    # Lowercase
    text = text.lower()

    # HTML Tags
    text = html_pattern.sub(r'', text)

    # urls
    text = urls_pattern.sub(r'', text)

    # punctuations
    text = text.translate(str.maketrans("", "", punc))

    # Emojis
    text = emoji.demojize(text)
    text = emoji_pattern.sub(r'', text)

    new_text = []

    for word in text.split(" "):

        # abbreviations
        word = abbv.get(word.upper(), word)
            
        # Stemming
        word = ps.stem(word)

        new_text.append(word)

    text = " ".join(new_text)

    return text

preprocess("This is the best movie I have ever watched")

In [None]:
# from tqdm import tqdm

# # cleaned = df.review.apply(preprocess)

# cleaned = []
# for i in tqdm(df.review):
#     cleaned.append(preprocess(i))

In [None]:
import json

# WRITTING
# with open("/content/drive/MyDrive/Temp/dumps/cleaned_reviews1.json", 'w') as f:
#     json.dump(cleaned, f)

# READING
with open("/content/drive/MyDrive/Temp/dumps/cleaned_reviews1.json", 'rb') as f:
    cleaned = json.load(f)

### Splitting

In [None]:
from sklearn.model_selection import train_test_split

cleaned_train, cleaned_test, Y_train, Y_test = train_test_split(
	cleaned,
	Y,
	test_size=0.2,
	random_state=42,
	stratify=Y
)

# **Hyper Parameters**

In [None]:
VOCAB_SIZE = 5000
OOV_TOKEN = "<OOV>"
MAXLEN = 2400

# **Word to Vector**

In [None]:
cleaned_train[0]

## Tokenizing

In [None]:
# Initializing and Fitting Tokenizer
from tensorflow.keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer(
	num_words = VOCAB_SIZE,			# vocab size
	oov_token = OOV_TOKEN,	    	# word out of vocab to replace with
	filters = '!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
	lower = True,
)
tokenizer.fit_on_texts(cleaned_train)

In [None]:
X_train_tokens = tokenizer.texts_to_sequences(cleaned_train)
X_test_tokens = tokenizer.texts_to_sequences(cleaned_test)

## Padding

In [None]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

X_train_padded = pad_sequences(X_train_tokens, maxlen=MAXLEN)
X_test_padded = pad_sequences(X_test_tokens, maxlen=MAXLEN)

## Converting to Tensors

In [None]:
train_data = tf.data.Dataset.from_tensor_slices((X_train_padded, Y_train))
test_data = tf.data.Dataset.from_tensor_slices((X_test_padded, Y_test))

## Shuffle | Batch | Pad

In [None]:
BUFFER_SIZE = 10000
BATCH_SIZE = 32

# Shuffle the training data
train_data = train_data.shuffle(BUFFER_SIZE)
test_data = test_data.shuffle(BUFFER_SIZE)

# Batch and pad the datasets to the maximum length of the sequences
train_data = train_data.padded_batch(BATCH_SIZE)
test_data = test_data.padded_batch(BATCH_SIZE)

# **Model**

## Training

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Input, Dense, Embedding, GlobalAveragePooling1D, Conv1D, Bidirectional, LSTM

In [None]:
EMB_DIM = 16
LSTM_DIM = 12
FILTERS = 16
KERNEL_SIZE = 8
EPOCHS = 5

model = Sequential([
    Input(shape=(MAXLEN,)),
    Embedding(VOCAB_SIZE, EMB_DIM),
    Conv1D(filters=FILTERS, kernel_size=KERNEL_SIZE, activation='relu'),
    # GlobalAveragePooling1D(),
    Bidirectional(LSTM(LSTM_DIM)),
    Dense(80, activation="relu"),
    Dense(1, activation="sigmoid"),
])
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
model.summary()
with tf.device('/GPU:0'):
    history = model.fit(train_data, epochs=EPOCHS, validation_data=test_data).history

## Plotting

In [None]:
import matplotlib.pyplot as plt

In [None]:
fig, axes = plt.subplots(1, 2, figsize = (15, 5))
# fig.subplots_adjust(hspace=10, wspace=10)

# Accuracy
axes[0].plot(history['accuracy'])
axes[0].plot(history['val_accuracy'])
axes[0].set_title("Accuracy")

# Loss
axes[1].plot(history['loss'])
axes[1].plot(history['val_loss'])
axes[1].set_title("Loss")

plt.show()

## Realtime Testing

In [None]:
text = "not a good movie at all"

cleaned_text = preprocess(text)
token_text = tokenizer.texts_to_sequences([cleaned_text])
padded_text = pad_sequences(token_text, maxlen=MAXLEN)
pred = model.predict(padded_text)[0, 0]

label = CLASSES[round(pred)]
probability = pred

print(label, probability)