# Imports

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import nltk
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelBinarizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize,sent_tokenize
from bs4 import BeautifulSoup
import spacy
import re,string,unicodedata
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.stem import LancasterStemmer,WordNetLemmatizer
from sklearn.linear_model import LogisticRegression,SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import tensorflow as tf

# Data Reading

In [None]:
imdb_data=pd.read_csv('IMDB Dataset.csv')
print(imdb_data.shape)
imdb_data.head()

In [None]:
imdb_data['sentiment'].value_counts()


In [None]:
review = imdb_data['review'].loc[1]
review

# TEXT PREPROECESSING

In [None]:
#Removing the square brackets
def remove_between_square_brackets(text):
    return re.sub('\[[^]]*\]', '', text)

print(review)
print('---------------------')
review = remove_between_square_brackets(review)
print(review)

In [None]:
#Removing the html strips
def strip_html(text):
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()


print(review)
print('---------------------')
review = strip_html(review)
print(review)

In [None]:
#clean the review
def remove_special_characters(text, with_lower=True):
    pattern=r'[^a-zA-z0-9\s]'
    text=re.sub(pattern,'',text)
    if with_lower is False:
        return text
    return text.lower()

print(review)
print('---------------------')
review = remove_special_characters(review)
print(review)

In [None]:
#split the review
def split_review(text):
    tokens = word_tokenize(text)
    return tokens

tokenized_review = split_review(review)
tokenized_review

In [None]:
tokenized_review

In [None]:
#Stemming the text
def porter_stemmer(text):
    ps=nltk.porter.PorterStemmer()
    review_s = [ps.stem(word) for word in text]
    return review_s

tokenized_review_stemmed = porter_stemmer(tokenized_review)
print(' '.join(tokenized_review))
print('---------------------')
print(' '.join(tokenized_review_stemmed))


In [None]:
#Stemming the text
def wordnet_lemmatizer(text):
    lem = WordNetLemmatizer()
    review_l = [lem.lemmatize(word) for word in text]
    return review_l

tokenized_review_lemmatized = wordnet_lemmatizer(tokenized_review)
print(' '.join(tokenized_review))
print('---------------------')
print(' '.join(tokenized_review_lemmatized))



In [None]:
#set stopwords to english
stop=set(stopwords.words('english'))
print(stop)


In [None]:
def remove_stop_words(text, stop):
    review_s = [word for word in text if word not in stop]
    return review_s

print(' '.join(tokenized_review_lemmatized))
print('---------------------')
tokenized_review_lemmatized = remove_stop_words(tokenized_review_lemmatized, stop)
print(' '.join(tokenized_review_lemmatized))


## Before  and after preprocessing


In [None]:
print(imdb_data['review'].loc[1])
print('---------------------')
print(' '.join(tokenized_review_lemmatized))


# ML Model

In [None]:
X = []
for i in range(imdb_data['review'].shape[0]):    
    if i % 100 == 0:
        print('At step', i)
    review = imdb_data['review'].iloc[i]
    review = remove_between_square_brackets(review)
    review = strip_html(review)
    review = remove_special_characters(review)
    tokenized_review = split_review(review)
    #tokenized_review_lemmatized = wordnet_lemmatizer(tokenized_review)
    tokenized_review = remove_stop_words(tokenized_review, stop)
    X.append(' '.join(tokenized_review))

In [None]:
#check difference
index = 100

print( imdb_data['review'].iloc[index])
print('------------------')
print(X[index])


In [None]:
from sklearn.model_selection import train_test_split

X = np.array(X)

X_train, X_test, y_train, y_test = train_test_split(X, imdb_data['sentiment'].values, test_size=0.2, random_state=42)
print("Train shapes : X = {}, y = {}".format(X_train.shape,y_train.shape))
print("Test shapes : X = {}, y = {}".format(X_test.shape,y_test.shape))

In [None]:
le = LabelEncoder()
y_train = le.fit_transform(y_train)
y_test = le.fit_transform(y_test)

In [None]:
tfidf_vec = TfidfVectorizer(ngram_range=(1, 2))

tfidf_vec_train = tfidf_vec.fit_transform(X_train)
tfidf_vec_test = tfidf_vec.transform(X_test)


In [None]:
lr=LogisticRegression(penalty='l2',max_iter=500,C=1,random_state=42)
lr_tfidf=lr.fit(tfidf_vec_train, y_train)


In [None]:
lr_tfidf_predict=lr.predict(tfidf_vec_test)


In [None]:
print("Classification Report: \n", classification_report(y_test, lr_tfidf_predict,target_names=['Negative','Positive']))


# Deep Learning Model

In [None]:
tokenizer = Tokenizer(num_words=10000, oov_token='<OOV>')
tokenizer.fit_on_texts(X)
word_index = tokenizer.word_index
training_sequence = tokenizer.texts_to_sequences(X_train)
testing_sequence = tokenizer.texts_to_sequences(X_test)
train_pad_sequence = pad_sequences(training_sequence,maxlen = 200)
test_pad_sequence = pad_sequences(testing_sequence,maxlen = 200)
print('Total Unique Words : {}'.format(len(word_index)))

In [None]:
X_train[1]
print('----------')
train_pad_sequence[1]

In [None]:
model = tf.keras.Sequential([tf.keras.layers.Embedding(10001 + 1,64 ,input_length=200,
                            trainable=True),
                             tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
                             tf.keras.layers.Dropout(0.5),
                             tf.keras.layers.Dense(128,activation = 'relu',),
                             tf.keras.layers.Dense(64,activation = 'relu'),
                             tf.keras.layers.Dropout(0.5),
                             tf.keras.layers.Dense(1,activation = tf.nn.sigmoid)])

In [None]:
model.summary()

In [None]:
model.compile(loss = tf.keras.losses.BinaryCrossentropy() , optimizer='Adam',  metrics=['acc'])

history = model.fit(train_pad_sequence,y_train,epochs = 30 ,validation_data=(test_pad_sequence,y_test))