In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
import nltk
from nltk.corpus import stopwords
from tensorflow.keras import regularizers

from numpy import array
import random as python_random
import tensorflow as tf
import tensorflowjs as tfjs

from sklearn.utils import shuffle
from tensorflow.keras import callbacks,optimizers
from keras.preprocessing.text import one_hot
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras .layers.core import Activation,Dropout,Dense
from keras.layers import Flatten
from keras.layers.recurrent import LSTM
from keras.layers import GlobalMaxPooling1D
from keras.layers.convolutional import Conv1D 
from keras.layers.embeddings import Embedding
from keras.callbacks import ModelCheckpoint
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer


Matplotlib is building the font cache; this may take a moment.


ModuleNotFoundError: No module named 'tensorflow'

In [None]:
tf.__version__

In [None]:
movie_reviews = pd.read_csv("IMDB Dataset.csv")
movie_reviews = shuffle(movie_reviews, random_state=42)
movie_reviews.head()

In [None]:
movie_reviews['sentiment'].value_counts()

In [None]:
movie_reviews.isna().values.any()

In [None]:
movie_reviews.shape

In [None]:
movie_reviews['review'][10]

In [None]:
import seaborn as sns
sns.countplot(x='sentiment', data=movie_reviews)

In [None]:
def preprocess_text(sen):
    # Removing html tags
    sentence = remove_tags(sen)
    
    # Remove punctuations and numbers
    sentence = re.sub('[^a-zA-Z]', ' ', sentence)
    
    # Single character removal
    sentence = re.sub(r"\s+[a-zA-Z]\s+",' ', sentence)
    
    # Removing multiple spaces
    sentence = re.sub(r'\s+', ' ', sentence)
    
    return sentence

In [None]:
TAG_RE = re.compile(r'<[^>]+>')

def remove_tags(text):
    return TAG_RE.sub('', text)

In [None]:
X = []
sentences = list(movie_reviews['review'])
for sen in sentences:
    X.append(preprocess_text(sen))

In [None]:
X[10]

In [None]:
y = movie_reviews['sentiment']

y = np.array(list(map(lambda x: 1 if x =="positive" else 0, y)))

In [None]:
X_train,X_test,y_train,y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [None]:
len(y_train)

In [None]:
len(y_test)

### Preparing the embedding Layer

In [None]:
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(X_train)

X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)

In [None]:
import pickle
with open('new_tokenizer.pkl', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
# Adding 1 because of reserved 0 index
vocab_size = len(tokenizer.word_index) + 1

maxlen = 100

X_train = pad_sequences(X_train, padding='post', maxlen=maxlen)
X_test = pad_sequences(X_test,padding='post', maxlen=maxlen)

In [None]:
from numpy import array
from numpy import asarray
from numpy import zeros

embeddings_dict = dict()
glove_file = open('glove.6B.100d.txt', encoding="utf8")

for line in glove_file:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_dict[word] = coefs
glove_file.close()

print('Found %s word vectors.' % len(embeddings_dict))



In [None]:
        
embedding_matrix = np.zeros((vocab_size, 100))
for word, index in  tokenizer.word_index.items():
    embedding_vector = embeddings_dict.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[index] = embedding_vector


### Deep learning model
+ using recurrent neural network
   - training

In [None]:

# model = Sequential()
# embedding_layer = Embedding(vocab_size, 100, weights=[embedding_matrix], input_length=maxlen, trainable=False)
# model.add(embedding_layer)
# model.add(LSTM(128))
# model.add(Dense(1, activation='sigmoid'))
# model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
# print(model.summary())

model = Sequential()
embedding_layer = Embedding(vocab_size, 100, weights=[embedding_matrix], input_length=maxlen , trainable=False)
model.add(embedding_layer)
model.add(LSTM(128))

model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])

In [None]:
# history = model.fit(X_train, y_train, batch_size=128, epochs=6, verbose=2, validation_split=0.2,callbacks=[checkpointer])
# model = create_model(model)
# elst = callbacks.EarlyStopping(monitor='val_loss', patience=5, mode='min')
# save_ck = callbacks.ModelCheckpoint(filepath="model_weights.hdf5", verbose=1, save_best_only=True, monitor='val_loss', mode='min')

In [None]:
history = model.fit(X_train, y_train, batch_size=128, epochs=6, verbose=1, validation_split=0.2)

In [None]:
# path = 'model_weights.hdf5'
# model.load_weights(path)

In [None]:
# Evaluating performance of the model
score = model.evaluate(X_test, y_test, verbose=1)

In [None]:
print("Test Score:", score[0])
print("Test Accuracy:", score[1])

In [None]:
plt.plot(history.history['acc'])
plt.plot(history.history['val_acc'])

plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train','test'], loc = 'upper left')
plt.show()

plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])

plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train','test'], loc = 'upper left')
plt.show()

In [None]:
instance = X[100]
instance = [instance]
print(instance)

In [None]:
# test = [
#     'This is an excellent movie',
#     'The move was fantastic I like it',
#     'You should watch it is brilliant',
#     'Exceptionally good',
#     'Wonderfully directed and executed I like it',
#     'Its a fantastic series',
#     'Never watched such a brillent movie',
#     'It is a Wonderful movie',
    
#      "horrible acting",
#     'waste of money',
#     'pathetic picture',
#     'It was very boring',
#     'I did not like the movie',
#     'The movie was horrible',
#     'I will not recommend',
#     'The acting is pathetic'
# ]

In [None]:
# instance1 = ['good']
instance = tokenizer.texts_to_sequences(instance)

flat_list = []
for sublist in instance:
    for item in sublist:
        flat_list.append(item)
        
flat_list = [flat_list]

instance = pad_sequences(flat_list, padding='post', maxlen=maxlen)

predict = model.predict(instance)
# predict

if predict.any() > 0.5:
    print('positive')
elif predict.any() < 0.5:
    print('negative')
else:
    print('neutral')
print(predict)

In [None]:
# classes = model.predict_classes(X_test, batch_size=128)
proba=model.predict(X_test, batch_size=128)
print(proba)

In [None]:
tfjs.converters.save_keras_model(model, 'models')

### saving the model

In [None]:
#model.save('model.h5')
model.save('new_model2.h5')

### saving tokenizer

In [None]:
# import pickle
# with open('tokenizer.pkl', 'wb') as handle:
#     pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
from keras.models import load_model

new_model = load_model('new_model2.h5')

### loading tokenizer

In [None]:
# with open('tokenizer.pkl', 'rb') as handle:
#     tokenizer = pickle.load(handle)

In [None]:

instancen = input()
instancen = [instancen]
instancen = tokenizer.texts_to_sequences(instancen)

flat_list = []
for sublist in instancen:
    for item in sublist:
        flat_list.append(item)
        
flat_list = [flat_list]

instancen = pad_sequences(flat_list, padding='post', maxlen=100)

predict = new_model.predict(instancen)
print(predict)

for i in range(len(predict)):
#     print("Predicted sentiemnt:")
    for idx, val in enumerate(predict[i]):
        if val >= 0.5:
            print('POSITIVE')
        else:
            print('NEGATIVE')
       
        
# if predict > "0.5:
#     print('positive')
# elif predict < 0.5:
#     print('negative')
# else:
#     print('neutral')
# print(predict)