In [None]:
!pip install tensorflow-gpu==2.4

In [None]:
!nvcc --version

In [None]:
import matplotlib.pyplot as plt
import tensorflow_hub as hub
import tensorflow as tf
import tensorflow_text
import pandas as pd
import numpy as np
import nltk
import re
import os
%matplotlib inline

In [None]:
from tensorflow.keras.losses import BinaryCrossentropy
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.models import Sequential
from tensorflow.keras.models import load_model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.metrics import AUC
from tensorflow.keras import layers

In [None]:
from sklearn.metrics import classification_report
from nltk.stem.porter import PorterStemmer
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from collections import defaultdict
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from copy import deepcopy
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')

In [None]:
!wget http://downloads.cs.stanford.edu/nlp/data/glove.6B.zip

In [None]:
!unzip glove.6B.zip

In [None]:
df = pd.read_csv('processed_data.csv')

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df['Sentiment'].value_counts()

In [None]:
df['Review'] = df['Review'].astype(str)

In [None]:
print(df[df['Review'].apply(lambda x: len(x.split(' ')) < 400)])

In [None]:
df.drop(df[df['Review'].apply(lambda x: len(x.split(' ')) > 400)].index, inplace = True)

In [None]:
df.shape

In [None]:
df['Sentiment'].value_counts()

In [None]:
words = dict()

def add_to_dict(d, filename):
  with open(filename, 'r', encoding="utf8") as f:
    for line in f.readlines():
      line = line.split(' ')

      try:
        d[line[0]] = np.array(line[1:], dtype=float)
      except:
        continue

add_to_dict(words, 'glove.6B.50d.txt')

In [None]:
len(words)

In [None]:
tokenizer = nltk.RegexpTokenizer(r"\w+")
lemmatizer = WordNetLemmatizer()


def message_to_token_list(s):
  tokens = tokenizer.tokenize(s)
  lemmatized_tokens = [lemmatizer.lemmatize(t) for t in tokens]
  useful_tokens = [t for t in lemmatized_tokens if t in words]


  return useful_tokens

In [None]:
def message_to_word_vectors(message, word_dict=words):
  processed_list_of_tokens = message_to_token_list(message)

  vectors = []

  for token in processed_list_of_tokens:
    if token not in word_dict:
      continue
    
    token_vector = word_dict[token]
    vectors.append(token_vector)
  
  return np.array(vectors, dtype=float)

In [None]:
def df_to_X_y(dff):
  y = dff['Sentiment'].to_numpy().astype(int)

  all_word_vector_sequences = []

  for message in dff['Review']:
    message_as_vector_seq = message_to_word_vectors(message)
    
    if message_as_vector_seq.shape[0] == 0:
      message_as_vector_seq = np.zeros(shape=(1, 50))

    all_word_vector_sequences.append(message_as_vector_seq)
  
  return all_word_vector_sequences, y

In [None]:
X, y = df_to_X_y(df)

print(len(X), len(X[0]))

In [None]:
print(len(X), len(X[2]))

In [None]:
def word_len(np_array):
  sequence_lengths = []

  for i in range(len(np_array)):
    sequence_lengths.append(len(np_array[i]))

  import matplotlib.pyplot as plt

  plt.hist(sequence_lengths)

  print(pd.Series(sequence_lengths).describe())

In [None]:
word_len(X)

In [None]:
def pad_X(X, desired_sequence_length=400):
  X_copy = deepcopy(X)

  for i, x in enumerate(X):
    x_seq_len = x.shape[0]
    sequence_length_difference = desired_sequence_length - x_seq_len
    
    pad = np.zeros(shape=(sequence_length_difference, 50))

    X_copy[i] = np.concatenate([x, pad])
  
  return np.array(X_copy).astype(float)

In [None]:
X = pad_X(X)

X.shape

In [None]:
y.shape

In [None]:
tf.config.experimental.list_physical_devices()

In [None]:
len(tf.config.list_physical_devices('GPU'))>0

In [None]:
model = Sequential([])

model.add(layers.Input(shape=(400, 50)))
model.add(layers.LSTM(64, return_sequences=True))
model.add(layers.Dropout(0.2))
model.add(layers.LSTM(64, return_sequences=True, unroll=False))
model.add(layers.Dropout(0.2))
model.add(layers.LSTM(64, return_sequences=True))
model.add(layers.Dropout(0.2))
model.add(layers.Flatten())
model.add(layers.Dense(1, activation='sigmoid'))

model.summary()

In [None]:
cp = ModelCheckpoint('model/', save_best_only=True)

model.compile(optimizer=Adam(learning_rate=0.0001), 
              loss=BinaryCrossentropy(), 
              metrics=['accuracy', AUC(name='auc')])

In [None]:
frequencies = pd.value_counts(df['Sentiment'])

frequencies

In [None]:
weights = {0: frequencies.sum() / frequencies[0], 1: frequencies.sum() / frequencies[1]}
weights

In [None]:

k = 5
kf = KFold(n_splits=k, random_state=1, shuffle=True)
acc_score = []
 
for train_index , test_index in kf.split(X):
    X_train , X_test = X[train_index],X[test_index]
    y_train , y_test = y[train_index] , y[test_index]
     
    model.fit(X_train, y_train, epochs=50, validation_split=.2 , callbacks=[cp], class_weight=weights)
    pred_values = (model.predict(X_test) > 0.5).astype(int)
     
    acc = accuracy_score(pred_values , y_test)
    acc_score.append(acc)
     
avg_acc_score = sum(acc_score)/k
 
print('accuracy of each fold - {}'.format(acc_score))
print('Avg accuracy : {}'.format(avg_acc_score))

In [None]:
#model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=50, callbacks=[cp], class_weight=weights)

In [None]:
model.save('gfgModel.h5')
print('Model Saved!')
 

In [None]:
best_model = load_model('model/')

In [None]:
test_predictions = (best_model.predict(X_test) > 0.5).astype(int)
print(classification_report(y_test, test_predictions))

In [None]:
savedModel=load_model('gfgModel.h5')
savedModel.summary()

In [None]:
def NLP_pipeline(review, word_dict=words, desired_sequence_length=400):
  review = review.lower()
  email_urls = re.compile("(\bhttp.+? | \b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b)")
  review = re.sub(email_urls, '', review)
  review = re.sub(r'[^\w\s]', '', review)
  # Remove digits
  pattern = r'[0-9]'
  review = re.sub(pattern, '', review)
  processed_list_of_tokens = message_to_token_list(review)

  vectors = []    
  for token in processed_list_of_tokens:
    if token not in word_dict:
      continue
    
    token_vector = word_dict[token]
    vectors.append(token_vector)

  print(len(vectors))
  sequence_length_difference = desired_sequence_length - len(vectors)
  
  pad = np.zeros(shape=(sequence_length_difference, 50))
  
  vectors = np.array(vectors).astype(float)
  print(vectors.shape)
  vectors = np.concatenate([vectors, pad])
  print(vectors.shape)
  vectors = np.reshape(vectors, (1, 400, 50))
  print(vectors.shape)
  predictions = (savedModel.predict(vectors) > 0.5).astype(int)

  if predictions == 1:
    return "positive review"

  else:
    return "negative review"

In [None]:
review = input("Enter your review: ")
sentiment = NLP_pipeline(review)
print(sentiment)