<a href="https://colab.research.google.com/github/devan1510/devan1510/blob/main/sentiment_analysis_on_amazon_reviews(using_recurrent_neural_network).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# imports for the dataset
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
# import libraries for language preprocessing
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
# make tensorflow imports
import tensorflow as tf
from tensorflow.keras.layers import SimpleRNN,LSTM,Dense,Dropout,Embedding,BatchNormalization
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import pad_sequences

In [None]:
# load the dataset
from google.colab import files
uploaded = files.upload()

Saving amazon_reviews.csv to amazon_reviews.csv


In [None]:
df= pd.read_csv("amazon_reviews.csv")
df.head()

Unnamed: 0,Review,Sentiment
0,Fast shipping but this product is very cheaply...,1
1,This case takes so long to ship and it's not e...,1
2,Good for not droids. Not good for iPhones. You...,1
3,The cable was not compatible between my macboo...,1
4,The case is nice but did not have a glow light...,1


In [None]:
# exploratory data analysis
df.dropna(axis= 0,inplace= True)

In [None]:
df.Sentiment.value_counts()

Unnamed: 0_level_0,count
Sentiment,Unnamed: 1_level_1
1,5000
2,5000
3,5000
4,5000
5,4999


In [None]:
# download stopwords from nltk
nltk.download('stopwords')
stop_words= set(stopwords.words('english'))
# download punkt sentence tokenizer
nltk.download('punkt')
import re as reg
# text clean reviews
def clean_reviews(text):
  """
  perform text cleaning on a language document and tokenize
  the words for nlp models
  """
  # remove html and other brackets
  regex= reg.compile('<.*?>')
  text= reg.sub(regex,'',text)

  # remove special characters like @,#,$
  pattern= reg.compile('[^a-zA-Z0-9\s]')
  text= reg.sub(pattern,'',text)

  # remove numbers
  pattern= reg.compile('\d+')
  text= reg.sub(pattern,'',text)

  # lower case the text
  words= text.lower()

  # tokenize of words
  tokens= word_tokenize(words)

  # remove stop words
  tokens= [word for word in text if not word in stop_words]
  return text


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
# using the function to preprocess reviews
df['Review']= df.Review.apply(clean_reviews)

In [None]:
# tokenization
tokenizer= Tokenizer()
# converting all the reviews to list to pass it as parameter
product_reviews= df.Review.tolist()
tokenizer.fit_on_texts(product_reviews)

# generating text sequences
text_sequences= tokenizer.texts_to_sequences(product_reviews)

# set max words we want in an example
max_words= 500

# perform sequence padding to ensure every sequence has same length
padded_sequences= pad_sequences(text_sequences,maxlen= max_words)

# convert text sequences to a numpy array
text_sequences= np.array(padded_sequences)

In [None]:
df.head()

Unnamed: 0,Review,Sentiment_2,Sentiment_3,Sentiment_4,Sentiment_5
0,Fast shipping but this product is very cheaply...,0,0,0,0
1,This case takes so long to ship and its not ev...,0,0,0,0
2,Good for not droids Not good for iPhones You c...,0,0,0,0
3,The cable was not compatible between my macboo...,0,0,0,0
4,The case is nice but did not have a glow light...,0,0,0,0


In [None]:
# divide the dataframe into features and label
X= pad_sequences(text_sequences,maxlen= max_words)
y= df[['Sentiment_2','Sentiment_3','Sentiment_4','Sentiment_5']]

In [None]:
# train_test_split
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test= train_test_split(X,y,test_size= .2)

In [None]:
# model building,compiling, and training
# 1. build a model(rnn)
rnn_model= Sequential([
    Embedding(len(tokenizer.word_index)+1,
              max_words,
              input_length= max_words),
    SimpleRNN(8,activation= 'relu',return_sequences= True),
    SimpleRNN(8,activation= 'relu'),
    Dense(4,activation= 'softmax')])



In [None]:
rnn_model.summary()

array([[    0,     0,     0, ...,     3,  1290,     2],
       [    0,     0,     0, ...,     2,    64,    88],
       [    0,     0,     0, ...,     4,     1,   122],
       ...,
       [    0,     0,     0, ...,     6,    42,   348],
       [    0,     0,     0, ...,  3996,   140,  2322],
       [    0,     0,     0, ...,    45, 10131,   140]], dtype=int32)

In [None]:
# compile the model
rnn_model.compile(loss= 'categorical_crossentropy',
                  optimizer= 'adam',
                  metrics= ['accuracy'])
# training the model
history= rnn_model.fit(X_train,y_train,
                       batch_size= 64,
                       epochs= 1,
                       validation_data= (X_test,y_test))

[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m292s[0m 919ms/step - accuracy: 0.2532 - loss: 6383477456896.0000 - val_accuracy: 0.3966 - val_loss: nan


In [None]:
# LSTM model
lstm_model= Sequential([
    Embedding(len(tokenizer.word_index)+1,
              max_words,input_length= max_words),
    # add an lstm layer
    LSTM(units= 8),
    BatchNormalization(),
    Dropout(rate= 0.5),

    # add a dense layer
    Dense(units= 8,activation= 'relu'),
    BatchNormalization(),
    Dropout(rate= 0.5),

    # add an output layer
    Dense(4,activation= 'softmax')
])



In [None]:
lstm_model.summary()

In [None]:
# compile the model
lstm_model.compile(loss= "categorical_crossentropy",
                   optimizer= 'adam',
                   metrics= ['accuracy'])
# training the model
history_lstm= model.fit(X_train, y_train,
                         batch_size=64,
                         epochs=1,
                       validation_data=(X_test, y_test))

In [None]:
# plotting the history
metrics= history_lstm.history
plt.figure()

# plotting training and validation loss
plt.plot(history_lstm.epoch,metrics.loss,metrics.val_loss)
plt.legend('loss','val_loss')

# plotting train and validation accuracy
plt.figure()
plt.plot(history_lstm.epoch,metrics.accuracy,metrics.val_accuracy)
plt.legend('accuracy','val_accuracy')

In [None]:
# make a function to review the rating
def predict_review_rating(review,model= lstm_model):
  text_sequences_test= np.array(tokenizer.texts_to_sequences([review]))
  testing= pad_sequences(text_sequences_test,maxlen= max_words)
  y_pred_test= np.argmax(model.predict(testing),axis= 1)
  return y_pred_test[0] +1