<a href="https://colab.research.google.com/github/fork52/Sentiment-Analyzer/blob/master/Sentiment_Analyzer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## CONNECT TO DRIVE

In [1]:
from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


## Set Drive Directory


In [2]:
cd "/content/drive/My Drive/Sentiment Analyzer/"

/content/drive/My Drive/Sentiment Analyzer


In [3]:
!ls

meta.tsv  reviews_Apps_for_Android_5.csv      reviews_new.pkl	 vecs.tsv
models	  reviews_Apps_for_Android_5.json.gz  sentences_new.pkl


## Import Necessary Libraries
 

In [4]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np
import pandas as pd
import json
import gzip
import pickle
from sklearn.model_selection import train_test_split
import tensorflow as tf

# Load data

## Extracting data from the .gz file
Do only for the first time

In [None]:
def parse(path):
  g = gzip.open(path, 'rb')
  for l in g:
    yield eval(l)

def getDF(path):
  i = 0
  df = {}
  for d in parse(path):
    df[i] = d
    i += 1
  return pd.DataFrame.from_dict(df, orient='index')

# load the dataframe in the variable df
df = getDF('reviews_Apps_for_Android_5.json.gz')

In [None]:
# Saving the data frame as csv
df.to_csv('reviews_Apps_for_Android_5.csv')

## Load data from csv file

In [None]:
df = pd.read_csv('reviews_Apps_for_Android_5.csv')

# Exploring the data
Make sure you loaded the dataframe

In [None]:
df.head()

Unnamed: 0.1,Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime
0,0,A1N4O8VOJZTDVB,B004A9SDD8,Annette Yancey,"[1, 1]","Loves the song, so he really couldn't wait to ...",3.0,Really cute,1383350400,"11 2, 2013"
1,1,A2HQWU6HUKIEC7,B004A9SDD8,"Audiobook lover ""Kathy""","[0, 0]","Oh, how my little grandson loves this app. He'...",5.0,2-year-old loves it,1323043200,"12 5, 2011"
2,2,A1SXASF6GYG96I,B004A9SDD8,Barbara Gibbs,"[0, 0]",I found this at a perfect time since my daught...,5.0,Fun game,1337558400,"05 21, 2012"
3,3,A2B54P9ZDYH167,B004A9SDD8,"Brooke Greenstreet ""Babylove""","[3, 4]",My 1 year old goes back to this game over and ...,5.0,We love our Monkeys!,1354752000,"12 6, 2012"
4,4,AFOFZDTX5UC6D,B004A9SDD8,C. Galindo,"[1, 1]",There are three different versions of the song...,5.0,This is my granddaughters favorite app on my K...,1391212800,"02 1, 2014"


## No of records in the data

In [None]:
# No of records = 752937
print( 'No of records:' , len(df) )

No of records: 752937


## Taking a look at a few reviews.
Change the index to see different reviews and their ratings.


In [None]:
index = 41170
print('Review:\n  ', df.iloc[index]['reviewText'])
print('Summary:',df.iloc[index]['summary'])
print('Ratings:',df.iloc[index]['overall'] )

Review:
   nan
Summary: My picture is worth a 1000 words
Ratings: 5.0


In [None]:
df['overall'].describe()

count    752937.000000
mean          3.968931
std           1.342484
min           1.000000
25%           3.000000
50%           5.000000
75%           5.000000
max           5.000000
Name: overall, dtype: float64

# Get Sentences and Get Lablels

## Obtain Sentences from the dataframe

In [None]:
def get_sentences(df):
    n = len(df)
    sentences = [ df.iloc[i]['reviewText'] for i in range(n) if isinstance(df.iloc[i]['reviewText'],str) ]
    return sentences

def get_labels(df):
    n = len(df)
    reviews = [ int(df.iloc[i]['overall'])-1 for i in range(n) if isinstance(df.iloc[i]['reviewText'],str) ]
    return reviews

In [None]:
sentences = get_sentences(df)
reviews = get_labels(df)

In [None]:
#save the files
with open('sentences_new.pkl', 'wb') as f:
    pickle.dump(sentences, f)
with open('reviews_new.pkl', 'wb') as f:
    pickle.dump(reviews, f)

## Load data from the .pkl files

In [5]:
with open('sentences_new.pkl', 'rb') as f:
    sentences = pickle.load(f)
with open('reviews_new.pkl', 'rb') as f:
    reviews = pickle.load(f)
print(len(sentences),len(reviews))

752927 752927


# Train-Test split

In [6]:
X_train, X_test, y_train, y_test=train_test_split(
    sentences, reviews , test_size=0.05, random_state=42
)

print( len(X_train) , len(y_train) )
print( len(X_test)  , len(y_test) )

715280 715280
37647 37647


# Tokenize the string and perform padding on the string

In [7]:
# Set hyperparameters
vocab_size = 20000
embedding_dim = 32
oov_tok = "<OOV>"
trunc_type='post'
max_length = 220
padding_type='post'


# Fit tokenizer on the training data
tokenizer = Tokenizer(num_words = vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(X_train)
word_index = tokenizer.word_index

#save the tokenizer
with open('models/tokenizer.pkl', 'wb') as f:
    pickle.dump(tokenizer,f)

In [31]:
def prepare_input(X):
    '''Get the padded sequences'''
    sequences = tokenizer.texts_to_sequences(X)

    padded = pad_sequences(
                           sequences,
                           maxlen=max_length,
                           truncating=trunc_type,
                           padding=padding_type
                        )
    return sequences

In [32]:
 # Prepare testing and training data
train_data = prepare_input(X_train)
testing_data = prepare_input(X_test)

# Defining the model

In [35]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64,  return_sequences=True)),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(5, activation='softmax')
])

model.compile(loss='sparse_categorical_crossentropy',optimizer='adam',metrics=['accuracy'])
model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, None, 32)          640000    
_________________________________________________________________
bidirectional_4 (Bidirection (None, None, 128)         49664     
_________________________________________________________________
bidirectional_5 (Bidirection (None, 64)                41216     
_________________________________________________________________
dense_4 (Dense)              (None, 64)                4160      
_________________________________________________________________
dense_5 (Dense)              (None, 5)                 325       
Total params: 735,365
Trainable params: 735,365
Non-trainable params: 0
_________________________________________________________________


# Training the model

In [36]:
num_epochs = 6
model.fit(
          train_data, 
          np.array(y_train,dtype=int),
          epochs=num_epochs, 
          validation_data=( testing_data , np.array(y_test) )
)

ValueError: ignored

In [None]:
model.save('models/basic_model')

Instructions for updating:
If using Keras pass *_constraint arguments to layers.
INFO:tensorflow:Assets written to: models/basic_model/assets


In [None]:
#load the saved model
new_model = tf.keras.models.load_model('models/basic_model')
model = new_model

In [None]:
new_sent = ['The product is just decent and amazing.']
new_data = prepare_input(new_sent)
rating = np.argmax( model.predict(new_data) ) + 1
print('Rating given by model is:',rating)

Rating given by model is: 5


In [None]:
# Reverse the model's index
reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])

# Get the weights from the model's embedding layer
e = model.layers[0]
weights = e.get_weights()[0]
print(weights.shape) # shape: (vocab_size, embedding_dim)


# Write the embeddings to the files
import io

out_v = io.open('vecs.tsv', 'w', encoding='utf-8')
out_m = io.open('meta.tsv', 'w', encoding='utf-8')
for word_num in range(1, vocab_size):
  word = reverse_word_index[word_num]
  embeddings = weights[word_num]
  out_m.write(word + "\n")
  out_v.write('\t'.join([str(x) for x in embeddings]) + "\n")
out_v.close()
out_m.close()


try:
  from google.colab import files
except ImportError:
  pass
else:
  files.download('vecs.tsv')
  files.download('meta.tsv')

(20000, 32)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>