# RSA-Lite: Review Sentiment Analysis Classifier Lite
**Model Spesifications**
```
version: v1.0.0 Lite
total_param: 77,954
model_type: Custom Recurrent Neural Network (RNN)
model_layers: [
    Embedding,
    LSTM,
    Dense,
    Dense,
    Dropout,
    Dense
]
train_data_length: 1,280,000
test_data_length: 320,000
```

**WARNING:** 
1. **THIS MODEL IS STILL UNDER DEVELOPMENT AND MIGHT BE UNSTABLE!**

Data source: [Kaggle](https://www.kaggle.com/datasets/kritanjalijain/amazon-reviews)

Kaggle public API (require 'kaggle.json' file, visit [Kaggle API](https://www.kaggle.com/docs/api)): 
```
# command prompt
kaggle datasets download -d kritanjalijain/amazon-reviews
```

## Importing Dependencies

In [None]:
try:
    import numpy as np
    import pandas as pd
    import os
    import shutil
    from zipfile import ZipFile
    from IPython.display import display
    from sklearn.model_selection import train_test_split

    import tensorflow as tf
    import keras
    from keras.preprocessing.text import Tokenizer
    from keras.preprocessing.sequence import pad_sequences
    from keras import Sequential
    from keras.layers import Embedding, LSTM, Dense, Dropout

    import matplotlib.pyplot as plt
    from sklearn.metrics import classification_report
    import json
except Exception:
    raise Exception(f"Error on importing dependencies!")

# for google colab users
from google.colab import files

In [None]:
def kaggle(origin_path: str, dataset_urls=None, copy=True):
    root_dir = "/root/.kaggle"
    file_name = "kaggle.json"
    file_loc = os.path.join(root_dir, file_name)

    if not os.path.isdir(root_dir):
        os.mkdir(root_dir)

    if copy:
        shutil.copy(origin_path, root_dir)
    else:
        shutil.move(origin_path, root_dir)

    os.chmod(file_loc, 600)
    print(f"'kaggle.json' file has been copied to {file_loc}!")

    if dataset_urls is not None:
        from kaggle.api.kaggle_api_extended import KaggleApi

        api = KaggleApi()
        api.authenticate()

        for dataset in dataset_urls:
            api.dataset_download_cli(dataset)

        print(f"Dataset has been successfully downloaded!")

def unzip(path: str, del_src: bool):
    zip_ref = ZipFile(path, 'r')
    zip_ref.extractall(os.getcwd())
    zip_ref.close()

    if del_src:
        os.remove(path)

    print(f"\"{path}\" has been extracted!")

def ws_setup(ws_name: str, data_origins: list):
    loc = os.getcwd()
    ws_path = os.path.join(loc, ws_name)

    if not os.path.isdir(ws_path):
        os.mkdir(ws_path)
    else:
        raise Exception(f"\"{ws_name}\" directory already exists!")

    for data in data_origins:
        shutil.move(data, ws_path)

    os.chdir(ws_path)

    print(f"current workspace directory \"{os.getcwd()}\"")

def display_dfs(dataframes: list):
    for dataframe in dataframes:
        display(dataframe)

## Project Setup

In [None]:
files.upload()

kaggle(origin_path="./kaggle.json", dataset_urls=['kritanjalijain/amazon-reviews'], copy=False)

Saving kaggle.json to kaggle.json
'kaggle.json' file has been copied to /root/.kaggle/kaggle.json!
Downloading amazon-reviews.zip to /content


100%|██████████| 1.29G/1.29G [00:05<00:00, 255MB/s]


Dataset has been successfully downloaded!





In [None]:
unzip("./amazon-reviews.zip", del_src=True)

"./amazon-reviews.zip" has been extracted!


In [None]:
ws_setup(ws_name="workspace", data_origins=['./train.csv', './test.csv'])

current workspace directory "/content/workspace"


In [None]:
headers = ['class', 'title', 'review']
train = pd.read_csv("./train.csv", names=headers)
test = pd.read_csv("./test.csv", names=headers)

df = pd.concat([train, test], axis=0)

display_dfs([df])

Unnamed: 0,class,title,review
0,2,Stuning even for the non-gamer,This sound track was beautiful! It paints the ...
1,2,The best soundtrack ever to anything.,I'm reading a lot of reviews saying that this ...
2,2,Amazing!,This soundtrack is my favorite music of all ti...
3,2,Excellent Soundtrack,I truly like this soundtrack and I enjoy video...
4,2,"Remember, Pull Your Jaw Off The Floor After He...","If you've played the game, you know how divine..."
...,...,...,...
399995,1,Unbelievable- In a Bad Way,We bought this Thomas for our son who is a hug...
399996,1,"Almost Great, Until it Broke...",My son recieved this as a birthday gift 2 mont...
399997,1,Disappointed !!!,"I bought this toy for my son who loves the ""Th..."
399998,2,Classic Jessica Mitford,This is a compilation of a wide range of Mitfo...


In [None]:
df['class'].replace(to_replace=[1, 2], value=['negative', 'positive'], inplace=True)
dummy = pd.get_dummies(df['class'])

df.drop(labels=['class'], axis=1, inplace=True)
df = pd.concat([df, dummy], axis=1)

display_dfs([df])

Unnamed: 0,title,review,negative,positive
0,Stuning even for the non-gamer,This sound track was beautiful! It paints the ...,0,1
1,The best soundtrack ever to anything.,I'm reading a lot of reviews saying that this ...,0,1
2,Amazing!,This soundtrack is my favorite music of all ti...,0,1
3,Excellent Soundtrack,I truly like this soundtrack and I enjoy video...,0,1
4,"Remember, Pull Your Jaw Off The Floor After He...","If you've played the game, you know how divine...",0,1
...,...,...,...,...
399995,Unbelievable- In a Bad Way,We bought this Thomas for our son who is a hug...,1,0
399996,"Almost Great, Until it Broke...",My son recieved this as a birthday gift 2 mont...,1,0
399997,Disappointed !!!,"I bought this toy for my son who loves the ""Th...",1,0
399998,Classic Jessica Mitford,This is a compilation of a wide range of Mitfo...,0,1


In [None]:
df_lite = df.copy()
df_lite.drop(df_lite.index[2000000:4000000], inplace=True)
display_dfs([df_lite, df])

Unnamed: 0,title,review,negative,positive
400000,Not So Simple Sand Castles,I found this book beyond what I wanted to get ...,1,0
400001,Great cooler,This is one nice cooler - large but not too la...,0,1
400002,Great size,The size is great for almost everything you co...,0,1
400003,Cooler great,My son was happy with his Christmas gift. He p...,0,1
400004,buy something else,"I will be searching for a different cooler, as...",1,0
...,...,...,...,...
1999995,Missed to many important scenes,I read this book prior to watching any adaptat...,1,0
1999996,terrible adaptation,I don't expect Jane Austen movie adaptations t...,1,0
1999997,Love Jane Austen,This is my favorite movie version of Jane Aust...,0,1
1999998,A nice film,I was not familiar with this story at all befo...,0,1


Unnamed: 0,title,review,negative,positive
0,Stuning even for the non-gamer,This sound track was beautiful! It paints the ...,0,1
1,The best soundtrack ever to anything.,I'm reading a lot of reviews saying that this ...,0,1
2,Amazing!,This soundtrack is my favorite music of all ti...,0,1
3,Excellent Soundtrack,I truly like this soundtrack and I enjoy video...,0,1
4,"Remember, Pull Your Jaw Off The Floor After He...","If you've played the game, you know how divine...",0,1
...,...,...,...,...
399995,Unbelievable- In a Bad Way,We bought this Thomas for our son who is a hug...,1,0
399996,"Almost Great, Until it Broke...",My son recieved this as a birthday gift 2 mont...,1,0
399997,Disappointed !!!,"I bought this toy for my son who loves the ""Th...",1,0
399998,Classic Jessica Mitford,This is a compilation of a wide range of Mitfo...,0,1


In [None]:
train, test = train_test_split(df_lite, test_size=0.2, random_state=16, shuffle=True)
display_dfs([train, test])

Unnamed: 0,title,review,negative,positive
685954,Not what I expected,I went into reading this book immediately afte...,1,0
1237038,Awesome for dog clipping.,These clippers come with an adjustable blade. ...,0,1
1913299,Notice.....,notice that the reviews from 2007 are all good...,1,0
893540,Thunder Wagon from the Wind River Series,"I have been reading the Wind River series, and...",0,1
1134243,Why all the fuss????,"If you will allow a voice of dissent here, I a...",1,0
...,...,...,...,...
1223748,Easy Install,Installation was very easy and the winch worke...,0,1
498939,the miracle in a bottle,"my skin was a wreck, dry, kind of scaly and th...",0,1
498885,What on Earth Am I Here For? Purpose Driven Life,It's great like all Warren's writing. Is very ...,0,1
1319673,DANTE IS THE MAN,This is by far 1 of the best books I've ever r...,0,1


Unnamed: 0,title,review,negative,positive
1297867,Un-American Expedition,I purchased this and an American Expedition co...,1,0
1896803,Very handy,"I am very glad I have this book. For some, the...",0,1
1774745,Dissapointing,"After the brilliant ""Go Now"", ""Godlike"" clocks...",1,0
1452126,Must remind myself not to spend time with book...,Crappy mystery novel about a forensic anthropo...,1,0
1794199,Great fit!!,"These mats slipped right in, and fit snug to k...",0,1
...,...,...,...,...
1931515,Dont waste your time with 'new' Lamb.,Lamb 1 and 2 - good.Lamb 3 and Songs for the F...,1,0
1620305,"It's OK, I guess.",I mulled over buying this funnel/strainer for ...,1,0
800650,call the bomb squad,Somehow the spirit of Christmas and gangsta ra...,1,0
506168,Not so durable,I have bought a couple Kong items for my three...,1,0


## Preprocessing

In [None]:
voc_sz = 1500
embedding_dim = 16

y = ['negative', 'positive']

train_text, train_label = train['review'].values, train[y].values
test_text, test_label = test['review'].values, test[y].values

tokenizer = Tokenizer(num_words=voc_sz, oov_token='x')
tokenizer.fit_on_texts(train_text)
tokenizer.fit_on_texts(test_text)

In [None]:
train_seq = tokenizer.texts_to_sequences(train_text)
test_seq = tokenizer.texts_to_sequences(test_text)

In [None]:
samples = train_text[:1001]
print(type(samples))

length = list()

for sample in samples:
    txt_length = len(sample)
    length.append(txt_length)

avg_len = round(sum(length) / len(length))
max_len = round(avg_len / 4)
print(f'average text length: {avg_len}\npadding maxlen (sigma i=0 to n=301 train_i / 4): {max_len}')

<class 'numpy.ndarray'>
average text length: 410
padding maxlen (sigma i=0 to n=301 train_i / 4): 102


In [None]:
padded_train_txt = pad_sequences(train_seq, padding='post', maxlen=max_len)
padded_test_txt = pad_sequences(test_seq, padding='post', maxlen=max_len)

## Modeling

In [None]:
model = Sequential()

model.add(Embedding(input_dim=voc_sz, output_dim=embedding_dim))
model.add(LSTM(64))
model.add(Dense(128, activation="relu"))
model.add(Dense(128, activation="relu"))
model.add(Dense(64, activation="relu"))
model.add(Dropout(0.6))
model.add(Dense(2, activation="sigmoid"))

model.compile(
    loss='binary_crossentropy',
    optimizer='adam',
    metrics=['accuracy']
)

In [None]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 16)          24000     
                                                                 
 lstm (LSTM)                 (None, 64)                20736     
                                                                 
 dense (Dense)               (None, 128)               8320      
                                                                 
 dense_1 (Dense)             (None, 128)               16512     
                                                                 
 dense_2 (Dense)             (None, 64)                8256      
                                                                 
 dropout (Dropout)           (None, 64)                0         
                                                                 
 dense_3 (Dense)             (None, 2)                 1

In [None]:
class callback(keras.callbacks.Callback):
    def on_epoch_end(self, epoch, logs={}):
        if(logs.get('accuracy')>0.925):
            print("\naccuracy reached >92%)")
            self.model.stop_training = True

alpha_callback = callback()

## Training

In [None]:
hist = model.fit(
    padded_train_txt,
    train_label,
    callbacks=[alpha_callback],
    validation_data=(padded_test_txt, test_label),
    epochs=30
)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
accuracy reached >92%)


In [None]:
model.save("rsa_v1-lite.h5")

In [None]:
export_dir = 'rsa_v1-lite/'
tf.saved_model.save(model, export_dir)

shutil.make_archive("rsa_v1-lite", "zip", "./rsa_v1-lite")



INFO:tensorflow:Assets written to: rsa_v1-lite/assets


INFO:tensorflow:Assets written to: rsa_v1-lite/assets


'/content/workspace/rsa_v1-lite.zip'

In [None]:
with open('token.txt', 'w') as f:
    f.write(json.dumps(tokenizer.word_index))

with open("token.py", "w") as f:
    f.write(json.dumps(tokenizer.word_index))

with open("token_index_w.txt", "w") as f:
    f.write(json.dumps(tokenizer.index_word))

with open("token_index_w.py", "w") as f:
    f.write(json.dumps(tokenizer.index_word))

## Testing

In [None]:
def rsa_predict(review: str, display_array=True):
    seq = tokenizer.texts_to_sequences(review)
    padded_txt = pad_sequences(seq, padding="post", maxlen=max_len)

    raw_pred = model.predict(padded_txt)

    if raw_pred[-1][0] > raw_pred[-1][1]:
        txt_class = "Negative"
    elif raw_pred[-1][1] > raw_pred[-1][0]:
        txt_class = "Positive"
    else:
        raise Exception(f"RSA-Lite was unable to classify the given text. Received array:\n{raw_pred}")

    print(f"Original Text: \n{review}")

    if display_array:
        print(f"Class: {txt_class}\n{raw_pred}")
    else:
        print(f"Class: {txt_class}")

## Evaluation

In [None]:
reviews = train['review'].tolist()

rsa_predict(reviews[-1], display_array=False)

Original Text: 
This feeder is very nice. The ring can be full of birds yet the feeding holes remain open. So far the squirrels haven't been able to get in.
Class: Positive
