In [1]:
# Prerequisites
## %pip install datasets
## %pip install imbalanced-learn
## %pip install lxml and restart runtime

# References
# https://huggingface.co/datasets/McAuley-Lab/Amazon-Reviews-2023
# https://lxml.de/installation.html

In [2]:
#from datasets import load_dataset

In [3]:
# dataset = load_dataset("McAuley-Lab/Amazon-Reviews-2023", "raw_review_All_Beauty", trust_remote_code=True)
# print(dataset["full"][0])

In [4]:
# dataset["full"].to_csv("amazon_reviews.csv", index=False)

In [5]:
import pandas as pd
import numpy as np

In [6]:
## Load the dataset
data = pd.read_csv("amazon_reviews.csv")
data.head()

Unnamed: 0,rating,title,text,images,asin,parent_asin,user_id,timestamp,helpful_vote,verified_purchase
0,5.0,Such a lovely scent but not overpowering.,This spray is really nice. It smells really go...,[],B00YQ6X8EO,B00YQ6X8EO,AGKHLEW2SOWHNMFQIJGBECAF7INQ,1588687728923,0,True
1,4.0,Works great but smells a little weird.,"This product does what I need it to do, I just...",[],B081TJ8YS3,B081TJ8YS3,AGKHLEW2SOWHNMFQIJGBECAF7INQ,1588615855070,1,True
2,5.0,Yes!,"Smells good, feels great!",[],B07PNNCSP9,B097R46CSY,AE74DYR3QUGVPZJ3P7RFWBGIX7XQ,1589665266052,2,True
3,1.0,Synthetic feeling,Felt synthetic,[],B09JS339BZ,B09JS339BZ,AFQLNQNQYFWQZPJQZS6V3NZU4QBQ,1643393630220,0,True
4,5.0,A+,Love it,[],B08BZ63GMJ,B08BZ63GMJ,AFQLNQNQYFWQZPJQZS6V3NZU4QBQ,1609322563534,0,True


In [7]:
data = data[['text', 'rating']]
data.head()

Unnamed: 0,text,rating
0,This spray is really nice. It smells really go...,5.0
1,"This product does what I need it to do, I just...",4.0
2,"Smells good, feels great!",5.0
3,Felt synthetic,1.0
4,Love it,5.0


In [8]:
data.shape

(701528, 2)

In [9]:
## Missing Values
data.isnull().sum()

text      212
rating      0
dtype: int64

In [10]:
## Checking null records
data[data['text'].isnull()]

Unnamed: 0,text,rating
9078,,5.0
15159,,5.0
28358,,5.0
29389,,4.0
34286,,3.0
...,...,...
693459,,4.0
700225,,2.0
700357,,5.0
700744,,3.0


In [11]:
## Dropping null records
data = data.dropna()
data[data['text'].isnull()]

Unnamed: 0,text,rating


In [12]:
## Checking the dependent variable
data['rating'].unique()

array([5., 4., 1., 3., 2.])

In [13]:
## Checking if data is balanced
data['rating'].value_counts()

rating
5.0    420566
1.0    102067
4.0     79357
3.0     56294
2.0     43032
Name: count, dtype: int64

Imbalance dataset :(

We can use smothe to fix the problem or ensemble techniques

In [14]:
## Changing ratings to binary to have two classes:
### 1-3: Negative
### 4-5: Positive

In [15]:
data['rating'] = data['rating'].apply(lambda rating: 0 if rating < 3 else 1)

In [16]:
data.head()

Unnamed: 0,text,rating
0,This spray is really nice. It smells really go...,1
1,"This product does what I need it to do, I just...",1
2,"Smells good, feels great!",1
3,Felt synthetic,0
4,Love it,1


In [17]:
data['rating'].value_counts()

rating
1    556217
0    145099
Name: count, dtype: int64

In [18]:
## Print the percent of positive and negative reviews
print("Positive reviews: ", data['rating'].value_counts()[1]/len(data))
print("Negative reviews: ", data['rating'].value_counts()[0]/len(data))

Positive reviews:  0.7931046774920293
Negative reviews:  0.20689532250797074


In [19]:
## Preprocessing
### 1. Lower all the cases
data['text'] = data['text'].str.lower()

In [20]:
data.head()

Unnamed: 0,text,rating
0,this spray is really nice. it smells really go...,1
1,"this product does what i need it to do, i just...",1
2,"smells good, feels great!",1
3,felt synthetic,0
4,love it,1


In [21]:
import regex as re
from nltk.corpus import stopwords
from bs4 import BeautifulSoup

In [22]:
stopwords_set = set(stopwords.words('english'))

In [23]:
### 2. Cleaning the text

#### Removing urls
data['text'] = data['text'].apply(lambda text: re.sub(r'(http|https)://[\w_-]+\.[\w_-]+\.[\w_-]+[\w\-\/]+', '', text))

#### Removing html tags
data['text'] = data['text'].apply(lambda text: BeautifulSoup(text, 'lxml').get_text())

#### Removing special characters
data['text'] = data['text'].apply(lambda text : re.sub('[^a-zA-z0-9\s]+', '', text))

#### Triming the text
data['text'] = data['text'].apply(lambda text: text.strip())

#### Removing stopwords
data['text'] = data['text'].apply(lambda text: " ".join([word for word in text.split() if word not in stopwords_set]))

### Removing any additional spaces
data['text'] = data['text'].apply(lambda text: " ".join(text.split()))

data.head()

  data['text'] = data['text'].apply(lambda text: BeautifulSoup(text, 'lxml').get_text())


Unnamed: 0,text,rating
0,spray really nice smells really good goes real...,1
1,product need wish odorless soft coconut smell ...,1
2,smells good feels great,1
3,felt synthetic,0
4,love,1


In [24]:
#### Apply lemmatization
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
data['text'] = data['text'].apply(lambda text: " ".join([lemmatizer.lemmatize(word) for word in text.split()]))

data.head()

Unnamed: 0,text,rating
0,spray really nice smell really good go really ...,1
1,product need wish odorless soft coconut smell ...,1
2,smell good feel great,1
3,felt synthetic,0
4,love,1


In [25]:
## Train test split
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(data['text'], data['rating'], test_size=0.2, random_state=42)

In [26]:
## Vectorization
from sklearn.feature_extraction.text import CountVectorizer

bow = CountVectorizer()
X_train_bow = bow.fit_transform(X_train)
X_test_bow = bow.transform(X_test)

In [27]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfid = TfidfVectorizer()
X_train_tf = tfid.fit_transform(X_train)
X_test_tf = tfid.transform(X_test)

In [None]:
#### Pending, implement also Word2Vec

In [None]:
## Reducing the dimensionality because of the sparsity which causes memory issues
from sklearn.decomposition import TruncatedSVD

svd = TruncatedSVD(n_components=300, random_state=42)
X_train_bow_svd = svd.fit_transform(X_train_bow)
X_test_bow_svd = svd.transform(X_test_bow)

X_train_tf_svd = svd.fit_transform(X_train_tf)
X_test_tf_svd = svd.transform(X_test_tf)

In [42]:
## Training the model
from sklearn.naive_bayes import GaussianNB
nb_model_bow = GaussianNB().fit(X_train_bow_svd, y_train)
nb_model_tf = GaussianNB().fit(X_train_tf_svd, y_train)

In [36]:
## Evaluating the model
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [40]:
y_pred_bow = nb_model_bow.predict(X_test_bow_svd)
y_pred_tf = nb_model_tf.predict(X_test_tf_svd)

print("Naive Bayes with BOW")
print("Accuracy: ", accuracy_score(y_test, y_pred_bow))

print()
print(classification_report(y_test, y_pred_bow))

print()
print(confusion_matrix(y_test, y_pred_bow))

print("\nNaive Bayes with TF")
print("Accuracy: ", accuracy_score(y_test, y_pred_tf))

print()
print(classification_report(y_test, y_pred_tf))

print()
print(confusion_matrix(y_test, y_pred_tf))

Naive Bayes with BOW
Accuracy:  0.5465336793475161

              precision    recall  f1-score   support

           0       0.28      0.77      0.41     29119
           1       0.89      0.49      0.63    111145

    accuracy                           0.55    140264
   macro avg       0.59      0.63      0.52    140264
weighted avg       0.76      0.55      0.59    140264


[[22410  6709]
 [56896 54249]]

Naive Bayes with TF
Accuracy:  0.6746135858096162

              precision    recall  f1-score   support

           0       0.37      0.77      0.50     29119
           1       0.92      0.65      0.76    111145

    accuracy                           0.67    140264
   macro avg       0.64      0.71      0.63    140264
weighted avg       0.80      0.67      0.71    140264


[[22539  6580]
 [39060 72085]]


In [43]:
## Training the model
from sklearn.ensemble import RandomForestClassifier
rfc_model_bow = RandomForestClassifier().fit(X_train_bow_svd, y_train)
rfc_model_tf = RandomForestClassifier().fit(X_train_tf_svd, y_train)

KeyboardInterrupt: 

In [None]:
y_pred_bow = rfc_model_bow.predict(X_test_bow_svd)
y_pred_tf = rfc_model_tf.predict(X_test_tf_svd)

print("Naive Bayes with BOW")
print("Accuracy: ", accuracy_score(y_test, y_pred_bow))

print()
print(classification_report(y_test, y_pred_bow))

print()
print(confusion_matrix(y_test, y_pred_bow))

print("\nNaive Bayes with TF")
print("Accuracy: ", accuracy_score(y_test, y_pred_tf))

print()
print(classification_report(y_test, y_pred_tf))

print()
print(confusion_matrix(y_test, y_pred_tf))