In [14]:
import pandas as pd
import numpy as np
import nltk
nltk.download('wordnet')
import re
from bs4 import BeautifulSoup

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\dhruv\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [15]:
! pip install bs4 
# in case you don't have it installed

# Dataset: https://s3.amazonaws.com/amazon-reviews-pds/tsv/amazon_reviews_us_Beauty_v1_00.tsv.gz




[notice] A new release of pip is available: 23.2.1 -> 23.3.2
[notice] To update, run: python.exe -m pip install --upgrade pip


## Read Data

In [16]:
url = "https://web.archive.org/web/20201127142707if_/https://s3.amazonaws.com/amazon-reviews-pds/tsv/amazon_reviews_us_Office_Products_v1_00.tsv.gz"
data = pd.read_csv(url, sep='\t',on_bad_lines='skip')

  data = pd.read_csv(url, sep='\t',on_bad_lines='skip')


## Keep Reviews and Ratings

In [17]:
# Exrtacting the required fields from the dataset
data = data[["review_body", "star_rating"]]

In [18]:
pd.set_option('display.max_colwidth', None)

print(data.head(3))

                                                                                                                                                   review_body  \
0                                                                                                                                               Great product.   
1  What's to say about this commodity item except, what have we come to in this world.<br />Having the need to bnuy captured and compressed air. &#60;lol&#62;   
2                                                                                                              Haven't used yet, but I am sure I will like it.   

  star_rating  
0           5  
1           5  
2           5  


 ## We form three classes and select 10000 reviews randomly from each class.



In [19]:
rating_stats = data.star_rating.value_counts()
for i , j in rating_stats.items():
    print(f'Number of reviews having rating {i} : {j}')
    
print()
# removing any rating except from 1, 2, 3, 4, and 5. There are some ratings with NaN and string values
data.loc[:, 'star_rating'] = pd.to_numeric(data.star_rating, errors="coerce")

data = data.dropna()
# data["star_rating"] = data["star_rating"].astype(int)
print()
rating_stats = data.star_rating.value_counts()
for i , j in rating_stats.items():
    print(f'Number of reviews having rating {i} : {j}')

# ignoring all neutral rating (class 3)
neutral_rating = data[data["star_rating"] == 3].shape[0]

data.drop(data[data.star_rating == 3].index, inplace=True)
# data = data[data['star_rating'] != 3]
data["sentiment"] = data["star_rating"].apply(lambda x: 1 if x > 3 else 0)

# selecting 100000 reviews from positive and negative class on a raandom basis
rating_1 = data[data["sentiment"] == 1].sample(n=100000, random_state=1)
rating_0 = data[data["sentiment"] == 0].sample(n=100000, random_state=1)

# number of total reviews in each class: positive, negative and neutral
positive_rating = data[data["sentiment"] == 1].shape[0]
negative_rating = data[data["sentiment"] == 0].shape[0]

print(f"\n# of Positive rating: {positive_rating}")
print(f"# of Negative rating: {negative_rating}")
print(f"# of Neutral rating: {neutral_rating}")

final_sample = pd.concat([rating_0, rating_1])

Number of reviews having rating 5 : 1459036
Number of reviews having rating 4 : 389612
Number of reviews having rating 1 : 286080
Number of reviews having rating 3 : 179871
Number of reviews having rating 2 : 129033
Number of reviews having rating 5 : 123776
Number of reviews having rating 4 : 28759
Number of reviews having rating 1 : 20899
Number of reviews having rating 3 : 13820
Number of reviews having rating 2 : 9351
Number of reviews having rating 2015-06-05 : 1
Number of reviews having rating 2015-02-11 : 1
Number of reviews having rating 2014-02-14 : 1


Number of reviews having rating 5.0 : 1582704
Number of reviews having rating 4.0 : 418348
Number of reviews having rating 1.0 : 306967
Number of reviews having rating 3.0 : 193680
Number of reviews having rating 2.0 : 138381

# of Positive rating: 2001052
# of Negative rating: 445348
# of Neutral rating: 193680


# Data Cleaning

# Pre-processing

In [20]:
avg_len_before = final_sample["review_body"].apply(len).mean()

In [21]:
# convert reviews to lowercase strings
final_sample["review_body"] = final_sample["review_body"].str.lower()

# to remove HTML tags
final_sample["review_body"] = final_sample["review_body"].apply(lambda text: re.sub(r'<.*?>', '', text)  if type(text) == str else '')

# to remove URLs
final_sample["review_body"] = final_sample["review_body"].apply(lambda text: re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE))

# to remove non-alphabetic characters except for aphostophes as they will be removed by expanding the contractions
final_sample["review_body"] = final_sample["review_body"].apply(lambda text: re.sub(r'[^a-z\s\']', '', text))

# to remove extra spaces
final_sample["review_body"] = final_sample["review_body"].apply(lambda text: re.sub(r'\s+', ' ', text).strip())

# contractions dictionary for expanding the same. For contractions with multiple expansion I have taken the ones that fit most closely according to me. 
contractions_dict = {
    "ain't": "am not",
    "aren't": "are not",
    "can't": "cannot",
    "can't've": "cannot have",
    "'cause": "because",
    "could've": "could have",
    "couldn't": "could not",
    "couldn't've": "could not have",
    "didn't": "did not",
    "doesn't": "does not",
    "don't": "do not",
    "hadn't": "had not",
    "hadn't've": "had not have",
    "hasn't": "has not",
    "haven't": "have not",
    "he'd": "he had",
    "he'd've": "he would have",
    "he'll": "he will",
    "he'll've": "he will have",
    "he's": "he is",
    "how'd": "how did",
    "how'd'y": "how do you",
    "how'll": "how will",
    "how's": "how is",
    "i'd": "I would",
    "i'd've": "I would have",
    "i'll": "I will",
    "i'll've": "I will have",
    "i'm": "I am",
    "i've": "I have",
    "isn't": "is not",
    "it'd": "it would",
    "it'd've": "it would have",
    "it'll": "it will",
    "it'll've": "it will have",
    "it's": "it is",
    "let's": "let us",
    "ma'am": "madam",
    "mayn't": "may not",
    "might've": "might have",
    "mightn't": "might not",
    "mightn't've": "might not have",
    "must've": "must have",
    "mustn't": "must not",
    "mustn't've": "must not have",
    "needn't": "need not",
    "needn't've": "need not have",
    "o'clock": "of the clock",
    "oughtn't": "ought not",
    "oughtn't've": "ought not have",
    "shan't": "shall not",
    "sha'n't": "shall not",
    "shan't've": "shall not have",
    "she'd": "she would",
    "she'd've": "she would have",
    "she'll": "she will",
    "she'll've": "she will have",
    "she's": "she is",
    "should've": "should have",
    "shouldn't": "should not",
    "shouldn't've": "should not have",
    "so've": "so have",
    "so's": "so is",
    "that'd": "that would",
    "that'd've": "that would have",
    "that's": "that is",
    "there'd": "there would",
    "there'd've": "there would have",
    "there's": "there is",
    "they'd": "they would",
    "they'd've": "they would have",
    "they'll": "they will",
    "they'll've": "they will have",
    "they're": "they are",
    "they've": "they have",
    "to've": "to have",
    "wasn't": "was not",
    "we'd": "we would",
    "we'd've": "we would have",
    "we'll": "we will",
    "we'll've": "we will have",
    "we're": "we are",
    "we've": "we have",
    "weren't": "were not",
    "what'll": "what will",
    "what'll've": "what will have",
    "what're": "what are",
    "what's": "what is",
    "what've": "what have",
    "when's": "when is",
    "when've": "when have",
    "where'd": "where did",
    "where's": "where is",
    "where've": "where have",
    "who'll": "who will",
    "who'll've": "who will have",
    "who's": "who is",
    "who've": "who have",
    "why's": "why is",
    "why've": "why have",
    "will've": "will have",
    "won't": "will not",
    "won't've": "will not have",
    "would've": "would have",
    "wouldn't": "would not",
    "wouldn't've": "would not have",
    "y'all": "you all",
    "y'all'd": "you all would",
    "y'all'd've": "you all would have",
    "y'all're": "you all are",
    "y'all've": "you all have",
    "you'd": "you would",
    "you'd've": "you would have",
    "you'll": "you will",
    "you'll've": "you will have",
    "you're": "you are",
    "you've": "you have"
}

# expanding contractions
final_sample["review_body"] = final_sample["review_body"].apply(lambda text: ' '.join([contractions_dict[word] if word in contractions_dict else word for word in text.split()]))

In [22]:
avg_len_after = final_sample["review_body"].apply(len).mean()
print(f"Average length of reviews before and after data cleaning: {avg_len_before}, {avg_len_after}")

Average length of reviews before and after data cleaning: 317.722825, 301.98912


## remove the stop words 

In [23]:
from nltk.corpus import stopwords
nltk.download('stopwords')

stopwords_en = stopwords.words('english')

avg_len_before = data["review_body"].apply(len).mean()

# removing stop words
# final_sample["review_body"] = final_sample["review_body"].apply(lambda text: ' '.join([word if word not in stopwords_en else '' for word in text.split()]))
final_sample["review_body"] = final_sample["review_body"].apply( lambda x : ' '.join([i for i in x.split() if i not in (stopwords_en)]))

print(final_sample.head())

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\dhruv\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        review_body  \
2081506                                                                                                                                                                                                          

## perform lemmatization  

In [24]:
from nltk import word_tokenize, pos_tag
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet as wn
from collections import defaultdict

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

# creating a pos tag map for proper lemmatization
pos_tag_map = defaultdict(lambda : wn.NOUN)
pos_tag_map['J'] = wn.ADJ
pos_tag_map['V'] = wn.VERB
pos_tag_map['R'] = wn.ADV

lemmatizer = WordNetLemmatizer()

# for each review: we would contextually tokenize the sentence using word_tokenize, this will give us the word token and it's pos_tag. This tag will be mapped with pos_tag_map to get the appropriate wordnet pos_tag that will be feed into the lemmatizer.
final_sample["review_lemmatize"] = final_sample["review_body"].apply(lambda text: ' '.join([lemmatizer.lemmatize(word_token, pos_tag_map[word_pos_tag[0]])
 for word_token, word_pos_tag in pos_tag(word_tokenize(text))]))

avg_len_after = final_sample["review_body"].apply(len).mean()
print(f"\nAverage length of reviews before and after data cleaning: {round(avg_len_before, 2)}, {avg_len_after}")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\dhruv\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\dhruv\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!



Average length of reviews before and after data cleaning: 279.14, 191.664955


In [25]:
pd.set_option('display.max_colwidth', None)

print(final_sample[["review_lemmatize", "review_body"]].head(3))

                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   review_lemmatize  \
2081506                                                                                                                                                                                                                                          

# TF-IDF Feature Extraction

In [26]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1,2))
tfidf_features = tfidf_vectorizer.fit_transform(final_sample["review_lemmatize"].tolist())

# to get the features that the vectorizer selected 
feature_df = pd.DataFrame(tfidf_features.toarray(), columns=tfidf_vectorizer.get_feature_names_out())
# final_df = pd.concat([feature_df, final_sample])

print(feature_df.head())

    aa  aaa  aaa battery   ability  able  able find  able get  able print  \
0  0.0  0.0          0.0  0.000000   0.0        0.0       0.0         0.0   
1  0.0  0.0          0.0  0.097509   0.0        0.0       0.0         0.0   
2  0.0  0.0          0.0  0.000000   0.0        0.0       0.0         0.0   
3  0.0  0.0          0.0  0.000000   0.0        0.0       0.0         0.0   
4  0.0  0.0          0.0  0.000000   0.0        0.0       0.0         0.0   

   able use  absolute  ...  yield  young  youtube   yr  zebra  zero  \
0       0.0       0.0  ...    0.0    0.0      0.0  0.0    0.0   0.0   
1       0.0       0.0  ...    0.0    0.0      0.0  0.0    0.0   0.0   
2       0.0       0.0  ...    0.0    0.0      0.0  0.0    0.0   0.0   
3       0.0       0.0  ...    0.0    0.0      0.0  0.0    0.0   0.0   
4       0.0       0.0  ...    0.0    0.0      0.0  0.0    0.0   0.0   

   zero star  zip  zipper  zire  
0        0.0  0.0     0.0   0.0  
1        0.0  0.0     0.0   0.0  
2       

# Spliting Data

In [27]:
from sklearn.model_selection import train_test_split

X = feature_df
y = final_sample["sentiment"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [28]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Perceptron

In [29]:
from sklearn.linear_model import Perceptron

# Initialize the Perceptron model
perceptron = Perceptron()

# Train the model
perceptron.fit(X_train, y_train)

# Make predictions
y_train_pred = perceptron.predict(X_train)
y_test_pred = perceptron.predict(X_test)

# Calculate metrics for training data
accuracy_train = accuracy_score(y_train, y_train_pred)
precision_train = precision_score(y_train, y_train_pred)
recall_train = recall_score(y_train, y_train_pred)
f1_train = f1_score(y_train, y_train_pred)

# Calculate metrics for testing data
accuracy_test = accuracy_score(y_test, y_test_pred)
precision_test = precision_score(y_test, y_test_pred)
recall_test = recall_score(y_test, y_test_pred)
f1_test = f1_score(y_test, y_test_pred)

# Print metrics
print("Training Metrics:")
print("Accuracy:", accuracy_train)
print("Precision:", precision_train)
print("Recall:", recall_train)
print("F1 Score:", f1_train)

print("\nTesting Metrics:")
print("Accuracy:", accuracy_test)
print("Precision:", precision_test)
print("Recall:", recall_test)
print("F1 Score:", f1_test)

Training Metrics:
Accuracy: 0.8590875
Precision: 0.9488397531057113
Recall: 0.759078919405448
F1 Score: 0.8434175070144734

Testing Metrics:
Accuracy: 0.85105
Precision: 0.9386748267033036
Recall: 0.7512870495326636
F1 Score: 0.8345918933925597


# Logistic Regression

In [30]:
from sklearn.linear_model import LogisticRegression

# Initialize the Logistic Regression model
logreg = LogisticRegression(max_iter=1000)

# Train the model
logreg.fit(X_train, y_train)

# Make predictions on the training and testing sets
y_train_pred = logreg.predict(X_train)
y_test_pred = logreg.predict(X_test)

# Calculate metrics for the training set
accuracy_train = accuracy_score(y_train, y_train_pred)
precision_train = precision_score(y_train, y_train_pred)
recall_train = recall_score(y_train, y_train_pred)
f1_train = f1_score(y_train, y_train_pred)

# Calculate metrics for the testing set
accuracy_test = accuracy_score(y_test, y_test_pred)
precision_test = precision_score(y_test, y_test_pred)
recall_test = recall_score(y_test, y_test_pred)
f1_test = f1_score(y_test, y_test_pred)

# Print the metrics
print("Training Metrics:")
print("Accuracy:", accuracy_train)
print("Precision:", precision_train)
print("Recall:", recall_train)
print("F1 Score:", f1_train)

print("\nTesting Metrics:")
print("Accuracy:", accuracy_test)
print("Precision:", precision_test)
print("Recall:", recall_test)
print("F1 Score:", f1_test)


Training Metrics:
Accuracy: 0.90476875
Precision: 0.9091023956332761
Recall: 0.8994537021989424
F1 Score: 0.9042523109019273

Testing Metrics:
Accuracy: 0.898125
Precision: 0.9030968525452889
Recall: 0.8920377867746289
F1 Score: 0.8975332545449972


# SVM


In [31]:
from sklearn.svm import LinearSVC

# Initialize the SVM model
svm = LinearSVC(dual=True)

# Train the model
svm.fit(X_train, y_train)

# Make predictions
y_train_pred = svm.predict(X_train)
y_test_pred = svm.predict(X_test)

# Calculate metrics for training data
accuracy_train = accuracy_score(y_train, y_train_pred)
precision_train = precision_score(y_train, y_train_pred)
recall_train = recall_score(y_train, y_train_pred)
f1_train = f1_score(y_train, y_train_pred)

# Calculate metrics for testing data
accuracy_test = accuracy_score(y_test, y_test_pred)
precision_test = precision_score(y_test, y_test_pred)
recall_test = recall_score(y_test, y_test_pred)
f1_test = f1_score(y_test, y_test_pred)

# Print metrics
print("Training Metrics:")
print("Accuracy:", accuracy_train)
print("Precision:", precision_train)
print("Recall:", recall_train)
print("F1 Score:", f1_train)

print("\nTesting Metrics:")
print("Accuracy:", accuracy_test)
print("Precision:", precision_test)
print("Recall:", recall_test)
print("F1 Score:", f1_test)

Training Metrics:
Accuracy: 0.908825
Precision: 0.9113780740927102
Recall: 0.9057042491217981
F1 Score: 0.9085323033707865

Testing Metrics:
Accuracy: 0.896275
Precision: 0.8990437845998993
Recall: 0.8928874893787174
F1 Score: 0.8959550618150813


# Naive Bayes

In [32]:
from sklearn.naive_bayes import MultinomialNB

# Initialize the Multinomial Naive Bayes model
nb_model = MultinomialNB()

# Train the model
nb_model.fit(X_train, y_train)

# Make predictions on the training and testing sets
y_train_pred = nb_model.predict(X_train)
y_test_pred = nb_model.predict(X_test)

# Calculate metrics for the training set
accuracy_train = accuracy_score(y_train, y_train_pred)
precision_train = precision_score(y_train, y_train_pred)
recall_train = recall_score(y_train, y_train_pred)
f1_train = f1_score(y_train, y_train_pred)

# Calculate metrics for the testing set
accuracy_test = accuracy_score(y_test, y_test_pred)
precision_test = precision_score(y_test, y_test_pred)
recall_test = recall_score(y_test, y_test_pred)
f1_test = f1_score(y_test, y_test_pred)

# Print the metrics
print("Training Metrics:")
print("Accuracy:", accuracy_train)
print("Precision:", precision_train)
print("Recall:", recall_train)
print("F1 Score:", f1_train)

print("\nTesting Metrics:")
print("Accuracy:", accuracy_test)
print("Precision:", precision_test)
print("Recall:", recall_test)
print("F1 Score:", f1_test)


Training Metrics:
Accuracy: 0.86659375
Precision: 0.864345708463794
Recall: 0.8696510944707662
F1 Score: 0.8669902852122111

Testing Metrics:
Accuracy: 0.864775
Precision: 0.8625571229882774
Recall: 0.8679462188234118
F1 Score: 0.8652432796033782
