In [5]:
import numpy as np 
import pandas as pd


### Best Practises 
1. preprocessing and Cleaning 
2. Train,Test split 
3. Bow , tfidf, word2vec 
4. Train Ml algorithm 

In [6]:

# load the dataset 
df = pd.read_json('Kindle_Store_5.json', lines=True)

In [7]:
df.head()

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime
0,A1F6404F1VG29J,B000F83SZQ,Avidreader,"[0, 0]",I enjoy vintage books and movies so I enjoyed ...,5,Nice vintage story,1399248000,"05 5, 2014"
1,AN0N05A9LIJEQ,B000F83SZQ,critters,"[2, 2]",This book is a reissue of an old one; the auth...,4,Different...,1388966400,"01 6, 2014"
2,A795DMNCJILA6,B000F83SZQ,dot,"[2, 2]",This was a fairly interesting read. It had ol...,4,Oldie,1396569600,"04 4, 2014"
3,A1FV0SX13TWVXQ,B000F83SZQ,"Elaine H. Turley ""Montana Songbird""","[1, 1]",I'd never read any of the Amy Brewster mysteri...,5,I really liked it.,1392768000,"02 19, 2014"
4,A3SPTOKDG7WBLN,B000F83SZQ,Father Dowling Fan,"[0, 1]","If you like period pieces - clothing, lingo, y...",4,Period Mystery,1395187200,"03 19, 2014"


In [8]:
df.shape

(982619, 9)

In [9]:
df.columns

Index(['reviewerID', 'asin', 'reviewerName', 'helpful', 'reviewText',
       'overall', 'summary', 'unixReviewTime', 'reviewTime'],
      dtype='object')

In [10]:
data = df[['reviewText', 'overall']]
data.head()

Unnamed: 0,reviewText,overall
0,I enjoy vintage books and movies so I enjoyed ...,5
1,This book is a reissue of an old one; the auth...,4
2,This was a fairly interesting read. It had ol...,4
3,I'd never read any of the Amy Brewster mysteri...,5
4,"If you like period pieces - clothing, lingo, y...",4


In [11]:
data.shape

(982619, 2)

In [12]:
## Missing Values 
data.isnull().sum()

reviewText    0
overall       0
dtype: int64

In [13]:
data['overall'].unique()

array([5, 4, 3, 2, 1], dtype=int64)

In [14]:
data['overall'].value_counts()

overall
5    575264
4    254013
3     96194
2     34130
1     23018
Name: count, dtype: int64

## preprocessing And cleaning the data 



In [15]:
## positive reviews is 1 and negative reviews is 0 
data['overall'] = data['overall'].apply(lambda x: 0 if x <3 else 1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['overall'] = data['overall'].apply(lambda x: 0 if x <3 else 1)


In [16]:
data['overall'].value_counts()

overall
1    925471
0     57148
Name: count, dtype: int64

In [17]:
data.head()

Unnamed: 0,reviewText,overall
0,I enjoy vintage books and movies so I enjoyed ...,1
1,This book is a reissue of an old one; the auth...,1
2,This was a fairly interesting read. It had ol...,1
3,I'd never read any of the Amy Brewster mysteri...,1
4,"If you like period pieces - clothing, lingo, y...",1


### because the target is unsambling 

In [18]:
from sklearn.utils import resample

# Use the preprocessed 'data' DataFrame
df_majority = data[data.overall == 1]
df_minority = data[data.overall == 0]

# Only perform resampling if the minority class is not empty
if df_minority.shape[0] > 0:
    df_majority_downsampled = resample(
        df_majority,
        replace=False,
        n_samples=df_minority.shape[0],  # to match minority class
        random_state=42
    )
    df_balanced = pd.concat([df_majority_downsampled, df_minority])
    # Shuffle
    df_balanced = df_balanced.sample(frac=1, random_state=42)
else:
    print("No samples found in the minority class (overall == 0). Cannot perform downsampling.")
    df_balanced = data.copy()


In [19]:
df_balanced.shape

(114296, 2)

In [20]:
df_balanced['overall'].value_counts()

overall
0    57148
1    57148
Name: count, dtype: int64

In [21]:
df_balanced.head()

Unnamed: 0,reviewText,overall
571772,"Description sad shorts, that a big ole lie. Al...",0
130982,"I agree with another reviewer, it is time to m...",1
883433,&#34;His deep and piercing blue eyes were star...,1
686816,"Vampires, Sirens, and witches Oh my!This cover...",1
771649,if you like a good love story without b#$chy e...,1


In [22]:
## 1. Lower All the cases
df_balanced['reviewText'] = df_balanced['reviewText'].str.lower()

In [23]:
from nltk.corpus import stopwords
import re
import nltk
nltk.download('stopwords')
from bs4 import BeautifulSoup
#lxml
import html

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [24]:


## Remove special characters 
df_balanced['reviewText'] = df_balanced['reviewText'].apply(lambda x : re.sub('[^a-z A-z 0-9]+', '', x))



In [25]:
## Remove any extra spaces
df_balanced['reviewText'] = df_balanced['reviewText'].apply(lambda x: " ".join(x.split()))

In [26]:

## Remove the Url 
df_balanced['reviewText'] = df_balanced['reviewText'].apply(lambda x: re.sub(r'(http|https|ftp|ssh)://([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%:/~+#-]*[\w@?^=%&/~+#-])?', '', str(x)))

In [27]:


from bs4 import BeautifulSoup
# Remove HTML tags
df_balanced['reviewText'] = df_balanced['reviewText'].apply(lambda x: BeautifulSoup(x, "html").get_text())

In [28]:
# Remove stop words
stop_words = set(stopwords.words('english'))
df_balanced['reviewText'] = df_balanced['reviewText'].apply(lambda x: ' '.join(word for word in x.split() if word not in stop_words))

In [29]:
df_balanced.shape

(114296, 2)

In [30]:
df_balanced.head()

Unnamed: 0,reviewText,overall
571772,description sad shorts big ole lie pages get i...,0
130982,agree another reviewer time move story lacy se...,1
883433,34his deep piercing blue eyes staring right sh...,1
686816,vampires sirens witches oh mythis cover awesom...,1
771649,like good love story without bchy ex girlfrien...,1


In [31]:
# lemmatization
from nltk.stem import WordNetLemmatizer
lemetizer = WordNetLemmatizer()


In [32]:
def lemmatize_text(text):
    return ' '.join([lemetizer.lemmatize(word) for word in text.split()])

In [33]:
df_balanced['reviewText'] = df_balanced['reviewText'].apply(lambda x: lemmatize_text(x))

In [34]:
df_balanced.head()

Unnamed: 0,reviewText,overall
571772,description sad short big ole lie page get int...,0
130982,agree another reviewer time move story lacy se...,1
883433,34his deep piercing blue eye staring right sho...,1
686816,vampire siren witch oh mythis cover awesome lo...,1
771649,like good love story without bchy ex girlfrien...,1


In [35]:
## train test split 
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df_balanced['reviewText'], df_balanced['overall'], test_size=0.2, random_state=42)

In [36]:
X_train

954476    wow even short story something didnt enjoy ive...
933004    journey like dahlia choice would bite cat inst...
612       instalust someone whose face havent seen even ...
702643    blake karrington truth beast pen trapstar 3 ab...
31127     might interesting plot difficult read doesnt f...
                                ...                        
669157    probably shouldnt review book didnt finish ter...
50125     book 95 cent kindle keep money read like barba...
333507    good advice great tip bought get celebrity cha...
889090    5 starsi love bad boy archer much katy preache...
16155     battery ran third time used appliance impresse...
Name: reviewText, Length: 91436, dtype: object

In [37]:
from sklearn.feature_extraction.text import CountVectorizer
bow = CountVectorizer(max_features=2500)  # Limit to 5000 features for efficiency
X_train_bow = bow.fit_transform(X_train).toarray()
x_test_bow = bow.transform(X_test).toarray()


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(max_features=5000)  # Limit to 5000 features for efficiency
X_train_tfidf = tfidf.fit_transform(X_train).toarray()
x_test_tfidf = tfidf.transform(X_test).toarray()


In [None]:
X_train_bow.shape, x_test_bow.shape, X_train_tfidf.shape, x_test_tfidf.shape

((91436, 2500), (22860, 2500), (91436, 5000), (22860, 5000))

In [None]:
X_train_bow[0]

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [None]:
from sklearn.naive_bayes import MultinomialNB
nb_bow = MultinomialNB()
nb_bow.fit(X_train_bow, y_train)


NameError: name 'X_train_bow' is not defined

In [None]:
nb_tfidf = MultinomialNB()
nb_tfidf.fit(X_train_tfidf, y_train)

In [None]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
y_pred_bow = nb_bow.predict(x_test_bow)
y_pred_tfidf = nb_tfidf.predict(x_test_tfidf)

print("Bag of Words Model:")
print("Accuracy:", accuracy_score(y_test, y_pred_bow))
print("Classification Report:\n", classification_report(y_test, y_pred_bow))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_bow))

In [None]:
print("TF-IDF Model:")
print("Accuracy:", accuracy_score(y_test, y_pred_tfidf))
print("Classification Report:\n", classification_report(y_test, y_pred_tfidf))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_tfidf))