In [3]:
import numpy as np
import pandas as pd
df=pd.read_csv('./all_kindle_review .csv')
df=df[['rating','reviewText']]
df

Unnamed: 0,rating,reviewText
0,3,"Jace Rankin may be short, but he's nothing to ..."
1,5,Great short read. I didn't want to put it dow...
2,3,I'll start by saying this is the first of four...
3,3,Aggie is Angela Lansbury who carries pocketboo...
4,4,I did not expect this type of book to be in li...
...,...,...
11995,4,Valentine cupid is a vampire- Jena and Ian ano...
11996,5,I have read all seven books in this series. Ap...
11997,3,This book really just wasn't my cuppa. The si...
11998,1,"tried to use it to charge my kindle, it didn't..."


Preprocessing and data Cleaning

In [4]:
df.isnull().sum()
print(df['rating'].unique())
# df['rating'].value_counts()


[3 5 4 2 1]


In [5]:
# rating is set to 0 or 1 as i want negative or positive
df['rating']=df['rating'].apply(lambda x:0 if x<3 else 1)
df.head()

Unnamed: 0,rating,reviewText
0,1,"Jace Rankin may be short, but he's nothing to ..."
1,1,Great short read. I didn't want to put it dow...
2,1,I'll start by saying this is the first of four...
3,1,Aggie is Angela Lansbury who carries pocketboo...
4,1,I did not expect this type of book to be in li...


In [6]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer 
from bs4 import BeautifulSoup

In [7]:

# Convert to lowercase
df['reviewText'] = df['reviewText'].str.lower()

# Remove HTML tags
df['reviewText'] = df['reviewText'].apply(lambda x: BeautifulSoup(x, "html.parser").get_text())

# Remove emails and hyperlinks
df['reviewText'] = df['reviewText'].apply(lambda x: re.sub(r"(https?://\S+|www\.\S+|\S+@\S+\.\S+)", " ", x))

# Keep only alphabets, numbers, hyphens, and spaces
df['reviewText'] = df['reviewText'].apply(lambda x: re.sub(r'[^a-zA-Z0-9\s-]', '', x))

# Remove stopwords
stop_words = set(stopwords.words('english'))
df['reviewText'] = df['reviewText'].apply(lambda x: " ".join([y for y in x.split() if y not in stop_words]))

# Normalize spaces
df['reviewText'] = df['reviewText'].apply(lambda x: " ".join(x.split()))

# Show cleaned text
df


Unnamed: 0,rating,reviewText
0,1,jace rankin may short hes nothing mess man hau...
1,1,great short read didnt want put read one sitti...
2,1,ill start saying first four books wasnt expect...
3,1,aggie angela lansbury carries pocketbooks inst...
4,1,expect type book library pleased find price right
...,...,...
11995,1,valentine cupid vampire- jena ian another vamp...
11996,1,read seven books series apocalypticadventure o...
11997,1,book really wasnt cuppa situation man capturin...
11998,0,tried use charge kindle didnt even register ch...


In [8]:
lemmatizer=WordNetLemmatizer()
df['reviewText'] = df['reviewText'].apply(lambda x: " ".join([lemmatizer.lemmatize(word, 'v') for word in x.split()]))


In [9]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(df['reviewText'],df['rating'],test_size=0.2,random_state=45)

In [10]:
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
cv=CountVectorizer(max_features=2500,ngram_range=(1,2))
tfidf=TfidfVectorizer(max_features=2500,ngram_range=(1,2))

In [11]:
X_cv_train=cv.fit_transform(X_train).toarray()
X_cv_test=cv.transform(X_test).toarray()

X_tfidf_train=tfidf.fit_transform(X_train).toarray()
X_tfidf_test=tfidf.transform(X_test).toarray()

In [12]:
from sklearn.naive_bayes import MultinomialNB
cv_model=MultinomialNB()
cv_model.fit(X_cv_train,y_train)
y_cv_predict=cv_model.predict(X_cv_test)

In [13]:
from sklearn.metrics import accuracy_score,classification_report
print("BOW Accuracy", accuracy_score(y_cv_predict,y_test))
print(classification_report(y_cv_predict,y_test))

BOW Accuracy 0.8191666666666667
              precision    recall  f1-score   support

           0       0.77      0.72      0.74       874
           1       0.85      0.88      0.86      1526

    accuracy                           0.82      2400
   macro avg       0.81      0.80      0.80      2400
weighted avg       0.82      0.82      0.82      2400



In [14]:
from sklearn.naive_bayes import MultinomialNB
tfidf_model=MultinomialNB()
tfidf_model.fit(X_tfidf_train,y_train)
y_tfidf_predict=tfidf_model.predict(X_tfidf_test)


In [15]:
from sklearn.metrics import accuracy_score,classification_report
print("TFIDF Accuracy", accuracy_score(y_tfidf_predict,y_test))
print(classification_report(y_tfidf_predict,y_test))

TFIDF Accuracy 0.8166666666666667
              precision    recall  f1-score   support

           0       0.57      0.84      0.68       552
           1       0.95      0.81      0.87      1848

    accuracy                           0.82      2400
   macro avg       0.76      0.83      0.78      2400
weighted avg       0.86      0.82      0.83      2400



In [16]:
X_wv_train=X_train.apply(lambda x:x.split()).tolist()
X_wv_test=X_test.apply(lambda x:x.split()).tolist()

In [17]:
import gensim
from gensim.models import Word2Vec
model=Word2Vec(X_wv_train,vector_size=500)

In [18]:
model.wv.index_to_key

['book',
 'read',
 'story',
 'like',
 'character',
 'one',
 'get',
 'love',
 'good',
 'would',
 'really',
 'make',
 'time',
 'author',
 'think',
 'go',
 'write',
 'find',
 'want',
 'know',
 'series',
 'end',
 'enjoy',
 'much',
 'first',
 'well',
 'even',
 'give',
 'didnt',
 'short',
 'could',
 'take',
 'great',
 'sex',
 '-',
 'say',
 'little',
 'dont',
 'way',
 'interest',
 'two',
 'see',
 'come',
 'keep',
 'start',
 'plot',
 'also',
 'romance',
 'work',
 'look',
 'seem',
 'im',
 'stories',
 'feel',
 'never',
 'lot',
 'try',
 'need',
 'kindle',
 'many',
 'bite',
 'life',
 'review',
 'leave',
 'another',
 'better',
 'tell',
 'hot',
 'though',
 'recommend',
 'back',
 'enough',
 'man',
 'use',
 'still',
 'people',
 'star',
 'cant',
 'something',
 'happen',
 'part',
 'doesnt',
 'page',
 'bad',
 'new',
 'free',
 'things',
 'wasnt',
 'next',
 'scenes',
 'put',
 'turn',
 'world',
 'felt',
 'together',
 'best',
 'years',
 'buy',
 'finish',
 'begin',
 'thing',
 'woman',
 'novel',
 'main',
 'liv

In [19]:
def avg_word2vec(tokens, model):
    vectors = [model.wv[word] for word in tokens if word in model.wv]
    return np.mean(vectors, axis=0) if vectors else np.zeros(model.vector_size)

In [20]:
X_wv_train = np.array([avg_word2vec(tokens, model) for tokens in X_wv_train])
X_wv_test= np.array([avg_word2vec(tokens, model) for tokens in X_wv_test])

In [21]:
from sklearn.ensemble import RandomForestClassifier
wv_model=RandomForestClassifier()

In [22]:
wv_model.fit(X_wv_train,y_train)


0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [23]:
wv_predict=wv_model.predict(X_wv_test)
# X_wv_test.shape
from sklearn.metrics import accuracy_score,classification_report
print("avgWV Accuracy", accuracy_score(y_test,wv_predict))
print(classification_report(wv_predict,y_test))

avgWV Accuracy 0.7575
              precision    recall  f1-score   support

           0       0.55      0.68      0.61       666
           1       0.86      0.79      0.82      1734

    accuracy                           0.76      2400
   macro avg       0.71      0.73      0.72      2400
weighted avg       0.78      0.76      0.76      2400



In [24]:
import pickle

# Save the CountVectorizer
with open('BoW.pickle', 'wb') as f:
    pickle.dump(cv, f)

# Save the BoW classifier model
with open('bow_model.pickle', 'wb') as f:
    pickle.dump(cv_model, f)


In [25]:
import pickle

# Save the TfidfVectorizer
with open('TfIdf.pickle', 'wb') as f:
    pickle.dump(tfidf, f)

# Save the TF-IDF classifier model
with open('tfidf_model.pickle', 'wb') as f:
    pickle.dump(tfidf_model, f)


In [26]:
model.save("word2vec.model")

In [27]:
import pickle
with open('model_w2v.pickle', 'wb') as f:
    pickle.dump(wv_model, f)
