In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import pandas as pd
import numpy as np


# Build a multi class classification model to predict the rating given the review

In [3]:
with open("/content/drive/MyDrive/Cell_Phones_and_Accessories_5.json", "r") as f:
    data = pd.read_json(f, orient="records", lines=True)

In [4]:
data.shape

(194439, 9)

In [5]:
data.head()

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime
0,A30TL5EWN6DFXT,120401325X,christina,"[0, 0]",They look good and stick good! I just don't li...,4,Looks Good,1400630400,"05 21, 2014"
1,ASY55RVNIL0UD,120401325X,emily l.,"[0, 0]",These stickers work like the review says they ...,5,Really great product.,1389657600,"01 14, 2014"
2,A2TMXE2AFO7ONB,120401325X,Erica,"[0, 0]",These are awesome and make my phone look so st...,5,LOVE LOVE LOVE,1403740800,"06 26, 2014"
3,AWJ0WZQYMYFQ4,120401325X,JM,"[4, 4]",Item arrived in great time and was in perfect ...,4,Cute!,1382313600,"10 21, 2013"
4,ATX7CZYFXI1KW,120401325X,patrice m rogoza,"[2, 3]","awesome! stays on, and looks great. can be use...",5,leopard home button sticker for iphone 4s,1359849600,"02 3, 2013"


In [6]:
data.columns

Index(['reviewerID', 'asin', 'reviewerName', 'helpful', 'reviewText',
       'overall', 'summary', 'unixReviewTime', 'reviewTime'],
      dtype='object')

# Text preprocessing

In [7]:
#take review and rating 
df = data[["reviewText","overall"]]





In [8]:
df["overall"].value_counts()

5    108664
4     39993
3     21439
1     13279
2     11064
Name: overall, dtype: int64

In [9]:
import pandas as pd
import numpy as np
import spacy
nlp = spacy.load('en_core_web_sm')

from spacy.lang.en.stop_words import STOP_WORDS

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import naive_bayes, svm
from sklearn.metrics import accuracy_score, recall_score, precision_score
from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings("ignore")

In [10]:
df.columns

Index(['reviewText', 'overall'], dtype='object')

In [11]:
df.drop_duplicates(['reviewText', 'overall'], keep = 'first')

Unnamed: 0,reviewText,overall
0,They look good and stick good! I just don't li...,4
1,These stickers work like the review says they ...,5
2,These are awesome and make my phone look so st...,5
3,Item arrived in great time and was in perfect ...,4
4,"awesome! stays on, and looks great. can be use...",5
...,...,...
194434,Works great just like my original one. I reall...,5
194435,Great product. Great packaging. High quality a...,5
194436,"This is a great cable, just as good as the mor...",5
194437,I really like it becasue it works well with my...,5


In [12]:
# Step - b: Change all the text to lower case. This is required as python interprets 'data' and 'DATA' differently
df['reviewText'] = [sentence.lower() for sentence in df['reviewText']]



In [13]:
#change datatypes
df['reviewText'] = df['reviewText'].astype("category")

In [14]:
df.isna().sum()

reviewText    0
overall       0
dtype: int64

In [15]:
#defining the function to remove punctuation

import string
string.punctuation
def remove_punctuation(text):
    punctuationfree="".join([i for i in text if i not in string.punctuation])
    return punctuationfree
#storing the puntuation free text
df['clean_txt']= df['reviewText'].apply(lambda x:remove_punctuation(x))
df.head()

Unnamed: 0,reviewText,overall,clean_txt
0,they look good and stick good! i just don't li...,4,they look good and stick good i just dont like...
1,these stickers work like the review says they ...,5,these stickers work like the review says they ...
2,these are awesome and make my phone look so st...,5,these are awesome and make my phone look so st...
3,item arrived in great time and was in perfect ...,4,item arrived in great time and was in perfect ...
4,"awesome! stays on, and looks great. can be use...",5,awesome stays on and looks great can be used o...


In [16]:
df.isna().sum()

reviewText    0
overall       0
clean_txt     0
dtype: int64

In [17]:
#defining function for tokenization

import re
def tokenization(text):
    tokens = re.split('W+',text)
    return tokens
#applying function to the column
df['tokenised']= df['clean_txt'].apply(lambda x: tokenization(x))

In [18]:
df.head()

Unnamed: 0,reviewText,overall,clean_txt,tokenised
0,they look good and stick good! i just don't li...,4,they look good and stick good i just dont like...,[they look good and stick good i just dont lik...
1,these stickers work like the review says they ...,5,these stickers work like the review says they ...,[these stickers work like the review says they...
2,these are awesome and make my phone look so st...,5,these are awesome and make my phone look so st...,[these are awesome and make my phone look so s...
3,item arrived in great time and was in perfect ...,4,item arrived in great time and was in perfect ...,[item arrived in great time and was in perfect...
4,"awesome! stays on, and looks great. can be use...",5,awesome stays on and looks great can be used o...,[awesome stays on and looks great can be used ...


In [19]:
#removing stopwords
import nltk
nltk.download('stopwords')
stopwords = nltk.corpus.stopwords.words('english')
def remove_stopwords(text):
    output= [i for i in text if i not in stopwords]
    return output
df['no_stopwords']= df['tokenised'].apply(lambda x:remove_stopwords(x))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [20]:
df.isna().sum()

reviewText      0
overall         0
clean_txt       0
tokenised       0
no_stopwords    0
dtype: int64

In [21]:
#defining the function for lemmatization
nltk.download('wordnet')
nltk.download('omw-1.4')
from nltk.stem import WordNetLemmatizer
#defining the object for Lemmatization
wordnet_lemmatizer = WordNetLemmatizer()
def lemmatizer(text):
  lemm_text = [wordnet_lemmatizer.lemmatize(word) for word in text]
  return lemm_text
df['lemmatized_txt']=df['no_stopwords'].apply(lambda x:lemmatizer(x))

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


In [22]:
df.head()

Unnamed: 0,reviewText,overall,clean_txt,tokenised,no_stopwords,lemmatized_txt
0,they look good and stick good! i just don't li...,4,they look good and stick good i just dont like...,[they look good and stick good i just dont lik...,[they look good and stick good i just dont lik...,[they look good and stick good i just dont lik...
1,these stickers work like the review says they ...,5,these stickers work like the review says they ...,[these stickers work like the review says they...,[these stickers work like the review says they...,[these stickers work like the review says they...
2,these are awesome and make my phone look so st...,5,these are awesome and make my phone look so st...,[these are awesome and make my phone look so s...,[these are awesome and make my phone look so s...,[these are awesome and make my phone look so s...
3,item arrived in great time and was in perfect ...,4,item arrived in great time and was in perfect ...,[item arrived in great time and was in perfect...,[item arrived in great time and was in perfect...,[item arrived in great time and was in perfect...
4,"awesome! stays on, and looks great. can be use...",5,awesome stays on and looks great can be used o...,[awesome stays on and looks great can be used ...,[awesome stays on and looks great can be used ...,[awesome stays on and looks great can be used ...


In [23]:
df.isna().sum()

reviewText        0
overall           0
clean_txt         0
tokenised         0
no_stopwords      0
lemmatized_txt    0
dtype: int64

In [24]:
df['lemmatized_txt'] = [','.join(map(str, l)) for l in df['lemmatized_txt']]

In [25]:
#tfidf vectorisation
tfidf_vectorizer = TfidfVectorizer(ngram_range =(1,2), lowercase = True) # stop_words='english'

tfidf_Xtrain = tfidf_vectorizer.fit_transform(df['lemmatized_txt'])
'''tfidf_Xtest = tfidf_vectorizer.fit_transform(X_test)
tfidf_Xval = tfidf_vectorizer.fit_transform(X_val)
#token_set = tfidf_vectorizer.get_feature_names()'''


'tfidf_Xtest = tfidf_vectorizer.fit_transform(X_test)\ntfidf_Xval = tfidf_vectorizer.fit_transform(X_val)\n#token_set = tfidf_vectorizer.get_feature_names()'

# Train Validation and Test splitting

In [26]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(tfidf_Xtrain,df['overall'],stratify =df['overall'],  train_size=0.7)

In [27]:
X_train.shape

(136107, 2318094)

In [28]:
X_test, X_val, y_test, y_val = train_test_split(X_test,y_test,stratify = y_test, test_size=0.2)

In [29]:
X_test.shape

(46665, 2318094)

In [30]:
X_val.shape

(11667, 2318094)

In [31]:

df.columns

Index(['reviewText', 'overall', 'clean_txt', 'tokenised', 'no_stopwords',
       'lemmatized_txt'],
      dtype='object')

In [32]:
y_val.shape

(11667,)

In [33]:
type(X_train)

scipy.sparse.csr.csr_matrix

In [34]:
df['lemmatized_txt']

0         they look good and stick good i just dont like...
1         these stickers work like the review says they ...
2         these are awesome and make my phone look so st...
3         item arrived in great time and was in perfect ...
4         awesome stays on and looks great can be used o...
                                ...                        
194434    works great just like my original one i really...
194435    great product great packaging high quality and...
194436    this is a great cable just as good as the more...
194437    i really like it becasue it works well with my...
194438    product as described i have wasted a lot of mo...
Name: lemmatized_txt, Length: 194439, dtype: object

In [35]:
token_set = tfidf_vectorizer.get_feature_names()
print(len(token_set))

2318094


In [36]:
y_train

122084    5
127200    5
77552     4
58869     5
20194     5
         ..
113969    5
82196     4
84466     5
181610    5
72036     5
Name: overall, Length: 136107, dtype: int64

# Model Building

In [37]:
#LOGISTIC REGRESSION 

from sklearn.linear_model import LogisticRegression
clr = LogisticRegression(multi_class='multinomial')
clr.fit(X_train, y_train)
val_pred = clr.predict(X_val)
test_pred = clr.predict(X_test) # accuracy


In [38]:
from sklearn.metrics import accuracy_score
print(accuracy_score(val_pred,y_val))

0.6713808176909232


In [39]:
print(accuracy_score(test_pred,y_test))

0.6655737704918033


In [40]:
#RANDOM FOREST

#grid search
from sklearn.model_selection import GridSearchCV
max_depth=[2, 8, 16]
n_estimators = [10,15]
criterion = ['gini', 'entropy']
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators= 20, max_depth = 5, criterion="gini", n_jobs = -1)
forest_params = {'max_depth': max_depth, 'n_estimators' : n_estimators, 'criterion' : criterion}
grid = GridSearchCV(estimator= rf, param_grid= forest_params, cv = 5)
grid.fit(X_train, y_train)
val_pred_rf = grid.predict(X_val)
test_pred_rf = grid.predict(X_test)

In [41]:
print(accuracy_score(test_pred_rf,y_test))

0.5588342440801457


In [42]:
print(accuracy_score(val_pred_rf,y_val))

0.5589268878032057


In [43]:
'''#SVM 
from sklearn import svm
from sklearn.model_selection import GridSearchCV
parameters = {'kernel':('linear', 'rbf'), 'C':[1, 10]}
svc = svm.SVC()

grid = GridSearchCV(estimator = svc,
             param_grid={'C': [1, 10], 'kernel': ('linear', 'rbf')})
grid.fit(X_train, y_train)
val_pred_sv = grid.predict(X_val)
test_pred_sv = grid.predict(X_test)'''

"#SVM \nfrom sklearn import svm\nfrom sklearn.model_selection import GridSearchCV\nparameters = {'kernel':('linear', 'rbf'), 'C':[1, 10]}\nsvc = svm.SVC()\n\ngrid = GridSearchCV(estimator = svc,\n             param_grid={'C': [1, 10], 'kernel': ('linear', 'rbf')})\ngrid.fit(X_train, y_train)\nval_pred_sv = grid.predict(X_val)\ntest_pred_sv = grid.predict(X_test)"

In [44]:
#DECISION TREES
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier(max_depth = 5)
dt.fit(X_train, y_train)
val_pred_dt = dt.predict(X_val)
test_pred_dt = dt.predict(X_test)


KeyboardInterrupt: ignored

In [None]:
print(accuracy_score(val_pred_dt,y_val))

In [45]:
#XGBOOST
from xgboost import XGBClassifier
xg = XGBClassifier()
xg.fit(X_train, y_train)

XGBClassifier(objective='multi:softprob')

In [46]:
val_pred_xg = xg.predict(X_val)
test_pred_xg = xg.predict(X_test)

In [47]:
print(accuracy_score(test_pred_xg,y_test))

0.6003428693881925


In [48]:
print(accuracy_score(val_pred_xg,y_val))

0.602554212736779


# Building our own Word2vec models

In [49]:
import gensim
import gensim.downloader as gensim_api


In [50]:
review_txt = df['lemmatized_txt'].apply(gensim.utils.simple_preprocess)

In [51]:
review_txt

0         [they, look, good, and, stick, good, just, don...
1         [these, stickers, work, like, the, review, say...
2         [these, are, awesome, and, make, my, phone, lo...
3         [item, arrived, in, great, time, and, was, in,...
4         [awesome, stays, on, and, looks, great, can, b...
                                ...                        
194434    [works, great, just, like, my, original, one, ...
194435    [great, product, great, packaging, high, quali...
194436    [this, is, great, cable, just, as, good, as, t...
194437    [really, like, it, becasue, it, works, well, w...
194438    [product, as, described, have, wasted, lot, of...
Name: lemmatized_txt, Length: 194439, dtype: object

In [52]:
### MODEL 1 
model1 = gensim.models.Word2Vec(size=200,   
            window=3, min_count=2, sg=1, workers = 3)

In [53]:
model1.build_vocab(review_txt, progress_per = 1000)

In [54]:
model1.epochs

5

In [55]:
model1.corpus_count

194439

In [56]:
model1.train(review_txt, total_examples = model1.corpus_count, epochs = model1.epochs)

(60588074, 82664015)

In [57]:
model1.save("model1_word2vec")

In [58]:
#### MODEL 2 
model2 = gensim.models.Word2Vec(size=100,   
            window=3, min_count=2, sg=1, workers = 3)
model2.build_vocab(review_txt, progress_per = 1000)
model2.train(review_txt, total_examples = model2.corpus_count, epochs = model2.epochs)
model2.save('model2_word2vec')


In [59]:
#### MODEL 3
model3 = gensim.models.Word2Vec(size=300,   
            window=3, min_count=2, sg=1, workers = 3)
model3.build_vocab(review_txt, progress_per = 1000)
model3.train(review_txt, total_examples = model3.corpus_count, epochs = model3.epochs)
model3.save('model3_word2vec')


In [60]:
#### MODEL 4
model4 = gensim.models.Word2Vec(size=100,   
            window=7, min_count=2, sg=1, workers = 3)
model4.build_vocab(review_txt, progress_per = 1000)
model4.train(review_txt, total_examples = model4.corpus_count, epochs =model4.epochs)
model4.save('model4_word2vec')


In [61]:
#### MODEL 5
model5 = gensim.models.Word2Vec(size=300,   
            window=7, min_count=2, sg=1, workers = 3)
model5.build_vocab(review_txt, progress_per = 1000)
model5.train(review_txt, total_examples = model5.corpus_count, epochs =model5.epochs)
model5.save('model5_word2vec')


In [62]:
#### MODEL 6
model6 = gensim.models.Word2Vec(size=200,   
            window=7, min_count=2, sg=1, workers = 3)
model6.build_vocab(review_txt, progress_per = 1000)
model6.train(review_txt, total_examples = model6.corpus_count, epochs =model6.epochs)
model6.save('model6_word2vec')


In [63]:
#### MODEL 7
model7 = gensim.models.Word2Vec(size=100,   
            window=7, min_count=5, sg=1, workers = 3)
model7.build_vocab(review_txt, progress_per = 1000)
model7.train(review_txt, total_examples = model7.corpus_count, epochs =model7.epochs)
model7.save('model7_word2vec')


In [64]:
#### MODEL 8
model8 = gensim.models.Word2Vec(size=200,   
            window=7, min_count=5, sg=1, workers = 3)
model8.build_vocab(review_txt, progress_per = 1000)
model8.train(review_txt, total_examples = model8.corpus_count, epochs =model8.epochs)
model8.save('model8_word2vec')


In [None]:
#### MODEL 9
model9 = gensim.models.Word2Vec(size=300,   
            window=7, min_count=5, sg=1, workers = 3)
model9.build_vocab(review_txt, progress_per = 1000)
model9.train(review_txt, total_examples = model9.corpus_count, epochs =model9.epochs)
model9.save('model9_word2vec')

In [None]:
#### MODEL 10
model10 = gensim.models.Word2Vec(size=100,   
            window=3, min_count=5, sg=1, workers = 3)
model10.build_vocab(review_txt, progress_per = 1000)
model10.train(review_txt, total_examples = model10.corpus_count, epochs =model10.epochs)
model10.save('model10_word2vec')

In [None]:
#### MODEL 11
model11 = gensim.models.Word2Vec(size=200,   
            window=3, min_count=5, sg=1, workers = 3)
model11.build_vocab(review_txt, progress_per = 1000)
model11.train(review_txt, total_examples = model11.corpus_count, epochs =model11.epochs)
model11.save('model11_word2vec')

In [None]:
#### MODEL 12
model12 = gensim.models.Word2Vec(size=300,   
            window=3, min_count=5, sg=1, workers = 3)
model12.build_vocab(review_txt, progress_per = 1000)
model12.train(review_txt, total_examples = model12.corpus_count, epochs =model12.epochs)
model12.save('model12_word2vec')

In [None]:
#indexing our model1 
model1.wv.index_to_key