<a href="https://colab.research.google.com/github/subhamyadav580/Amazon-Reviews-Predictions-of-Cell-Phones-and-Accessories/blob/main/Amazon_Review_Prediction_Cell_Phones_and_Accessories.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
%cd "/content/drive/My Drive/"

/content/drive/My Drive


## Load data


In [3]:
import pandas as pd
import gzip

def parse(path):
  g = gzip.open(path, 'rb')
  for l in g:
    yield eval(l)

def getDF(path):
  i = 0
  df = {}
  for d in parse(path):
    df[i] = d
    i += 1
  return pd.DataFrame.from_dict(df, orient='index')

df = getDF('data/reviews_Cell_Phones_and_Accessories_5.json.gz')

In [4]:
df.isnull().sum()

reviewerID           0
asin                 0
reviewerName      3519
helpful              0
reviewText           0
overall              0
summary              0
unixReviewTime       0
reviewTime           0
dtype: int64

In [5]:
sum(df.overall) / len(df.overall)

4.129912208970422

In [6]:
df

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime
0,A30TL5EWN6DFXT,120401325X,christina,"[0, 0]",They look good and stick good! I just don't li...,4.0,Looks Good,1400630400,"05 21, 2014"
1,ASY55RVNIL0UD,120401325X,emily l.,"[0, 0]",These stickers work like the review says they ...,5.0,Really great product.,1389657600,"01 14, 2014"
2,A2TMXE2AFO7ONB,120401325X,Erica,"[0, 0]",These are awesome and make my phone look so st...,5.0,LOVE LOVE LOVE,1403740800,"06 26, 2014"
3,AWJ0WZQYMYFQ4,120401325X,JM,"[4, 4]",Item arrived in great time and was in perfect ...,4.0,Cute!,1382313600,"10 21, 2013"
4,ATX7CZYFXI1KW,120401325X,patrice m rogoza,"[2, 3]","awesome! stays on, and looks great. can be use...",5.0,leopard home button sticker for iphone 4s,1359849600,"02 3, 2013"
...,...,...,...,...,...,...,...,...,...
194434,A1YMNTFLNDYQ1F,B00LORXVUE,eyeused2loveher,"[0, 0]",Works great just like my original one. I reall...,5.0,This works just perfect!,1405900800,"07 21, 2014"
194435,A15TX8B2L8B20S,B00LORXVUE,Jon Davidson,"[0, 0]",Great product. Great packaging. High quality a...,5.0,Great replacement cable. Apple certified,1405900800,"07 21, 2014"
194436,A3JI7QRZO1QG8X,B00LORXVUE,Joyce M. Davidson,"[0, 0]","This is a great cable, just as good as the mor...",5.0,Real quality,1405900800,"07 21, 2014"
194437,A1NHB2VC68YQNM,B00LORXVUE,Nurse Farrugia,"[0, 0]",I really like it becasue it works well with my...,5.0,I really like it becasue it works well with my...,1405814400,"07 20, 2014"


## Downloading the NLTK library


In [7]:
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize, sent_tokenize
nltk.download('stopwords')
from nltk.corpus import stopwords 
from nltk.stem import PorterStemmer

stemmer = PorterStemmer()

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


We will convert all text to lower case.

In [8]:
df['reviewText'] = df['reviewText'].str.lower()

## **Tokenization**

We will use word_tokenize method from NLTK to split the review text into individual words (and you will see that punctuation is also produced as separate `words`

In [9]:
def identify_tokens(row):
    review = row['reviewText']
    tokens = word_tokenize(review)
    # taken only words (not punctuation)
    token_words = [w for w in tokens if w.isalpha()]
    return token_words

df['words'] = df.apply(identify_tokens, axis=1)

## **Stemming**

Stemming reduces related words to a common stem. It is an optional process step, and it it is useful to test accuracy with and without stemming

In [10]:
def stem_list(row):
    my_list = row['words']
    stemmed_list = [stemmer.stem(word) for word in my_list]
    return (stemmed_list)

df['stemmed_words'] = df.apply(stem_list, axis=1)

## **Removing stop words**

`Stop words` are commonly used words that are unlikely to have any benefit in natural language processing. These includes words such as `a`, `the`, `is`.

As before we will define a function and apply it to our DataFrame.

We create a set of words that we will call `stops` (using a set helps to speed up removing stop words).

In [11]:
stops = set(stopwords.words("english"))                  

def remove_stops(row):
    my_list = row['stemmed_words']
    meaningful_words = [w for w in my_list if not w in stops]
    return (meaningful_words)

df['stem_meaningful'] = df.apply(remove_stops, axis=1)

## **Rejoin words**

Now we will rejoin our meaningful stemmed words into a single string.

In [12]:
def rejoin_words(row):
    my_list = row['stem_meaningful']
    joined_words = ( " ".join(my_list))
    return joined_words

df['processed'] = df.apply(rejoin_words, axis=1)

In [69]:
review_list = df['processed'].loc[0:14999].to_numpy()

In [77]:
len(review_list)

15000

In [72]:
review_labels = df['overall'].loc[0:14999].to_numpy()

In [73]:
review_labels

array([4., 5., 5., ..., 5., 1., 2.])

## **Creating Models**

In [74]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split

### **Spliting Data**

In [78]:
def split_vals(a,n): 
  return a[:n].copy(), a[n:].copy()

n_valid = 5000  # same as Kaggle's test set size
n_trn = 15000 - n_valid
# raw_train, raw_valid = split_vals(df_raw, n_trn)
train_data, test_data = split_vals(review_list, n_trn)
train_labels, test_labels = split_vals(review_labels, n_trn)

train_data.shape, train_labels.shape, test_data.shape

((10000,), (10000,), (5000,))

In [79]:
counter = CountVectorizer()

In [80]:
counter.fit(train_data)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 1), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [None]:
counter.vocabulary_

In [84]:
len(counter.vocabulary_)

12302

In [85]:
training_counts = counter.transform(train_data)

In [86]:
test_counts = counter.transform(test_data)

In [87]:
training_counts.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [88]:
test_counts.shape

(5000, 12302)

In [89]:
classifier = MultinomialNB()

In [90]:
classifier.fit(training_counts, train_labels)
print(classifier.predict(test_counts))
print(classifier.predict_proba(test_counts))

[5. 5. 5. ... 5. 4. 5.]
[[1.01135096e-15 1.00970809e-21 7.76715386e-16 3.09777126e-02
  9.69022287e-01]
 [7.28403929e-05 1.65690853e-06 2.90612058e-03 1.71994288e-01
  8.25025094e-01]
 [5.04002475e-02 1.19349792e-01 1.27978673e-01 3.04441496e-01
  3.97829791e-01]
 ...
 [1.20893555e-03 3.19307509e-04 4.28237011e-02 2.45652231e-01
  7.09995825e-01]
 [1.16333601e-02 2.96363120e-02 1.69929222e-01 4.21306742e-01
  3.67494364e-01]
 [3.35301765e-01 6.39751782e-04 8.22248096e-02 1.29875309e-02
  5.68846142e-01]]


In [91]:
print(classifier.score(test_counts, test_labels))

0.5528
