# Grammar and Online Product Reviews

#### -------------------------------------------------------------------------------------------------------------------------------------------------

#### This is a list of over 71,045 reviews from 1,000 different products provided by Datafiniti's Product Database. The dataset includes the text and title of the review, the name and manufacturer of the product, reviewer metadata, and more.

#### -------------------------------------------------------------------------------------------------------------------------------------------------

#### Columns:
##### 1. id
##### 2. brand
##### 3. categories
##### 4. dateAdded
##### 5. dateUpdated
##### 6. ean
##### 7. keys
##### 8. manufacturer
##### 9. manufacturerNumber
##### 10. name
##### 11. reviews.date
##### 12. reviews.dateAdded
##### 13. reviews.dateSeen
##### 14. reviews.didPurchase
##### 15. reviews.doRecommend
##### 16. reviews.id
##### 17. reviews.numHelpful
##### 18. reviews.rating
##### 19. reviews.sourceURLs
##### 20. reviews.text
##### 21. reviews.title
##### 22. reviews.userCity
##### 23. reviews.userProvince
##### 24. reviews.username
##### 25. upc

#### -------------------------------------------------------------------------------------------------------------------------------------------------

#### Import libraries

In [203]:
import string
import numpy as np
import pandas as pd
import seaborn as sns
import re
import datetime as dt
## from keras.preprocessing.text import Tokenizer
## from keras.preprocessing.sequence import pad_sequences

from sklearn.metrics import confusion_matrix, plot_confusion_matrix, classification_report, accuracy_score
from sklearn.utils import shuffle

## from keras.utils import np_utils
## from keras.utils.np_utils import to_categorical
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier  
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.svm import SVC
from numpy import loadtxt
import nltk

import matplotlib.pyplot as plt

from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer

#### -------------------------------------------------------------------------------------------------------------------------------------------------

#### Read the dataset

In [204]:
original_dataset = pd.read_csv('GrammarandProductReviews.csv')
nlp_dataset = original_dataset

#### -------------------------------------------------------------------------------------------------------------------------------------------------

## 1. Data preprocessing based on NLP Tasks

In [205]:
nlp_dataset.shape

(71044, 25)

In [206]:
nlp_dataset.isnull().sum()

id                          0
brand                       0
categories                  0
dateAdded                   0
dateUpdated                 0
ean                     31979
keys                        0
manufacturer              141
manufacturerNumber        203
name                        0
reviews.date               67
reviews.dateAdded           0
reviews.dateSeen            0
reviews.didPurchase     38886
reviews.doRecommend     10615
reviews.id              38886
reviews.numHelpful      38536
reviews.rating              0
reviews.sourceURLs          0
reviews.text               36
reviews.title             476
reviews.userCity        65634
reviews.userProvince    70595
reviews.username           96
upc                         2
dtype: int64

#### -------------------------------------------------------------------------------------------------------------------------------------------------

#### Filtering columns `reviews.didPurchase` and `reviews.text` 

In [173]:
nlp_dataset.dropna(subset = ['reviews.text'], inplace=True) ## 36 null values
## nlp_dataset.dropna(subset = ['reviews.didPurchase'], inplace=True) 
nlp_dataset.dropna(subset = ['reviews.doRecommend'], inplace=True) ## 10615 null values 
nlp_dataset.dropna(subset = ['reviews.title'], inplace=True) ## 476 null values

#### Dimension of the dataset after filtering, removing the null values

In [174]:
nlp_dataset.shape

(60340, 25)

In [175]:
nlp_dataset['reviews.didPurchase'].isnull().sum()

33159

In [176]:
nlp_dataset['reviews.text'].isnull().sum()

0

In [177]:
nlp_dataset['reviews.doRecommend'].isnull().sum()

0

In [178]:
nlp_dataset['reviews.title'].isnull().sum()

0

In [179]:
## (nlp_dataset['reviews.didPurchase'] == nlp_dataset['reviews.doRecommend']).where(nlp_dataset['reviews.didPurchase'].notnull()).value_counts()
nlp_dataset['reviews.doRecommend'].notnull().value_counts()

True    60340
Name: reviews.doRecommend, dtype: int64

In [180]:
nlp_dataset['reviews.didPurchase'].fillna(nlp_dataset['reviews.doRecommend'], inplace=True)

In [181]:
nlp_dataset['reviews.didPurchase'].isnull().sum()

0

In [182]:
nlp_dataset.duplicated().sum()

0

In [183]:
##nlp_dataset = nlp_dataset.astype(object)

In [184]:
nlp_dataset.dtypes

id                       object
brand                    object
categories               object
dateAdded                object
dateUpdated              object
ean                      object
keys                     object
manufacturer             object
manufacturerNumber       object
name                     object
reviews.date             object
reviews.dateAdded        object
reviews.dateSeen         object
reviews.didPurchase        bool
reviews.doRecommend      object
reviews.id              float64
reviews.numHelpful      float64
reviews.rating            int64
reviews.sourceURLs       object
reviews.text             object
reviews.title            object
reviews.userCity         object
reviews.userProvince     object
reviews.username         object
upc                      object
dtype: object

In [185]:
nlp_dataset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 60340 entries, 3 to 71043
Data columns (total 25 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   id                    60340 non-null  object 
 1   brand                 60340 non-null  object 
 2   categories            60340 non-null  object 
 3   dateAdded             60340 non-null  object 
 4   dateUpdated           60340 non-null  object 
 5   ean                   30576 non-null  object 
 6   keys                  60340 non-null  object 
 7   manufacturer          60199 non-null  object 
 8   manufacturerNumber    60162 non-null  object 
 9   name                  60340 non-null  object 
 10  reviews.date          60340 non-null  object 
 11  reviews.dateAdded     60340 non-null  object 
 12  reviews.dateSeen      60340 non-null  object 
 13  reviews.didPurchase   60340 non-null  bool   
 14  reviews.doRecommend   60340 non-null  object 
 15  reviews.id         

#### Input and output variables

In [186]:
reviews = nlp_dataset['reviews.text']
y = nlp_dataset['reviews.didPurchase']

#### Number of purchased and not purchased products

In [187]:
y.value_counts().to_frame().rename(columns = {'reviews.didPurchase' : 'Review Purchased'})

Unnamed: 0,Review Purchased
True,35149
False,25191


#### Removing columns `id`, `ean`, `keys`, `manufacturer`, `manufacturerNumber`, `reviews.id`, `reviews.sourceURLs`, `upc`, `reviews.UserCity`, `reviews.userProvince`, `reviews.username`, `reviews.numHelpful`

In [188]:
nlp_dataset.drop(['id', 'categories', 'ean', 'keys', 'manufacturer', 'manufacturerNumber', 'reviews.id', 'reviews.sourceURLs', 'upc', 'reviews.userCity', 'dateUpdated', 'reviews.dateSeen', 'reviews.userProvince', 'reviews.username', 'reviews.numHelpful'], axis=1, inplace=True)

In [189]:
nlp_dataset.shape

(60340, 10)

In [190]:
nlp_dataset.isnull().sum()

brand                  0
dateAdded              0
name                   0
reviews.date           0
reviews.dateAdded      0
reviews.didPurchase    0
reviews.doRecommend    0
reviews.rating         0
reviews.text           0
reviews.title          0
dtype: int64

In [201]:
##nlp_dataset['Date'] = pd.to_date(nlp_dataset.dateAdded, format='%Y-%m-%d')
nlp_dataset['dateAdded'] = pd.to_datetime(nlp_dataset['dateAdded']).dt.date

In [202]:
nlp_dataset['reviews.date'] = pd.to_datetime(nlp_dataset['reviews.date']).dt.date

ParserError: Unknown string format:  hooks slide or swivel into any desired position."

In [198]:
nlp_dataset.to_csv("nlp_dataset.csv", index = False)

In [161]:
docs = []  ## <class 'list'>
for sen in reviews:
    docs.append(sen)

In [162]:
stop_words = set(line.strip() for line in open('stopwords.txt'))
exclude = set(string.punctuation) 
def docs_preprocessor(docs):
    tokenizer = RegexpTokenizer(r'\w+')
    for idx in range(len(docs)):
        docs[idx] = docs[idx].lower()  # Convert to lowercase.
        docs[idx] = tokenizer.tokenize(docs[idx])  # Split into words.
    # Remove numbers, but not words that contain numbers.
    docs = [[token for token in doc if not token.isdigit()] for doc in docs]
    #Remove stop words in documents
    docs = [[token for token in doc if not token in stop_words] for doc in docs]
    #Remove punctuations in documents
    docs = [[token for token in doc if not token in exclude] for doc in docs]
    # Remove words that are only one character.
    docs = [[token for token in doc if len(token) > 2] for doc in docs]
    
    # Lemmatize all words in documents.
    lemmatizer = WordNetLemmatizer()
    docs = [[lemmatizer.lemmatize(token) for token in doc] for doc in docs]
  
    return docs
# Perform function on our document
documents = docs_preprocessor(docs)

LookupError: 
**********************************************************************
  Resource [93mwordnet[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('wordnet')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mcorpora/wordnet[0m

  Searched in:
    - 'C:\\Users\\fatba/nltk_data'
    - 'C:\\Users\\fatba\\anaconda3\\nltk_data'
    - 'C:\\Users\\fatba\\anaconda3\\share\\nltk_data'
    - 'C:\\Users\\fatba\\anaconda3\\lib\\nltk_data'
    - 'C:\\Users\\fatba\\AppData\\Roaming\\nltk_data'
    - 'C:\\nltk_data'
    - 'D:\\nltk_data'
    - 'E:\\nltk_data'
**********************************************************************
