# Grammar and Online Product Reviews

#### -------------------------------------------------------------------------------------------------------------------------------------------------

#### This is a list of over 71,045 reviews from 1,000 different products provided by Datafiniti's Product Database. The dataset includes the text and title of the review, the name and manufacturer of the product, reviewer metadata, and more.

#### -------------------------------------------------------------------------------------------------------------------------------------------------

#### Import libraries

In [31]:
import string
import numpy as np
import pandas as pd
import seaborn as sns
import re
import datetime as dt

ModuleNotFoundError: No module named 'keras'

#### -------------------------------------------------------------------------------------------------------------------------------------------------

#### Read the dataset

In [2]:
original_dataset = pd.read_csv('GrammarandProductReviews.csv')
nlp_dataset = original_dataset

#### -------------------------------------------------------------------------------------------------------------------------------------------------

## 1. Data preprocessing based on NLP Tasks

In [3]:
nlp_dataset.shape

(71044, 25)

In [4]:
nlp_dataset.isnull().sum()

id                          0
brand                       0
categories                  0
dateAdded                   0
dateUpdated                 0
ean                     31979
keys                        0
manufacturer              141
manufacturerNumber        203
name                        0
reviews.date               67
reviews.dateAdded           0
reviews.dateSeen            0
reviews.didPurchase     38886
reviews.doRecommend     10615
reviews.id              38886
reviews.numHelpful      38536
reviews.rating              0
reviews.sourceURLs          0
reviews.text               36
reviews.title             476
reviews.userCity        65634
reviews.userProvince    70595
reviews.username           96
upc                         2
dtype: int64

#### -------------------------------------------------------------------------------------------------------------------------------------------------

#### Filtering columns `reviews.didPurchase` and `reviews.text` 

In [5]:
nlp_dataset.dropna(subset = ['reviews.text'], inplace=True) ## 36 null values
## nlp_dataset.dropna(subset = ['reviews.didPurchase'], inplace=True) 
nlp_dataset.dropna(subset = ['reviews.doRecommend'], inplace=True) ## 10615 null values 
nlp_dataset.dropna(subset = ['reviews.title'], inplace=True) ## 476 null values

#### Dimension of the dataset after filtering, removing the null values

In [6]:
nlp_dataset.shape

(60340, 25)

In [7]:
nlp_dataset['reviews.didPurchase'].isnull().sum()

33159

In [8]:
nlp_dataset['reviews.text'].isnull().sum()

0

In [9]:
nlp_dataset['reviews.doRecommend'].isnull().sum()

0

In [10]:
nlp_dataset['reviews.title'].isnull().sum()

0

In [11]:
## (nlp_dataset['reviews.didPurchase'] == nlp_dataset['reviews.doRecommend']).where(nlp_dataset['reviews.didPurchase'].notnull()).value_counts()
nlp_dataset['reviews.doRecommend'].notnull().value_counts()

True    60340
Name: reviews.doRecommend, dtype: int64

In [12]:
nlp_dataset['reviews.didPurchase'].fillna(nlp_dataset['reviews.doRecommend'], inplace=True)

In [13]:
nlp_dataset['reviews.didPurchase'].isnull().sum()

0

In [14]:
nlp_dataset.duplicated().sum()

0

In [15]:
nlp_dataset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 60340 entries, 3 to 71043
Data columns (total 25 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   id                    60340 non-null  object 
 1   brand                 60340 non-null  object 
 2   categories            60340 non-null  object 
 3   dateAdded             60340 non-null  object 
 4   dateUpdated           60340 non-null  object 
 5   ean                   30576 non-null  object 
 6   keys                  60340 non-null  object 
 7   manufacturer          60199 non-null  object 
 8   manufacturerNumber    60162 non-null  object 
 9   name                  60340 non-null  object 
 10  reviews.date          60340 non-null  object 
 11  reviews.dateAdded     60340 non-null  object 
 12  reviews.dateSeen      60340 non-null  object 
 13  reviews.didPurchase   60340 non-null  bool   
 14  reviews.doRecommend   60340 non-null  object 
 15  reviews.id         

#### Input and output variables

In [16]:
reviews = nlp_dataset['reviews.text']
y = nlp_dataset['reviews.didPurchase']

#### Number of purchased and not purchased products

In [17]:
y.value_counts().to_frame().rename(columns = {'reviews.didPurchase' : 'Review Purchased'})

Unnamed: 0,Review Purchased
True,35149
False,25191


#### Removing columns `id`, `ean`, `keys`, `manufacturer`, `manufacturerNumber`, `reviews.id`, `reviews.sourceURLs`, `upc`, `reviews.UserCity`, `reviews.userProvince`, `reviews.username`, `reviews.numHelpful`

In [18]:
nlp_dataset.drop(['id', 'categories', 'ean', 'keys', 'manufacturer', 'manufacturerNumber', 'reviews.id', 'reviews.sourceURLs', 'upc', 'reviews.userCity', 'dateUpdated', 'reviews.dateSeen', 'reviews.userProvince', 'reviews.username', 'reviews.dateAdded', 'reviews.numHelpful'], axis=1, inplace=True)

In [19]:
nlp_dataset.shape

(60340, 9)

In [20]:
nlp_dataset.isnull().sum()

brand                  0
dateAdded              0
name                   0
reviews.date           0
reviews.didPurchase    0
reviews.doRecommend    0
reviews.rating         0
reviews.text           0
reviews.title          0
dtype: int64

In [21]:
nlp_dataset['dateAdded'] = pd.to_datetime(nlp_dataset['dateAdded']).dt.date

In [22]:
## row 26327
(nlp_dataset['brand'] == 'Concept Housewares').value_counts()

False    60332
True         8
Name: brand, dtype: int64

In [23]:
nlp_dataset.drop(nlp_dataset[nlp_dataset['brand'] == 'Concept Housewares'].index, inplace = True)

In [24]:
nlp_dataset['reviews.date'] = pd.to_datetime(nlp_dataset['reviews.date']).dt.date

In [25]:
nlp_dataset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 60332 entries, 3 to 71043
Data columns (total 9 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   brand                60332 non-null  object
 1   dateAdded            60332 non-null  object
 2   name                 60332 non-null  object
 3   reviews.date         60332 non-null  object
 4   reviews.didPurchase  60332 non-null  bool  
 5   reviews.doRecommend  60332 non-null  object
 6   reviews.rating       60332 non-null  int64 
 7   reviews.text         60332 non-null  object
 8   reviews.title        60332 non-null  object
dtypes: bool(1), int64(1), object(7)
memory usage: 4.2+ MB


In [26]:
nlp_dataset.to_csv("nlp_dataset.csv", index = False)

In [27]:
def preprocess_text(sen):
 
    sentence = sen
    # Remove everything except a-z, A-Z
    sentence = re.sub('[^a-zA-Z0-9]', ' ', sentence)
 
    # Removing multiple spaces
    sentence = re.sub(r'\s+', ' ', sentence)
 
    return sentence

In [29]:
docs = []  ## <class 'list'>
for sen in reviews:
    docs.append(preprocess_text(sen))

In [30]:
some_comments = [
    "A very good product", 
    "I like it very much" 
            ] 
tokenizer = Tokenizer() 
tokenizer.fit_on_texts(some_comments) 
 
tokenizer.word_index

NameError: name 'Tokenizer' is not defined