# Grammar and Online Product Reviews

#### -------------------------------------------------------------------------------------------------------------------------------------------------

#### This is a list of over 71,045 reviews from 1,000 different products provided by Datafiniti's Product Database. The dataset includes the text and title of the review, the name and manufacturer of the product, reviewer metadata, and more.

#### -------------------------------------------------------------------------------------------------------------------------------------------------

#### Import libraries

In [155]:
import string
import numpy as np
import pandas as pd
import re

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

from keras.utils import np_utils
from keras.utils.np_utils import to_categorical
from sklearn.preprocessing import LabelEncoder
from numpy import loadtxt

from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer

#### -------------------------------------------------------------------------------------------------------------------------------------------------

#### Read the dataset

In [156]:
original_dataset = pd.read_csv('GrammarandProductReviews.csv')
nlp_dataset = original_dataset

#### -------------------------------------------------------------------------------------------------------------------------------------------------

## Data preprocessing based on NLP Tasks

In [157]:
nlp_dataset.shape

(71044, 25)

In [158]:
nlp_dataset.isnull().sum()

id                          0
brand                       0
categories                  0
dateAdded                   0
dateUpdated                 0
ean                     31979
keys                        0
manufacturer              141
manufacturerNumber        203
name                        0
reviews.date               67
reviews.dateAdded           0
reviews.dateSeen            0
reviews.didPurchase     38886
reviews.doRecommend     10615
reviews.id              38886
reviews.numHelpful      38536
reviews.rating              0
reviews.sourceURLs          0
reviews.text               36
reviews.title             476
reviews.userCity        65634
reviews.userProvince    70595
reviews.username           96
upc                         2
dtype: int64

#### -------------------------------------------------------------------------------------------------------------------------------------------------

#### Removing columns `id`, `ean`, `keys`, `manufacturer`, `manufacturerNumber`, `reviews.id`, `reviews.sourceURLs`, `upc`, `reviews.UserCity`, `reviews.userProvince`, `reviews.username`, `reviews.numHelpful`

In [159]:
nlp_dataset.drop(['id', 'categories', 'ean', 'keys', 'manufacturer', 'manufacturerNumber', 'reviews.id', 'reviews.sourceURLs', 'upc', 'reviews.userCity', 'dateUpdated', 'reviews.dateSeen', 'reviews.userProvince', 'reviews.username', 'reviews.dateAdded', 'reviews.numHelpful'], axis=1, inplace=True)

#### -------------------------------------------------------------------------------------------------------------------------------------------------

#### Filtering columns `reviews.didPurchase` and `reviews.text` 

In [160]:
nlp_dataset.dropna(subset = ['reviews.text'], inplace=True) ## 36 null values
## nlp_dataset.dropna(subset = ['reviews.didPurchase'], inplace=True) 
nlp_dataset.dropna(subset = ['reviews.doRecommend'], inplace=True) ## 10615 null values 
nlp_dataset.dropna(subset = ['reviews.title'], inplace=True) ## 476 null values

#### -------------------------------------------------------------------------------------------------------------------------------------------------

#### Dimension of the dataset after filtering, removing the null values

In [161]:
nlp_dataset.shape

(60340, 9)

In [162]:
nlp_dataset['reviews.didPurchase'].isnull().sum()

33159

In [163]:
nlp_dataset['reviews.text'].isnull().sum()

0

In [164]:
nlp_dataset['reviews.doRecommend'].isnull().sum()

0

In [165]:
nlp_dataset['reviews.title'].isnull().sum()

0

#### -------------------------------------------------------------------------------------------------------------------------------------------------

#### Filling the missing data of `reviews.didPurchase`

In [166]:
nlp_dataset['reviews.doRecommend'].notnull().value_counts()

True    60340
Name: reviews.doRecommend, dtype: int64

In [167]:
nlp_dataset['reviews.didPurchase'].fillna(nlp_dataset['reviews.doRecommend'], inplace=True)

In [168]:
nlp_dataset['reviews.didPurchase'].isnull().sum()

0

#### -------------------------------------------------------------------------------------------------------------------------------------------------

#### Removing the duplicated rows

In [169]:
nlp_dataset.duplicated().value_counts()

False    60228
True       112
dtype: int64

In [170]:
nlp_dataset.drop_duplicates(keep=False,inplace=True)

In [171]:
nlp_dataset.duplicated().value_counts()

False    60116
dtype: int64

#### -------------------------------------------------------------------------------------------------------------------------------------------------

In [172]:
nlp_dataset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 60116 entries, 3 to 71043
Data columns (total 9 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   brand                60116 non-null  object
 1   dateAdded            60116 non-null  object
 2   name                 60116 non-null  object
 3   reviews.date         60116 non-null  object
 4   reviews.didPurchase  60116 non-null  bool  
 5   reviews.doRecommend  60116 non-null  object
 6   reviews.rating       60116 non-null  int64 
 7   reviews.text         60116 non-null  object
 8   reviews.title        60116 non-null  object
dtypes: bool(1), int64(1), object(7)
memory usage: 4.2+ MB


#### -------------------------------------------------------------------------------------------------------------------------------------------------

#### Input and output variables

In [173]:
reviews = nlp_dataset['reviews.text']
y = nlp_dataset['reviews.didPurchase']

#### -------------------------------------------------------------------------------------------------------------------------------------------------

#### Number of purchased and not purchased products

In [174]:
y.value_counts().to_frame().rename(columns = {'reviews.didPurchase' : 'Review Purchased'})

Unnamed: 0,Review Purchased
True,35119
False,24997


#### -------------------------------------------------------------------------------------------------------------------------------------------------

In [175]:
nlp_dataset.shape

(60116, 9)

In [176]:
nlp_dataset.isnull().sum()

brand                  0
dateAdded              0
name                   0
reviews.date           0
reviews.didPurchase    0
reviews.doRecommend    0
reviews.rating         0
reviews.text           0
reviews.title          0
dtype: int64

#### -------------------------------------------------------------------------------------------------------------------------------------------------

#### Formating the dates

In [177]:
nlp_dataset['dateAdded'] = pd.to_datetime(nlp_dataset['dateAdded']).dt.date

In [178]:
## row 26327
(nlp_dataset['brand'] == 'Concept Housewares').value_counts()

False    60108
True         8
Name: brand, dtype: int64

In [179]:
nlp_dataset.drop(nlp_dataset[nlp_dataset['brand'] == 'Concept Housewares'].index, inplace = True)

In [180]:
nlp_dataset['reviews.date'] = pd.to_datetime(nlp_dataset['reviews.date']).dt.date

In [181]:
nlp_dataset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 60108 entries, 3 to 71043
Data columns (total 9 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   brand                60108 non-null  object
 1   dateAdded            60108 non-null  object
 2   name                 60108 non-null  object
 3   reviews.date         60108 non-null  object
 4   reviews.didPurchase  60108 non-null  bool  
 5   reviews.doRecommend  60108 non-null  object
 6   reviews.rating       60108 non-null  int64 
 7   reviews.text         60108 non-null  object
 8   reviews.title        60108 non-null  object
dtypes: bool(1), int64(1), object(7)
memory usage: 4.2+ MB


#### -------------------------------------------------------------------------------------------------------------------------------------------------

#### Saving the preprocessed dataset

In [182]:
nlp_dataset.to_csv("nlp_dataset.csv", index = False)

#### -------------------------------------------------------------------------------------------------------------------------------------------------

#### Removing not needed characters

In [183]:
def preprocess_text(sen):
 
    sentence = sen
    # Remove everything except a-z, A-Z
    sentence = re.sub('[^a-zA-Z0-9]', ' ', sentence)
 
    # Removing multiple spaces
    sentence = re.sub(r'\s+', ' ', sentence)
 
    return sentence

In [184]:
docs = []  ## <class 'list'>
for sen in reviews:
    docs.append(preprocess_text(sen))

#### -------------------------------------------------------------------------------------------------------------------------------------------------

#### Removing not needed words

In [185]:
stop_words = set(line.strip() for line in open('stopwords.txt'))
exclude = set(string.punctuation) 
def docs_preprocessor(docs):
    tokenizer = RegexpTokenizer(r'\w+')
    for idx in range(len(docs)):
        docs[idx] = docs[idx].lower()  # Convert to lowercase.
        docs[idx] = tokenizer.tokenize(docs[idx])  # Split into words.
    # Remove numbers, but not words that contain numbers.
    docs = [[token for token in doc if not token.isdigit()] for doc in docs]
    # Remove stop words in documents
    docs = [[token for token in doc if not token in stop_words] for doc in docs]
    # Remove punctuations in documents
    docs = [[token for token in doc if not token in exclude] for doc in docs]
    # Remove words that are only one character.
    docs = [[token for token in doc if len(token) > 2] for doc in docs]
  
    return docs
# Perform function on our document
documents = docs_preprocessor(docs)

#### -------------------------------------------------------------------------------------------------------------------------------------------------

#### Encoding the output variable

In [186]:
encoder = LabelEncoder()
encoder.fit(y)
encoded_y = encoder.transform(y)

# convert integers to dummy variables (i.e. one hot encoded)
dummy_y = np_utils.to_categorical(encoded_y)
dummy_y

array([[1., 0.],
       [1., 0.],
       [1., 0.],
       ...,
       [1., 0.],
       [0., 1.],
       [0., 1.]], dtype=float32)

In [187]:
some_comments = [
    "A very good product", 
    "I like it very much" 
            ] 
tokenizer = Tokenizer() 
tokenizer.fit_on_texts(some_comments) 
 
tokenizer.word_index

{'very': 1,
 'a': 2,
 'good': 3,
 'product': 4,
 'i': 5,
 'like': 6,
 'it': 7,
 'much': 8}

In [188]:
tokenizer = Tokenizer(nb_words=16000, lower=True )

tokenizer.fit_on_texts(docs)

sequences = tokenizer.texts_to_sequences(docs)

word_index = tokenizer.word_index

print('U gjetën %s tokena unik.' % len(word_index))

# convert text to sequence of tokens and pad them to ensure equal length vectors 
x = pad_sequences(sequences, maxlen=25)
x



U gjetën 23677 tokena unik.


array([[   4, 9177,   18, ...,  179,  320,  373],
       [ 357,   99,    6, ...,   94,    6,  382],
       [  57,  280,   20, ...,   83,   29,   87],
       ...,
       [   0,    0,    0, ...,    0,  244,  144],
       [  18,  148, 1763, ..., 1741,   37,  626],
       [   0,    0,    0, ...,  149,  660, 6545]])