# Data Preparation

In [1]:
# Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
color = sns.color_palette()
%matplotlib inline

In [2]:
# Reading dataset and checking head
dataset = pd.read_csv("amazon_reviews.txt", delimiter = "\t")
dataset.head()

Unnamed: 0,DOC_ID,LABEL,RATING,VERIFIED_PURCHASE,PRODUCT_CATEGORY,PRODUCT_ID,PRODUCT_TITLE,REVIEW_TITLE,REVIEW_TEXT
0,1,__label1__,4,N,PC,B00008NG7N,"Targus PAUK10U Ultra Mini USB Keypad, Black",useful,"When least you think so, this product will sav..."
1,2,__label1__,4,Y,Wireless,B00LH0Y3NM,Note 3 Battery : Stalion Strength Replacement ...,New era for batteries,Lithium batteries are something new introduced...
2,3,__label1__,3,N,Baby,B000I5UZ1Q,"Fisher-Price Papasan Cradle Swing, Starlight",doesn't swing very well.,I purchased this swing for my baby. She is 6 m...
3,4,__label1__,4,N,Office Products,B003822IRA,Casio MS-80B Standard Function Desktop Calculator,Great computing!,I was looking for an inexpensive desk calcolat...
4,5,__label1__,4,N,Beauty,B00PWSAXAM,Shine Whitening - Zero Peroxide Teeth Whitenin...,Only use twice a week,I only use it twice a week and the results are...


In [3]:
# Checking the shape of dataset
dataset.shape

(21000, 9)

In [4]:
# Checking the name of columns in the data
dataset.columns

Index(['DOC_ID', 'LABEL', 'RATING', 'VERIFIED_PURCHASE', 'PRODUCT_CATEGORY',
       'PRODUCT_ID', 'PRODUCT_TITLE', 'REVIEW_TITLE', 'REVIEW_TEXT'],
      dtype='object')

In [5]:
# Changing the label to 1 and 0 for (fake and not fake)
dataset.loc[dataset["LABEL"] == "__label1__", "LABEL"] = '1'
dataset.loc[dataset["LABEL"] == "__label2__", "LABEL"] = '0'

In [6]:
# Checking the number of fake and non-fake reviews
# Dataset is balanced with equal number of fake and non-fake reviews
dataset.groupby('LABEL').count()

Unnamed: 0_level_0,DOC_ID,RATING,VERIFIED_PURCHASE,PRODUCT_CATEGORY,PRODUCT_ID,PRODUCT_TITLE,REVIEW_TITLE,REVIEW_TEXT
LABEL,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,10500,10500,10500,10500,10500,10500,10500,10500
1,10500,10500,10500,10500,10500,10500,10500,10500


In [7]:
# Counting the number of categories in the dataset
# Dataset is balanced in terms of number of products per category too
dataset.groupby(dataset["LABEL"]).PRODUCT_CATEGORY.value_counts()

LABEL  PRODUCT_CATEGORY      
0      Apparel                   350
       Automotive                350
       Baby                      350
       Beauty                    350
       Books                     350
       Camera                    350
       Electronics               350
       Furniture                 350
       Grocery                   350
       Health & Personal Care    350
       Home                      350
       Home Entertainment        350
       Home Improvement          350
       Jewelry                   350
       Kitchen                   350
       Lawn and Garden           350
       Luggage                   350
       Musical Instruments       350
       Office Products           350
       Outdoors                  350
       PC                        350
       Pet Products              350
       Shoes                     350
       Sports                    350
       Tools                     350
       Toys                      350
       V

In [8]:
# Checking breakdown of verified purchase based on labels
dataset.groupby("VERIFIED_PURCHASE").LABEL.value_counts()

VERIFIED_PURCHASE  LABEL
N                  1        7623
                   0        1679
Y                  0        8821
                   1        2877
Name: LABEL, dtype: int64

# Text Analytics

In [9]:
# Creating a column for length of text in summary column
dataset['TXT_LENGTH'] = dataset['REVIEW_TEXT'].apply(len)

# Showing average length of texts for label 1 and 2
dataset.groupby(["LABEL"]).TXT_LENGTH.agg({'TXT_LENGTH': 'mean'})

is deprecated and will be removed in a future version
  """


Unnamed: 0_level_0,TXT_LENGTH
LABEL,Unnamed: 1_level_1
0,428.102857
1,316.55


In [10]:
# Creating a column for the number of sentences in the summary column 
dataset['num_sent'] = dataset['REVIEW_TEXT'].apply(lambda x: len(str(x).split('.')))
dataset.head()

Unnamed: 0,DOC_ID,LABEL,RATING,VERIFIED_PURCHASE,PRODUCT_CATEGORY,PRODUCT_ID,PRODUCT_TITLE,REVIEW_TITLE,REVIEW_TEXT,TXT_LENGTH,num_sent
0,1,1,4,N,PC,B00008NG7N,"Targus PAUK10U Ultra Mini USB Keypad, Black",useful,"When least you think so, this product will sav...",116,3
1,2,1,4,Y,Wireless,B00LH0Y3NM,Note 3 Battery : Stalion Strength Replacement ...,New era for batteries,Lithium batteries are something new introduced...,404,4
2,3,1,3,N,Baby,B000I5UZ1Q,"Fisher-Price Papasan Cradle Swing, Starlight",doesn't swing very well.,I purchased this swing for my baby. She is 6 m...,248,6
3,4,1,4,N,Office Products,B003822IRA,Casio MS-80B Standard Function Desktop Calculator,Great computing!,I was looking for an inexpensive desk calcolat...,212,5
4,5,1,4,N,Beauty,B00PWSAXAM,Shine Whitening - Zero Peroxide Teeth Whitenin...,Only use twice a week,I only use it twice a week and the results are...,331,4


In [11]:
# Install package for text analytics 
#!pip install textstat

In [12]:
# Import text stat functionality
import textstat
from textstat import flesch_kincaid_grade
from textstat.textstat import textstat

In [13]:
# Creating a Flesch-Kincaid grade level for
# Meaning: https://datawarrior.wordpress.com/2016/03/29/flesch-kincaid-readability-measure/
dataset['FK_grade_score'] = dataset['REVIEW_TEXT'].apply(textstat.flesch_kincaid_grade)

In [14]:
# Checking for average grade per label 
dataset.groupby(["LABEL"]).FK_grade_score.agg(lambda x: sum(x)/len(x))

LABEL
0    13.803848
1     8.007886
Name: FK_grade_score, dtype: float64

In [15]:
# Install nltk package
#!pip install --user -U nltk

In [16]:
# Importing nltk package and importing stop words 
# Stop words are a, an, the 
# Reference: https://www.geeksforgeeks.org/removing-stop-words-nltk-python/
import nltk
nltk.download('stopwords')
wpt = nltk.WordPunctTokenizer()
stop_words = nltk.corpus.stopwords.words('english')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\dipee\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [17]:
# Counting number of stop words and creating a column with the info
def stopCount(x):
    sum =0
    for char in x.split():
        sum+= char in stop_words
    return sum
dataset['stop_count'] = dataset['REVIEW_TEXT'].apply(stopCount)

In [18]:
# Checking for average number of stop words in different labels 
dataset.groupby(["LABEL"]).stop_count.agg(lambda x: sum(x)/len(x))

LABEL
0    32.519048
1    24.696190
Name: stop_count, dtype: float64

In [19]:
# Counting for the number of capital letters and creating a column 
def capsCount(x):
    sum = 0
    for char in x:
        sum+= char in "QWERTYUIOPASDFGHJKLZXCVBNM"
    return sum
dataset['caps_count'] = dataset['REVIEW_TEXT'].apply(capsCount)

# Checking for average number of capital letters in each label 
dataset.groupby(["LABEL"]).caps_count.agg(lambda x: sum(x)/len(x))

LABEL
0    12.099810
1     8.712667
Name: caps_count, dtype: float64

In [20]:
# Counting the number of punctuation marks in the text 
import string
count = lambda l1,l2: sum([1 for x in l1 if x in l2])
def punctCount(x):
    return count(x, set(string.punctuation))

# Creating a column with punctuation count 
dataset['punct_count'] = dataset['REVIEW_TEXT'].apply(punctCount)

# Checking average number of punctuations per label 
dataset.groupby(["LABEL"]).punct_count.agg(lambda x: sum(x)/len(x))

LABEL
0    15.571524
1    10.182571
Name: punct_count, dtype: float64

In [21]:
# Checking if there are emojis in the text using binary indicator
dataset["emojis"] = dataset["REVIEW_TEXT"].apply(lambda x: 1 if ";)" in x.split() or ":)" in x.split() or ":-)" in x.split() else 0)

# Looking at total reviews with emojis for both labels
dataset.groupby(["LABEL"]).emojis.agg(lambda x: sum(x))

LABEL
0    107
1     85
Name: emojis, dtype: int64

In [22]:
# Checking the ratings distribution by label 
dataset.groupby(['LABEL','RATING'])['LABEL'].count()

LABEL  RATING
0      1          868
       2          565
       3          942
       4         1974
       5         6151
1      1          889
       2          627
       3          926
       4         1999
       5         6059
Name: LABEL, dtype: int64

In [23]:
# Importing nltk packages
import csv 
from nltk.classify import SklearnClassifier
import nltk
import string
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.util import ngrams
from nltk.tokenize import word_tokenize
#nltk.download()

###  Bag of Words

In [24]:
# Insert beautiful soup
#!pip install BeautifulSoup4
#!pip install gensim

In [25]:
# Import packages
from bs4 import BeautifulSoup 
import re

In [26]:
 # Function to convert a raw review to a string of words
def review_to_words( raw_review ):
    # The input is a single string (a raw movie review), and 
    # the output is a single string (a preprocessed movie review)
    
    # 1. Remove HTML
    review_text = BeautifulSoup(raw_review).get_text() 
    
    # 2. Remove non-letters        
    letters_only = re.sub("[^a-zA-Z]", " ", review_text) 
    
    # 3. Convert to lower case, split into individual words
    words = letters_only.lower().split()                             
    
    # 4. In Python, searching a set is much faster than searching
    #   a list, so convert the stop words to a set
    # to improve execution time this conversion should be done once
    stops = set(stopwords.words("english"))                  
     
    # 5. Remove stop words
    meaningful_words = [w for w in words if not w in stops]   
    
    # 6. Join the words back into one string separated by space, 
    # and return the result.
    return( " ".join( meaningful_words ))  

In [27]:
# Cleaning reviews and creating a new dataset
dataset['clean_text'] = dataset['REVIEW_TEXT'].apply(review_to_words)
dataset.head()

Unnamed: 0,DOC_ID,LABEL,RATING,VERIFIED_PURCHASE,PRODUCT_CATEGORY,PRODUCT_ID,PRODUCT_TITLE,REVIEW_TITLE,REVIEW_TEXT,TXT_LENGTH,num_sent,FK_grade_score,stop_count,caps_count,punct_count,emojis,clean_text
0,1,1,4,N,PC,B00008NG7N,"Targus PAUK10U Ultra Mini USB Keypad, Black",useful,"When least you think so, this product will sav...",116,3,1.9,10,2,3,0,least think product save day keep around case ...
1,2,1,4,Y,Wireless,B00LH0Y3NM,Note 3 Battery : Stalion Strength Replacement ...,New era for batteries,Lithium batteries are something new introduced...,404,4,15.6,28,4,7,0,lithium batteries something new introduced mar...
2,3,1,3,N,Baby,B000I5UZ1Q,"Fisher-Price Papasan Cradle Swing, Starlight",doesn't swing very well.,I purchased this swing for my baby. She is 6 m...,248,6,2.6,25,6,8,0,purchased swing baby months pretty much grown ...
3,4,1,4,N,Office Products,B003822IRA,Casio MS-80B Standard Function Desktop Calculator,Great computing!,I was looking for an inexpensive desk calcolat...,212,5,4.0,18,6,4,0,looking inexpensive desk calcolatur works ever...
4,5,1,4,N,Beauty,B00PWSAXAM,Shine Whitening - Zero Peroxide Teeth Whitenin...,Only use twice a week,I only use it twice a week and the results are...,331,4,8.3,34,4,6,0,use twice week results great used teeth whiten...


In [28]:
# Write to csv 
dataset.to_csv(r'text_features.csv', index=False)

In [36]:
# Create features for bag of words
from sklearn.feature_extraction.text import CountVectorizer

# Initialize the "CountVectorizer" object
vectorizer = CountVectorizer(analyzer = "word",   \
                             tokenizer = None,    \
                             preprocessor = None, \
                             stop_words = None,   \
                             max_features = 5000) 
# Note that CountVectorizer comes with its own options to automatically do preprocessing, tokenization, and stop word removal -- for each of these, instead of specifying "None", we could have used a built-in method or specified our own function to use.

# fit_transform() does two functions: First, it fits the model
# and learns the vocabulary; second, it transforms our training data
# into feature vectors. The input to fit_transform should be a list of 
# strings.
train_data_features = vectorizer.fit_transform(dataset['clean_text'])

# Numpy arrays are easy to work with, so convert the result to an 
# array
train_data_features = train_data_features.toarray()

In [37]:
# Changing array into dataframe and getting column names
column_name = vectorizer.get_feature_names()
word_features = pd.DataFrame(data = train_data_features,
                            columns = column_name)
word_features.head()


Unnamed: 0,DOC_ID,LABEL,RATING,VERIFIED_PURCHASE,PRODUCT_CATEGORY,PRODUCT_ID,PRODUCT_TITLE,REVIEW_TITLE,REVIEW_TEXT,TXT_LENGTH,...,yrs,yummy,zero,zip,zipper,zippered,zippers,zombies,zone,zoom
0,1,1,4,N,PC,B00008NG7N,"Targus PAUK10U Ultra Mini USB Keypad, Black",useful,"When least you think so, this product will sav...",116,...,0,0,0,0,0,0,0,0,0,0
1,2,1,4,Y,Wireless,B00LH0Y3NM,Note 3 Battery : Stalion Strength Replacement ...,New era for batteries,Lithium batteries are something new introduced...,404,...,0,0,0,0,0,0,0,0,0,0
2,3,1,3,N,Baby,B000I5UZ1Q,"Fisher-Price Papasan Cradle Swing, Starlight",doesn't swing very well.,I purchased this swing for my baby. She is 6 m...,248,...,0,0,0,0,0,0,0,0,0,0
3,4,1,4,N,Office Products,B003822IRA,Casio MS-80B Standard Function Desktop Calculator,Great computing!,I was looking for an inexpensive desk calcolat...,212,...,0,0,0,0,0,0,0,0,0,0
4,5,1,4,N,Beauty,B00PWSAXAM,Shine Whitening - Zero Peroxide Teeth Whitenin...,Only use twice a week,I only use it twice a week and the results are...,331,...,0,0,0,0,0,0,0,0,0,0


In [38]:
word_features.columns 

Index(['aa', 'aaa', 'ab', 'ability', 'able', 'absolute', 'absolutely',
       'absorb', 'absorbs', 'abuse',
       ...
       'yrs', 'yummy', 'zero', 'zip', 'zipper', 'zippered', 'zippers',
       'zombies', 'zone', 'zoom'],
      dtype='object', length=5000)

In [44]:
# Features for beauty
beauty = dataset.loc[dataset.PRODUCT_CATEGORY == 'Beauty']

# Initialize the "CountVectorizer" object
vectorizer_b = CountVectorizer(analyzer = "word",   \
                             tokenizer = None,    \
                             preprocessor = None, \
                             stop_words = None,   \
                             max_features = 100) 
# Note that CountVectorizer comes with its own options to automatically do preprocessing, tokenization, and stop word removal -- for each of these, instead of specifying "None", we could have used a built-in method or specified our own function to use.

# fit_transform() does two functions: First, it fits the model
# and learns the vocabulary; second, it transforms our training data
# into feature vectors. The input to fit_transform should be a list of 
# strings.
train_data_features_b = vectorizer_b.fit_transform(beauty['clean_text'])

# Numpy arrays are easy to work with, so convert the result to an 
# array
train_data_features_b = train_data_features_b.toarray()

# Changing array into dataframe and getting column names
col_name = vectorizer_b.get_feature_names()
word_features_b = pd.DataFrame(data = train_data_features_b,
                            columns = col_name)
word_features_b.head()

Unnamed: 0,acne,also,amazing,back,best,better,bit,bottle,bought,brush,...,use,used,using,way,well,without,work,works,would,years
0,0,1,0,0,0,0,0,0,0,0,...,2,1,1,0,0,0,0,0,1,0
1,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,1,0,...,1,0,0,0,1,0,0,1,0,0
4,0,2,0,0,0,0,0,0,1,0,...,1,3,1,0,0,0,0,0,1,0


In [45]:
word_features_b.shape

(700, 100)

In [48]:
word_features_b.columns

Index(['acne', 'also', 'amazing', 'back', 'best', 'better', 'bit', 'bottle',
       'bought', 'brush', 'buy', 'color', 'cream', 'day', 'days', 'definitely',
       'different', 'dry', 'easy', 'even', 'every', 'eye', 'eyes', 'face',
       'far', 'feel', 'feeling', 'feels', 'first', 'found', 'get', 'getting',
       'give', 'go', 'going', 'good', 'got', 'great', 'hair', 'helps',
       'highly', 'know', 'last', 'less', 'light', 'like', 'little', 'long',
       'look', 'looking', 'looks', 'lot', 'love', 'made', 'make', 'makes',
       'many', 'much', 'need', 'never', 'nice', 'oil', 'one', 'perfect',
       'pretty', 'price', 'product', 'products', 'put', 'quality', 'really',
       'recommend', 'results', 'scent', 'see', 'serum', 'shampoo', 'since',
       'skin', 'smell', 'smells', 'smooth', 'soft', 'still', 'stuff', 'think',
       'time', 'tried', 'try', 'two', 'use', 'used', 'using', 'way', 'well',
       'without', 'work', 'works', 'would', 'years'],
      dtype='object')

In [51]:
# Columns in beauty to subset the main dataset
col_b = ['acne', 'also', 'amazing', 'back', 'best', 'better', 'bit', 'bottle',
       'bought', 'brush', 'buy', 'color', 'cream', 'day', 'days', 'definitely',
       'different', 'dry', 'easy', 'even', 'every', 'eye', 'eyes', 'face',
       'far', 'feel', 'feeling', 'feels', 'first', 'found', 'get', 'getting',
       'give', 'go', 'going', 'good', 'got', 'great', 'hair', 'helps',
       'highly', 'know', 'last', 'less', 'light', 'like', 'little', 'long',
       'look', 'looking', 'looks', 'lot', 'love', 'made', 'make', 'makes',
       'many', 'much', 'need', 'never', 'nice', 'oil', 'one', 'perfect',
       'pretty', 'price', 'product', 'products', 'put', 'quality', 'really',
       'recommend', 'results', 'scent', 'see', 'serum', 'shampoo', 'since',
       'skin', 'smell', 'smells', 'smooth', 'soft', 'still', 'stuff', 'think',
       'time', 'tried', 'try', 'two', 'use', 'used', 'using', 'way', 'well',
       'without', 'work', 'works', 'would', 'years']

In [52]:
# Subsetting dataset for top 100 beauty items
word_features = word_features.loc[:,col_b]

In [58]:
word_features.shape

(21000, 100)

In [57]:
# Checking dataset has no NAs
word_features.isna().sum()

acne          0
also          0
amazing       0
back          0
best          0
better        0
bit           0
bottle        0
bought        0
brush         0
buy           0
color         0
cream         0
day           0
days          0
definitely    0
different     0
dry           0
easy          0
even          0
every         0
eye           0
eyes          0
face          0
far           0
feel          0
feeling       0
feels         0
first         0
found         0
             ..
really        0
recommend     0
results       0
scent         0
see           0
serum         0
shampoo       0
since         0
skin          0
smell         0
smells        0
smooth        0
soft          0
still         0
stuff         0
think         0
time          0
tried         0
try           0
two           0
use           0
used          0
using         0
way           0
well          0
without       0
work          0
works         0
would         0
years         0
Length: 100, dtype: int6

In [60]:
# Appending with original data
dataset_2 = pd.concat([dataset, word_features], axis=1)
dataset_2.head()

Unnamed: 0,DOC_ID,LABEL,RATING,VERIFIED_PURCHASE,PRODUCT_CATEGORY,PRODUCT_ID,PRODUCT_TITLE,REVIEW_TITLE,REVIEW_TEXT,TXT_LENGTH,...,use,used,using,way,well,without,work,works,would,years
0,1,1,4,N,PC,B00008NG7N,"Targus PAUK10U Ultra Mini USB Keypad, Black",useful,"When least you think so, this product will sav...",116,...,0,0,0,0,0,0,0,0,0,0
1,2,1,4,Y,Wireless,B00LH0Y3NM,Note 3 Battery : Stalion Strength Replacement ...,New era for batteries,Lithium batteries are something new introduced...,404,...,0,0,0,0,0,0,0,1,0,0
2,3,1,3,N,Baby,B000I5UZ1Q,"Fisher-Price Papasan Cradle Swing, Starlight",doesn't swing very well.,I purchased this swing for my baby. She is 6 m...,248,...,0,0,0,0,1,0,0,0,0,0
3,4,1,4,N,Office Products,B003822IRA,Casio MS-80B Standard Function Desktop Calculator,Great computing!,I was looking for an inexpensive desk calcolat...,212,...,0,0,0,0,0,0,0,1,0,0
4,5,1,4,N,Beauty,B00PWSAXAM,Shine Whitening - Zero Peroxide Teeth Whitenin...,Only use twice a week,I only use it twice a week and the results are...,331,...,2,1,1,0,0,0,0,0,1,0


In [61]:
# Checking shape of he concatenated dataset
dataset_2.shape

(21000, 117)

In [62]:
# Writing to csv
dataset_2.to_csv(r'train_data.csv', index= False)

In [63]:
dataset_2.head()

Unnamed: 0,DOC_ID,LABEL,RATING,VERIFIED_PURCHASE,PRODUCT_CATEGORY,PRODUCT_ID,PRODUCT_TITLE,REVIEW_TITLE,REVIEW_TEXT,TXT_LENGTH,...,use,used,using,way,well,without,work,works,would,years
0,1,1,4,N,PC,B00008NG7N,"Targus PAUK10U Ultra Mini USB Keypad, Black",useful,"When least you think so, this product will sav...",116,...,0,0,0,0,0,0,0,0,0,0
1,2,1,4,Y,Wireless,B00LH0Y3NM,Note 3 Battery : Stalion Strength Replacement ...,New era for batteries,Lithium batteries are something new introduced...,404,...,0,0,0,0,0,0,0,1,0,0
2,3,1,3,N,Baby,B000I5UZ1Q,"Fisher-Price Papasan Cradle Swing, Starlight",doesn't swing very well.,I purchased this swing for my baby. She is 6 m...,248,...,0,0,0,0,1,0,0,0,0,0
3,4,1,4,N,Office Products,B003822IRA,Casio MS-80B Standard Function Desktop Calculator,Great computing!,I was looking for an inexpensive desk calcolat...,212,...,0,0,0,0,0,0,0,1,0,0
4,5,1,4,N,Beauty,B00PWSAXAM,Shine Whitening - Zero Peroxide Teeth Whitenin...,Only use twice a week,I only use it twice a week and the results are...,331,...,2,1,1,0,0,0,0,0,1,0


In [66]:
dataset_2.isna().sum()

DOC_ID               0
LABEL                0
RATING               0
VERIFIED_PURCHASE    0
PRODUCT_CATEGORY     0
PRODUCT_ID           0
PRODUCT_TITLE        0
REVIEW_TITLE         0
REVIEW_TEXT          0
TXT_LENGTH           0
num_sent             0
FK_grade_score       0
stop_count           0
caps_count           0
punct_count          0
emojis               0
clean_text           0
acne                 0
also                 0
amazing              0
back                 0
best                 0
better               0
bit                  0
bottle               0
bought               0
brush                0
buy                  0
color                0
cream                0
                    ..
really               0
recommend            0
results              0
scent                0
see                  0
serum                0
shampoo              0
since                0
skin                 0
smell                0
smells               0
smooth               0
soft       

In [65]:
dataset_2.columns 

Index(['DOC_ID', 'LABEL', 'RATING', 'VERIFIED_PURCHASE', 'PRODUCT_CATEGORY',
       'PRODUCT_ID', 'PRODUCT_TITLE', 'REVIEW_TITLE', 'REVIEW_TEXT',
       'TXT_LENGTH',
       ...
       'use', 'used', 'using', 'way', 'well', 'without', 'work', 'works',
       'would', 'years'],
      dtype='object', length=117)