In [34]:
from warnings import filterwarnings
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from PIL import Image
from nltk.corpus import stopwords
from nltk.sentiment import SentimentIntensityAnalyzer
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, GridSearchCV, cross_validate
from sklearn.preprocessing import LabelEncoder
from textblob import Word, TextBlob
from wordcloud import WordCloud

In [35]:
filterwarnings("ignore")

In [36]:
pd.set_option("display.max_columns", None)
pd.set_option("display.width", 200)
pd.set_option("display.float_format", lambda x: "%.2f" % x)

In [37]:
## Text Preprocessing

In [38]:
df = pd.read_csv("datasets/amazon_reviews.csv", sep=",")

In [39]:
df.head()

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime,day_diff,helpful_yes,total_vote
0,A3SBTW3WS4IQSN,B007WTAJTO,,"[0, 0]",No issues.,4.0,Four Stars,1406073600,2014-07-23,138,0,0
1,A18K1ODH1I2MVB,B007WTAJTO,0mie,"[0, 0]","Purchased this for my device, it worked as adv...",5.0,MOAR SPACE!!!,1382659200,2013-10-25,409,0,0
2,A2FII3I2MBMUIA,B007WTAJTO,1K3,"[0, 0]",it works as expected. I should have sprung for...,4.0,nothing to really say....,1356220800,2012-12-23,715,0,0
3,A3H99DFEG68SR,B007WTAJTO,1m2,"[0, 0]",This think has worked out great.Had a diff. br...,5.0,Great buy at this price!!! *** UPDATE,1384992000,2013-11-21,382,0,0
4,A375ZM4U047O79,B007WTAJTO,2&amp;1/2Men,"[0, 0]","Bought it with Retail Packaging, arrived legit...",5.0,best deal around,1373673600,2013-07-13,513,0,0


In [40]:
# Normalizing Case Folding

In [41]:
df['reviewText'] = df['reviewText'].str.lower()

In [42]:
df['reviewText'].head()

0                                           no issues.
1    purchased this for my device, it worked as adv...
2    it works as expected. i should have sprung for...
3    this think has worked out great.had a diff. br...
4    bought it with retail packaging, arrived legit...
Name: reviewText, dtype: object

In [43]:
# Punctuations

In [44]:
df['reviewText'] = df['reviewText'].str.replace('[^\w\s]', '')

In [45]:
# Numbers

In [46]:
df['reviewText'] = df['reviewText'].str.replace('\d', '')

In [47]:
df['reviewText'].head()

0                                           no issues.
1    purchased this for my device, it worked as adv...
2    it works as expected. i should have sprung for...
3    this think has worked out great.had a diff. br...
4    bought it with retail packaging, arrived legit...
Name: reviewText, dtype: object

In [48]:
# Stopwords

In [49]:
import nltk

In [50]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [51]:
sw = stopwords.words('english')

In [52]:
sw

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [53]:
df['reviewText'] = df['reviewText'].apply(lambda x: " ".join(x for x in str(x).split() if x not in sw))

In [54]:
df['reviewText'].head()

0                                              issues.
1    purchased device, worked advertised. never muc...
2    works expected. sprung higher capacity. think ...
3    think worked great.had diff. bran 64gb card we...
4    bought retail packaging, arrived legit, orange...
Name: reviewText, dtype: object

In [55]:
# Rare Words

In [56]:
temp_df = pd.Series(" ".join(df['reviewText']).split()).value_counts()

In [57]:
drops = temp_df[temp_df < 2].index

In [58]:
drops.shape

(10805,)

In [59]:
df['reviewText'] = df['reviewText'].apply(lambda x : " ".join(x for x in x.split() if x not in drops))

In [60]:
# Tokenization

In [61]:
df['reviewText'].apply(lambda x: TextBlob(x).words).head()

0                                             [issues]
1    [purchased, device, worked, advertised, never,...
2    [works, expected, higher, capacity, think, mad...
3    [think, worked, 64gb, card, went, south, 3, on...
4    [bought, retail, packaging, arrived, legit, ve...
Name: reviewText, dtype: object

In [62]:
# Lemmatization

In [64]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...


True

In [65]:
df['reviewText'] = df['reviewText'].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))

In [66]:
df['reviewText'].head()

0                                              issues.
1    purchased device, worked advertised. never muc...
2    work expected. higher capacity. think made bit...
3    think worked 64gb card went south 3 one held p...
4    bought retail packaging, arrived legit, versio...
Name: reviewText, dtype: object