In [None]:
##################################################
# Introduction to Text Mining and Natural Language Processing
##################################################

##################################################
# Sentiment Analysis and Sentiment Modeling for Amazon Reviews
##################################################

# 1. Text Preprocessing
# 2. Text Visualization
# 3. Sentiment Analysis
# 4. Feature Engineering
# 5. Sentiment Modeling

# !pip install nltk
# !pip install textblob
# !pip install wordcloud

In [None]:
from warnings import filterwarnings
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from PIL import Image
from nltk.corpus import stopwords
from nltk.sentiment import SentimentIntensityAnalyzer
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, GridSearchCV, cross_validate
from sklearn.preprocessing import LabelEncoder
from textblob import Word, TextBlob
from wordcloud import WordCloud

filterwarnings('ignore')
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 200)
pd.set_option('display.float_format', lambda x: '%.2f' % x)

In [22]:
##################################################
# 1. Text Preprocessing
##################################################

amazon_reviews = pd.read_csv("amazon_reviews.csv")
df = amazon_reviews.copy()
df.head()

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime,day_diff,helpful_yes,total_vote
0,A3SBTW3WS4IQSN,B007WTAJTO,,"[0, 0]",No issues.,4.0,Four Stars,1406073600,2014-07-23,138,0,0
1,A18K1ODH1I2MVB,B007WTAJTO,0mie,"[0, 0]","Purchased this for my device, it worked as adv...",5.0,MOAR SPACE!!!,1382659200,2013-10-25,409,0,0
2,A2FII3I2MBMUIA,B007WTAJTO,1K3,"[0, 0]",it works as expected. I should have sprung for...,4.0,nothing to really say....,1356220800,2012-12-23,715,0,0
3,A3H99DFEG68SR,B007WTAJTO,1m2,"[0, 0]",This think has worked out great.Had a diff. br...,5.0,Great buy at this price!!! *** UPDATE,1384992000,2013-11-21,382,0,0
4,A375ZM4U047O79,B007WTAJTO,2&amp;1/2Men,"[0, 0]","Bought it with Retail Packaging, arrived legit...",5.0,best deal around,1373673600,2013-07-13,513,0,0


In [23]:
###############################
# Normalizing Case Folding
###############################

df['reviewText'] = df['reviewText'].str.lower()
df['reviewText']

Unnamed: 0,reviewText
0,no issues.
1,"purchased this for my device, it worked as adv..."
2,it works as expected. i should have sprung for...
3,this think has worked out great.had a diff. br...
4,"bought it with retail packaging, arrived legit..."
...,...
4910,i bought this sandisk 16gb class 10 to use wit...
4911,used this for extending the capabilities of my...
4912,great card that is very fast and reliable. it ...
4913,good amount of space for the stuff i want to d...


In [28]:
###############################
# Punctuations
###############################

df['reviewText'] = df['reviewText'].str.replace("[^\w\s]", " ", regex=True)
df['reviewText']

Unnamed: 0,reviewText
0,no issues
1,purchased this for my device it worked as adv...
2,it works as expected i should have sprung for...
3,this think has worked out great had a diff br...
4,bought it with retail packaging arrived legit...
...,...
4910,i bought this sandisk 16gb class 10 to use wit...
4911,used this for extending the capabilities of my...
4912,great card that is very fast and reliable it ...
4913,good amount of space for the stuff i want to d...


In [29]:
###############################
# Numbers
###############################

df['reviewText'] = df['reviewText'].str.replace("\d", " ", regex=True)
df['reviewText']

Unnamed: 0,reviewText
0,no issues
1,purchased this for my device it worked as adv...
2,it works as expected i should have sprung for...
3,this think has worked out great had a diff br...
4,bought it with retail packaging arrived legit...
...,...
4910,i bought this sandisk gb class to use wit...
4911,used this for extending the capabilities of my...
4912,great card that is very fast and reliable it ...
4913,good amount of space for the stuff i want to d...


In [30]:
###############################
# Stopwords
###############################

import nltk
nltk.download('stopwords')

sw = stopwords.words('english')
df['reviewText'] = df['reviewText'].apply(lambda x: " ".join(x for x in str(x).split() if x not in sw))
df['reviewText']

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Unnamed: 0,reviewText
0,issues
1,purchased device worked advertised never much ...
2,works expected sprung higher capacity think ma...
3,think worked great diff bran gb card went sout...
4,bought retail packaging arrived legit orange e...
...,...
4910,bought sandisk gb class use htc inspire months...
4911,used extending capabilities samsung galaxy not...
4912,great card fast reliable comes optional adapte...
4913,good amount space stuff want fits gopro say


In [41]:
###############################
# Rarewords
###############################

temp_df = pd.Series(' '.join(df['reviewText']).split()).value_counts()
drops = temp_df[temp_df <= 1]
df['reviewText'] = df['reviewText'].apply(lambda x: " ".join(x for x in x.split() if x not in drops))
df['reviewText']

Unnamed: 0,reviewText
0,issues
1,purchased device worked advertised never much ...
2,works expected higher capacity think made bit ...
3,think worked great gb card went south months o...
4,bought retail packaging arrived legit envelope...
...,...
4910,bought sandisk gb class use htc inspire months...
4911,used capabilities samsung galaxy note greatly ...
4912,great card fast reliable comes optional adapte...
4913,good amount space stuff want fits gopro say


In [42]:
###############################
# Tokenization
###############################

nltk.download("punkt")

df["reviewText"].apply(lambda x: TextBlob(x).words).head()

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Unnamed: 0,reviewText
0,[issues]
1,"[purchased, device, worked, advertised, never,..."
2,"[works, expected, higher, capacity, think, mad..."
3,"[think, worked, great, gb, card, went, south, ..."
4,"[bought, retail, packaging, arrived, legit, en..."


In [45]:
###############################
# Lemmatization
###############################

nltk.download('wordnet')

df['reviewText'] = df['reviewText'].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))
df['reviewText']

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Unnamed: 0,reviewText
0,issue
1,purchased device worked advertised never much ...
2,work expected higher capacity think made bit e...
3,think worked great gb card went south month on...
4,bought retail packaging arrived legit envelope...
...,...
4910,bought sandisk gb class use htc inspire month ...
4911,used capability samsung galaxy note greatly ex...
4912,great card fast reliable come optional adapter...
4913,good amount space stuff want fit gopro say
