In [None]:
"""
Problem Statement No. 16 
Consider the Amazon Alexa Reviews Dataset. This dataset consists of a nearly 3000 Amazon customer reviews (input 
text), star ratings, date of review, variant and feedback of various amazon Alexa products like Alexa Echo, Echo dots, 
Alexa Firesticks etc. Perform following operations on this dataset. 
(I) Remove all punctuations from review text. 
(II) Tokenize the review text into words. 
(III) Remove the Stopwords from the tokenized text. 
(IV) Perform stemming & lemmatization on the review text. 
(V) Perform the word vectorization on review text using Bag of Words technique. 
(VI) Create representation of Review Text by calculating Term Frequency and Inverse Document Frequency (TF-IDF)
"""

In [63]:
import pandas as pd

In [64]:
df = pd.read_csv('Alexa-Dataset.csv')

In [65]:
df.sample(5)

Unnamed: 0,rating,date,variation,verified_reviews,feedback
684,5,19-May-18,Black,I bought this for a family member as a gift. I...,1
136,5,30-Jul-18,Charcoal Fabric,Still love it,1
2492,3,30-Jul-18,Black Dot,It won’t work as a blue tooth speaker with my ...,1
1736,5,28-Jul-18,White Show,Device has great speakers and the screen quali...,1
2954,5,30-Jul-18,White Dot,I really like the echo dot. It’s amazing to be...,1


In [66]:
df['verified_reviews'][2]

'Sometimes while playing a game, you can answer a question correctly but Alexa says you got it wrong and answers the same as you.  I like being able to turn lights on and off while away from home.'

In [67]:
df['verified_reviews'] = df['verified_reviews'].str.lower()

In [68]:
df['verified_reviews'][2]

'sometimes while playing a game, you can answer a question correctly but alexa says you got it wrong and answers the same as you.  i like being able to turn lights on and off while away from home.'

In [69]:
import string 

In [70]:
punc = string.punctuation

In [71]:
punc

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [72]:
def remove_punctuation(data):
    for char in punc:
        data = data.replace(char,'')
    return data

In [73]:
text = 'i have a data . ,'

In [74]:
remove_punctuation(text)

'i have a data  '

In [75]:
df['verified_reviews'] = df['verified_reviews'].astype(str)

In [76]:
df['verified_reviews'] =df['verified_reviews'].apply(remove_punctuation)

In [77]:
df['verified_reviews'][2]

'sometimes while playing a game you can answer a question correctly but alexa says you got it wrong and answers the same as you  i like being able to turn lights on and off while away from home'

In [86]:
import nltk

In [87]:
from nltk.tokenize import word_tokenize

In [88]:
def tokenize_word(data):
    tokens = nltk.word_tokenize(data)
    return tokens

In [89]:
tokenize_word(text)

['i', 'have', 'a', 'data', '.', ',']

In [90]:
df['verified_reviews'] = df['verified_reviews'].apply(tokenize_word)

In [91]:
df['verified_reviews'][2]

['sometimes',
 'playing',
 'game',
 'answer',
 'question',
 'correctly',
 'alexa',
 'says',
 'got',
 'wrong',
 'answers',
 'like',
 'able',
 'turn',
 'lights',
 'away',
 'home']

In [78]:
from nltk.corpus import stopwords

In [79]:
nltk.corpus.stopwords.words("english")

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [80]:
df['verified_reviews'] = df['verified_reviews'].astype(str)

In [81]:
def remove_stopwords(data):
    stop_words = set(stopwords.words("english"))
    words = data.split()
    filtered_word = []
    for word in words:
        if word.lower() not in stop_words:
            filtered_word.append(word)
    filtered_text = ' '.join(filtered_word)
    return filtered_text

In [82]:
data = 'hii i am a coder'

In [83]:
remove_stopwords(data)

'hii coder'

In [84]:
df['verified_reviews'] = df['verified_reviews'].apply(lambda x: remove_stopwords(x))

In [85]:
df['verified_reviews'][2]

'sometimes playing game answer question correctly alexa says got wrong answers like able turn lights away home'

In [92]:
from nltk.stem import PorterStemmer

In [93]:
stemmer = PorterStemmer()

In [100]:
def perform_stemming(text):
    stemmed_words = [stemmer.stem(word) for word in text]
    stemmed_text = ' '.join(stemmed_words)
    return stemmed_text

In [103]:
text1 = 'study studying studied' 

In [106]:
 words = nltk.word_tokenize(text1)