In [31]:
import numpy as np
import pandas as pd
import re
from bs4 import BeautifulSoup
import unicodedata

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
nltk.download('stopwords')
nltk.download('wordnet')

def rem_en(input_txt):
    words = input_txt.lower().split()
    noise_free_words = [word for word in words if word not in stop] 
    noise_free_text = " ".join(noise_free_words)
    return noise_free_text

stop = set(stopwords.words('english'))
tokeniser = RegexpTokenizer(r"\w+")
lemmatiser = WordNetLemmatizer()

[nltk_data] Downloading package stopwords to /Users/diya/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/diya/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [62]:
df = pd.read_csv('preprocessed_amazon_customer_reviews.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2393392 entries, 0 to 2393391
Data columns (total 14 columns):
 #   Column             Dtype  
---  ------             -----  
 0   Unnamed: 0         int64  
 1   customer_id        int64  
 2   review_id          object 
 3   product_id         object 
 4   product_parent     int64  
 5   product_title      object 
 6   star_rating        float64
 7   helpful_votes      float64
 8   total_votes        float64
 9   vine               object 
 10  verified_purchase  object 
 11  review_headline    object 
 12  review_body        object 
 13  review_date        object 
dtypes: float64(3), int64(3), object(8)
memory usage: 255.6+ MB


In [63]:
#deal with NaN values in review heading / body columns 
df[['review_headline', 'review_body']] = df[['review_headline', 'review_body']].fillna('')

# merge review text columns 
df['review_text'] = df.review_headline.str.cat(df.review_body, sep=' . ')
df.head()

Unnamed: 0.1,Unnamed: 0,customer_id,review_id,product_id,product_parent,product_title,star_rating,helpful_votes,total_votes,vine,verified_purchase,review_headline,review_body,review_date,review_text
0,0,42521656,R26MV8D0KG6QI6,B000SAQCWC,159713740,"The Cravings Place Chocolate Chunk Cookie Mix,...",5.0,0.0,0.0,N,Y,Using these for years - love them.,"As a family allergic to wheat, dairy, eggs, nu...",2015-08-31,Using these for years - love them. . As a fami...
1,1,12049833,R1OF8GP57AQ1A0,B00509LVIQ,138680402,"Mauna Loa Macadamias, 11 Ounce Packages",5.0,0.0,0.0,N,Y,Wonderful,"My favorite nut. Creamy, crunchy, salty, and ...",2015-08-31,"Wonderful . My favorite nut. Creamy, crunchy,..."
2,2,107642,R3VDC1QB6MC4ZZ,B00KHXESLC,252021703,Organic Matcha Green Tea Powder - 100% Pure Ma...,5.0,0.0,0.0,N,N,Five Stars,This green tea tastes so good! My girlfriend l...,2015-08-31,Five Stars . This green tea tastes so good! My...
3,3,6042304,R12FA3DCF8F9ER,B000F8JIIC,752728342,15oz Raspberry Lyons Designer Dessert Syrup Sauce,5.0,0.0,0.0,N,Y,Five Stars,I love Melissa's brand but this is a great sec...,2015-08-31,Five Stars . I love Melissa's brand but this i...
4,4,18123821,RTWHVNV6X4CNJ,B004ZWR9RQ,552138758,"Stride Spark Kinetic Fruit Sugar Free Gum, 14-...",5.0,0.0,0.0,N,Y,Five Stars,good,2015-08-31,Five Stars . good


In [64]:
def remove_extra_whitespaces_func(text):
    return re.sub(r'^\s*|\s\s*', ' ', text).strip()

def remove_accented_chars_func(text):
    return unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')

def remove_html_tags_func(text):
    soup = BeautifulSoup(text, "html.parser")
    # Get all the text other than html tags.
    stripped_text = soup.get_text(separator=" ")
    return stripped_text

def deEmojify(inputString):
    return inputString.encode('ascii', 'ignore').decode('ascii')

In [65]:
#remove html tags
df["clean_text"] = df["review_text"].apply(remove_html_tags_func)

# remove links
df["clean_text"] = df["clean_text"].apply(lambda s: ' '.join(re.sub(r'https?://\S+|www\.\S+', "", s).split()))

## remove punctuation
df["clean_text"] = df["clean_text"].apply(lambda s: ' '.join(re.sub("[.,!?:;-='...@#_]", "", s).split()))

# remove numbers
df["clean_text"].replace('\d+', '', regex=True, inplace=True)

#remove emojis
df["clean_text"] = df["clean_text"].apply(lambda s: deEmojify(s))

#remove accented words
df["clean_text"] = df["clean_text"].apply(remove_accented_chars_func)

#remove extra white spaces 
df["clean_text"] = df["clean_text"].apply(remove_extra_whitespaces_func)

#make all words lowercase
df["clean_text"] = df["clean_text"].apply(lambda s: s.lower())

In [66]:
#checking processed text
df_processed = df[['review_text', 'clean_text']].head(20)
print(df_processed)

                                          review_text  \
0   Using these for years - love them. . As a fami...   
1   Wonderful . My favorite nut.  Creamy, crunchy,...   
2   Five Stars . This green tea tastes so good! My...   
3   Five Stars . I love Melissa's brand but this i...   
4                                   Five Stars . good   
5                  Not Happy . The popcorn was stale.   
6   Five Stars . Love these bars, but have to watc...   
7   Five Stars . Love the taste but the price was ...   
8   Great tasting! . I'm a member of the crowdtap ...   
9   Disgusting now and difficult on digestion . Us...   
10  If you like soy sauce, you'll really like this...   
11  Four Stars . Good flavor and seems concentrate...   
12              Five Stars . Great to use in recipes.   
13  Five Stars . It's rice. Have enough to last fo...   
14  Five Stars . Very good tasting and a great way...   
15  Excellent . They were perfect and came right o...   
16  YUMMY! . Wow, these are soo

In [None]:
from contractions import CONTRACTION_MAP 

def expand_contractions(text, map=CONTRACTION_MAP):
    pattern = re.compile('({})'.format('|'.join(map.keys())), flags=re.IGNORECASE|re.DOTALL)
    def get_match(contraction):
        match = contraction.group(0)
        first_char = match[0]
        expanded = map.get(match) if map.get(match) else map.get(match.lower())
        expanded = first_char+expanded[1:]
        return expanded     
    new_text = pattern.sub(get_match, text)
    new_text = re.sub("'", "", new_text)
    return new_text

In [None]:
#remove stop words
df["clean_text"] = df["clean_text"].apply(lambda s: rem_en(s))

#tokenise words
df["clean_text"] = df["clean_text"].apply(lambda x: tokeniser.tokenize(x))

#lemmatise words
df["clean_text"] = df["clean_text"].apply(lambda tokens: [lemmatiser.lemmatize(token, pos='v') for token in tokens])

In [None]:
print(df['clean_text'].head(10))

In [None]:
# POS tagging
nltk.download('averaged_perceptron_tagger')
from nltk import pos_tag

df['tagged'] = df['clean_text'].apply(nltk.pos_tag)

#extract nouns
df['nouns'] = df['tagged'].apply(lambda x: [word for word, tag in x if tag in ['NN', 'NNS', 'NNP', 'NNPS']])
# extract verbs
df['verbs'] = df['tagged'].apply(lambda x: [word for word, tag in x if tag in ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']])

df_pos_tagged = df[['clean_text', 'nouns', 'verbs', 'tagged']].head(20)

In [None]:
# chunking 

#Define the patterns we had identified
#Chunk 1 : Adjective followed by a Noun 
chunkGram = r"""chunk: {<JJ>+<NN>+}"""

#Chunk 2 : Noun and Adjective with other POS in between
chunkGram = r"""chunk: {<NN|NNP|NNS|NNPS>+<IN|DT|NN|VB.|RB>*<JJ>+}"""

#Chunk 3: Sequence of Nouns
chunkGram = r"""chunk: {<NN|NNP|NNS|NNPS>{2,9}}"""


#Passing the Chunk to a regex parser
chunkParser = nltk.RegexpParser(chunkGram)
#Parsing
chunked = chunkParser.parse(tagged)
print(chunked)


# Accessing the Chunk
for subtree in chunked.subtrees(filter=lambda t: t.label() == 'chunk'):
    print('Filtered chunks= ',subtree)
    chunked_output = ' '.join([w for w, t in subtree.leaves()])
     #Visualize the output
     chunked.draw()


In [55]:
#create sample dataframe - first 20 reviews
test_df = df[['review_text', 'clean_text']].head(20)
final_text = []
for i in range(len(test_df)):
    tokenised_list = df.loc[i, "clean_text"]
    tokenised_text = ''.join(tokenised_list)
    final_text.append(tokenised_text)
test_df["final_text"] = final_text
print(test_df)

                                          review_text  \
0   Using these for years - love them. . As a fami...   
1   Wonderful . My favorite nut.  Creamy, crunchy,...   
2   Five Stars . This green tea tastes so good! My...   
3   Five Stars . I love Melissa's brand but this i...   
4                                   Five Stars . good   
5                  Not Happy . The popcorn was stale.   
6   Five Stars . Love these bars, but have to watc...   
7   Five Stars . Love the taste but the price was ...   
8   Great tasting! . I'm a member of the crowdtap ...   
9   Disgusting now and difficult on digestion . Us...   
10  If you like soy sauce, you'll really like this...   
11  Four Stars . Good flavor and seems concentrate...   
12              Five Stars . Great to use in recipes.   
13  Five Stars . It's rice. Have enough to last fo...   
14  Five Stars . Very good tasting and a great way...   
15  Excellent . They were perfect and came right o...   
16  YUMMY! . Wow, these are soo

In [57]:
# TF-IDF

from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer= TfidfVectorizer(lowercase = False)
tfidf_response = vectorizer.fit_transform(test_df['final_text'])
df_tfidf_sklearn = pd.DataFrame(tfidf_response.toarray(),columns=vectorizer.get_feature_names())
df_tfidf_sklearn


Unnamed: 0,add,addict,allergens,allergic,allergy,allow,almond,also,amaze,aminos,...,whenever,whim,without,wonderful,workers,would,wow,years,youll,yummy
0,0.0,0.0,0.271162,0.135581,0.135581,0.135581,0.0,0.0,0.0,0.0,...,0.0,0.0,0.119178,0.0,0.0,0.0,0.0,0.271162,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.337031,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.187001,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [60]:
#bow 
def calculateBOW(wordset,l_doc):
  tf_diz = dict.fromkeys(wordset,0)
  for word in l_doc:
      tf_diz[word]=l_doc.count(word)
  return tf_diz


full_text = []
for i in test_df["clean_text"]:
    full_text += i

wordset = pd.unique(full_text)
bow1 = calculateBOW(wordset,test_df["clean_text"][0])
bow2 = calculateBOW(wordset,test_df["clean_text"][3])
bow3 = calculateBOW(wordset,test_df["clean_text"][2])
df_bow = pd.DataFrame([bow1,bow2,bow3])
df_bow

Unnamed: 0,u,s,e,Unnamed: 4,y,a,r,l,o,v,...,t,d,n,p,b,k,x,j,q,z
0,10,17,39,49,8,22,21,23,15,4,...,18,3,16,2,2,1,2,0,0,0
1,0,8,7,10,0,7,4,3,2,2,...,4,3,4,0,2,0,0,0,0,0
2,0,2,7,7,0,3,4,2,3,2,...,4,2,2,0,0,0,0,0,0,0


In [59]:
#  Word2Vec data preparation

#Create a list of list of words for each sentence or review : Word2Vec requires a list of words for each review.
def data_preparation_w2v(dataset):
    """
    param dataset: list of documents.
    returns: returns a list of tokenize sentences along with their word count document wise.
    """
    
    word_2_vec_list=[]
    count_list_document_wise = []
    
    for review in dataset:
        review_list=[]
        count_dict = {}
        
        for word in str(review).split():
            review_list.append(word)
            if word not in count_dict:
                count_dict.update({word: 1})
            else:
                count_dict.update({word: count_dict[word] + 1})
        
        word_2_vec_list.append(review_list)
        count_list_document_wise.append(count_dict)
    
    print(f"tokenized review sample:\n{word_2_vec_list[0]}")
    print('='*50)
    print(f"review count of words sample:\n{count_list_document_wise[0]}")
    
    return word_2_vec_list, count_list_document_wise


tokenized_reviews_LoL, word_count_reviews_LoD = data_preparation_w2v(test_df['final_text'].values)

tokenized review sample:
['use', 'years', 'love', 'family', 'allergic', 'wheat', 'dairy', 'egg', 'nut', 'several', 'things', 'love', 'entire', 'crave', 'place', 'line', 'products', 'allow', 'us', 'bake', 'treat', 'minimal', 'effort', 'ingredients', 'allergy', 'free', 'gluten', 'free', 'mix', 'usually', 'omit', 'one', 'two', 'allergens', 'great', 'see', 'mix', 'create', 'without', 'many', 'common', 'allergens', 'note', 'still', 'soy', 'corn', 'consume', 'regular', 'basis', 'years']
review count of words sample:
{'use': 1, 'years': 2, 'love': 2, 'family': 1, 'allergic': 1, 'wheat': 1, 'dairy': 1, 'egg': 1, 'nut': 1, 'several': 1, 'things': 1, 'entire': 1, 'crave': 1, 'place': 1, 'line': 1, 'products': 1, 'allow': 1, 'us': 1, 'bake': 1, 'treat': 1, 'minimal': 1, 'effort': 1, 'ingredients': 1, 'allergy': 1, 'free': 2, 'gluten': 1, 'mix': 2, 'usually': 1, 'omit': 1, 'one': 1, 'two': 1, 'allergens': 2, 'great': 1, 'see': 1, 'create': 1, 'without': 1, 'many': 1, 'common': 1, 'note': 1, 'still

In [None]:
# WordCloud