In [26]:
# Link to amazon_food.csv
# https://drive.google.com/open?id=1J5GDMYqSmxc8z4FaloZ-tx3Le7n43iMg

import pandas as pd
import re
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
from nltk.corpus import stopwords
import nltk

In [27]:
# encoding to utf-8 here is important 
df = pd.read_csv('amazon_food.csv', index_col=0, encoding = 'utf-8')
df.head()

Unnamed: 0,reviewText,overall
0,Just another flavor of Kit Kat but the taste i...,4.0
1,I bought this on impulse and it comes from Jap...,3.0
2,Really good. Great gift for any fan of green t...,4.0
3,"I had never had it before, was curious to see ...",5.0
4,I've been looking forward to trying these afte...,4.0


In [28]:
len(df.reviewText)

151254

In [30]:
# parse html elements
import HTMLParser
html_parser = HTMLParser.HTMLParser()

no_html = []
for i in df.reviewText:
    i = html_parser.unescape(str(i))
    no_html.append(i.lower())
    
print len(no_html)

151254


In [31]:
# remove urls 
url_re = 'https:\/\/www\.([a-zA-Z\d]+)\.com\/([a-zA-Z\d]+)\/([a-zA-Z\d]*)\/?'
    
no_url = []
for i in no_html:
    u = re.finditer(url_re , i)
    if u:
        for a in u:
            i = i.replace(a.group(), '')
    # remove newlines including those created when url removed at end of string
    i = i.replace('\n', '')
    no_url.append(i)
    
print len(no_url)

151254


In [32]:
# remove any screen names encountered
no_scrname = []
scrname_re = '@([a-zA-Z\d]*)'
for i in no_url:
    u = re.finditer(scrname_re , i)
    if u:
        for a in u:
            i = i.replace(a.group(), '')
    no_scrname.append(i)   

In [33]:
# remove all hashtags
no_hashtag = []
hashtag_re = '#([a-z]+)'

for i in no_scrname:
    u = re.finditer(hashtag_re , i)
    if u:
        for a in u:
            i = i.replace(a.group(), '')
    no_hashtag.append(i)

In [34]:
no_spec = []
spec_char = ['$', '(', ')','{','}','~','@','#','%','^','&','*',':','|','<',';','[',']','+','!','?','`']

for i in no_hashtag:
    for char in spec_char:
        i = i.replace(char, '')
    no_spec.append(i)

In [35]:
no_num = []
num_re = '\d'

for i in no_spec:
    u = re.findall(num_re, i)
    if u:
        for a in u:
            i = i.replace(a , '')
    no_num.append(i)

In [36]:
# get all a-z characters only and remove emojis, punctuations and special characters
char_only = []
not_char_re = '([^a-z\s]+)'

for i in no_num:
    u = re.findall(not_char_re , i)
    if u:
        for a in u:
            i = i.replace(a , '')
    char_only.append(i)
    
len(char_only)

151254

In [38]:
x = df.iloc[3,0]
print x

i had never had it before was curious to see what it was like smooth great subtle good flavor i am ordering more and plan to make it a routine


In [39]:
import time

start = time.time()

lemma_check =[]
wnl = WordNetLemmatizer()
for item in char_only:
    #item = str(item).decode('utf-8')
    t = " ".join([wnl.lemmatize(i) for i in item.split()])
    lemma_check.append(t) 
    
end = time.time()
exe_time = end - start
print 'Time taken :',(exe_time),' seconds'

Time taken : 98.0313000679  seconds


In [40]:
# A custom stoplist
STOPLIST = set(stopwords.words('english') + ["n't", "'s", "'m", "ca", "singapore", "food", "im", "street","'ve","'re", 
                                            'porn','watch', 'video', 'centre', '0', '...', ':',',','.','!','/','(',')',
                                             '-','&','`','~','@','#','$','%','^','*','[',']','?','\\','{','}',';',"'",
                                             '"','+','=',
                                             'eat', 'day', 'time', 'cdataadsbygoogle', 'windowadsbygoogle',
                                             'wa', 'ha', 'come', 'place', 'dish', 'bring', 'think', 'quite','located',
                                             'month', 'went', 'probably','pm', 'say', 'said','including','year','item',
                                            'youre', 'sure', 'dont', 'came','really', 'got', 'thing', 'address', 'photo',
                                            'credit', 'opening', 'hour'] 
               + list(ENGLISH_STOP_WORDS))
print len(STOPLIST)

431


In [41]:
start = time.time()

tokens = []
for item in lemma_check:  
    try:
        Tokens = nltk.word_tokenize(item.lower())
        t = [tok for tok in Tokens if tok not in STOPLIST]
        # print t
    except:
        continue
    tokens.append(t)
    
end = time.time()
exe_time = end - start
print 'Time taken :',(exe_time),' seconds'
    
print len(tokens)

Time taken : 67.7756071091  seconds
151254


In [42]:
df1 = pd.Series(tokens)

In [43]:
df1.head(10)

0    [flavor, kit, kat, taste, unique, bit, differe...
1    [bought, impulse, japan, amused, family, weird...
2    [good, great, gift, fan, green, tea, expensive...
3    [curious, like, smooth, great, subtle, good, f...
4    [ive, looking, forward, trying, hearing, popul...
5    [kitkats, good, looking, strong, green, tea, f...
6    [mitsuwa, marketplace, illinoisi, actually, ex...
7    [creamy, white, chocolate, infused, matcha, gr...
8    [hearing, mixed, opinion, kit, kat, decided, t...
9    [love, green, tea, love, kit, kat, belong, hat...
dtype: object

In [45]:
df['reviewText_tokenize'] = tokens
df.head()

Unnamed: 0,reviewText,overall,reviewText_tokenize
0,just another flavor of kit kat but the taste i...,4.0,"[flavor, kit, kat, taste, unique, bit, differe..."
1,i bought this on impulse and it comes from jap...,3.0,"[bought, impulse, japan, amused, family, weird..."
2,really good great gift for any fan of green te...,4.0,"[good, great, gift, fan, green, tea, expensive..."
3,i had never had it before was curious to see w...,5.0,"[curious, like, smooth, great, subtle, good, f..."
4,ive been looking forward to trying these after...,4.0,"[ive, looking, forward, trying, hearing, popul..."


In [46]:
df.to_csv('amazon_food-tokenize.csv', encoding='utf-8')