In [1]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup, NavigableString, Tag
import re
import time
import random
import sys
pd.set_option('display.max_colwidth', None)

In [2]:
#https://s3.amazonaws.com/amazon-reviews-pds/tsv/index.txt
#https://s3.amazonaws.com/amazon-reviews-pds/readme.html
#https://www.tensorflow.org/datasets/catalog/amazon_us_reviews
#https://stackoverflow.com/questions/39263929/how-can-i-read-tar-gz-file-using-pandas-read-csv-with-gzip-compression-option

In [3]:
#df = pd.read_csv('amazon_reviews_us_Video_Games_v1_00.tsv.gz', sep='\t', compression='gzip', error_bad_lines=False)


In [29]:
import re
from nltk.corpus import stopwords
swords = set(stopwords.words('english')) #set nltk stopword list equal to a variable

from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

lemmatizer = WordNetLemmatizer()
tokenizer = RegexpTokenizer(r'\w+') #create tokenizer to remove punctuation

def tokem_lite(some_string):
    stok = tokenizer.tokenize(some_string)
    #stok = ' '.join(stok) #return string of words
    slem = [lemmatizer.lemmatize(word) for word in stok]
    #slem = lemmatizer.lemmatize(stok)
    cleansw=[word for word in slem if word not in swords]
    return ' '.join(cleansw)
    for item in stok:
        slems = []
        for word in item:
            slems.append(lemmatizer.lemmatize(word)) #make list of non-stop words
            
    return ' '.join(slems) #return string of words

def clean_amazon_data(file_name, new_name):
    df = pd.read_csv(file_name, sep='\t', compression='gzip', error_bad_lines=False, low_memory=False) #read in file
    df.drop(columns=['marketplace', 'vine', 'product_category'], inplace=True) #drop columns that won't be used
    df['verified_purchase']=df['verified_purchase'].map({'Y':1, 'N':0}) #change verified_purchase to 1/0 classifier
    df['review_date'] = pd.to_datetime(df['review_date']) #convert review date to date time object
    print(f'Initial size: {sys.getsizeof(df)/1_000_000_000}') # print size of file (in Gigs)
    print(f'Initial shape: {df.shape}') #preview shape
    
    products = pd.Series(df['product_id'].value_counts()>10) #create bool series for whether item appears more than 10 times
    prod_list = [] #create empty list
    prod_dict = dict(products) #create dictionary of products series matching bool and product id
    for key, value in prod_dict.items():
        if value == False:
            prod_list.append(key) #make a list of just product ids from series which appear less than 10 times
    prod_in = df[df['product_id'].isin(prod_list)].index #make list of indexes of those product ids
    df.drop(index=prod_in, inplace=True) #drop those indexes (all products with less than 10 reviews)
    print(f'Size after dropping products w/reviews < 10: {sys.getsizeof(df)/1_000_000_000}') # print size of file (in Gigs)
    
    print(f'Null Preview: {df.isnull().sum()}') #preview null values
    null_perc = round((df.isnull().sum().sum()/len(df)*100),2)
    print(f'Null Percentage: {null_perc}%') #% of null value rows out of all rows
    dropped=False #null values have not been dropped
    if null_perc < .1: #automatically drop nulls if they represent less than 1%
        dropped=True
        df.dropna(inplace=True)
        print(f'Size after dropna(): {sys.getsizeof(df)/1_000_000_000}') # print size of file (in Gigs)
    else:  #if nulls are more than 1%, ask user to approve dropping
        answer = input('Would you like to drop all null values? Please enter yes or no: ') #option to drop nulls
        if answer.lower() == 'yes':
            dropped=True
            df.dropna(inplace=True)
            print(f'Size after dropna(): {sys.getsizeof(df)/1_000_000_000}') # print size of file (in Gigs)
        else:
            print('''
            As you wish...
            WARNING!
            Null values remain in Data
            ''')
    
    df['full_review'] = df['review_headline']+' '+df['review_body']#concatenate review header and body into one column for NLP
    df.drop(columns=['review_headline', 'review_body'], inplace=True)
    print(f'Size after concatenation: {sys.getsizeof(df)/1_000_000_000}') # print size of file (in Gigs)
    
    if dropped == True: #only do this if nulls have been dropped, otherwise it will break
        print('Tokenizing, lemmatizing, and removing stopwords...hold please')
        df['full_review'] = df['full_review'].map(lambda x: tokem_lite(x)) #tokenize, lemmatize, and remove stopwords
        print(f'Size after tokemmitization: {sys.getsizeof(df)/1_000_000_000}') # print size of file (in Gigs)
    
    df.to_csv(f'./data/{new_name}.csv', index=False)
    #print(f'File saved as {new_name}.csv')
    
    print(f'Final size: {sys.getsizeof(df)/1_000_000_000}') # print size of file (in Gigs)
    print(f'Final shape: {df.shape}') #preview shape
    return f'File saved as {new_name}.csv'

In [30]:
#print(f'Null Percentage: {round((new_df.isnull().sum().sum()/len(new_df)*100),2)}%')

In [31]:
%%time
new_df = clean_amazon_data('./data/amazon_reviews_us_Video_Games_v1_00.tsv.gz', 'video_games')

b'Skipping line 20630: expected 15 fields, saw 22\nSkipping line 28172: expected 15 fields, saw 22\nSkipping line 54791: expected 15 fields, saw 22\nSkipping line 75419: expected 15 fields, saw 22\nSkipping line 104832: expected 15 fields, saw 22\nSkipping line 138464: expected 15 fields, saw 22\nSkipping line 194849: expected 15 fields, saw 22\nSkipping line 201568: expected 15 fields, saw 22\nSkipping line 242567: expected 15 fields, saw 22\nSkipping line 493585: expected 15 fields, saw 22\nSkipping line 502478: expected 15 fields, saw 22\nSkipping line 660750: expected 15 fields, saw 22\n'


Initial size: 1.718119368
Initial shape: (1780268, 12)
Size after dropping products w/reviews < 10: 1.616999617
Null Preview: customer_id           0
review_id             0
product_id            0
product_parent        0
product_title         0
star_rating           0
helpful_votes         0
total_votes           0
verified_purchase     0
review_headline      26
review_body          51
review_date          24
dtype: int64
Null Percentage: 0.01%
Size after dropna(): 1.615822037
Size after concatenation: 1.524033039
Tokenizing, lemmatizing, and removing stopwords...hold please
Size after tokemmitization: 1.181082599
Final size: 1.181082599
Final shape: (1648136, 11)
Wall time: 10min 41s


In [27]:
df2 = pd.read_csv('./data/video_games.csv', low_memory=False)
sys.getsizeof(df2)/1_000_000_000

1.265136394

In [17]:
new_df['total_votes'].value_counts()

0       817185
1       267812
2       139116
3        82764
4        58745
         ...  
560          1
563          1
1174         1
564          1
1017         1
Name: total_votes, Length: 714, dtype: int64

In [18]:
new_df.head()

Unnamed: 0,customer_id,review_id,product_id,product_parent,product_title,product_category,star_rating,helpful_votes,total_votes,verified_purchase,review_date,full_review
0,12039526,RTIS3L2M1F5SM,B001CXYMFS,737716809,Thrustmaster T-Flight Hotas X Flight Stick,Video Games,5,0,0,1,2015-08-31,"an amazing joystick. I especially love that you can twist ... Used this for Elite Dangerous on my mac, an amazing joystick. I especially love that you can twist the stick for different movement bindings as well as move it in the normal way."
2,2331478,R3BH071QLH8QMC,B0029CSOD2,98937668,Hidden Mysteries: Titanic Secrets of the Fateful Voyage,Video Games,1,0,1,1,2015-08-31,One Star poor quality work and not as it is advertised.
3,52495923,R127K9NTSXA2YH,B00GOOSV98,23143350,GelTabz Performance Thumb Grips - PlayStation 4 and PlayStation 3,Video Games,3,0,0,1,2015-08-31,"good, but could be bettee nice, but tend to slip away from stick in intense (hard pressed) gaming sessions."
4,14533949,R32ZWUXDJPW27Q,B00Y074JOM,821342511,Zero Suit Samus amiibo - Japan Import (Super Smash Bros Series),Video Games,4,0,0,1,2015-08-31,"Great but flawed. Great amiibo, great for collecting. Quality material to be desired, since its not perfect."
6,17521011,R2F0POU5K6F73F,B008XHCLFO,24234603,Protection for your 3DS XL,Video Games,5,0,0,1,2015-08-31,"A Must I have a 2012-2013 XL and this is very durable, comfortable, and really cool looking."


In [32]:
#new_df['verified_purchase']=new_df['verified_purchase'].map({'Y':1, 'N':0})

In [36]:
#new_df['verified_purchase'].value_counts(normalize=True)

1    0.655297
0    0.344703
Name: verified_purchase, dtype: float64

In [39]:
#new_df['full_review'] = new_df['review_headline'] + ' ' + new_df['review_body']

In [51]:
#new_df['full_review']

In [81]:
import re
from nltk.corpus import stopwords
swords = set(stopwords.words('english')) #set nltk stopword list equal to a variable

from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

lemmatizer = WordNetLemmatizer()
tokenizer = RegexpTokenizer(r'\w+') #create tokenizer to remove punctuation

def tokem_lite(some_string):
    stok = tokenizer.tokenize(some_string)
    #stok = ' '.join(stok) #return string of words
    slem = [lemmatizer.lemmatize(word) for word in stok]
    #slem = lemmatizer.lemmatize(stok)
    cleansw=[word for word in slem if word not in swords]
    return ' '.join(cleansw)
    for item in stok:
        slems = []
        for word in item:
            slems.append(lemmatizer.lemmatize(word)) #make list of non-stop words
            
    return ' '.join(slems) #return string of words

In [82]:
#tokem_lite(new_df['full_review'][0]) #test on single row

'amazing joystick I especially love twist Used Elite Dangerous mac amazing joystick I especially love twist stick different movement binding well move normal way'

In [91]:
#new_df['full_review'].map(lambda x: tokem_lite(x))

0                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                             amazing joystick I especially love twist Used Elite Dangerous mac amazing joystick I especially love twist stick different movement binding well move normal way
2                                                                                                                                                                                                                                                                                                                        

In [88]:
#new_df['revs_clean'] = [tokem_lite(review) for review in new_df['full_review']] #test on all reviews

In [89]:
#new_df['revs_clean']

0                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                             amazing joystick I especially love twist Used Elite Dangerous mac amazing joystick I especially love twist stick different movement binding well move normal way
2                                                                                                                                                                                                                                                                                                                        

In [6]:
import sys

In [7]:
sys.getsizeof(df)

2255759359

In [8]:
#df.columns

Index(['marketplace', 'customer_id', 'review_id', 'product_id',
       'product_parent', 'product_title', 'product_category', 'star_rating',
       'helpful_votes', 'total_votes', 'vine', 'verified_purchase',
       'review_headline', 'review_body', 'review_date'],
      dtype='object')

In [17]:
#df.isnull().sum()

marketplace           0
customer_id           0
review_id             0
product_id            0
product_parent        0
product_title         0
product_category      0
star_rating           0
helpful_votes         0
total_votes           0
vine                  0
verified_purchase     0
review_headline      28
review_body          59
review_date          27
dtype: int64

In [20]:
#df.groupby('product_id').count() > 10

Unnamed: 0_level_0,marketplace,customer_id,review_id,product_parent,product_title,product_category,star_rating,helpful_votes,total_votes,vine,verified_purchase,review_headline,review_body,review_date
product_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
0000118532,False,False,False,False,False,False,False,False,False,False,False,False,False,False
006056038X,False,False,False,False,False,False,False,False,False,False,False,False,False,False
006073132X,False,False,False,False,False,False,False,False,False,False,False,False,False,False
007876355X,False,False,False,False,False,False,False,False,False,False,False,False,False,False
0078764343,False,False,False,False,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
B014IG8QWU,False,False,False,False,False,False,False,False,False,False,False,False,False,False
B014KXMM94,False,False,False,False,False,False,False,False,False,False,False,False,False,False
B017W175WA,False,False,False,False,False,False,False,False,False,False,False,False,False,False
B01A0LTNJC,False,False,False,False,False,False,False,False,False,False,False,False,False,False


In [24]:
products = pd.Series(df['product_id'].value_counts()>10)

In [48]:
#df.loc[df[products]==False]

In [35]:
prod_list = []
prod_dict = dict(products)
for key, value in prod_dict.items():
    if value == False:
        prod_list.append(key)
prod_list[:5] #preview

['B00Q538I46', 'B00N1O2A1Y', 'B00HNWZE18', 'B00JPI9YH8', 'B00004SVVU']

In [53]:
prod_in = df[df['product_id'].isin(prod_list)].index

In [56]:
df.drop(index=prod_in, inplace=True)

In [57]:
df['product_id'].value_counts()

B00BGA9WK2    10318
B007FTE2VW     3971
B00178630A     3715
B0050SYILE     3545
B005CPGHAA     3399
              ...  
B00002SWA8       11
B0009350BC       11
B000O3EFRM       11
B0002SMN1Y       11
B0007W65FA       11
Name: product_id, Length: 20952, dtype: int64

In [12]:
df.columns

Index(['marketplace', 'customer_id', 'review_id', 'product_id',
       'product_parent', 'product_title', 'product_category', 'star_rating',
       'helpful_votes', 'total_votes', 'vine', 'verified_purchase',
       'review_headline', 'review_body', 'review_date'],
      dtype='object')

In [32]:
%%writefile utils.py

import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup, NavigableString, Tag
import re
import time
import random
import sys
pd.set_option('display.max_colwidth', None)

import re
from nltk.corpus import stopwords
swords = set(stopwords.words('english')) #set nltk stopword list equal to a variable

from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

lemmatizer = WordNetLemmatizer()
tokenizer = RegexpTokenizer(r'\w+') #create tokenizer to remove punctuation

def tokem_lite(some_string):
    stok = tokenizer.tokenize(some_string)
    #stok = ' '.join(stok) #return string of words
    slem = [lemmatizer.lemmatize(word) for word in stok]
    #slem = lemmatizer.lemmatize(stok)
    cleansw=[word for word in slem if word not in swords]
    return ' '.join(cleansw)
    for item in stok:
        slems = []
        for word in item:
            slems.append(lemmatizer.lemmatize(word)) #make list of non-stop words
            
    return ' '.join(slems) #return string of words

def clean_amazon_data(file_name, new_name):
    df = pd.read_csv(file_name, sep='\t', compression='gzip', error_bad_lines=False, low_memory=False) #read in file
    df.drop(columns=['marketplace', 'vine', 'product_category'], inplace=True) #drop columns that won't be used
    df['verified_purchase']=df['verified_purchase'].map({'Y':1, 'N':0}) #change verified_purchase to 1/0 classifier
    df['review_date'] = pd.to_datetime(df['review_date']) #convert review date to date time object
    print(f'Initial size: {sys.getsizeof(df)/1_000_000_000}') # print size of file (in Gigs)
    print(f'Initial shape: {df.shape}') #preview shape
    
    products = pd.Series(df['product_id'].value_counts()>10) #create bool series for whether item appears more than 10 times
    prod_list = [] #create empty list
    prod_dict = dict(products) #create dictionary of products series matching bool and product id
    for key, value in prod_dict.items():
        if value == False:
            prod_list.append(key) #make a list of just product ids from series which appear less than 10 times
    prod_in = df[df['product_id'].isin(prod_list)].index #make list of indexes of those product ids
    df.drop(index=prod_in, inplace=True) #drop those indexes (all products with less than 10 reviews)
    print(f'Size after dropping products w/reviews < 10: {sys.getsizeof(df)/1_000_000_000}') # print size of file (in Gigs)
    
    print(f'Null Preview: {df.isnull().sum()}') #preview null values
    null_perc = round((df.isnull().sum().sum()/len(df)*100),2)
    print(f'Null Percentage: {null_perc}%') #% of null value rows out of all rows
    dropped=False #null values have not been dropped
    if null_perc < .1: #automatically drop nulls if they represent less than 1%
        dropped=True
        df.dropna(inplace=True)
        print(f'Size after dropna(): {sys.getsizeof(df)/1_000_000_000}') # print size of file (in Gigs)
    else:  #if nulls are more than 1%, ask user to approve dropping
        answer = input('Would you like to drop all null values? Please enter yes or no: ') #option to drop nulls
        if answer.lower() == 'yes':
            dropped=True
            df.dropna(inplace=True)
            print(f'Size after dropna(): {sys.getsizeof(df)/1_000_000_000}') # print size of file (in Gigs)
        else:
            print('''
            As you wish...
            WARNING!
            Null values remain in Data
            ''')
    
    df['full_review'] = df['review_headline']+' '+df['review_body']#concatenate review header and body into one column for NLP
    df.drop(columns=['review_headline', 'review_body'], inplace=True)
    print(f'Size after concatenation: {sys.getsizeof(df)/1_000_000_000}') # print size of file (in Gigs)
    
    if dropped == True: #only do this if nulls have been dropped, otherwise it will break
        print('Tokenizing, lemmatizing, and removing stopwords...hold please')
        df['full_review'] = df['full_review'].map(lambda x: tokem_lite(x)) #tokenize, lemmatize, and remove stopwords
        print(f'Size after tokemmitization: {sys.getsizeof(df)/1_000_000_000}') # print size of file (in Gigs)
    
    df.to_csv(f'./data/{new_name}.csv', index=False)
    #print(f'File saved as {new_name}.csv')
    
    print(f'Final size: {sys.getsizeof(df)/1_000_000_000}') # print size of file (in Gigs)
    print(f'Final shape: {df.shape}') #preview shape
    return f'File saved as {new_name}.csv'

Overwriting utils.py


In [6]:
import utils as ut