In [1]:
# Data Filtering and Cleaning

In [37]:
import gzip
path = '/home/cambam/Desktop/Python/Coursera/Basic Data Processing and Visualization/Data Processing and Visualization/Data/amazon_reviews_us_Gift_Card_v1_00.tsv.gz'
f = gzip.open(path, 'rt')

In [38]:
import csv
reader = csv.reader(f, delimiter = '\t')

In [39]:
header = next(reader)

In [40]:
# Create an empty dataset
# read csv file line by line
# fields to INT or Boolean
# append dataset to being cleaning data
dataset = []
for line in reader:
    d = dict(zip(header, line))
    for field in ['helpful_votes', 'star_rating', 'total_votes']:
        d[field] = int(d[field])
    for field in ['verified_purchase', 'vine']:
        if d[field] == 'Y':
            d[field] = True
        else:
            d[field] = False
    dataset.append(d)

In [41]:
len(dataset)

148310

In [42]:
dataset[0]

{'marketplace': 'US',
 'customer_id': '24371595',
 'review_id': 'R27ZP1F1CD0C3Y',
 'product_id': 'B004LLIL5A',
 'product_parent': '346014806',
 'product_title': 'Amazon eGift Card - Celebrate',
 'product_category': 'Gift Card',
 'star_rating': 5,
 'helpful_votes': 0,
 'total_votes': 0,
 'vine': False,
 'verified_purchase': True,
 'review_headline': 'Five Stars',
 'review_body': 'Great birthday gift for a young adult.',
 'review_date': '2015-08-31'}

In [43]:
# This List Comprehension checks if there is a review data
# then saves into new dataset
dataset = [d for d in dataset if 'review_date' in d]

In [44]:
len(dataset)

148309

In [45]:
# Using the review data field
# we extract the last four digits 
# and save into new field to show the year
for d in dataset:
    d['yearInt'] = int(d['review_date'][:4])

In [46]:
dataset[0]

{'marketplace': 'US',
 'customer_id': '24371595',
 'review_id': 'R27ZP1F1CD0C3Y',
 'product_id': 'B004LLIL5A',
 'product_parent': '346014806',
 'product_title': 'Amazon eGift Card - Celebrate',
 'product_category': 'Gift Card',
 'star_rating': 5,
 'helpful_votes': 0,
 'total_votes': 0,
 'vine': False,
 'verified_purchase': True,
 'review_headline': 'Five Stars',
 'review_body': 'Great birthday gift for a young adult.',
 'review_date': '2015-08-31',
 'yearInt': 2015}

In [47]:
# This List Comprehension extracts all the reviews
# that were post 2009 or earlier
dataset = [d for d in dataset if d['yearInt'] > 2009]

In [48]:
len(dataset)

148095

In [49]:
# Removes all reviews with less than 3 helpful votes or helpful votes / total votes is less than .5
dataset = [d for d in dataset if d['total_votes'] < 3 or d['helpful_votes']/d['total_votes'] >= .5]

In [50]:
len(dataset)

147801

In [51]:
# Filter data by inactive users

In [52]:
from collections import defaultdict

In [53]:
# Creats a user review dict
nReviewsPerUser = defaultdict(int)

In [54]:
# collects the number of user reviews by
# iterating through the dataset and 
# counting the number of reviews by customer ID
for d in dataset:
    nReviewsPerUser[d['customer_id']] += 1

In [56]:
# Filter to keeponly users with 2 or more reviews
dataset = [d for d in dataset if nReviewsPerUser[d['customer_id']] >= 2]

In [57]:
len(dataset)

11172

In [63]:
# Split the review body by any white space chars
dataset = [d for d in dataset if len(d['review_body'].split()) >= 10]

In [64]:
len(dataset)

7033