In [8]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [49]:
# Read in the data
import gzip
with gzip.open('amazon-meta.txt.gz', 'rt', encoding='utf8') as f:
    data = f.read()
f.close()

In [51]:
# Parse and group the data by each product
grouped = []
add = []

for string in data.split('\n')[3:]:
    if string != '':
        add.append(string)
    else:
        grouped.append(add)
        add = []
        
grouped[0:3]

[['Id:   0', 'ASIN: 0771044445', '  discontinued product'],
 ['Id:   1',
  'ASIN: 0827229534',
  '  title: Patterns of Preaching: A Sermon Sampler',
  '  group: Book',
  '  salesrank: 396585',
  '  similar: 5  0804215715  156101074X  0687023955  0687074231  082721619X',
  '  categories: 2',
  '   |Books[283155]|Subjects[1000]|Religion & Spirituality[22]|Christianity[12290]|Clergy[12360]|Preaching[12368]',
  '   |Books[283155]|Subjects[1000]|Religion & Spirituality[22]|Christianity[12290]|Clergy[12360]|Sermons[12370]',
  '  reviews: total: 2  downloaded: 2  avg rating: 5',
  '    2000-7-28  cutomer: A2JW67OY8U6HHK  rating: 5  votes:  10  helpful:   9',
  '    2003-12-14  cutomer: A2VE83MZF98ITY  rating: 5  votes:   6  helpful:   5'],
 ['Id:   2',
  'ASIN: 0738700797',
  '  title: Candlemas: Feast of Flames',
  '  group: Book',
  '  salesrank: 168596',
  '  similar: 5  0738700827  1567184960  1567182836  0738700525  0738700940',
  '  categories: 2',
  '   |Books[283155]|Subjects[1000]|Re

In [52]:
ratings_dict = {}

for group in grouped:
    ident, total, downloaded, avg_rating = '', '', '', ''
    for item in group:
        if item.startswith('Id:'):
            ident = item.split()[-1]
        elif item.startswith('  reviews:'):
            total = item.split()[2]
            downloaded = item.split()[4]
            avg_rating = item.split()[7]
        elif item.startswith('  discontinued product'):
            skip = True
    if skip == False:
        ratings_dict[ident] = [total, downloaded, avg_rating]
    else:
        skip = False            

In [56]:
ratings_dict

{'1': ['2', '2', '5'],
 '2': ['12', '12', '4.5'],
 '3': ['1', '1', '5'],
 '4': ['1', '1', '4'],
 '5': ['0', '0', '0'],
 '6': ['17', '17', '4'],
 '7': ['3', '3', '4.5'],
 '8': ['15', '15', '4.5'],
 '9': ['0', '0', '0'],
 '10': ['6', '6', '4'],
 '11': ['1', '1', '5'],
 '12': ['12', '12', '4.5'],
 '13': ['0', '0', '0'],
 '14': ['0', '0', '0'],
 '15': ['8', '8', '4'],
 '16': ['10', '10', '4.5'],
 '17': ['3', '3', '3.5'],
 '18': ['15', '15', '5'],
 '19': ['8', '8', '3.5'],
 '20': ['1', '1', '5'],
 '21': ['140', '140', '4.5'],
 '22': ['4', '4', '4.5'],
 '23': ['3', '3', '3'],
 '24': ['2', '2', '4.5'],
 '25': ['2', '2', '3.5'],
 '26': ['1', '1', '4'],
 '27': ['2', '2', '3'],
 '28': ['0', '0', '0'],
 '29': ['0', '0', '0'],
 '30': ['0', '0', '0'],
 '31': ['2', '2', '5'],
 '32': ['11', '11', '4.5'],
 '33': ['4', '4', '5'],
 '34': ['6', '6', '4'],
 '35': ['5', '5', '4.5'],
 '36': ['0', '0', '0'],
 '37': ['7', '7', '3.5'],
 '38': ['3', '3', '4.5'],
 '39': ['22', '22', '3.5'],
 '40': ['8', '8', '5'

In [60]:
ratings = pd.DataFrame.from_dict(ratings_dict)

In [61]:
ratings.head(10)

Unnamed: 0,Unnamed: 1,1,10,100,1000,10000,100000,100001,100002,100003,...,99990,99991,99992,99993,99994,99995,99996,99997,99998,99999
0,,2,6,0,1,0,2.0,53.0,0,62,...,7,128.0,4.0,4.0,0,0,4,2,0,1
1,,2,6,0,1,0,2.0,53.0,0,62,...,7,5.0,4.0,4.0,0,0,4,2,0,1
2,,5,4,0,5,0,4.5,3.5,0,4,...,5,4.5,4.5,4.5,0,0,5,3,0,4


In [62]:
ratings = ratings.T.reset_index(drop=False)
ratings.columns = ['id','total','downloaded','avg_rating']

In [83]:
ratings.dropna(axis=0, how='any', inplace=True)

In [87]:
ratings.head()

Unnamed: 0,id,total,downloaded,avg_rating
1,1,2,2,5.0
2,10,6,6,4.0
3,100,0,0,0.0
4,1000,1,1,5.0
5,10000,0,0,0.0


In [85]:
ratings.dtypes

id            float64
total         float64
downloaded    float64
avg_rating    float64
dtype: object

In [91]:
ratings['id']  = pd.to_numeric(ratings['id']).astype(int)
ratings['total']  = pd.to_numeric(ratings['total']).astype(int)
ratings['downloaded']  = pd.to_numeric(ratings['downloaded']).astype(int)
ratings = ratings.sort_values('id')

In [93]:
ratings.set_index('id')

Unnamed: 0_level_0,total,downloaded,avg_rating
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,2,2,5.0
2,12,12,4.5
3,1,1,5.0
4,1,1,4.0
5,0,0,0.0
6,17,17,4.0
7,3,3,4.5
8,15,15,4.5
9,0,0,0.0
10,6,6,4.0


Reliable/trustworthy ratings are more likely to come from those who have actually purchased the product. Obviously some people might have purchased the product elsewhere and decided to review on Amazon anyway, but if the number of reviews is significantly greater than the number downloaded, those are suspect.

In [100]:
print('There are {} products with more ratings than downloads.'.format(len(ratings[ratings['total'] > ratings['downloaded']])))

There are 8615 products with more ratings than downloads.


In [129]:
def diff(ratings):
    if ratings[ratings.downloaded > 0]:
        val = (ratings['total'] - ratings['downloaded']) / ratings['downloaded']
    else:
        val = 0
    return val

ratings['Pct Diff'] = diff(ratings)

ValueError: The truth value of a DataFrame is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().

#### 1. Trustworthiness of ratings Ratings are susceptible to manipulation, bias etc. What can you say (quantitatively speaking) about the ratings in this dataset?

In [210]:
ratings.loc[ratings['downloaded'] > 0, 'Pct Diff'] = (ratings['total'] - ratings['downloaded']) / ratings['downloaded']

In [211]:
ratings.head()

Unnamed: 0,id,total,downloaded,avg_rating,Pct Diff
1,1,2,2,5.0,0.0
109902,2,12,12,4.5,0.0
219674,3,1,1,5.0,0.0
329477,4,1,1,4.0,0.0
439371,5,0,0,0.0,0.0


In [151]:
ratings.groupby(ratings['Pct Diff']).count()

Unnamed: 0_level_0,id,total,downloaded,avg_rating
Pct Diff,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
-0.600000,1,1,1,1
-0.500000,35,35,35,35
-0.400000,1,1,1,1
-0.333333,32,32,32,32
-0.250000,31,31,31,31
-0.200000,25,25,25,25
-0.125000,1,1,1,1
-0.111111,1,1,1,1
-0.083333,1,1,1,1
-0.037037,1,1,1,1


In [166]:
diff_range = [4.999, 5, 10, 20, 50, 100]

for i in diff_range:
    if i < 5:
        print('There are {} products with {}% or fewer reviews greater than the number of downloads.'\
                  .format(len(ratings[ratings['Pct Diff'] <= i]), i))
    if i >= 5:
        print('There are {} products with {}% more than reviews than the number of downloads.'\
              .format(len(ratings[ratings['Pct Diff'] > i]), i))


There are 541740 products with 4.999% or fewer reviews greater than the number of downloads.
There are 919 products with 5% more than reviews than the number of downloads.
There are 555 products with 10% more than reviews than the number of downloads.
There are 307 products with 20% more than reviews than the number of downloads.
There are 124 products with 50% more than reviews than the number of downloads.
There are 64 products with 100% more than reviews than the number of downloads.


I'd be suspicious of any reviews from products where there are more than 5-10% of reviews than downloads. That being said, most of the reviews do seem trustworthy.

#### 2. Category bloat Consider the product group named 'Books'. Each product in this group is associated with categories. Naturally, with categorization, there are tradeoffs between how broad or specific the categories must be.

For this dataset, quantify the following:<br>
a. Is there redundancy in the categorization? How can it be identified/removed?<br>
b. Is it possible to reduce the number of categories drastically (say to 10% of existing categories) by sacrificing relatively few category entries (say close to 10%)?

In [168]:
grouped[0:5]

[['Id:   0', 'ASIN: 0771044445', '  discontinued product'],
 ['Id:   1',
  'ASIN: 0827229534',
  '  title: Patterns of Preaching: A Sermon Sampler',
  '  group: Book',
  '  salesrank: 396585',
  '  similar: 5  0804215715  156101074X  0687023955  0687074231  082721619X',
  '  categories: 2',
  '   |Books[283155]|Subjects[1000]|Religion & Spirituality[22]|Christianity[12290]|Clergy[12360]|Preaching[12368]',
  '   |Books[283155]|Subjects[1000]|Religion & Spirituality[22]|Christianity[12290]|Clergy[12360]|Sermons[12370]',
  '  reviews: total: 2  downloaded: 2  avg rating: 5',
  '    2000-7-28  cutomer: A2JW67OY8U6HHK  rating: 5  votes:  10  helpful:   9',
  '    2003-12-14  cutomer: A2VE83MZF98ITY  rating: 5  votes:   6  helpful:   5'],
 ['Id:   2',
  'ASIN: 0738700797',
  '  title: Candlemas: Feast of Flames',
  '  group: Book',
  '  salesrank: 168596',
  '  similar: 5  0738700827  1567184960  1567182836  0738700525  0738700940',
  '  categories: 2',
  '   |Books[283155]|Subjects[1000]|Re

Yes, it does appear there is quite a bit of redundancy in the categories, with deeper sub-categories being more specific (obviously).

In [195]:
num_books = 0
categories = []

for group in grouped:
    product = ''
    for item in group:
        if item.startswith('  group:'):
            product = item.split()[1]
        if product == 'Book':
            if item.startswith('   |Books'):
                categories.append(item.strip())
    if product == 'Book':
        num_books += 1
        
print('There are {} total books.'.format(num_books))
print('There are {} unique categories'.format(len(set(categories))))

There are 393561 total books.
There are 11316 unique categories


In [216]:
categories[0:10]

['|Books[283155]|Subjects[1000]|Religion & Spirituality[22]|Christianity[12290]|Clergy[12360]|Preaching[12368]',
 '|Books[283155]|Subjects[1000]|Religion & Spirituality[22]|Christianity[12290]|Clergy[12360]|Sermons[12370]',
 '|Books[283155]|Subjects[1000]|Religion & Spirituality[22]|Earth-Based Religions[12472]|Wicca[12484]',
 '|Books[283155]|Subjects[1000]|Religion & Spirituality[22]|Earth-Based Religions[12472]|Witchcraft[12486]',
 '|Books[283155]|Subjects[1000]|Home & Garden[48]|Crafts & Hobbies[5126]|General[5144]',
 '|Books[283155]|Subjects[1000]|Religion & Spirituality[22]|Christianity[12290]|Reference[172810]|Commentaries[12155]|New Testament[12159]',
 '|Books[283155]|Subjects[1000]|Religion & Spirituality[22]|Christianity[12290]|Christian Living[12333]|Discipleship[12335]',
 '|Books[283155]|Subjects[1000]|Religion & Spirituality[22]|Christianity[12290]|Bibles[12059]|Translations[764432]|Life Application[572080]',
 '|Books[283155]|Subjects[1000]|Religion & Spirituality[22]|Bible

In [217]:
categories.split('|')

AttributeError: 'list' object has no attribute 'split'

In [218]:
sub_categories = []

for i in categories:
    for sub in i.split('|')[1:]:
        sub_categories.append(sub)
        
print('There are {} total subcategories.'.format(len(sub_categories)))
print('There are {} unique subcategories.'.format(len(set(sub_categories))))

There are 6919709 total subcategories.
There are 12930 unique subcategories.


In [219]:
set(sub_categories[0:60])

{'Bible & Other Sacred Texts[12056]',
 'Bible[764430]',
 'Bibles[12059]',
 'Books[283155]',
 'Christian Living[12333]',
 'Christianity[12290]',
 'Clergy[12360]',
 'Commentaries[12155]',
 'Crafts & Hobbies[5126]',
 'Discipleship[12335]',
 'Earth-Based Religions[12472]',
 'General[5144]',
 'General[572094]',
 'Home & Garden[48]',
 'Life Application[572080]',
 'New Testament[12159]',
 'New Testament[572082]',
 'Preaching[12368]',
 'Reference[172810]',
 'Religion & Spirituality[22]',
 'Sermons[12370]',
 'Study Guides, History & Reference[764438]',
 'Subjects[1000]',
 'Translations[764432]',
 'Wicca[12484]',
 'Witchcraft[12486]'}

There is clearly some redundancy here, with multiple subcategories for Bibles, General, New Testament, etc., not even including those that could be perceived as redundant. Note, this list excludes the general category "Books".

In [220]:
cat_df = pd.DataFrame({'sub_categories':sub_categories})

In [235]:
cat_df['sub_categories'].value_counts().head(10)

Books[283155]                       1286848
Subjects[1000]                      1222638
Children's Books[4]                  134263
Nonfiction[53]                       106966
Religion & Spirituality[22]           93648
Literature & Fiction[17]              84709
Business & Investing[3]               74124
Professional & Technical[173507]      67692
Computers & Internet[5]               66734
Health, Mind & Body[10]               66371
Name: sub_categories, dtype: int64

In [243]:
print('The "Books" and "Subjects" categories comprise {:.2f}% of the categories, and if eliminated will reduce the number of subjects by that much.'.format((cat_df['sub_categories'].value_counts().head(10)[0]+cat_df['sub_categories'].value_counts().head(10)[1])*100/cat_df['sub_categories'].value_counts().head(10).sum()))

The "Books" and "Subjects" categories comprise 78.32% of the categories, and if eliminated will reduce the number of subjects by that much.


B. (Suggested duration: 30 mins)
Give the number crunching a rest! Just think about these problems.

1. Algorithm thinking<br>
How would you build the product categorization from scratch, using similar/co-purchased information?

2. Product thinking<br>
Now, put on your 'product thinking' hat:<br>
a. Is it a good idea to show users the categorization hierarchy for items?<br>
b. Is it a good idea to show users similar/co-purchased items?<br>
c. Is it a good idea to show users reviews and ratings for items?<br>
d. For each of the above, why? How will you establish the same?

What is currently unclear is if the additional layers of categorization are causing some problem for the customers of the site. Yes, it seems redundant to have a single product fall into so many different categories, but unless this causes a problem for those customers being able to find what they want, I don't see a strong motivation to change it. 

That being said, if products could be classified using a well-defined clustering algorithm, it could reduce the number of different categories a single product ends up in. In order to make this happen, though, proper criteria would need to be established, and data from reviews and contents of the product itself (e.g. book text, video images, etc) would need to be made available for accurate clustering. 

I suspect that categories are chosen by the sellers, and if those sellers were restricted to choosing a single category for their products, this might actually achieve better results.

I don't think showing users the categorization hierarchy is a good idea, as it isn't how people tend to search for products. It could help in terms of parametric searching, which I am a fan of but is often seen as undesirable to the general population. It is sometimes helpful to know where a product ranks within a given category, particularly for books, but beyond that it is more of a distraction than a help.

I do think it is a good idea to show users both similar and co-purchased items, however, as it can help provide a balance. If those similar products were ones with high reviews (or could be filtered as such), this might make it easier for someone to decide between an item they came across through a raw search and one that might be better suited to their needs. 

I also think it is a good idea to show users reviews and ratings for items, as it can help provide some assurance that the product is genuine and provide justifications for the ratings. Personally I prefer to buy items that have at least 80% positive (4 and 5 star) reviews unless there is some reason not to. It does help if those reviews are from verified purchases, though. I am not necessarily dissuaded by reviews that were for products offered at a discount, however, because I think in most cases those reviewers are being as honest as they can be. Especially since Amazon has a policy that reviewers who have received items at a discount for review provide a disclaimer stating this. 