In [2]:
import pandas as pd
import numpy as np
import seaborn
import json
from tqdm.notebook import tqdm
from collections import Counter
import plotly.express as px


# Read data

In [27]:
def read_data(path: str, n: int = 100_000_000, filter_ids = None) -> pd.DataFrame:
    items = []
    with open(path, 'r') as data_file:      
        for _ in tqdm(range(n)):
            line = data_file.readline()
            if not line:
                break
            json_line = json.loads(line)
            if filter_ids is None:
                items.append(json_line)
            elif json_line['asin'] in filter_ids:
                items.append(json_line)
                if len(items) == len(filter_ids):
                    break
    return pd.DataFrame(items)

books = read_data('../data/raw/Books_5.json', 1_000_000)
book_ids = set(books['asin'].unique())
books_meta = read_data('../data/raw/meta_Books.json', filter_ids=book_ids)
print(f'There are {len(book_ids)} unique books in 500k first reviews.')
print(f'{books.shape=}')
print(f'{books_meta.shape=}')

  0%|          | 0/1000000 [00:00<?, ?it/s]

  0%|          | 0/100000000 [00:00<?, ?it/s]

There are 12521 unique books in 500k first reviews.
books.shape=(1000000, 12)
books_meta.shape=(12521, 18)


In [33]:
all_categories = np.concatenate(books_meta['category'].values)
all_categories = [x if '&amp;' not in x else x.replace('&amp;', '&') for x in all_categories if '&amp;' in x]

categories_counter = Counter(all_categories)
most_common_categories = categories_counter.most_common(20)
most_common_categories

[('Literature & Fiction', 1671),
 ('Mystery, Thriller & Suspense', 410),
 ('Biographies & Memoirs', 409),
 ('Growing Up & Facts of Life', 259),
 ('Politics & Social Sciences', 227),
 ('Teen & Young Adult', 220),
 ('Cookbooks, Food & Wine', 207),
 ('Christian Books & Bibles', 199),
 ('Thrillers & Suspense', 176),
 ('Science Fiction & Fantasy', 153),
 ('Business & Money', 147),
 ('Health, Fitness & Dieting', 135),
 ('Religion & Spirituality', 131),
 ('New, Used & Rental Textbooks', 130),
 ('Arts & Photography', 129),
 ('Politics & Government', 108),
 ('Humor & Entertainment', 99),
 ('Arts & Literature', 95),
 ('Science & Math', 84),
 ('Regional & International', 84)]

In [34]:
books_meta['category'] = books_meta['category'].apply(lambda x: set(x))

In [36]:
twenty_most_popular_categories = [(name, count, f'{count / books_meta.shape[0] * 100:.2f}%') for name, count in categories_counter.most_common(20)]
twenty_most_popular_categories_names = [name for name, _, _ in twenty_most_popular_categories]
twenty_most_popular_categories_names

['Literature & Fiction',
 'Mystery, Thriller & Suspense',
 'Biographies & Memoirs',
 'Growing Up & Facts of Life',
 'Politics & Social Sciences',
 'Teen & Young Adult',
 'Cookbooks, Food & Wine',
 'Christian Books & Bibles',
 'Thrillers & Suspense',
 'Science Fiction & Fantasy',
 'Business & Money',
 'Health, Fitness & Dieting',
 'Religion & Spirituality',
 'New, Used & Rental Textbooks',
 'Arts & Photography',
 'Politics & Government',
 'Humor & Entertainment',
 'Arts & Literature',
 'Science & Math',
 'Regional & International']

In [37]:
for cat_name in tqdm(twenty_most_popular_categories_names):
    books_meta[cat_name.replace(' ', '_').replace('&', 'and').replace(',', '_')] = books_meta['category'].apply(lambda x: int(cat_name in x))

  0%|          | 0/20 [00:00<?, ?it/s]

In [51]:
# books_meta['description'][:10].values
books_meta.columns

Index(['category', 'tech1', 'description', 'fit', 'title', 'also_buy', 'tech2',
       'brand', 'feature', 'rank', 'also_view', 'main_cat', 'similar_item',
       'date', 'price', 'asin', 'imageURL', 'imageURLHighRes',
       'Literature_and_Fiction', 'Mystery__Thriller_and_Suspense',
       'Biographies_and_Memoirs', 'Growing_Up_and_Facts_of_Life',
       'Politics_and_Social_Sciences', 'Teen_and_Young_Adult',
       'Cookbooks__Food_and_Wine', 'Christian_Books_and_Bibles',
       'Thrillers_and_Suspense', 'Science_Fiction_and_Fantasy',
       'Business_and_Money', 'Health__Fitness_and_Dieting',
       'Religion_and_Spirituality', 'New__Used_and_Rental_Textbooks',
       'Arts_and_Photography', 'Politics_and_Government',
       'Humor_and_Entertainment', 'Arts_and_Literature', 'Science_and_Math',
       'Regional_and_International'],
      dtype='object')

In [58]:
asin_value_counts = books_meta['asin'].value_counts()

if asin_value_counts.max() == 1:
    print('Each book presents once in metadata dataset')
else:
    print('There are duplicate "asin"-s in metadata dataset')

Each book presents once in metadata dataset


In [59]:
books = pd.merge(books, books_meta, how='left', on='asin')
books.head(3)

Unnamed: 0,overall,verified,reviewTime,reviewerID,asin,style,reviewerName,reviewText,summary,unixReviewTime,...,Business_and_Money,Health__Fitness_and_Dieting,Religion_and_Spirituality,New__Used_and_Rental_Textbooks,Arts_and_Photography,Politics_and_Government,Humor_and_Entertainment,Arts_and_Literature,Science_and_Math,Regional_and_International
0,5.0,False,"03 30, 2005",A1REUF3A1YCPHM,1713353,{'Format:': ' Hardcover'},TW Ervin II,"The King, the Mice and the Cheese by Nancy Gur...",A story children will love and learn from,1112140800,...,0,0,0,0,0,0,0,0,0,0
1,5.0,True,"06 20, 2016",AVP0HXC9FG790,1713353,,Amazon Customer,The kids loved it!,Five Stars,1466380800,...,0,0,0,0,0,0,0,0,0,0
2,5.0,True,"01 24, 2016",A324TTUBKTN73A,1713353,{'Format:': ' Paperback'},Tekla Borner,My students (3 & 4 year olds) loved this book!...,Five Stars,1453593600,...,0,0,0,0,0,0,0,0,0,0


In [65]:
need_save = False
if need_save:
    books.to_csv('../data/interim/books_with_metadata.csv.zip', index=None, compression='zip')

# EDA

## Ratings distribution

In [66]:
px.bar(books[['asin', 'overall']].groupby('overall').agg({'overall': 'count'}))

### Ratings distribution by category (for twenty most popupal categories)

In [69]:
from plotly.subplots import make_subplots
import plotly.graph_objects as go

fig = make_subplots(rows=5, cols=4)

i = 0
j = 1
for cat in twenty_most_popular_categories_names:
    i += 1
    if i % 5 == 0:
        j += 1
        i = 1
    
    cat = cat.replace(' ', '_').replace('&', 'and').replace(',', '_')
    gr = books[books[cat] == 1].groupby('overall').agg({'overall': 'count'})
    x = gr.index
    y = np.reshape(gr.values, (1,5))[0]

    fig.add_trace(go.Bar(x=x, y=y, name=cat, ), row=j, col=i)


fig.update_layout(height=1000, width=1500, title_text="Plots")
fig.show()

# Summary report

In [78]:
def summary(data: pd.DataFrame) -> None:
    from IPython.display import display, Markdown
    percentiles = [5, 25, 50, 75, 95]

    unique_books_count = books["asin"].nunique()
    unique_users = books['reviewerID'].nunique()
    average_rating = books['overall'].mean()
    meadian_rating = books['overall'].median()
    
    books_by_review_count_and_overall_mean_rating = books[['asin', 'overall', 'title']].\
        groupby('asin').\
        agg(overall_mean=('overall', 'mean'), overall_count=('overall', 'count'), title=('title', 'max'))
    five_most_lovable_book_str = '\n'.join([f'    1. Book \"{item["title"]}\" has {item["overall_count"]} reviews with average rating {item["overall_mean"]:.2f}' \
        for idx, item in books_by_review_count_and_overall_mean_rating.sort_values(['overall_mean', 'overall_count'], ascending=False)[:5].iterrows()])
    five_less_lovable_book_str = '\n'.join([f'    1. Book \"{item["title"]}\" has {item["overall_count"]} reviews with average rating {item["overall_mean"]:.2f}' \
        for idx, item in books_by_review_count_and_overall_mean_rating.sort_values(['overall_mean', 'overall_count'], ascending=[True, False])[:5].iterrows()])

    books_per_user = books[['reviewerID', 'asin']].groupby('reviewerID').agg({'asin': 'count'})
    percentiles_values = np.percentile(books_per_user['asin'], percentiles)
    percentiles_str = '\n'.join([f'   1. {percentiles[i]}% users reviewed {percentiles_values[i]} books or less' for i in range(len(percentiles))])

    five_most_reviewed_books = books[['asin', 'overall', 'title']].\
        groupby('asin').\
        agg(asin_count=('asin', 'count'), overall_mean=('overall', 'mean'), title=('title', 'max')).\
        sort_values('asin_count', ascending=False)
    five_most_reviewed_books_str = '\n'.join([f'    1. Book **\"{item["title"]}\"** has {item["asin_count"]} reviews with average rating {item["overall_mean"]:.2f}' \
        for _, item in five_most_reviewed_books[:5].iterrows()])

    top_reviewers = books[['reviewerID', 'asin', 'overall']].\
        groupby('reviewerID').\
        agg(asin_count=('asin', 'count'), overall_mean=('overall', 'mean')).\
        sort_values('asin_count', ascending=False)
    five_top_reviewers_str = '\n'.join([f'    1. Reviewer "{idx}" has {item["asin_count"]} reviews with {item["overall_mean"]:.2f} mean rating' \
        for idx, item in top_reviewers[:5].iterrows()])

    report = f'''## Summary report

1. There are {unique_books_count} books
2. There are {unique_users} reviewers
4. Average rating is {average_rating:.2f}
5. Median rating is {meadian_rating:.2f}
6. Five most lovable books (books with largest review count and best ratings)
{five_most_lovable_book_str}
7. Five less lovable books (books with largest review count and worst ratings)
{five_less_lovable_book_str}
6. Five top reviewers
{five_top_reviewers_str}
8. Most reviewed books
{five_most_reviewed_books_str}
5. How many books user reviewed:
{percentiles_str}
'''
    display(Markdown(report))

In [79]:
summary(books)

## Summary report

1. There are 12521 books
2. There are 538995 reviewers
4. Average rating is 4.32
5. Median rating is 5.00
6. Five most lovable books (books with largest review count and best ratings)
    1. Book "Therapy and Technique" has 53 reviews with average rating 5.00
    1. Book "More Peanuts" has 29 reviews with average rating 5.00
    1. Book "Living the Infinite Way" has 22 reviews with average rating 5.00
    1. Book "Summer Story (Brambly Hedge)" has 21 reviews with average rating 5.00
    1. Book "My Great-Aunt Arizona" has 18 reviews with average rating 5.00
7. Five less lovable books (books with largest review count and worst ratings)
    1. Book "Collins Primary Geography Atlas For The Middle East" has 5 reviews with average rating 1.00
    1. Book "Medieval Lives: Eight Charismatic Men and Women of the Middle Ages" has 2 reviews with average rating 1.00
    1. Book "Always the Bridesmaid" has 2 reviews with average rating 1.00
    1. Book "Redesigning 50: The No-Plastic-Surgery Guide to 21st-Century Age Defiance" has 2 reviews with average rating 1.00
    1. Book "The Book of God and Physics: A Novel of the Voynich Mystery" has 2 reviews with average rating 1.00
6. Five top reviewers
    1. Reviewer "A1D2C0WDCSHUWZ" has 734.0 reviews with 4.59 mean rating
    1. Reviewer "A2F6N60Z96CAJI" has 623.0 reviews with 4.59 mean rating
    1. Reviewer "A1K1JW1C5CUSUZ" has 390.0 reviews with 4.46 mean rating
    1. Reviewer "A2TX179XAT5GRP" has 303.0 reviews with 4.68 mean rating
    1. Reviewer "A2OJW07GQRNJUT" has 302.0 reviews with 4.92 mean rating
8. Most reviewed books
    1. Book **"Divergent"** has 19657 reviews with average rating 4.42
    1. Book **"The Hobbit"** has 18500 reviews with average rating 4.68
    1. Book **"All the Light We Cannot See"** has 15922 reviews with average rating 4.54
    1. Book **"A Tale of Two Cities (Collins Classics)"** has 7894 reviews with average rating 4.50
    1. Book **"Allegiant"** has 7830 reviews with average rating 3.57
5. How many books user reviewed:
   1. 5% users reviewed 1.0 books or less
   1. 25% users reviewed 1.0 books or less
   1. 50% users reviewed 1.0 books or less
   1. 75% users reviewed 2.0 books or less
   1. 95% users reviewed 4.0 books or less
