In [82]:
%matplotlib inline
import pandas as pd
import numpy as np
import seaborn as sb
import gzip
import ast

In [4]:
def parse(path):
    g = gzip.open(path, 'rb')
    for l in g:
        yield eval(l)

def getDF(path):
    i = 0
    df = {}
    for d in parse(path):
        df[i] = d
        i += 1
    return pd.DataFrame.from_dict(df, orient='index')

In [88]:
##### Reading files and transforming into DF #####

# Grocery and Gourmet Food (TAKES 3 MIN)
#reviews_food_df = getDF('data/reviews_Grocery_and_Gourmet_Food.json.gz')
#meta_food_df = getDF('data/meta_Grocery_and_Gourmet_Food.json.gz')
#reviews_app_df = getDF('data/reviews_Apps_for_Android.json.gz')

# Sports and Outdoors (CAREFUL, TAKES ~25 MIN)
#reviews_sport_df = getDF('data/reviews_Sports_and_Outdoors.json.gz')
#meta_sport_df = getDF('data/meta_Sports_and_Outdoors.json.gz')

In [92]:
reviews_app_df[reviews_app_df.unixReviewTime != np.nan].head()

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime
0,AUI0OLXAB3KKT,B004A9SDD8,A Customer,"[0, 0]",Glad to finally see this app on the android ma...,5.0,Great app!!!,1301184000.0,"03 27, 2011"
1,A1ZUSQ3TC3EC4C,B004A9SDD8,A. Lissak,"[12, 14]",this app works great on the Kindle Fire... kid...,5.0,Kid loves it,1321574000.0,"11 18, 2011"
2,AC05OAXD72X1V,B004A9SDD8,Allie,"[0, 0]",We love these monkey's and all the concepts th...,4.0,Love these monkeys!,1367366000.0,"05 1, 2013"
3,A2RVMFOKBVM21I,B004A9SDD8,Amazon Customer,"[0, 2]",cannot get my kindle away from my 2 year old g...,5.0,fun fun for toddlers,1350173000.0,"10 14, 2012"
4,A3NBSRGUWQGCMZ,B004A9SDD8,Amazon Customer,"[1, 3]",I start this app up whenever I forget what a f...,1.0,Might be great if it worked,1300838000.0,"03 23, 2011"


First, let's try to see what are the types of items that have been reviewed by people by looking at the different categories in the data from "Grocery and Gourmet Food" and "Sports and Outdoors".

In [10]:
def create_categories_count_df(reviews_df, meta_df):
    merged_df = pd.merge(meta_df[['asin', 'categories']], reviews_df[['asin']], on='asin')
    categories = {}
    for all_cats in merged_df['categories'].values:
        for cats in all_cats:
            for cat in cats:
                if cat in categories:
                    categories[cat] += 1
                else:
                    categories[cat] = 1

    count_series = pd.Series(categories, name='count')
    count_series.index.name = 'category'
    count_series.reset_index()
    count_df = count_series.to_frame().sort_values('count', ascending=False)
    
    return count_df

In [11]:
food_cat_count_df = create_categories_count_df(reviews_food_df, meta_food_df)
food_cat_count_df.head(20)

Unnamed: 0_level_0,count
category,Unnamed: 1_level_1
Grocery & Gourmet Food,1297156
Beverages,29736
Cooking & Baking,22663
"Canned, Dry & Packaged Foods",14287
Tea,11435
"Herbs, Spices & Seasonings",9921
Coffee,6909
Single Herbs & Spices,6602
"Cooking Oils, Vinegars & Sprays",5189
Oils,4610


In [14]:
sport_cat_count_df = create_categories_count_df(reviews_sport_df, meta_sport_df)


In [16]:
sport_cat_count_df[sport_cat_count_df['count'] > 10000]

Unnamed: 0_level_0,count
category,Unnamed: 1_level_1
Sports & Outdoors,3339650
"Clothing, Shoes & Jewelry",902890
Hunting & Fishing,681394
Clothing,619257
Accessories,579179
Hunting,490667
Men,459520
Outdoor Gear,416349
Camping & Hiking,404931
Exercise & Fitness,338671


# Mode discovering

In [128]:
import datetime
from dateutil import parser
import os
import json
def count_review(acc,file_path):
    skiped = 0
    g = gzip.open(file_path, 'rb')
    for l in g:
        row =  eval(l)
        try:
            date_key = datetime.datetime.fromtimestamp(row['unixReviewTime']).strftime('%Y-%m')
        except KeyError:
            try: 
                date_format = "%m %d, %Y"
                date_key = datetime.datetime.strptime(row['reviewTime'], date_format).strftime('%Y-%m')
            except (KeyError, ValueError) as e:
                #print('row is : {}'.format(row))      
                skiped += 1
                continue
        if date_key in acc:
            acc[date_key] += 1
        else:
            acc[date_key] = 0
    if skiped :
        print('skiped {} rows because of KeyError (not present) or ValueError (not parsable)'.format(skiped))

In [None]:
AMAZON_EVOLUTION = 'data/amazon_evolution.npy'
FILES_NAME = []
if not os.path.isfile(AMAZON_EVOLUTION):
    acc =  {}
    for file in os.listdir('data'):
        if(file.startswith('reviews') and file.endswith('.json.gz')):
            print('current file : {}'.format(file))
            count_review(acc,'data/' + file)
    np.save(AMAZON_EVOLUTION,acc)
else:
    acc=np.load(AMAZON_EVOLUTION).item()
evolution_df = pd.DataFrame(list(acc.items()),columns=['Date','Number'])
evolution_df.sort_values('Number')
evolution_df.plot()

current file : reviews_Amazon_Instant_Video.json.gz
current file : reviews_Apps_for_Android.json.gz


In [118]:
date_format = " %m, %Y"
today = datetime.datetime.today()
today_formated = today.strftime(date_format)
print(today_formated)


 11, 2017


'2012-11'

In [53]:
for file in os.listdir('data'):
    if(file.startswith('reviews_G') and file.endswith('.json.gz')):
        print('{}'.format(file))

reviews_Grocery_and_Gourmet_Food.json.gz
