In [None]:
%matplotlib inline
import pandas as pd
import numpy as np
import seaborn as sb
import gzip
import matplotlib.pyplot as plt
plt.style.use('ggplot')

In [None]:
##### Functions for reading and parsing files #####

def parse(path):
    g = gzip.open(path, 'rb')
    for l in g:
        yield eval(l)

def getDF(path):
    i = 0
    df = {}
    for d in parse(path):
        df[i] = d
        i += 1
    return pd.DataFrame.from_dict(df, orient='index')

In [None]:
##### Functions related to the DataFrames directly #####

def get_categories(item):
    for cats in item['categories']:
        for cat in cats:
            yield cat
            
def create_categories_count_df(reviews_df, meta_df):
    merged_df = pd.merge(meta_df[['asin', 'categories']], reviews_df[['asin']], on='asin')
    categories = {}
    for idx, item in merged_df.iterrows():
        for cat in get_categories(item):
            if cat in categories:
                categories[cat] += 1
            else:
                categories[cat] = 1

    count_series = pd.Series(categories, name='count')
    count_series.index.name = 'category'
    count_series.reset_index()
    count_df = count_series.to_frame().sort_values('count', ascending=False)
    
    return count_df

def df_with_datetime(df, col_name='datetime', out_format=None):
    if out_format:
        df[col_name] = pd.to_datetime(df['unixReviewTime'], unit='s').dt.strftime(out_format)
    else:
        df[col_name] = pd.to_datetime(df['unixReviewTime'], unit='s')
        
    return df

In [None]:
##### Reading files and transforming into DF #####

# Grocery and Gourmet Food (TAKES 3 MIN)
reviews_food_df = getDF('data/reviews_Grocery_and_Gourmet_Food.json.gz')
#meta_food_df = getDF('data/meta_Grocery_and_Gourmet_Food.json.gz')
#reviews_app_df = getDF('data/reviews_Apps_for_Android.json.gz')

# Sports and Outdoors (CAREFUL, TAKES ~25 MIN)
reviews_sport_df = getDF('data/reviews_Sports_and_Outdoors.json.gz')
#meta_sport_df = getDF('data/meta_Sports_and_Outdoors.json.gz')

In [None]:
# Add columns about time from unixReviewTime in reviews DF
reviews_food_df = df_with_datetime(reviews_food_df, 'yearTime', '%Y')
reviews_food_df = df_with_datetime(reviews_food_df, 'yearMonthTime', '%Y-%m')

reviews_sport_df = df_with_datetime(reviews_sport_df, 'yearTime', '%Y')
reviews_sport_df = df_with_datetime(reviews_sport_df, 'yearMonthTime', '%Y-%m')

First, let's try to see what are the types of items that have been reviewed by people by looking at the different categories in the data from "Grocery and Gourmet Food" and "Sports and Outdoors".

In [None]:
food_cat_count_df = create_categories_count_df(reviews_food_df, meta_food_df)
food_cat_count_df.head(20)

In [None]:
sport_cat_count_df = create_categories_count_df(reviews_sport_df, meta_sport_df)


In [None]:
sport_cat_count_df[sport_cat_count_df['count'] > 10000]

Now, we see that the different categories in the 'Grocery & Gourmet Food' file are not directly useful, because:
1. They are not directly telling us if the food is a healthy one or not
2. We see that a lot of reviews are about products that are not in a category (except for the main one 'Grocery & Gourmet Food')

In order to get the reviews related to a healthy product in this file, we can try the following: we could read the title (and/or description) of all products in the metadata and find the ones containing some keyword related to a healthy lifestyle (e.g. "organic", "natural", ...). Once we have those products, we can keep only the reviews about those products (using the 'asin' value).

In [None]:
##### Constants #####

METADATA_TO_KEEP = ['asin', 'title', 'categories', 'price']
REVIEWS_DATA_TO_KEEP = ['asin', 'overall', 'reviewText', 'yearMonthTime', 'yearTime']
HEALTHY_FOOD_KEYWORDS = ['organic', 'natural', 'sugar-free', 'healthy', 'vitamin',
                        'supplement', 'minerals', 'diet', 'vegan']
HEALTHY_SPORT_CATEGORIES = ['Exercise & Fitness', 'Cycling', 'Sport Watches', 'Team Sports',
                            'Strength Training Equipment', 'Action Sports', 'Cardio Training',
                           'Running']

In [None]:
##### Functions related to the 'healthiness' of items #####

def is_food_healthy(item):
    for kw in HEALTHY_FOOD_KEYWORDS:
        try:
            if kw in item['title'].lower() or kw in item['description'].lower():
                return True
        except:
            pass
        
    return False

def is_sport_item_healthy(item):
    for cat in get_categories(item):
        if cat in HEALTHY_SPORT_CATEGORIES:
            return True
        
    return False

In [None]:
# Metadata about healthy food only
meta_food_healthy_df = meta_food_df[meta_food_df.apply(lambda item: is_food_healthy(item), axis=1)]    
print(meta_food_healthy_df.shape)
meta_food_healthy_df.head()

In [None]:
# Reviews about healthy (food) products merged with corresponding metadata
merged_food_healthy_df = pd.merge(meta_food_healthy_df[METADATA_TO_KEEP], reviews_food_df[REVIEWS_DATA_TO_KEEP], on='asin')
print(merged_food_healthy_df.shape)
merged_food_healthy_df.head()

In [None]:
# Show number of reviews of healthy products for 'Grocery & Gourmet Food' over time
sb.countplot(x='yearTime', data=merged_food_healthy_df)

In [None]:
# Show number of reviews for 'Grocery & Gourmet Food' over time
sb.countplot(x='yearTime', data=reviews_food_df)

Another file that seems interesting for finding healthy products is the 'Sports & Outdoors' one. As we can see, this one contains categories that seem easy to categorize into healthy (or not), e.g. 'Exercise & Fitness' or 'Cycling', plus the products seem to be in more precise categories (not like in the 'Grocery & Gourmet Food' file). Thus, in order to get all reviews about products related to an healthy lifestyle, we could take all the reviews about a product that is in one of the 'healthy' categories, and we can choose those healthy categories manually.

In [None]:
# Metadata about healthy sport items only
meta_sport_healthy_df = meta_sport_df[meta_sport_df.apply(lambda item: is_sport_item_healthy(item), axis=1)]
print(meta_sport_healthy_df.shape)
meta_sport_healthy_df.head()

In [None]:
# Reviews about healthy sport items merged with corresponding metadata
merged_sport_healthy_df = pd.merge(meta_sport_healthy_df[METADATA_TO_KEEP], reviews_sport_df[REVIEWS_DATA_TO_KEEP], on='asin')
print(merged_sport_healthy_df.shape)
merged_sport_healthy_df.head()

In [None]:
# Show number of reviews of healthy products for 'Sports & Outdoors' over time
sb.countplot(x='yearTime', data=merged_sport_healthy_df)

In [None]:
# Show number of reviews for 'Sports & Outdoors' over time
sb.countplot(x='yearTime', data=reviews_sport_df)

Now that we have reviews of healthy products from both files, we can concatenate them and use those as our healthy reviews.

In [None]:
# All reviews about any healthy products with corresponding metadata
merged_healthy_df = pd.concat([merged_food_healthy_df, merged_sport_healthy_df])
print(merged_healthy_df.shape)
merged_healthy_df.head()

# Mode discovering

We want to discover some trend. To do so we will first compute the evolution of Amazon by taking the count of review per month over the whole dataset (17.5 GB when taking all categories sets). To avoid useless computation once this is computed it will be saved in a file and read from it again.

In [None]:
import datetime
from dateutil import parser
import os
import json
def get_date(item):
    try:
        return datetime.datetime.fromtimestamp(row['unixReviewTime'])
    except KeyError:
        date_format = "%m %d, %Y"
        return datetime.datetime.strptime(row['reviewTime'], date_format)


def count_review(acc,file_path):
    skiped = 0
    g = gzip.open(file_path, 'rb')
    for l in g:
        row =  eval(l)
        try:
            date_key = get_date(row).strftime('%Y-%m')
        except (KeyError, ValueError) as e:
            #print('row is : {}'.format(row))      
            skiped += 1
            continue
        if date_key in acc:
            acc[date_key] += 1
        else:
            acc[date_key] = 0
    if skiped :
        print('skiped {} rows because of KeyError (not present) or ValueError (not parsable)'.format(skiped))

In [None]:
AMAZON_EVOLUTION = 'data/amazon_evolution.npy'
FILES_NAME = []
if not os.path.isfile(AMAZON_EVOLUTION):
    acc =  {}
    for file in os.listdir('data'):
        if(file.startswith('reviews') and file.endswith('.json.gz')):
            print('current file : {}'.format(file))
            count_review(acc,'data/' + file)
    np.save(AMAZON_EVOLUTION,acc)
else:
    acc=np.load(AMAZON_EVOLUTION).item()
evolution_df = pd.DataFrame(list(acc.items()),columns=['Date','Number'])
evolution_df.sort_values('Number')
evolution_df.plot()

In [118]:
date_format = " %m, %Y"
today = datetime.datetime.today()
today_formated = today.strftime(date_format)
print(today_formated)


 11, 2017


'2012-11'

In [53]:
for file in os.listdir('data'):
    if(file.startswith('reviews_G') and file.endswith('.json.gz')):
        print('{}'.format(file))

reviews_Grocery_and_Gourmet_Food.json.gz


In [None]:
def count_reviews_per_category(df, categories=None):
    for idx, item in df.iterrows():
        key_date = get_date(item)
        for cat in get_categories(item):
            if(categories == None or cat in categories):
                yield (cat, key_date, 1)
        