# Process Data for D3 Visualizations

In [1]:
import os
import pandas as pd
import json
# import numpy as np

## Merged Data

In [2]:
data_path = 'D:\DATA\OurFoods'
df = pd.read_csv(os.path.join(data_path, 'merged_amz-off_3.csv.gz'),\
                 dtype={'customer_id': 'object', 'product_parent': 'object', \
                        'star_rating': pd.Int64Dtype(), 'helpful_votes': pd.Int64Dtype(), 
                        'total_votes': pd.Int64Dtype(), \
                        'code': 'object'},
                 compression='gzip')
# convert reivew_date to datetime object
df.review_date = pd.to_datetime(df.review_date)
df.shape

(147304, 23)

## Data for Ridgeline Plot
+ Input: 
  + Merged Food Reviews dataset
  + A range of time, including "start date" and "end date"
+ Ouput:
  + Data of TOP10 Counts within the time range
  + Including these attributes:
    + For each category, data of one day is computed
      + **p: probability of reviews in a given date**
        + i.e. (count of day reviews) / (count of whole time reivews)
      + **p peak: normalize p with the maximum p (of whole range)**
        + i.e. (p of a date) / (max p of whole time range)
      + **p smooth: smooth p with adjacent p data**
        + i.e. (p of a date) / (sum of p data adjacent to it, including previous, current, next)
      

In [3]:
df.shape

(147304, 23)

### Preparation of Data
+ Keep only Valid Data
+ Slice by Date Range
+ Slice by TOP 10 Threshold

In [4]:
# keep only valid date, with [category, review_date, id]
sub = df[(df.energy_100g.notna()) & (df.energy_100g < 3000) & 
         (df.salt_100g < 100) & (df.review_date.notna()) & 
         (df.main_category_en.notna()) & (df.main_category_en.str.contains('^[A-Z].*'))]\
        .loc[:, ['main_category_en', 'review_date', 'review_id']]\
        .rename(mapper={'main_category_en': 'category', 'review_id': 'id'}, axis=1)\
        .reset_index(drop=True)
sub.shape

(59563, 3)

In [5]:
# slice by date range
start_date = '2014-01-01'
end_date = '2014-12-31'

sub = sub[(sub.review_date > start_date) & (sub.review_date < end_date)]
sub.shape

(17388, 3)

In [6]:
# get top 10 threshold
threshold = sub.groupby('category')[['id']].count()\
    .sort_values('id', ascending=False)\
    .iloc[9, 0] # get 10th category count
threshold

231

In [7]:
# Sice by TOP10 threshold
top10 = sub.assign(counts=lambda d: d.groupby('category')[['id']].transform('count'))\
        .query('counts >= {}'.format(threshold))\
        .reset_index(drop=True)
top10.shape

(15939, 4)

### Date Range index
+ Not Every Category has data for every date in the range
+ Supplement with 0 value 
+ Use Date Index

In [8]:
date_idx = []
for category in top10.category.unique():
    for date in pd.date_range(start_date, end_date, freq='D'):
        date_idx.append((category, date))
len(date_idx) # (number of category) * (number of days)

3650

### Calculate Plot Values
+ P
+ P Peak
+ P Smooth (use more days for smoothing?)

In [9]:
# add p attribute
data = top10.groupby(['category', 'review_date'])[['id']].count()\
    .reindex(date_idx, fill_value=0)\
    .reset_index()\
    .assign(byCategorySum=lambda d: d.groupby('category')[['id']].transform('sum'))\
    .assign(p=lambda d: d.id / d.byCategorySum)\
    .drop(['id', 'byCategorySum'], axis=1)
data.shape

(3650, 3)

In [10]:
# add p peak attribute
data = data.assign(byCategoryMaxP=lambda d: d.groupby('category')[['p']].transform(max))\
    .assign(p_peak=lambda d: d.p / d.byCategoryMaxP)\
    .drop(['byCategoryMaxP'], axis=1)
data.shape

(3650, 4)

In [11]:
# add p smooth attribute
data = data.assign(p_lag1=lambda d: d.groupby('category')[['p_peak']].shift(-1))\
    .assign(p_lead1=lambda d: d.groupby('category')[['p_peak']].shift(1))\
    .assign(p_smooth=lambda d: (d.p_lag1 + d.p_peak + d.p_lead1) / 3)\
    .drop(['p_lag1', 'p_lead1'], axis=1)\
    .fillna(method='ffill', axis=1)
data.shape

(3650, 5)

In [13]:
# add p smooth attribute, w/ 7 days smoothing
data = data\
    .assign(p_lag1=lambda d: d.groupby('category')[['p_peak']].shift(-1))\
    .assign(p_lag2=lambda d: d.groupby('category')[['p_peak']].shift(-2))\
    .assign(p_lag3=lambda d: d.groupby('category')[['p_peak']].shift(-3))\
    .assign(p_lead1=lambda d: d.groupby('category')[['p_peak']].shift(1))\
    .assign(p_lead2=lambda d: d.groupby('category')[['p_peak']].shift(2))\
    .assign(p_lead3=lambda d: d.groupby('category')[['p_peak']].shift(3))\
    .assign(p_smooth7=lambda d: (d.p_lag1 + d.p_lag2 + d.p_lag3 + 
                                 d.p_lead1 + d.p_lead2 + d.p_lead3 +
                                 d.p_peak) / 7)\
    .drop(['p_lag1', 'p_lag2', 'p_lag3', 'p_lead1', 'p_lead2', 'p_lead3'], axis=1)\
    .fillna(method='ffill', axis=1)
data.shape

(3650, 6)

In [14]:
# export to csv
data.to_csv('reviews.csv', index=False)

In [15]:
# export to json, as records
# [{category: ..., review_date: ..., p: ..., p_peak: ..., p_smooth: ...}, {}, {}, ...]
data.iloc[:3, :].to_json(orient='records')

'[{"category":"Plant-based foods and beverages","review_date":1388534400000,"p":0.0,"p_peak":0.0,"p_smooth":0.0,"p_smooth7":0.0},{"category":"Plant-based foods and beverages","review_date":1388620800000,"p":0.0041928721,"p_peak":0.5294117647,"p_smooth":0.3333333333,"p_smooth7":0.3333333333},{"category":"Plant-based foods and beverages","review_date":1388707200000,"p":0.0037269974,"p_peak":0.4705882353,"p_smooth":0.4803921569,"p_smooth7":0.4803921569}]'