## Environment Info

In [4]:
!python --version

Python 2.7.14 :: Anaconda custom (64-bit)


In [5]:
!ls -l -h -d adidas/
!ls adidas/ |wc

drwxrwxr-x 2 nlp nlp 4.0K Dec  8 11:09 adidas/
    145     145    1791



## EDA of features of 1 Json file

In [26]:
import json
from pprint import pprint

data = json.load(open('brooks/100029.json'))

In [27]:
data.keys()

[u'rating',
 u'colorVariation',
 u'color',
 u'gender',
 u'brand',
 u'reviewData',
 u'productName',
 u'reviews',
 u'productMeta',
 u'productId',
 u'productImageLink',
 u'price',
 u'modelId']

In [28]:
for key,value in data.iteritems():
    print "{}\t{}".format(key, type(value))

print len(data)

rating	<type 'int'>
colorVariation	<type 'list'>
color	<type 'unicode'>
gender	<type 'unicode'>
brand	<type 'unicode'>
reviewData	<type 'list'>
productName	<type 'unicode'>
reviews	<type 'int'>
productMeta	<type 'dict'>
productId	<type 'unicode'>
productImageLink	<type 'unicode'>
price	<type 'int'>
modelId	<type 'unicode'>
13


In [29]:
data['productMeta'].keys()

[u'specs', u'mediaData', u'productUrl', u'description', u'title']

In [30]:
data['reviewData'][0]

{u'rating': 5,
 u'ratingDate': u'12/01/2017',
 u'reviewAuthor': u'John',
 u'reviewComments': u'',
 u'reviewTitle': u'This shoe is truly an amazing thing!'}

In [31]:
data['colorVariation'][0]

{u'color': u'047',
 u'price': 120,
 u'productId': u'100029',
 u'productImageLink': u'http://www.brooksrunning.com/dw/image/v2/aaev_prd/on/demandware.static/-/Sites-BrooksCatalog/default/dw26966731/images/ProductImages/100029/100029_047_l_WR.jpg?sw=640',
 u'productUrl': u'http://www.brooksrunning.com/en_us/brooks-elmn8-v4-mens-womens-track-spikes/100029.html?dwvar_100029_width=D&dwvar_100029_color=047'}

#### Json file Observation

* ProductMeta has important product features in dict
    * Product features can be used for feature engineering and classification
* ReviewData is a list of dict. Key feature is rating, ratingDate, reviewComments, reviewTitle
    * reviewComments and reviewTitle can be combined for sentiment analysis
* We cannot get actually color information from colorVariation. It has a link to the shoe. 
    * price maybe different for different colors
    * Need to decide later on how to deal with this data



## Building data frame for product feature and reviews

* Each json file is 1 product from 1 website
* Process each json file at a time and produce
    * A data frame for reviews, multiple rows and need to have productID
    * A product feature data frame, 1 row and need to use the same productID
* Treat same product from different website
    * Match product ID
    * Join the table?

Data Process Functions

In [34]:
import json
import pandas as pd
import itertools
import sys
from collections import defaultdict

def reviewData2DF(review_data):
    df = pd.DataFrame.from_dict(review_data)
    if 'badges' in review_data:
        df = df.drop(['badges'], axis = 1)
    if 'images' in review_data:    
        df = df.drop(['images'], axis = 1)
    if 'reviewAuthor' in review_data:
        df = df.drop(['reviewAuthor'], axis = 1)
    if 'reviewID' in review_data:
        df = df.drop(['reviewId'], axis = 1)
    return df

def productmeta2DF(meta_data):
    df = pd.DataFrame.from_dict([meta_data])
    
    ## drop fields:
    if 'mediaData' in meta_data:    
        df = df.drop(['mediaData'], axis = 1)
    if 'productUrl' in meta_data:
        df = df.drop(['productUrl'], axis = 1)
    if 'title' in meta_data:
        df = df.drop(['title'], axis = 1)
    if 'subtitle' in meta_data:
        df = df.drop(['subtitle'], axis = 1)
    
    # process 'specs' 
    if 'specs' in meta_data:
        df.loc[0, 'specs'] = u' '.join(df.loc[0, 'specs']).encode('utf-8').strip()
    else:
        df.loc[0, 'specs'] = 'NaN'
    
    # process 'description'
    if 'description' not in meta_data:
        df.loc[0, 'description'] = 'NaN'
    return df

def read_json(json_file):
    # Process Json fields
    review_header = ['rating', 'ratingDate', 'reviewComments', 'reviewTitle', 'modelId']
    feature_header = ['description', 'specs', 'rating', 'brand', 'gender', 'price', 'productName', 'modelId']
    df_review = pd.DataFrame(columns=review_header)
    df_productspec = pd.DataFrame(columns=feature_header)
    
    for key,value in json_file.iteritems():
        if key == 'productMeta':
            df_productspec = productmeta2DF(value)
        elif key == 'reviewData':
            if value:
                df_review = reviewData2DF(value)
        elif key == 'rating':
            rating = value
        elif key == 'gender':
            gender = value
        elif key == 'productName':
            productName = value
        elif key == 'brand':
            brand = value
        elif key == 'price':
            price = value
        elif key == 'modelId':
            modelId = value
    
    # Complete product spec data frame fields
    df_productspec['rating'] = rating
    df_productspec['brand'] = brand
    df_productspec['gender'] = gender
    df_productspec['price'] = price
    df_productspec['productName'] = productName
    df_productspec['modelId'] = modelId
    
    # Complete Review data frame fields
    df_review['modelId'] = [modelId] * df_review.shape[0]
    
    return df_productspec, df_review

### Exam Json DataFrame

In [35]:
data = json.load(open('brooks/100029.json'))
df_product, df_review = read_json(data)

In [36]:
for key,value in data.iteritems():
    print "{}\t{}".format(key, type(value))

print len(data)

rating	<type 'int'>
colorVariation	<type 'list'>
color	<type 'unicode'>
gender	<type 'unicode'>
brand	<type 'unicode'>
reviewData	<type 'list'>
productName	<type 'unicode'>
reviews	<type 'int'>
productMeta	<type 'dict'>
productId	<type 'unicode'>
productImageLink	<type 'unicode'>
price	<type 'int'>
modelId	<type 'unicode'>
13


In [37]:
df_review

Unnamed: 0,rating,ratingDate,reviewAuthor,reviewComments,reviewTitle,modelId
0,5,12/01/2017,John,,This shoe is truly an amazing thing!,100029


In [38]:
df_product

Unnamed: 0,description,specs,rating,brand,gender,price,productName,modelId
0,"An unrelenting middle-distance track spike, de...",Support: Neutral Midsole Drop: 0mm Weight: 4.2...,5,Brooks,M,120,ELMN8 v4,100029


### Process multiple Json files

In [39]:
import glob   

JSON_PATH = 'brooks/*.json'   
file_list = glob.glob(JSON_PATH) 

feature_dfs = []
review_dfs = []

count = 0
for json_file in file_list:
    # print "Process {} ...".format(json_file)
    json_data = json.load(open(json_file))
    df_product, df_review = read_json(json_data)
    feature_dfs.append(df_product)
    review_dfs.append(df_review)
    count += 1

print "Processed total {:d} JSON files".format(count)

Processed total 71 JSON files



### Generate and post-process product spec data frame

In [41]:
productspec_df = pd.concat(feature_dfs)
productspec_df.reset_index(drop=True, inplace=True)

print "Product Spec DF Shape: {}".format(productspec_df.shape)
productspec_df.head()

Product Spec DF Shape: (71, 8)


Unnamed: 0,description,specs,rating,brand,gender,price,productName,modelId
0,"Comfort styled to go anywhere, for runners loo...",Support: Neutral Midsole Drop: 12mm Weight: 8....,4.5,Brooks,M,100,Revel,120249
1,"Springy and supportive, for those who want to ...",Support: Support Midsole Drop: 10mm Weight: 8....,3.5,Brooks,M,110,Ravenna 8,120238
2,The smoothest ride possible with GORE-TEX® tec...,Midsole Drop: 12mm Weight: 11.3oz / 320.3g Arc...,5.0,Brooks,F,150,Ghost 10 GTX,110256
3,"Attack any terrain fast in our lightest, faste...",Support: Neutral Midsole Drop: 6mm Weight: 9.3...,0.0,Brooks,F,140,Mazama 2,110279
4,"Lightweight support to be one with your run, f...",Support: Support Midsole Drop: 4mm Weight: 7.5...,4.5,Brooks,M,110,PureCadence 6,120236


#### Check duplicates by the whole roles

In [42]:
productspec_df.drop_duplicates
productspec_df.shape

(71, 8)

#### Check duplicates by shoe models

In [43]:
productspec_df.duplicated('modelId')

0     False
1     False
2     False
3     False
4     False
5     False
6     False
7     False
8     False
9     False
10    False
11    False
12    False
13    False
14    False
15    False
16    False
17    False
18    False
19    False
20    False
21    False
22    False
23    False
24    False
25    False
26    False
27    False
28    False
29    False
      ...  
41    False
42    False
43    False
44    False
45    False
46    False
47    False
48    False
49    False
50    False
51    False
52    False
53    False
54    False
55    False
56    False
57    False
58    False
59    False
60    False
61    False
62    False
63    False
64    False
65    False
66    False
67    False
68    False
69    False
70    False
Length: 71, dtype: bool

In [44]:
sum(productspec_df.duplicated('modelId'))

0

#### Duplicate models

In [45]:
duplicates = productspec_df[productspec_df.duplicated('modelId', keep=False)]
duplicates.sort_values(by = ['modelId'])

Unnamed: 0,description,specs,rating,brand,gender,price,productName,modelId


#### After exam the rows, we can remove duplicate rows. The whole row does not match maybe due to while spaces

In [46]:
productspec_df.drop_duplicates(['modelId'], keep = 'first', inplace=True)
productspec_df.shape

(71, 8)

### Generate and post-process review data frame

In [47]:
reviewData_df = pd.concat(review_dfs)
reviewData_df.reset_index(drop=True, inplace=True)

print "Review Data DF Shape: {}".format(reviewData_df.shape)
reviewData_df.head()

Review Data DF Shape: (6166, 6)


Unnamed: 0,modelId,rating,ratingDate,reviewAuthor,reviewComments,reviewTitle
0,120249,5,12/07/2017,Stacie,"Very cushiony shoe. It's comfortable to me, bu...",Very comfortable!
1,120249,5,12/05/2017,Lanier,,Recommend!
2,120249,5,12/03/2017,Carolyn,I haven't worn these much but took them out a ...,Nice for soft ground
3,120249,5,12/02/2017,The Italian,The top is flexible allowing for foot deformat...,Good Shoe!
4,120249,5,12/01/2017,Jhd,,This is my third pair. Love them


#### Check for duplicated reivews

In [48]:
reviewData_df.drop_duplicates
reviewData_df.shape

(6166, 6)

In [63]:
sum(reviewData_df.duplicated(['reviewComments', 'reviewTitle']))

311

In [64]:
duplicates = reviewData_df[reviewData_df.duplicated(['reviewComments', 'reviewTitle'])]
duplicates.sort_values(by = ['reviewComments'])

Unnamed: 0,modelId,rating,ratingDate,reviewAuthor,reviewComments,reviewTitle
61,120249,5,09/23/2017,ole girl,,I would buy this shoe again
4572,120225,5,04/03/2017,B,,Love them!
4563,120225,5,04/10/2017,Lauren,,Love them
4556,120225,5,04/14/2017,Haley,,Awesome
4523,120225,5,05/10/2017,Sheila the Runner,,Love these!
4500,120225,5,05/31/2017,v-,,Excellent!
4487,120225,5,06/09/2017,Lone runner,,Love them!
4485,120225,5,06/10/2017,Bri,,Love these shoes!
4474,120225,5,06/12/2017,Hallie,,Awesome shoes
4463,120225,5,06/19/2017,Meg,,Love these shoes!


#### Remove duplications

In [85]:
reviewData_df.drop_duplicates(['reviewComments'], keep = 'first', inplace=True)
reviewData_df.shape

(4288, 5)

## Check Point 1: writing DF to .csv before modeling

In [91]:
reviewData_df.to_csv('product_reviews.csv', encoding='utf-8', index=False)
productspec_df.to_csv('product_features.csv', encoding='utf-8', index=False)

In [92]:
!ls -lh

total 2168
-rw-------    1 qianyu  staff    70K Dec 20 11:19 JSON_EDA.ipynb
drwxr-xr-x  147 qianyu  staff   4.9K Dec 14 17:22 [34madidas[m[m
-rw-------    1 qianyu  staff   4.4K Dec 14 17:19 amazon_crawler.py
drwxr-xr-x@ 165 qianyu  staff   5.5K Dec 12 00:21 [34masics[m[m
drwxr-xr-x@  73 qianyu  staff   2.4K Dec 14 03:14 [34mbrooks[m[m
-rw-------    1 qianyu  staff   1.8K Dec 17 14:49 process_json.py
-rw-------    1 qianyu  staff   131K Dec 20 11:20 product_features.csv
-rw-r--r--    1 qianyu  staff   864K Dec 20 11:20 product_reviews.csv
