## Environment Info

In [1]:
!python --version

Python 2.7.14 :: Anaconda, Inc.


In [2]:
!ls -l -h -d adidas/
!ls adidas/ |wc

drwxr-xr-x  147 qianyu  staff   4.9K Dec 14 17:22 [34madidas/[m[m
     145     145    1791



## EDA of features of 1 Json file

In [3]:
import json
from pprint import pprint

data = json.load(open('adidas/15007715_W.json'))

In [4]:
data.keys()

[u'rating',
 u'colorVariation',
 u'gender',
 u'brand',
 u'reviewData',
 u'productName',
 u'reviews',
 u'productMeta',
 u'productId',
 u'productImageLink',
 u'price',
 u'modelId']

In [5]:
for key,value in data.iteritems():
    print "{}\t{}".format(key, type(value))

print len(data)

rating	<type 'float'>
colorVariation	<type 'list'>
gender	<type 'unicode'>
brand	<type 'unicode'>
reviewData	<type 'list'>
productName	<type 'unicode'>
reviews	<type 'int'>
productMeta	<type 'dict'>
productId	<type 'unicode'>
productImageLink	<type 'unicode'>
price	<type 'int'>
modelId	<type 'unicode'>
12


In [6]:
data['productMeta'].keys()

[u'specs', u'mediaData', u'productUrl', u'description', u'title']

In [7]:
data['reviewData'][1]

{u'badges': [],
 u'images': [],
 u'rating': 1,
 u'ratingDate': u'05/08/2017',
 u'reviewAuthor': u'CJS2017',
 u'reviewComments': u'The actual product came out looking much different than the online rendering.',
 u'reviewId': u'86186230',
 u'reviewTitle': u'Disappointing'}

In [8]:
data['colorVariation'][0]

{u'price': 160,
 u'productId': u'15007711_W',
 u'productImageLink': u'https://mifootwear.adidas.com/api2/rewrite/adicad//is/image/adidasAG/agm?&src=ir(adidasAGRender/FTW17_aad_9?&obj=a/f/nvr&show&obj=a/m/str&src=ad3_easy_blue&show&obj=a/m/toe&src=ad3_easy_blue&show&obj=a/m/hel&src=ad3_easy_blue&show&obj=a/m/lin&src=ad3_easy_blue&show&obj=a/m/2la&src=ad3_mystery_blue&show&obj=a/m/out&src=ad3_clay_brown&show&obj=a/m/soc&src=ad3_white&show&obj=a/m/slo&src=ad3_core_black&show&obj=a/m/clo&src=ad3_core_black&show&obj=a/o/bas&src=ad3_easy_blue&show&obj=a/o/lac_j0&src=ad3_easy_blue&show&obj=a/o/tla_t1&src=ad3_easy_blue&show&obj=a/o/mpa_m2&src=ad3_stone&show&obj=a/o/wla_p0&src=ad3_white&show&obj=a/o/tlc_c0&src=ad3_white&show&obj=a/o/adi_t1&src=ad3_silver_metallic&show&obj=a/o/jew_j0&show&obj=a/o/mp2_w1&src=ad3_white&show&obj=a/o/plo_p0&show&obj=a/o/emb_n0&decal&res=60&pos=0,0&src=fxg(undefined?&$variant=name_embroidery_off&$text=%20&$color=%20)&show&obj=a&req=object)&resMode=sharp2&op_usm=1.2,1

#### Json file Observation

* ProductMeta has important product features in dict
    * Product features can be used for feature engineering and classification
* ReviewData is a list of dict. Key feature is rating, ratingDate, reviewComments, reviewTitle
    * reviewComments and reviewTitle can be combined for sentiment analysis
* We cannot get actually color information from colorVariation. It has a link to the shoe. 
    * price maybe different for different colors
    * Need to decide later on how to deal with this data



## Building data frame for product feature and reviews

* Each json file is 1 product from 1 website
* Process each json file at a time and produce
    * A data frame for reviews, multiple rows and need to have productID
    * A product feature data frame, 1 row and need to use the same productID
* Treat same product from different website
    * Match product ID
    * Join the table?

Data Process Functions

In [9]:
import json
import pandas as pd
import itertools
import sys
from collections import defaultdict

def reviewData2DF(review_data):
    df = pd.DataFrame.from_dict(review_data)
    df = df.drop(['badges', 'images', 'reviewAuthor', 'reviewId'], axis = 1)
    return df

def productmeta2DF(meta_data):
    df = pd.DataFrame.from_dict([meta_data])
    
    ## drop fields:
    if 'mediaData' in meta_data:    
        df = df.drop(['mediaData'], axis = 1)
    if 'productUrl' in meta_data:
        df = df.drop(['productUrl'], axis = 1)
    if 'title' in meta_data:
        df = df.drop(['title'], axis = 1)
    if 'subtitle' in meta_data:
        df = df.drop(['subtitle'], axis = 1)
    
    # process 'specs' 
    if 'specs' in meta_data:
        df.loc[0, 'specs'] = u' '.join(df.loc[0, 'specs']).encode('utf-8').strip()
    else:
        df.loc[0, 'specs'] = 'NaN'
    
    # process 'description'
    if 'description' not in meta_data:
        df.loc[0, 'description'] = 'NaN'
    return df

def read_json(json_file):
    # Process Json fields
    review_header = ['rating', 'ratingDate', 'reviewComments', 'reviewTitle', 'modelId']
    feature_header = ['description', 'specs', 'rating', 'brand', 'gender', 'price', 'productName', 'modelId']
    df_review = pd.DataFrame(columns=review_header)
    df_productspec = pd.DataFrame(columns=feature_header)
    
    for key,value in json_file.iteritems():
        if key == 'productMeta':
            df_productspec = productmeta2DF(value)
        elif key == 'reviewData':
            if value:
                df_review = reviewData2DF(value)
        elif key == 'rating':
            rating = value
        elif key == 'gender':
            gender = value
        elif key == 'productName':
            productName = value
        elif key == 'brand':
            brand = value
        elif key == 'price':
            price = value
        elif key == 'modelId':
            modelId = value
    
    # Complete product spec data frame fields
    df_productspec['rating'] = rating
    df_productspec['brand'] = brand
    df_productspec['gender'] = gender
    df_productspec['price'] = price
    df_productspec['productName'] = productName
    df_productspec['modelId'] = modelId
    
    # Complete Review data frame fields
    df_review['modelId'] = [modelId] * df_review.shape[0]
    
    return df_productspec, df_review

### Exam Json DataFrame

In [10]:
data = json.load(open('adidas/15007715_W.json'))
df_product, df_review = read_json(data)

In [11]:
for key,value in data.iteritems():
    print "{}\t{}".format(key, type(value))

print len(data)

rating	<type 'float'>
colorVariation	<type 'list'>
gender	<type 'unicode'>
brand	<type 'unicode'>
reviewData	<type 'list'>
productName	<type 'unicode'>
reviews	<type 'int'>
productMeta	<type 'dict'>
productId	<type 'unicode'>
productImageLink	<type 'unicode'>
price	<type 'int'>
modelId	<type 'unicode'>
12


In [12]:
df_review

Unnamed: 0,rating,ratingDate,reviewComments,reviewTitle,modelId
0,4,08/15/2017,beautifullllllllllllllllllllllllllllllllllllll...,,4002178_W
1,1,05/08/2017,The actual product came out looking much diffe...,Disappointing,4002178_W
2,1,04/10/2017,These shoes look nothing like the picture! I e...,,4002178_W
3,1,02/26/2017,I ordered this shoe because i loved the displa...,color sample was way off,4002178_W


In [13]:
df_product

Unnamed: 0,description,specs,rating,brand,gender,price,productName,modelId
0,mi adizero adios 3 Shoes feature an energy-ret...,Runner type: neut boost™ is our most responsiv...,1.8,Adidas,W,160,mi adizero adios 3 Shoes,4002178_W


### Process multiple Json files

In [14]:
import glob   

JSON_PATH = 'adidas/*.json'   
file_list = glob.glob(JSON_PATH) 

feature_dfs = []
review_dfs = []

count = 0
for json_file in file_list:
    # print "Process {} ...".format(json_file)
    json_data = json.load(open(json_file))
    df_product, df_review = read_json(json_data)
    feature_dfs.append(df_product)
    review_dfs.append(df_review)
    count += 1

print "Processed total {:d} JSON files".format(count)

Processed total 145 JSON files



### Generate and post-process product spec data frame

In [69]:
productspec_df = pd.concat(feature_dfs)
productspec_df.reset_index(drop=True, inplace=True)

print "Product Spec DF Shape: {}".format(productspec_df.shape)
productspec_df.head()

Product Spec DF Shape: (145, 8)


Unnamed: 0,brand,description,gender,modelId,price,productName,rating,specs
0,Adidas,mi adizero adios 3 Shoes feature an energy-ret...,W,4002178_W,160,mi adizero adios 3 Shoes,1.8,Runner type: neut boost™ is our most responsiv...
1,Adidas,,W,4002179_W,100,mi Energy Cloud Shoes,3.3,
2,Adidas,"Designed specifically for women, these shoes a...",W,4002255_W,150,mi PureBoost X Shoes,5.0,Select single-color mesh or graphic-print mesh...
3,Adidas,Create your own adidas Supernova with boost™ t...,W,4002435_W,150,mi Supernova Shoes,3.9,Choose your outsole color for a custom footpri...
4,Adidas,The boost™ midsole in these running shoes rele...,M,4002556_M,160,mi PureBoost Shoes,4.5,boost™ is our most responsive cushioning ever:...


#### Check duplicates by the whole roles

In [70]:
productspec_df.drop_duplicates
productspec_df.shape

(145, 8)

#### Check duplicates by shoe models

In [71]:
productspec_df.duplicated('modelId')

0      False
1      False
2      False
3      False
4      False
5      False
6      False
7       True
8      False
9       True
10      True
11     False
12      True
13      True
14      True
15     False
16     False
17     False
18     False
19     False
20     False
21     False
22     False
23     False
24     False
25     False
26     False
27     False
28     False
29     False
       ...  
115    False
116    False
117    False
118    False
119    False
120    False
121    False
122    False
123    False
124    False
125    False
126    False
127    False
128    False
129    False
130    False
131    False
132    False
133    False
134    False
135    False
136    False
137    False
138    False
139    False
140    False
141    False
142    False
143    False
144    False
Length: 145, dtype: bool

In [72]:
sum(productspec_df.duplicated('modelId'))

6

#### Duplicate models

In [76]:
duplicates = productspec_df[productspec_df.duplicated('modelId', keep=False)]
duplicates.sort_values(by = ['modelId'])

Unnamed: 0,brand,description,gender,modelId,price,productName,rating,specs
0,Adidas,mi adizero adios 3 Shoes feature an energy-ret...,W,4002178_W,160,mi adizero adios 3 Shoes,1.8,Runner type: neut boost™ is our most responsiv...
7,Adidas,mi adizero adios 3 Shoes feature an energy-ret...,W,4002178_W,160,mi adizero adios 3 Shoes,1.8,Runner type: neut boost™ is our most responsiv...
1,Adidas,,W,4002179_W,100,mi Energy Cloud Shoes,3.3,
9,Adidas,,W,4002179_W,100,mi Energy Cloud Shoes,3.3,
2,Adidas,"Designed specifically for women, these shoes a...",W,4002255_W,150,mi PureBoost X Shoes,5.0,Select single-color mesh or graphic-print mesh...
10,Adidas,"Designed specifically for women, these shoes a...",W,4002255_W,150,mi PureBoost X Shoes,5.0,Select single-color mesh or graphic-print mesh...
3,Adidas,Create your own adidas Supernova with boost™ t...,W,4002435_W,150,mi Supernova Shoes,3.9,Choose your outsole color for a custom footpri...
12,Adidas,Create your own adidas Supernova with boost™ t...,W,4002435_W,150,mi Supernova Shoes,3.9,Choose your outsole color for a custom footpri...
4,Adidas,The boost™ midsole in these running shoes rele...,M,4002556_M,160,mi PureBoost Shoes,4.5,boost™ is our most responsive cushioning ever:...
13,Adidas,The boost™ midsole in these running shoes rele...,M,4002556_M,160,mi PureBoost Shoes,4.5,boost™ is our most responsive cushioning ever:...


#### After exam the rows, we can remove duplicate rows. The whole row does not match maybe due to while spaces

In [42]:
productspec_df.drop_duplicates(['modelId'], keep = 'first', inplace=True)
productspec_df.shape

(139, 8)

### Generate and post-process review data frame

In [77]:
reviewData_df = pd.concat(review_dfs)
reviewData_df.reset_index(drop=True, inplace=True)

print "Review Data DF Shape: {}".format(reviewData_df.shape)
reviewData_df.head()

Review Data DF Shape: (4359, 5)


Unnamed: 0,rating,ratingDate,reviewComments,reviewTitle,modelId
0,4,08/15/2017,beautifullllllllllllllllllllllllllllllllllllll...,,4002178_W
1,1,05/08/2017,The actual product came out looking much diffe...,Disappointing,4002178_W
2,1,04/10/2017,These shoes look nothing like the picture! I e...,,4002178_W
3,1,02/26/2017,I ordered this shoe because i loved the displa...,color sample was way off,4002178_W
4,5,09/15/2017,They are comfortable sneakers for working out ...,awesome sneakers,4002179_W


#### Check for duplicated reivews

In [78]:
reviewData_df.drop_duplicates
reviewData_df.shape

(4359, 5)

In [79]:
sum(reviewData_df.duplicated('reviewComments'))

71

In [82]:
duplicates = reviewData_df[reviewData_df.duplicated('reviewComments', keep=False)]
duplicates.sort_values(by = ['reviewComments'])

Unnamed: 0,rating,ratingDate,reviewComments,reviewTitle,modelId
111,5,06/15/2017,AWESOME QUALITY........lack of choices of mate...,AWESOME,4002435_W
28,5,06/15/2017,AWESOME QUALITY........lack of choices of mate...,AWESOME,4002435_W
18,1,08/31/2017,Adidas Superstar RT colour options are very po...,Lack of colour options,4002435_W
101,1,08/31/2017,Adidas Superstar RT colour options are very po...,Lack of colour options,4002435_W
26,5,06/21/2017,Awesome customization options for a variety of...,Love my custom sneakers! Adidas offers the bes...,4002435_W
109,5,06/21/2017,Awesome customization options for a variety of...,Love my custom sneakers! Adidas offers the bes...,4002435_W
21,5,08/09/2017,BEST PURCHASE EVER\r\nI like the new design ve...,Best purchase ever,4002435_W
104,5,08/09/2017,BEST PURCHASE EVER\r\nI like the new design ve...,Best purchase ever,4002435_W
146,2,10/19/2017,"Beautiful looking sneaker, I designed my own c...",,4002556_W
63,2,10/19/2017,"Beautiful looking sneaker, I designed my own c...",,4002556_W


#### Remove duplications

In [85]:
reviewData_df.drop_duplicates(['reviewComments'], keep = 'first', inplace=True)
reviewData_df.shape

(4288, 5)

## Check Point 1: writing DF to .csv before modeling

In [91]:
reviewData_df.to_csv('product_reviews.csv', encoding='utf-8', index=False)
productspec_df.to_csv('product_features.csv', encoding='utf-8', index=False)

In [92]:
!ls -lh

total 2168
-rw-------    1 qianyu  staff    70K Dec 20 11:19 JSON_EDA.ipynb
drwxr-xr-x  147 qianyu  staff   4.9K Dec 14 17:22 [34madidas[m[m
-rw-------    1 qianyu  staff   4.4K Dec 14 17:19 amazon_crawler.py
drwxr-xr-x@ 165 qianyu  staff   5.5K Dec 12 00:21 [34masics[m[m
drwxr-xr-x@  73 qianyu  staff   2.4K Dec 14 03:14 [34mbrooks[m[m
-rw-------    1 qianyu  staff   1.8K Dec 17 14:49 process_json.py
-rw-------    1 qianyu  staff   131K Dec 20 11:20 product_features.csv
-rw-r--r--    1 qianyu  staff   864K Dec 20 11:20 product_reviews.csv
