## Environment Info

In [1]:
!python --version

Python 2.7.14 :: Anaconda, Inc.


In [101]:
!ls -l -h -d adidas/
!ls adidas/ |wc

drwxr-xr-x  147 qianyu  staff   4.9K Dec 14 17:22 [34madidas/[m[m
     145     145    1791



## EDA of features of 1 Json file

In [163]:
import json
from pprint import pprint

data = json.load(open('data/reviews/nike/AH2142.json'))

In [164]:
data.keys()

[u'rating',
 u'colorVariation',
 u'price',
 u'reviewData',
 u'productName',
 u'reviews',
 u'productMeta',
 u'productId',
 u'productImageLink',
 u'brand',
 u'modelId']

In [165]:
for key,value in data.iteritems():
    print "{}\t{}".format(key, type(value))

print len(data)

rating	<type 'int'>
colorVariation	<type 'list'>
price	<type 'int'>
reviewData	<type 'list'>
productName	<type 'unicode'>
reviews	<type 'NoneType'>
productMeta	<type 'dict'>
productId	<type 'unicode'>
productImageLink	<type 'unicode'>
brand	<type 'unicode'>
modelId	<type 'unicode'>
11


In [166]:
data['productMeta'].keys()

[u'specs', u'mediaData', u'productUrl', u'sourcePrice', u'description']

In [154]:
data['colorVariation'][0]

{u'color': u'Pearl Pink/Bright Melon/Wolf Grey/Cool Grey',
 u'colorCode': u'604',
 u'price': 109.97,
 u'productId': u'11390978',
 u'productImageLink': u'https://c.static-nike.com/a/images/t_default/n1iph2moknkk7knitxlr/flyknit-racer-unisex-running-shoe-89TqXv1x.jpg'}

#### Json file Observation

* ProductMeta has important product features in dict
    * Product features can be used for feature engineering and classification
* ReviewData is a list of dict. Key feature is rating, ratingDate, reviewComments, reviewTitle
    * reviewComments and reviewTitle can be combined for sentiment analysis
* We cannot get actually color information from colorVariation. It has a link to the shoe. 
    * price maybe different for different colors
    * Need to decide later on how to deal with this data



## Building data frame for product feature and reviews

* Each json file is 1 product from 1 website
* Process each json file at a time and produce
    * A data frame for reviews, multiple rows and need to have productID
    * A product feature data frame, 1 row and need to use the same productID
* Treat same product from different website
    * Match product ID
    * Join the table?

Data Process Functions

In [171]:
import json
import pandas as pd
import itertools
import sys
from collections import defaultdict

def reviewData2DF(review_data):
    df = pd.DataFrame.from_dict(review_data)
    df['review'] = df['reviewTitle'] + ' ' + df['reviewComments']
    df = df.drop(['reviewTitle', 'reviewComments'], axis = 1)
    if 'badges' in df:
        df = df.drop(['badges'], axis = 1)
    if 'images' in df:    
        df = df.drop(['images'], axis = 1)
    if 'reviewAuthor' in df:
        df = df.drop(['reviewAuthor'], axis = 1)
    if 'reviewID' in df:
        df = df.drop(['reviewId'], axis = 1)
    return df

def productmeta2DF(meta_data):
    df = pd.DataFrame.from_dict([meta_data])
    
    ## drop fields:
    if 'mediaData' in meta_data:    
        df = df.drop(['mediaData'], axis = 1)
    if 'productUrl' in meta_data:
        df = df.drop(['productUrl'], axis = 1)
    if 'title' in meta_data:
        df = df.drop(['title'], axis = 1)
    if 'subtitle' in meta_data:
        df = df.drop(['subtitle'], axis = 1)
    
    # process 'specs' 
    if 'specs' in meta_data:
        df.loc[0, 'specs'] = u' '.join(df.loc[0, 'specs']).encode('utf-8').strip()
    else:
        df.loc[0, 'specs'] = 'NaN'
    
    # process 'description'
    if 'description' not in meta_data:
        df.loc[0, 'description'] = 'NaN'
    return df

def read_json(json_file):
    # Process Json fields
    review_header = ['rating', 'ratingDate', 'review', 'modelId']
    feature_header = ['description', 'specs', 'rating', 'brand', 'gender', 'price', 'productName', 'modelId']
    df_review = pd.DataFrame(columns=review_header)
    df_productspec = pd.DataFrame(columns=feature_header)
    # Default gender
    gender = 'N' # Neut
    
    for key,value in json_file.iteritems():
        if key == 'productMeta':
            df_productspec = productmeta2DF(value)
        elif key == 'reviewData':
            if value:
                df_review = reviewData2DF(value)
        elif key == 'rating':
            rating = value
        elif key == 'gender':
            gender = value
        elif key == 'productName':
            productName = value
        elif key == 'brand':
            brand = value
        elif key == 'price':
            price = value
        elif key == 'modelId':
            modelId = value
    
    # Complete product spec data frame fields
    df_productspec['rating'] = rating
    df_productspec['brand'] = brand
    df_productspec['gender'] = gender
    df_productspec['price'] = price
    df_productspec['productName'] = productName
    df_productspec['modelId'] = modelId
    
    # Complete Review data frame fields
    df_review['modelId'] = [modelId] * df_review.shape[0]
    
    return df_productspec, df_review

### Exam Json DataFrame

In [172]:
data = json.load(open('data/reviews/nike/AH2142.json'))
df_product, df_review = read_json(data)

In [157]:
for key,value in data.iteritems():
    print "{}\t{}".format(key, type(value))

print len(data)

rating	<type 'float'>
colorVariation	<type 'list'>
color	<type 'unicode'>
gender	<type 'unicode'>
brand	<type 'unicode'>
colorCode	<type 'unicode'>
reviewData	<type 'list'>
productName	<type 'unicode'>
reviews	<type 'int'>
productMeta	<type 'dict'>
productId	<type 'unicode'>
productImageLink	<type 'unicode'>
price	<type 'float'>
modelId	<type 'unicode'>
14


In [158]:
df_review

Unnamed: 0,rating,ratingDate,reviewId,review,modelId
0,2,12/05/2017,176755352,Would be perfect if they lasted.. Favourite tr...,526628
1,5,11/14/2017,175999398,So light... They feel so light you barely feel...,526628
2,4,10/21/2017,173858734,Defective??? I got a pair of these bad boys ea...,526628
3,5,10/19/2017,173819363,By far best shoes!!!!! I run in-between 60-70 ...,526628
4,5,10/10/2017,173269645,AWESOME SHOE Great shoe very light weight and ...,526628
5,5,10/10/2017,173172817,Fast mother f-er Crazy fast shoe - toe tight l...,526628
6,5,10/09/2017,173072592,Awesome Shoe! Purchased this shoe after having...,526628
7,5,09/23/2017,170274791,Great shoe I have close to 400 miles on my cur...,526628
8,2,09/13/2017,169975635,120 bucks for 4 weeks ? Indeed ?! My previous ...,526628
9,5,09/12/2017,169942969,Great! Super comfortable for walking and the a...,526628


In [173]:
df_product

Unnamed: 0,description,sourcePrice,specs,rating,brand,gender,price,productName,modelId
0,Nike Free Rn 2017 iD,70,Knit material is soft and breathable Flexible ...,0,Nike,N,70,Nike Free Rn 2017 iD,AH2142


### Process multiple Json files

In [175]:
import glob   

#JSON_PATH = 'brooks/*.json'   
JSON_PATH = 'data/reviews/nike/*.json'
file_list = glob.glob(JSON_PATH) 

feature_dfs = []
review_dfs = []

count = 0
for json_file in file_list:
    #print "Process {} ...".format(json_file)
    json_data = json.load(open(json_file))
    df_product, df_review = read_json(json_data)
    feature_dfs.append(df_product)
    review_dfs.append(df_review)
    count += 1

print "Processed total {:d} JSON files".format(count)

Processed total 140 JSON files



### Generate and post-process product spec data frame

In [159]:
productspec_df = pd.concat(feature_dfs)
productspec_df.reset_index(drop=True, inplace=True)

print "Product Spec DF Shape: {}".format(productspec_df.shape)
productspec_df.head()

Product Spec DF Shape: (118, 9)


Unnamed: 0,description,sourcePrice,specs,rating,brand,gender,price,productName,modelId
0,The Nike Flyknit Racer Unisex Running Shoe pro...,150.0,Forefoot Nike Zoom Air unit for ultra-responsi...,4.6,Nike,F/M,109.97,Nike Flyknit Racer,526628
1,Built for the future of running and those who ...,175.0,Flywire integrates with the laces for a dynami...,4.4,Nike,M,114.97,Nike LunarEpic Flyknit,818676
2,The Nike Revolution 3 Big Kids' Running Shoe h...,58.0,"Mesh upper has no-sew overlays for a snug, bre...",5.0,Nike,K,58.0,Nike Revolution 3,819413
3,The Nike Revolution 3 Little Kids' Running Sho...,52.0,"Mesh upper has no-sew overlays for a snug, bre...",3.6,Nike,K,52.0,Nike Revolution 3,819414
4,The Nike Revolution 3 Infant/Toddler Shoe has ...,44.0,"Mesh upper has no-sew overlays for a snug, bre...",,Nike,K,34.97,Nike Revolution 3,819415


#### Check duplicates by the whole roles

In [115]:
productspec_df.drop_duplicates
productspec_df.shape

(71, 8)

#### Check duplicates by shoe models

In [116]:
sum(productspec_df.duplicated('modelId'))

0

#### Duplicate models

In [117]:
duplicates = productspec_df[productspec_df.duplicated('modelId', keep=False)]
duplicates.sort_values(by = ['modelId'])

Unnamed: 0,description,specs,rating,brand,gender,price,productName,modelId


#### After exam the rows, we can remove duplicate rows. The whole row does not match maybe due to while spaces

In [118]:
productspec_df.drop_duplicates(['modelId'], keep = 'first', inplace=True)
productspec_df.shape

(71, 8)

### Generate and post-process review data frame

In [119]:
reviewData_df = pd.concat(review_dfs)
reviewData_df.reset_index(drop=True, inplace=True)

print "Review Data DF Shape: {}".format(reviewData_df.shape)
reviewData_df.head()

Review Data DF Shape: (6166, 4)


Unnamed: 0,rating,ratingDate,review,modelId
0,5,12/01/2017,This shoe is truly an amazing thing!,100029
1,2,11/26/2017,"Not for me Just not a comfortable shoe for me,...",110196
2,5,11/18/2017,Yes,110196
3,5,11/02/2017,Best shoes ever Brooks gave me new life... I h...,110196
4,5,11/02/2017,My 10th pair My 10th Bought my 10th pair. I ha...,110196


#### Check for duplicated reivews

In [120]:
reviewData_df.drop_duplicates
reviewData_df.shape

(6166, 4)

In [121]:
sum(reviewData_df.duplicated('review'))

311

In [122]:
duplicates = reviewData_df[reviewData_df.duplicated('review', keep=False)]
duplicates.sort_values(by = ['review'])

Unnamed: 0,rating,ratingDate,review,modelId
640,5,10/14/2017,#ReviewSweeps,110227
4655,5,10/21/2017,#ReviewSweeps,120234
1184,5,10/29/2017,Adrenaline GTS 17,110241
1360,4,04/24/2017,Adrenaline GTS 17,110241
1449,5,06/24/2017,Awesome,110242
2047,5,08/22/2017,Awesome,110257
157,5,04/09/2017,Awesome,110211
6110,5,12/07/2017,Awesome,120268
3603,5,04/14/2017,Awesome,120225
4356,5,06/11/2017,Awesome shoes,120231


#### Remove duplications if review field is not empty

In [123]:
print sum(reviewData_df['review'] == ' ')
empty_review_rows = reviewData_df[reviewData_df['review'] == ' ']
empty_review_rows.head(5)

0


Unnamed: 0,rating,ratingDate,review,modelId


In [124]:
reviewData_df.drop_duplicates(['review'], keep = 'first', inplace=True)
reviewData_df.shape

(5855, 4)

In [125]:
# Add back the empty rows
reviewData_df = pd.concat([reviewData_df, empty_review_rows])
reviewData_df.shape

(5855, 4)

In [126]:
reviewData_df.head(5)

Unnamed: 0,rating,ratingDate,review,modelId
0,5,12/01/2017,This shoe is truly an amazing thing!,100029
1,2,11/26/2017,"Not for me Just not a comfortable shoe for me,...",110196
2,5,11/18/2017,Yes,110196
3,5,11/02/2017,Best shoes ever Brooks gave me new life... I h...,110196
4,5,11/02/2017,My 10th pair My 10th Bought my 10th pair. I ha...,110196


In [127]:
reviewData_df.tail(5)

Unnamed: 0,rating,ratingDate,review,modelId
6161,5,11/04/2017,FINALLY!! My Adrenaline has come home! I am a...,120268
6162,1,11/01/2017,WHY did they change the style????????!!!!!!!!!...,120268
6163,5,10/28/2017,I love these shoes I bought these shoes today ...,120268
6164,5,10/28/2017,Such an improvement I was lucky enough to purc...,120268
6165,5,10/25/2017,AWESOME UPDATES!!! I have worn many brands and...,120268


## Check Point 1: writing DF to .csv before modeling

In [128]:
reviewData_df.to_csv('product_reviews.csv', encoding='utf-8', index=False)
productspec_df.to_csv('product_features.csv', encoding='utf-8', index=False)

In [129]:
!ls -lh

total 20032
-rw-------    1 qianyu  staff    64K Jan 27 11:33 JSON_EDA.ipynb
-rw-r--r--    1 qianyu  staff   470B Dec 21 20:36 README.md
-rw-r--r--    1 qianyu  staff    45K Dec 22 22:38 Sentiment_model_survey.ipynb
-rw-r--r--    1 qianyu  staff    40K Jan  4 22:54 Sentiment_p2v_model.ipynb
drwxrwxr-x@   4 qianyu  staff   136B Jan  7 12:51 [34m__MACOSX[m[m
drwxr-xr-x  147 qianyu  staff   4.9K Dec 14 17:22 [34madidas[m[m
drwxr-xr-x@ 165 qianyu  staff   5.5K Dec 12 00:21 [34masics[m[m
drwxr-xr-x@  73 qianyu  staff   2.4K Dec 14 03:14 [34mbrooks[m[m
drwx------@   5 qianyu  staff   170B Jan  7 12:38 [34mdata[m[m
-rw-r--r--@   1 qianyu  staff   7.9M Jan 27 09:28 data_full.zip
-rw-r--r--    1 qianyu  staff    19K Jan  4 22:54 product_feature_EDA.ipynb
-rw-------    1 qianyu  staff    16K Jan 27 11:34 product_features.csv
-rw-r--r--    1 qianyu  staff   1.7M Jan 27 11:34 product_reviews.csv
drwxr-xr-x    7 qianyu  staff   238B Jan  4 22:54 [34mshared_lib[m[m


## Process at Amazon Search JSON Data

In [92]:
import json
import jsontree

data = json.load(open('data/searchData/adidas/amazon-15007709_M-p1.json'))

In [97]:
def dict_generator(indict, pre=None):
    pre = pre[:] if pre else []
    if isinstance(indict, dict):
        for key, value in indict.items():
            if isinstance(value, dict):
                for d in dict_generator(value, [key] + pre):
                    yield d
            elif isinstance(value, list) or isinstance(value, tuple):
                for v in value:
                    for d in dict_generator(v, [key] + pre):
                        yield d
            else:
                yield pre + [key, value]
    else:
        yield indict

In [98]:
x = dict_generator(data)

In [99]:
print x.

<generator object dict_generator at 0x1077318c0>


In [100]:
for i in x:
    print i

[u'Items', u'ItemSearchResponse', u'TotalPages', u'0']
[u'Items', u'ItemSearchResponse', u'TotalResults', u'0']
[u'Request', u'Items', u'ItemSearchResponse', u'IsValid', u'True']
[u'Error', u'Errors', u'Request', u'Items', u'ItemSearchResponse', u'Message', u'We did not find any matches for your request.']
[u'Error', u'Errors', u'Request', u'Items', u'ItemSearchResponse', u'Code', u'AWS.ECommerceService.NoExactMatches']
[u'ItemSearchRequest', u'Request', u'Items', u'ItemSearchResponse', u'Keywords', u'15007709_M']
[u'ItemSearchRequest', u'Request', u'Items', u'ItemSearchResponse', u'Brand', u'adidas']
[u'ItemSearchRequest', u'Request', u'Items', u'ItemSearchResponse', u'SearchIndex', u'Shoes']
[u'ItemSearchRequest', u'Request', u'Items', u'ItemSearchResponse', u'ItemPage', u'1']
ItemAttributes
VariationSummary
[u'Items', u'ItemSearchResponse', u'MoreSearchResultsUrl', u'https://www.amazon.com/gp/search?linkCode=xm2&SubscriptionId=AKIAINWLFC6Z6CCKHKUA&keywords=adidas%2015007709_M&tag=32

In [140]:
x = glob.glob('data/reviews/*/*.json')