## Environment Info

In [1]:
!python --version

Python 2.7.14 :: Anaconda, Inc.


In [2]:
!ls -l -h -d adidas/
!ls adidas/ |wc

drwxr-xr-x  147 qianyu  staff   4.9K Dec 14 17:22 [34madidas/[m[m
     145     145    1791



## EDA of features of 1 Json file

In [121]:
import json
from pprint import pprint

data = json.load(open('adidas/15007715_W.json'))

In [122]:
data.keys()

[u'rating',
 u'colorVariation',
 u'gender',
 u'brand',
 u'reviewData',
 u'productName',
 u'reviews',
 u'productMeta',
 u'productId',
 u'productImageLink',
 u'price',
 u'modelId']

In [123]:
for key,value in data.iteritems():
    print "{}\t{}".format(key, type(value))

print len(data)

rating	<type 'float'>
colorVariation	<type 'list'>
gender	<type 'unicode'>
brand	<type 'unicode'>
reviewData	<type 'list'>
productName	<type 'unicode'>
reviews	<type 'int'>
productMeta	<type 'dict'>
productId	<type 'unicode'>
productImageLink	<type 'unicode'>
price	<type 'int'>
modelId	<type 'unicode'>
12


In [124]:
data['productMeta'].keys()

[u'specs', u'mediaData', u'productUrl', u'description', u'title']

In [125]:
data['reviewData']

[{u'badges': [],
  u'images': [],
  u'rating': 4,
  u'ratingDate': u'08/15/2017',
  u'reviewAuthor': u'sondn',
  u'reviewComments': u'beautifullllllllllllllllllllllllllllllllllllllllllll',
  u'reviewId': u'92077037',
  u'reviewTitle': u''},
 {u'badges': [],
  u'images': [],
  u'rating': 1,
  u'ratingDate': u'05/08/2017',
  u'reviewAuthor': u'CJS2017',
  u'reviewComments': u'The actual product came out looking much different than the online rendering.',
  u'reviewId': u'86186230',
  u'reviewTitle': u'Disappointing'},
 {u'badges': [],
  u'images': [],
  u'rating': 1,
  u'ratingDate': u'04/10/2017',
  u'reviewAuthor': u'CBC22',
  u'reviewComments': u"These shoes look nothing like the picture! I expected a grey color...they aren't grey at all..more like cream..I'm so disappointed that I spent so much money on a pair of shoes that I hate and can't return.",
  u'reviewId': u'84185693',
  u'reviewTitle': u''},
 {u'badges': [],
  u'images': [u'https://adidas.ugc.bazaarvoice.com/1576-en_us/6409

In [8]:
data['colorVariation'][0]

{u'price': 160,
 u'productId': u'15007711_W',
 u'productImageLink': u'https://mifootwear.adidas.com/api2/rewrite/adicad//is/image/adidasAG/agm?&src=ir(adidasAGRender/FTW17_aad_9?&obj=a/f/nvr&show&obj=a/m/str&src=ad3_easy_blue&show&obj=a/m/toe&src=ad3_easy_blue&show&obj=a/m/hel&src=ad3_easy_blue&show&obj=a/m/lin&src=ad3_easy_blue&show&obj=a/m/2la&src=ad3_mystery_blue&show&obj=a/m/out&src=ad3_clay_brown&show&obj=a/m/soc&src=ad3_white&show&obj=a/m/slo&src=ad3_core_black&show&obj=a/m/clo&src=ad3_core_black&show&obj=a/o/bas&src=ad3_easy_blue&show&obj=a/o/lac_j0&src=ad3_easy_blue&show&obj=a/o/tla_t1&src=ad3_easy_blue&show&obj=a/o/mpa_m2&src=ad3_stone&show&obj=a/o/wla_p0&src=ad3_white&show&obj=a/o/tlc_c0&src=ad3_white&show&obj=a/o/adi_t1&src=ad3_silver_metallic&show&obj=a/o/jew_j0&show&obj=a/o/mp2_w1&src=ad3_white&show&obj=a/o/plo_p0&show&obj=a/o/emb_n0&decal&res=60&pos=0,0&src=fxg(undefined?&$variant=name_embroidery_off&$text=%20&$color=%20)&show&obj=a&req=object)&resMode=sharp2&op_usm=1.2,1

#### Json file Observation

* ProductMeta has important product features in dict
    * Product features can be used for feature engineering and classification
* ReviewData is a list of dict. Key feature is rating, ratingDate, reviewComments, reviewTitle
    * reviewComments and reviewTitle can be combined for sentiment analysis
* We cannot get actually color information from colorVariation. It has a link to the shoe. 
    * price maybe different for different colors
    * Need to decide later on how to deal with this data



## Building data frame for product feature and reviews

* Each json file is 1 product from 1 website
* Process each json file at a time and produce
    * A data frame for reviews, multiple rows and need to have productID
    * A product feature data frame, 1 row and need to use the same productID
* Treat same product from different website
    * Match product ID
    * Join the table?

Data Process Functions

In [129]:
import json
import pandas as pd
import itertools
import sys
from collections import defaultdict

def reviewData2DF(review_data):
    df = pd.DataFrame.from_dict(review_data)
    df['review'] = df['reviewTitle'] + ' ' + df['reviewComments']
    df = df.drop(['reviewTitle', 'reviewComments'], axis = 1)
    if 'badges' in df:
        df = df.drop(['badges'], axis = 1)
    if 'images' in df:    
        df = df.drop(['images'], axis = 1)
    if 'reviewAuthor' in df:
        df = df.drop(['reviewAuthor'], axis = 1)
    if 'reviewID' in df:
        df = df.drop(['reviewId'], axis = 1)
    return df

def productmeta2DF(meta_data):
    df = pd.DataFrame.from_dict([meta_data])
    
    ## drop fields:
    if 'mediaData' in meta_data:    
        df = df.drop(['mediaData'], axis = 1)
    if 'productUrl' in meta_data:
        df = df.drop(['productUrl'], axis = 1)
    if 'title' in meta_data:
        df = df.drop(['title'], axis = 1)
    if 'subtitle' in meta_data:
        df = df.drop(['subtitle'], axis = 1)
    
    # process 'specs' 
    if 'specs' in meta_data:
        df.loc[0, 'specs'] = u' '.join(df.loc[0, 'specs']).encode('utf-8').strip()
    else:
        df.loc[0, 'specs'] = 'NaN'
    
    # process 'description'
    if 'description' not in meta_data:
        df.loc[0, 'description'] = 'NaN'
    return df

def read_json(json_file):
    # Process Json fields
    review_header = ['rating', 'ratingDate', 'review', 'modelId']
    feature_header = ['description', 'specs', 'rating', 'brand', 'gender', 'price', 'productName', 'modelId']
    df_review = pd.DataFrame(columns=review_header)
    df_productspec = pd.DataFrame(columns=feature_header)
    
    for key,value in json_file.iteritems():
        if key == 'productMeta':
            df_productspec = productmeta2DF(value)
        elif key == 'reviewData':
            if value:
                df_review = reviewData2DF(value)
        elif key == 'rating':
            rating = value
        elif key == 'gender':
            gender = value
        elif key == 'productName':
            productName = value
        elif key == 'brand':
            brand = value
        elif key == 'price':
            price = value
        elif key == 'modelId':
            modelId = value
    
    # Complete product spec data frame fields
    df_productspec['rating'] = rating
    df_productspec['brand'] = brand
    df_productspec['gender'] = gender
    df_productspec['price'] = price
    df_productspec['productName'] = productName
    df_productspec['modelId'] = modelId
    
    # Complete Review data frame fields
    df_review['modelId'] = [modelId] * df_review.shape[0]
    
    return df_productspec, df_review

### Exam Json DataFrame

In [130]:
data = json.load(open('brooks/100029.json'))
df_product, df_review = read_json(data)

In [131]:
for key,value in data.iteritems():
    print "{}\t{}".format(key, type(value))

print len(data)

rating	<type 'int'>
colorVariation	<type 'list'>
color	<type 'unicode'>
gender	<type 'unicode'>
brand	<type 'unicode'>
reviewData	<type 'list'>
productName	<type 'unicode'>
reviews	<type 'int'>
productMeta	<type 'dict'>
productId	<type 'unicode'>
productImageLink	<type 'unicode'>
price	<type 'int'>
modelId	<type 'unicode'>
13


In [132]:
df_review

Unnamed: 0,rating,ratingDate,review,modelId
0,5,12/01/2017,This shoe is truly an amazing thing!,100029


In [133]:
df_product

Unnamed: 0,description,specs,rating,brand,gender,price,productName,modelId
0,"An unrelenting middle-distance track spike, de...",Support: Neutral Midsole Drop: 0mm Weight: 4.2...,5,Brooks,M,120,ELMN8 v4,100029


### Process multiple Json files

In [134]:
import glob   

JSON_PATH = 'brooks/*.json'   
file_list = glob.glob(JSON_PATH) 

feature_dfs = []
review_dfs = []

count = 0
for json_file in file_list:
    # print "Process {} ...".format(json_file)
    json_data = json.load(open(json_file))
    df_product, df_review = read_json(json_data)
    feature_dfs.append(df_product)
    review_dfs.append(df_review)
    count += 1

print "Processed total {:d} JSON files".format(count)

Processed total 71 JSON files



### Generate and post-process product spec data frame

In [135]:
productspec_df = pd.concat(feature_dfs)
productspec_df.reset_index(drop=True, inplace=True)

print "Product Spec DF Shape: {}".format(productspec_df.shape)
productspec_df.head()

Product Spec DF Shape: (71, 8)


Unnamed: 0,description,specs,rating,brand,gender,price,productName,modelId
0,"An unrelenting middle-distance track spike, de...",Support: Neutral Midsole Drop: 0mm Weight: 4.2...,5.0,Brooks,M,120,ELMN8 v4,100029
1,"All-out speed at 400 meters or less, designed ...",Support: Neutral Midsole Drop: 0mm Weight: 4.7...,0.0,Brooks,F,110,QW-K v3,100030
2,"A nimble racing flat meets quick track spike, ...",Support: Neutral Midsole Drop: 0mm Weight: 4.7...,0.0,Brooks,F,120,Wire v5,100031
3,,Support: Support Arch: Flat Country of Origin:...,4.5,Brooks,F,120,Addiction 12,110196
4,,"Support: Neutral Arch: Flat, Medium, High Coun...",4.5,Brooks,F,130,Neuro,110211


#### Check duplicates by the whole roles

In [82]:
productspec_df.drop_duplicates
productspec_df.shape

(71, 8)

#### Check duplicates by shoe models

In [83]:
sum(productspec_df.duplicated('modelId'))

0

#### Duplicate models

In [84]:
duplicates = productspec_df[productspec_df.duplicated('modelId', keep=False)]
duplicates.sort_values(by = ['modelId'])

Unnamed: 0,description,specs,rating,brand,gender,price,productName,modelId


#### After exam the rows, we can remove duplicate rows. The whole row does not match maybe due to while spaces

In [85]:
productspec_df.drop_duplicates(['modelId'], keep = 'first', inplace=True)
productspec_df.shape

(71, 8)

### Generate and post-process review data frame

In [90]:
reviewData_df = pd.concat(review_dfs)
reviewData_df.reset_index(drop=True, inplace=True)

print "Review Data DF Shape: {}".format(reviewData_df.shape)
reviewData_df.head()

Review Data DF Shape: (6166, 5)


Unnamed: 0,modelId,rating,ratingDate,review,reviewAuthor
0,100029,5,12/01/2017,This shoe is truly an amazing thing!,John
1,110196,2,11/26/2017,"Not for me Just not a comfortable shoe for me,...",Gary
2,110196,5,11/18/2017,Yes,Zell
3,110196,5,11/02/2017,Best shoes ever Brooks gave me new life... I h...,Steve
4,110196,5,11/02/2017,My 10th pair My 10th Bought my 10th pair. I ha...,Lou


#### Check for duplicated reivews

In [136]:
reviewData_df.drop_duplicates
reviewData_df.shape

(6166, 5)

In [137]:
sum(reviewData_df.duplicated('review'))

311

In [138]:
duplicates = reviewData_df[reviewData_df.duplicated('review', keep=False)]
duplicates.sort_values(by = ['review'])

Unnamed: 0,modelId,rating,ratingDate,review,reviewAuthor
640,110227,5,10/14/2017,#ReviewSweeps,Grandaddy
4655,120234,5,10/21/2017,#ReviewSweeps,Jodi
1184,110241,5,10/29/2017,Adrenaline GTS 17,seventyplus
1360,110241,4,04/24/2017,Adrenaline GTS 17,Former Beast Lover
1449,110242,5,06/24/2017,Awesome,Trainmaster
2047,110257,5,08/22/2017,Awesome,Joe
157,110211,5,04/09/2017,Awesome,Matthew Purdy
6110,120268,5,12/07/2017,Awesome,amyb
3603,120225,5,04/14/2017,Awesome,Haley
4356,120231,5,06/11/2017,Awesome shoes,Cat


#### Remove duplications if review field is not empty

In [139]:
print sum(reviewData_df['review'] == ' ')
empty_review_rows = reviewData_df[reviewData_df['review'] == ' ']
empty_review_rows.head(5)

0


Unnamed: 0,modelId,rating,ratingDate,review,reviewAuthor


In [140]:
reviewData_df.drop_duplicates(['review'], keep = 'first', inplace=True)
reviewData_df.shape

(5855, 5)

In [141]:
# Add back the empty rows
reviewData_df = pd.concat([reviewData_df, empty_review_rows])
reviewData_df.shape

(5855, 5)

In [142]:
reviewData_df.head(5)

Unnamed: 0,modelId,rating,ratingDate,review,reviewAuthor
0,100029,5,12/01/2017,This shoe is truly an amazing thing!,John
1,110196,2,11/26/2017,"Not for me Just not a comfortable shoe for me,...",Gary
2,110196,5,11/18/2017,Yes,Zell
3,110196,5,11/02/2017,Best shoes ever Brooks gave me new life... I h...,Steve
4,110196,5,11/02/2017,My 10th pair My 10th Bought my 10th pair. I ha...,Lou


In [143]:
reviewData_df.tail(5)

Unnamed: 0,modelId,rating,ratingDate,review,reviewAuthor
6161,120268,5,11/04/2017,FINALLY!! My Adrenaline has come home! I am a...,RN/Soccer Mom/HIIT Addict
6162,120268,1,11/01/2017,WHY did they change the style????????!!!!!!!!!...,Kimbo
6163,120268,5,10/28/2017,I love these shoes I bought these shoes today ...,Ingrid
6164,120268,5,10/28/2017,Such an improvement I was lucky enough to purc...,Joy
6165,120268,5,10/25/2017,AWESOME UPDATES!!! I have worn many brands and...,Kimber


## Check Point 1: writing DF to .csv before modeling

In [23]:
reviewData_df.to_csv('product_reviews.csv', encoding='utf-8', index=False)
productspec_df.to_csv('product_features.csv', encoding='utf-8', index=False)

In [24]:
!ls -lh

total 2336
-rw-------    1 qianyu  staff    71K Jan  4 19:55 JSON_EDA.ipynb
-rw-r--r--    1 qianyu  staff   470B Dec 21 20:36 README.md
-rw-r--r--    1 qianyu  staff    45K Dec 22 22:38 Sentiment_model_survey.ipynb
-rw-r--r--    1 qianyu  staff    39K Dec 23 00:45 Sentiment_p2v_model.ipynb
drwxr-xr-x  147 qianyu  staff   4.9K Dec 14 17:22 [34madidas[m[m
drwxr-xr-x@ 165 qianyu  staff   5.5K Dec 12 00:21 [34masics[m[m
drwxr-xr-x@  73 qianyu  staff   2.4K Dec 14 03:14 [34mbrooks[m[m
-rw-r--r--    1 qianyu  staff   6.1K Dec 29 11:34 product_feature_EDA.ipynb
-rw-------    1 qianyu  staff   127K Jan  4 19:57 product_features.csv
-rw-r--r--    1 qianyu  staff   865K Jan  4 19:57 product_reviews.csv
drwxr-xr-x    7 qianyu  staff   238B Dec 23 00:21 [34mshared_lib[m[m
