In [1]:
import gzip
import pandas as pd

In [28]:
def get_meta_data_info(meta_data):
    item_id = meta_data['asin']
    
    # there's multiple category tag, the first [0] 
    # extract the first category and for some reason 
    # that's still in a list, thus the second [0] access
    # the single tag in that list
    category = meta_data['categories'][0][0]
    
    # title and image url are the fields that might be missing
    try:
        name = meta_data['title']
    except KeyError:
        name = None
    try:
        img_url = meta_data['imUrl']
    except KeyError:
        img_url = None

    info = {'item_id': item_id, 'category': category, 'img_url': img_url, 'name': name}
    return info

In [29]:
def get_all_meta_data(path):
    meta_data_infos = []
    g = gzip.open(path, 'r')
    for line in g:
        meta_data = eval(line)
        info = get_meta_data_info(meta_data)
        meta_data_infos.append(info)

    df_meta_data = pd.DataFrame(meta_data_infos)
    mask = ~df_meta_data['img_url'].isnull() & ~df_meta_data['name'].isnull()
    df_meta_data = df_meta_data[mask]
    df_meta_data = df_meta_data[df_meta_data['category'] == 'Clothing, Shoes & Jewelry']
    return df_meta_data

In [30]:
path = 'meta_Clothing_Shoes_and_Jewelry.json.gz'
df_meta_data = get_all_meta_data(path)
print(df_meta_data.shape)
df_meta_data.head()

(1435416, 4)


Unnamed: 0,category,img_url,item_id,name
0,"Clothing, Shoes & Jewelry",http://ecx.images-amazon.com/images/I/31mCncNu...,37214,Purple Sequin Tiny Dancer Tutu Ballet Dance Fa...
1,"Clothing, Shoes & Jewelry",http://ecx.images-amazon.com/images/I/314qZjYe...,31887,Ballet Dress-Up Fairy Tutu
2,"Clothing, Shoes & Jewelry",http://ecx.images-amazon.com/images/I/413tGhqo...,123456479,SHINING IMAGE HUGE PINK LEATHER JEWELRY BOX / ...
3,"Clothing, Shoes & Jewelry",http://ecx.images-amazon.com/images/I/31QZTHxv...,456844570,RiZ Women's Beautify Crafted &frac12; Rimmed F...
4,"Clothing, Shoes & Jewelry",http://ecx.images-amazon.com/images/I/31UsrgT5...,456808574,Lantin White Visor Wrap Around Ski Style Aviat...


In [4]:
def get_review_data_info(review_data):
    item_id = review_data['asin']
    rating = review_data['overall']
    user_id = review_data['reviewerID']
    info = {'item_id': item_id, 'user_id': user_id, 'rating': rating}
    return info

In [None]:
def get_all_review_data(path):
    review_data_infos = []
    g = gzip.open(path, 'r')
    for line in g:
        review_data = eval(line)
        info = get_review_data_info(review_data)
        review_data_infos.append(info)

    df_review_data = pd.DataFrame(review_data_infos)
    return df_review_data

In [5]:
path = 'reviews_Baby.json.gz'
df_review_data = get_all_review_data(path)
print(df_review_data.shape)
df_review_data.head()

(915446, 3)


Unnamed: 0,item_id,rating,user_id
0,188399313,5.0,A28O3NP6WR5517
1,188399399,5.0,AX0M1Z6ZWO52J
2,188399518,4.0,A1KD7N84L7NIUT
3,188399518,3.0,A29CUDEIF4X1UO
4,316967297,4.0,A32592TYN6C9EM


In [6]:
data = df_review_data.merge(df_meta_data, on = 'item_id', how = 'inner')
print(data.shape)
data.head()

(913953, 6)


Unnamed: 0,item_id,rating,user_id,category,img_url,name
0,188399313,5.0,A28O3NP6WR5517,Baby,http://ecx.images-amazon.com/images/I/41Swthpd...,Lifefactory 4oz BPA Free Glass Baby Bottles - ...
1,188399399,5.0,AX0M1Z6ZWO52J,Baby,http://ecx.images-amazon.com/images/I/61x8h9u6...,Planetwise Wipe Pouch
2,188399518,4.0,A1KD7N84L7NIUT,Baby,http://ecx.images-amazon.com/images/I/41otjnA4...,Planetwise Flannel Wipes
3,188399518,3.0,A29CUDEIF4X1UO,Baby,http://ecx.images-amazon.com/images/I/41otjnA4...,Planetwise Flannel Wipes
4,316967297,4.0,A32592TYN6C9EM,Baby,http://ecx.images-amazon.com/images/I/51%2BZ1%...,Annas Dream Full Quilt with 2 Shams
