# Walmart Search and Taxonomy API overview
Search API used for look ups. We will use this first

This method uses a search loop based on a price range.  Walmart only allows a max of 1000 items, even though they have 
20K laptops, for example.  Walmart does allow a search on price range.  If we start at the lowest price and 
end at the highest price, we can capture the entire dataset.

The method returns search result and appends to a json file.  We ran into several problems with json:
1. above page 1 (pages 2 -n) json was not readable into dataframe.  We solved this by adding a /n when appending and stripping it out when reading.  This worked.

2. The other problem is that if a single search returns a null result, the entire json file is unreadable.  This problem is not solved yet.  For this reason we abandoned this method fo now.  The problem is solvable.

In [None]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import requests
import time
import urllib
import json
import sys
import pprint
import os
import random

# We are using 2 api Keys for redundancy and because
# there is a limit to the number of calls
api_key = "4tsbjxvbrwnpcjdsh277csqu"
api_alt = "anrtcd5dheh9758z5wgdy677"

# Base url for all API look ups
base_url = "http://api.walmartlabs.com/v1/" 

# Variables to specify the API
search = "search?"
paginated_products = "paginated/items?"

# Base url for search
search_url = base_url + search
paginated_url = base_url + paginated_products



# Search API

In [None]:
# Load search criteria for Walmart Search API.
# reference URL: https://developer.walmartlabs.com/docs/read/Search_API

format = "json"
categoryId = "3944"
query = "tablet"
numItems = 25 # max 25. 10 is default
start = 1
sort = "price"
order = "desc"
facet = "on" # boolean
min = str(0) # lower end of range for search within price range
max = str(200) # higher end of range price for search with price range
facet_range = (f"price:[{min}%20TO%20{max}]") # note the format [num1:num2] does not work. Use %20TO%20 for ":"



# NOTE: we are using 2 methods to create urls. Because the params argument does not format the output
# in the manner needed to get the API to work.  This is used only for price range lookups

# manual url. Used for price range lookup only
manual_url = (f"{search_url}apiKey={api_alt}&format={format}&categoryId={categoryId}&query={query}&numItems={numItems}&facet={facet}&facet.range={facet_range}")

print(manual_url)

# API parameters for other lookups
params = {
    "apiKey": api_alt,
    #"start": start, # note keep turned off.  We can't get this to work yet
    "format": "json", # json|xml
    "categoryId": categoryId,
    "query": query,
    "numItems": numItems,
    #"sort": sort, #allowed sort types are [relevance, price, title, bestseller, customerRating, new]
    #"order": order,  #allowed values are [asc, desc]
    #"facet":facet, # note keep turned off.  We can't get this to work yet
    #"facet.range":facet_range    # note keep turned off.  We can't get this to work yet
}

In [None]:
##################################
STORE CATEGORY DATA TO BULID LOOPS
##################################

#LAPTOPS

# Number of items found in our search
# there are 20,747 total results for laptop in electronics
# step 0 there are 3089 results in the price range $0-$200
# step 1 there are 3015 results in the price range $201-$400
# step 2 there are 2053 results in the price range $401-$600
# step 3 there are 2995 results in the price range $601-$1000
# step 4 there are 2021 results in the price range $1001-$1400
# step 5 there are 2726 results in the price range $1401-$1800
# step 6 there are 1873 results in the price range $1801-$2200
# step 7 there are 1011 results in the price range $2201-$2600
# step 8 there are 863 results in the price range $2601-3000
# step 9 there are 1096 results in the price range above $3000

# Laptop category data
# category_total = 20747 #hard coded from broad search
# step0 = 200*25//3089
# step1 = 200*25//3015
# step2 = 200*25//2053
# step3 = 400*25//2995
# step4 = 400*25//2021
# step5 = 400*25//2726
# step6 = 400*25//1873
# step7 = 400*25//1011
# step8 = 400*25//863

In [None]:
# Loop by pricing parameters to pull out entire data set for a product category
# to save time we are going to hard code some numbers then go back later if we have time

#TABLETS
category_total = 485
query="tablet"

# Number of dollars in each step
step0 = 10
high_price = 1200 #manually look up the high price

# create a function to perform the price search

def price_data(lowRange, highRange):
    facet_range = (f"price:[{lowRange}%20TO%20{highRange}]")
    price_url = (f"{search_url}apiKey={api_alt}&format={format}&categoryId={categoryId}&query={query}&numItems={numItems}&facet={facet}&facet.range={facet_range}")
    walmart_data2 = requests.get(price_url)
    walmart_data2 = walmart_data2.json()
    return walmart_data2


#if walmart_data2.status_code==200:

                


In [None]:
# Use for loop to go through prices in the steps defined

searchCount = 0 
itemCount = 0 
sleepCount = 0
low = 25
high = 0

with open(f'Walmart_datasets/walmart_{query}.json', mode='w') as f:

    for itemCount in range (1,category_total):
        searchCount += 1
        itemCount = 10*searchCount #guess avg number of items per search. max = 25, but not all yield this much
        print(f"search / item count = {searchCount} / {itemCount}")

        # We have had issues pulling all the data.  Walmart could be shutting us down
        # We will add some random sleep every 100 queries to try to trick the servers
        sleepCount += 1
        if sleepCount == 100:
            sleepTime = random.randint(60, 120)
            time.sleep(sleepTime)
            print(f"resting for {sleepTime} seconds")
            sleepCount = 0
        elif high < high_price:
            high = low + step0
            data = price_data(low,high)
            f.write(json.dumps(data) + '\n')
            low = high
#         elif high < 200:
#             high = low + step0
#             data = price_data(low,high)
#             f.write(json.dumps(data) + '\n')
#             low = high
#         elif high < 400:
#             high = low + step1
#             data = price_data(low,high)
#             f.write(json.dumps(data) + '\n')
#             low = high
#         elif high < 600:
#             high = low + step2
#             data = price_data(low,high)
#             f.write(json.dumps(data) + '\n')
#             low = high
#         elif high < 1000:
#             high = low + step3
#             data = price_data(low,high)
#             f.write(json.dumps(data) + '\n')
#             low = high      
#         elif high < 1400:
#             high = low + step4
#             data = price_data(low,high)
#             f.write(json.dumps(data) + '\n')
#             low = high
#         elif high < 1800:
#             high = low + step5
#             data = price_data(low,high)
#             f.write(json.dumps(data) + '\n')       
#             low = high
#         elif high < 2200:
#             high = low + step6
#             data = price_data(low,high)
#             f.write(json.dumps(data) + '\n')    
#             low = high    
#         elif high < 2600:
#             high = low + step7
#             data = price_data(low,high)
#             f.write(json.dumps(data) + '\n')
#             low = high
#         elif high < 3000:
#             high = low + step8
#             data = price_data(low,high)
#             f.write(json.dumps(data) + '\n')
#             low = high
#         elif high < 8000:
#             high = low + step8
#             data = price_data(low,high)
#             f.write(json.dumps(data) + '\n')
#             low = high
        else:
            print ('finished')
            print (f'item count = {itemCount}')
            print (f'final search count = {searchCount}')
            print (f'final high price is ${high}')
            break
               
      



In [None]:
for i in range(1):
    data = price_data(600, 700)
    data = json.dumps(data)
    j = json.loads(data)
    print(j)

In [None]:
from pprint import pprint
import os, sys

array = []

filepath = os.path.join(f'Walmart_datasets/walmart_{query}.json')
with open(filepath, 'r') as jsonfile:
#     data = jsonfile.read()
    for line in jsonfile:
        data = line.strip()
        json_data = json.loads(data)
        
json_data

In [None]:
pd.DataFrame(array)

In [None]:
for line in array:
    df = pd.DataFrame(line)
    df.head(2)

In [None]:
df = None
one = None
for line in array:
#     print(len(line))
#     print(type(line))
    data = line.strip()
    json_data = json.loads(data)
#     print (len(json_data))
    one = json_data
    
#     df = pd.DataFrame(json_data)
    break
#     for item in json_data['items']:
#         print(item['itemId'])
    

In [None]:
pd.DataFrame(one['items'])

# Taxonomy API (Walmart) & Category API (Best Buy)

These APIs pull out the categories that can used to search for data.

In [None]:
# define taxonomy API for Walmart
api_key = api_key #or api_alt
taxonomy_url = f"http://api.walmartlabs.com/v1/taxonomy?apiKey={api_key}&format=json"


In [None]:
# store Walmart Taxonomy
#########################################
# KEEP TURNED OFF IF FILE ALREADY EXISTS#
#########################################

# walmart_data3 = requests.get(taxonomy_url)
# walmart_data3 = walmart_data3.json()

# # print a json file from the data we just gatherd
# try:       
#     a = []
#     if not os.path.isfile('Walmart_datasets/walmart_taxonomy.json'):
#         a.append(walmart_data3)
#         with open('Walmart_datasets/walmart_taxonomy.json', mode='w') as f:
#             f.write(json.dumps(walmart_data3, indent=4))
#     else:
#         with open('Walmart_datasets/walmart_taxonomy.json') as feedsjson:
#             feeds = json.load(feedsjson)

#         feeds.append(walmart_data3)
#         with open('Walmart_datasets/walmart_taxonomy.json', mode='w') as f:
#             f.write(json.dumps(feeds, indent=4))

# except:
#     print ("Skipped line")

In [9]:
# open the json file and create a readable dataframe
from pprint import pprint
import pandas as pd
array = []
taxo_id = []
taxo_name = []
taxo_path = []
d={}

# children1_id = [categories][children]['id']
# children1_name = ['name']
# children1_path = ['path']

# children2_id = [categories][children][childron]['id']
# children2_name = ['name']
# children2_path = ['path']

filepath = os.path.join('Walmart_datasets/walmart_taxonomy.json')
with open(filepath, 'r') as jsonfile:
    json_data = json.load(jsonfile)
#     for id in range(len(json_data['categories'])):
#         taxo_id.append(jason_data['categories'][id])
    for ob in json_data['categories']:
        array.append(ob)
      

#print(json_data['categories'][0]['id'])
#print(len(json_data['categories']))
#print (taxo_id)

#taxonomy_df = pd.DataFrame(taxo_id)
taxonomy_df = pd.DataFrame(array)
taxonomy_df = taxonomy_df[['name', 'path','id','children']]

#pprint (json_data)

print(json_data['categories'][1]['children'][1]['children'][1])

{'id': '91083_1212910_1212923', 'name': 'Car Detailing Kits', 'path': 'Auto & Tires/Auto Detailing & Car Care/Car Detailing Kits'}


In [None]:
pd.read_json('Walmart_datasets/walmart_taxonomy.json')

In [3]:
taxonomy_df

Unnamed: 0,name,path,id,children
0,"Arts, Crafts & Sewing","Arts, Crafts & Sewing",1334134,"[{'id': '1334134_5899871', 'name': 'Art Suppli..."
1,Auto & Tires,Auto & Tires,91083,"[{'id': '91083_1074767', 'name': 'Auto Body', ..."
2,Baby,Baby,5427,"[{'id': '5427_491351', 'name': 'Baby & Toddler..."
3,Beauty,Beauty,1085666,"[{'id': '1085666_9892091', 'name': 'All Fragra..."
4,Books,Books,3920,"[{'id': '3920_582321', 'name': 'Arts & Enterta..."
5,Cell Phones,Cell Phones,1105910,"[{'id': '1105910_133161', 'name': 'Accessories..."
6,Clothing,Clothing,5438,"[{'id': '5438_1228424', 'name': 'Activewear fo..."
7,Electronics,Electronics,3944,"[{'id': '3944_1228636', 'name': '3D Printing',..."
8,Food,Food,976759,"[{'id': '976759_976780', 'name': 'Baking', 'pa..."
9,Gifts & Registry,Gifts & Registry,1094765,"[{'id': '1094765_133059', 'name': 'Gift Basket..."


In [None]:
def flatten_dict(d, prefix='__'):
    def items():
        # A clojure for recursively extracting dict like values
        for key, value in d.items():
            if isinstance(value, dict):
                for sub_key, sub_value in flatten_dict(value).items():
                    # Key name should imply nested origin of the dict,
                    # so we use a default prefix of __ instead of _ or .
                    yield key + prefix + sub_key, sub_value
            else:
                yield key, value
    return dict(items())
