# Scraping Sneaker Data from the Stock X website

### There are 2 components to this notebook:

1. Using the individual shoe keys to scrape all the shoe info 
2. Using the shoe name search/ random shoe generator to get the price history for a specific shoe

In [50]:
import requests
from datetime import datetime
from datetime import timedelta
import pandas as pd
import glob
from urllib.request import Request, urlopen
import time
import random

In [235]:
# open a csv with all the sneakers (used for debugging purposes)
# second_path = r'/Users/gabbyvinco/Desktop/sneakers_df.csv'
# round2 = pd.read_csv(second_path, index_col=None, header=0)

In [234]:
# import all the csvs for the different sections of the Stock X website with the url additions
path = r'/Users/gabbyvinco/Desktop/sneaker_csv'
all_files = glob.glob(path + "/*.csv")

li = []

for filename in all_files:
    df = pd.read_csv(filename, index_col=None, header=0)
    li.append(df)
    
frame = pd.concat(li, ignore_index=True)    

In [236]:
# take transpose of data frame
frame = frame.T
# set an index so the column with the keys is no longer the index
frame = frame.reset_index()
# rename column to 'urlKeys' and rename dataframe
dataframe = frame.rename(columns={'index': 'urlKeys'})

In [237]:
# taking the urlKeys and inserting them in to th complete url with parameters
full_urls = []

for value in dataframe['urlKeys']:
    full = 'https://stockx.com/api/products/'+value+'?includes=market,360&currency=EUR&country=IT'
    #'https://stockx.com/api/products/'+value+
        # this part is determining which file/shoe you want to work with 
    #'?includes=market,360&currency=EUR&country=IT' 
        # this part is basically a string that is being passed down to your web browser to the web server
    # with this url we can send a GET request and fetch the information that is listed in the Inspect>Network>Preview
    full_urls.append(full)
    
# create a new column with the full urls    
dataframe['urlFull'] = full_urls

In [238]:
len(full_urls)

2471

In [243]:
# create a new list to hold the sneakers and all their information
info = []

# Shoe Info (getting the basic information regarding the shoe)

Here we use the keys that we grabbed in the previous script. These keys when added to the query take you to the page for that specific shoe. From there we were able to make a request and gather information like the Stock X shoe identification number, brand, colorway, release date, retail price,official shoe name, volatility, change percentage, and marketed gender of the shoe.

In [244]:
# create a function that loops through the full url list, extracts the variables that we want
# and prints them out in a neat format

def get_shoe_info(url_list):
    for url in url_list:
        headers = {
            "accept-encoding": "gzip, deflate, br",
            "sec-fetch-mode": "cors",
            "sec=fetch-site": "same-origin",
            "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36",
            "x-requested-with": "XMLHttpRequest"
        }
        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()
        product = response.json()["Product"]
        try:
            id_num = product["id"]
        except:
            print("no id")
            id_num = 0
        try:
            brand = product["brand"]
        except:
            print("no brand")
            brand = 0
        try:
            colorway = product["colorway"]
        except:
            print("no colorway")
            colorway = 0
        try:    
            release_date = product["releaseDate"]
        except:
            print("no release date")
            release_date = 0
        try:
            retail_price = product["retailPrice"]
        except:
            print("no retail price")
            retail_price = 0
        try:
            shoe_name = product["shoe"]
        except:
            print("no shoe name")
            shoe_name = 0
        try:
            volatility = product["market"]["volatility"]
        except:
            print("no volatility")
            volatility = 0
        try:
            change_percentage = product["market"]["changePercentage"]
        except:
            print("no change percentage")
            change_percentage = 0
        try:
            gender = product["gender"]
        except:
            print("no gender")
            gender = 0

        info.append([id_num,
                     brand,
                     colorway,
                     release_date,
                     retail_price,
                     shoe_name,
                     volatility,
                     change_percentage,
                     gender])

        print("shoe info added")
        time.sleep(5)

    return 0
    

In [245]:
# (for personal use) brief explanation of __name__ == "__main__"
# the global variable = __name__ and the entry point = __main__ (or the name that you import the module by)
# so the code below this if statement will only run if the module == entry point to your program
# it allows the code in the module to be importable by other modules without executing the code beneath the block on import

if __name__ == "__main__":
    import sys
    sys.exit(get_shoe_info(full_urls))

shoe info added
shoe info added
shoe info added
shoe info added
shoe info added
no release date
shoe info added
shoe info added
shoe info added
shoe info added
shoe info added
shoe info added
shoe info added
shoe info added
shoe info added
no release date
no retail price
shoe info added
shoe info added
shoe info added
shoe info added
shoe info added
shoe info added
shoe info added
shoe info added
shoe info added
shoe info added
shoe info added
no release date
shoe info added
shoe info added
shoe info added
no release date
shoe info added
shoe info added
shoe info added
shoe info added
no release date
shoe info added
shoe info added
shoe info added
no retail price
shoe info added
shoe info added
no release date
no retail price
shoe info added
no release date
shoe info added
shoe info added
shoe info added
shoe info added
no retail price
shoe info added
no retail price
shoe info added
no release date
shoe info added
shoe info added
no release date
shoe info added
shoe info added
shoe inf

SystemExit: 0

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [247]:
# add info into sneakers_df
# sneaker_df = pd.DataFrame(info, columns = ["ID","Brand", "Colorway","ReleaseDate","RetailPrice","Name","Volatility","ChangePercentage","Gender"])

# re-ran the script because there was a timeout error
# so here we just added to the point at which it was left off
round2 = round2.append(info)

In [248]:
round2.to_csv (r'/Users/gabbyvinco/Desktop/sneakers_df.csv', index = False, header=True)

# Shoe Name Seach to generate a url for price history

This portion was created with the intention to use with our time-series analysis. Here we have two methods as to which we can gather our price history info. 
1. By using the shoe search function where we can enter in the shoe name we are searching for, then select the colorway we want, and finally it will provide us with the "sku_id" which is the Stock X shoe identification number. Using this number we can continue and request the information from the price history plot.
2. We use the random shoe generator which will provide us with "sku_id" for a random shoe on the Stock X website.

In [None]:
# import sneaker dataset
path = r'/Users/gabbyvinco/Desktop/Sneaker_Info_data.csv'
sneaker_info_df = pd.read_csv(path, index_col=None, header=0)

In [38]:
# select shoe by name
# make an input where the shoe name can be entered
print("What is the name of the shoe you want to search? ")
shoe_search = input()

# make input lowercase to eliminate any variation in sizing
shoe_search = shoe_search.lower()

What is the name of the shoe you want to search? 
Yeezy 700 V3


In [39]:
# create a list with all the sneaker names in the dataset ensure that they are all 
# lowercased so we can compare it with the input
shoe_names = sneaker_info_df["Name"].to_list()
lower_names = []

for i in shoe_names:
    i = i.lower()
    lower_names.append(i)

# unique brands    
unique_names = set(shoe_names)
# print(unique_names)

#unique brands by lowercase
unique_lower_names = set(lower_names)
# print(unique_lower_names)

# capitalize the first letter of the searched term from the input
shoe_search_capitalized = shoe_search.title()

In [78]:
if shoe_search in unique_lower_names:
    print("We have found that shoe in our data.")
    
    # count of how many types of that name shoe there are
    count_of_name = sum(shoe_search in s for s in lower_names)
    print(f'There are {count_of_name} different colorways of that shoe.')

    # take the shoes of that name and display the colorways to choose from
    sneaker_search_responses = sneaker_info_df.set_index(["Name"])
    print("     ")
    colorway_options = sneaker_info_df[sneaker_info_df["Name"] == shoe_search_capitalized]
    pd.set_option('display.max_rows', colorway_options.shape[0]+1)
    print(colorway_options[["Colorway","Gender"]])
    print("     ")
    
    # choose the colorway
    print("Please specify which colorway you would like: ")
    color_search = input()
    print("     ")
    
    # take the selected colorway and output the ID number
    selected_shoe = colorway_options[colorway_options["Colorway"] == color_search]
    sku_id = selected_shoe["ID"]
    print("ID has been grabbed, proceed to make the request")
#     print(id_num)

else:
    # error message if the shoe isnt in the dataset
    print("We're sorry, the brand you entered wasn't found in our data.")

We have found that shoe in our data.
There are 9 different colorways of that shoe.
     
                           Colorway     Gender
1721        Azareth/Azareth/Azareth        men
1722              Alvah/Alvah/Alvah        men
1726              Azael/Azael/Azael        men
1728        Kyanite/Kyanite/Kyanite        men
1733        Eremial/Eremial/Eremial        men
1768  Safflower/Safflower/Safflower        men
2070        Azareth/Azareth/Azareth    toddler
2112  Safflower/Safflower/Safflower    toddler
2167        Azareth/Azareth/Azareth  preschool
     
Please specify which colorway you would like: 
Azael/Azael/Azael
     
ID has been grabbed, proceed to make the request


In [97]:
# to check if the id was grabbed
print(sku_id)

['176409b2-977e-4272-b02b-8fab93796e8d']


In [106]:
# take the ID number from previous cell and insert it in the url
search_url = "https://stockx.com/api/products/" + sku_id + "/chart"
search_url = search_url.values[0]
# display the entire id not just the first few characters
pd.options.display.max_colwidth = 150

In [107]:
# check the link generated
print(search_url)

https://stockx.com/api/products/176409b2-977e-4272-b02b-8fab93796e8d/chart


# Random Shoe Generator to create url for price history

This part uses a random number generator to then select a shoe out of the dataset and then provide a url for which we can then access the price history.

In [332]:
# check the length of the dataset
len(sneaker_info_df)

2206

In [109]:
# create a random number generator to then select a shoe out of the dataset
n = random.randint(0,2205)
random_sneaker = sneaker_info_df.loc[n,:]
print(random_sneaker[["Name","Colorway"]])
random_id = random_sneaker["ID"]
# create the url using the id from the random generator
random_url = "https://stockx.com/api/products/" + random_id + "/chart"


Name                              Jordan 1 Low
Colorway    Sail/Gym Red-University Gold-Black
Name: 411, dtype: object


In [110]:
# check the link
print(random_url)

https://stockx.com/api/products/7c79530c-fa82-463c-931c-f99d033102a7/chart


# Price History Function (Getting the data from the interactive plots)

This portion now uses the url from either the shoe search fucntion or the random shoe generator to access the price history plot on the Stock X website. This information is visible on the website in an interactive plot, however we were able to make a request and gather all of the price history information.

In [111]:
# allow us to control the days for which we want the price history information
day_ago = datetime.today()- timedelta(days=1)
day_choose = datetime.today().strftime('%Y-%m-%d')

In [112]:
def get_price_history(url_with_id, day_get):

    params = {
        "start_date": "all",
        "end_date": day_get,
        "intervals": "100",
        "format": "highstock",
        "currency": "EUR",
        "country": "IT"
    }

    headers = {
        "accept-encoding": "gzip, deflate",
        "sec-fetch-mode": "cors",
        "sec-fetch-site": "same-origin",
        "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36",
        "x-requested-with": "XMLHttpRequest"
    }

    response = requests.get(url_with_id, params=params, headers=headers)
    response.raise_for_status()
    price_history = response.json()["series"][0]["data"]
    
    y = []
    for timestamp, price in response.json()["series"][0]["data"]:
        date = datetime.utcfromtimestamp(int(timestamp) // 1000)
        print(f"[{date}]: €{price}")
        # append to list so data frame can be created
        y.append([date, price])
    
    return 0
    

## Don't forget to pass in the url variable (either search_url OR random_url)
You must use one or the other, but not both.

In [113]:
if __name__ == "__main__":
    import sys
    sys.exit(get_price_history(search_url, day_choose))

[2019-12-05 03:06:30]: €666
[2019-12-10 07:17:06]: €643
[2019-12-15 11:27:43]: €427
[2019-12-20 15:38:19]: €404
[2019-12-25 19:48:56]: €414
[2019-12-30 23:59:32]: €410
[2020-01-05 04:10:09]: €413
[2020-01-10 08:20:45]: €417
[2020-01-15 12:31:22]: €416
[2020-01-20 16:41:59]: €412
[2020-01-25 20:52:35]: €437
[2020-01-31 01:03:12]: €467
[2020-02-05 05:13:48]: €474
[2020-02-10 09:24:25]: €480
[2020-02-15 13:35:01]: €499
[2020-02-20 17:45:38]: €501
[2020-02-25 21:56:14]: €504
[2020-03-02 02:06:51]: €509
[2020-03-07 06:17:28]: €516
[2020-03-12 10:28:04]: €492
[2020-03-17 14:38:41]: €485
[2020-03-22 18:49:17]: €479
[2020-03-27 22:59:54]: €499
[2020-04-02 03:10:30]: €513
[2020-04-07 07:21:07]: €510
[2020-04-12 11:31:44]: €525
[2020-04-17 15:42:20]: €546
[2020-04-22 19:52:57]: €573
[2020-04-28 00:03:33]: €578
[2020-05-03 04:14:10]: €573
[2020-05-08 08:24:46]: €577
[2020-05-13 12:35:23]: €575
[2020-05-18 16:45:59]: €591
[2020-05-23 20:56:36]: €622
[2020-05-29 01:07:13]: €655
[2020-06-03 05:17:49

SystemExit: 0

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [72]:
# create a dataframe from the y list
price_history_df = pd.DataFrame(y)

In [73]:
price_history_df

Unnamed: 0,0,1
0,2020-01-21 19:54:25,1041
1,2020-01-26 12:38:00,1041
2,2020-01-31 05:21:35,1041
3,2020-02-04 22:05:10,1041
4,2020-02-09 14:48:45,1041
...,...,...
95,2021-04-12 00:55:06,359
96,2021-04-16 17:38:41,359
97,2021-04-21 10:22:16,358
98,2021-04-26 03:05:51,363


In [None]:
# save the dataframe as a csv file
# price_history_df.to_csv (r'/Users/gabbyvinco/Desktop/sneakers_df.csv', index = False, header=True)