# Capstone Project - Which Shoe is the Best for You?

General Assembly passion project. Scrape or obtain data from resources online to develop a dataset to perform cleaning, EDA, and analysis on. Try to predict a model on common themes like: 

- Price
- If item is in category A or B
- Cluster and create groups
- Recommender

In [1]:
# Import libraries
import pandas as pd
import numpy as np
import urllib
from bs4 import BeautifulSoup
import requests
from time import sleep, strftime

**Testing one website and finding appropriate keys**

Then run again on page 2.

In [2]:
result = requests.get('https://stockx.com/api/browse?page=1&category=152')
json_res = result.json()

# print json_res['Products'][0]['shortDescription']
# print json_res['Products'][0]['retailPrice']

In [12]:
result = requests.get('https://stockx.com/api/browse?page=2&category=152')
json_res = result.json()

print json_res['Products'][0]['shortDescription']
print json_res['Products'][0]['retailPrice']

Air-Jordan-5-Retro-Black-Metallic-2016
220


In [16]:
json_res['Products'][0]['market']

{u'absChangePercentage': 0.128205,
 u'annualHigh': 446,
 u'annualLow': 120,
 u'averageDeadstockPrice': 272,
 u'averageDeadstockPriceRank': 23,
 u'changePercentage': 0.128205,
 u'changeValue': 25,
 u'createdAt': u'2016-06-10T21:18:23+00:00',
 u'deadstockRangeHigh': 245,
 u'deadstockRangeLow': 195,
 u'deadstockSold': 3005,
 u'deadstockSoldRank': 37,
 u'highestBid': 220,
 u'lastHighestBidTime': 1497958653,
 u'lastLowestAskTime': 1497499216,
 u'lastSale': 220,
 u'lastSaleDate': u'2017-06-20T17:50:16+00:00',
 u'lowestAsk': 165,
 u'pricePremium': 0,
 u'pricePremiumRank': 32,
 u'productId': 0,
 u'productUuid': u'd682a939-b5ad-421a-9130-0003da9a3854',
 u'salesLast72Hours': 35,
 u'salesLastPeriod': 0,
 u'salesThisPeriod': 35,
 u'skuUuid': None,
 u'updatedAt': 1497981017,
 u'volatility': 0.115313}

### Functions to run for loop to scrape website

First function scrapes the web for an amount of pages (default 50). It will save the raw data as a csv as well. The second function will then clean the dataframe by taking out the unused columns.

In [120]:
def shoe_scraper(pages=50):
    '''Returns one dataframe of all results. 
    And will save into a new file.'''
    
    # Have to run requests first to get appropriate column names
    req = requests.get('https://stockx.com/api/browse?page=1&category=152')
    json_req = req.json()
    df = pd.DataFrame([], columns=json_req['Products'][0].keys())

    for i in range(1,pages):
        try:
            html = 'https://stockx.com/api/browse?page=' + str(int(i)) + '&category=152'
            result = requests.get(html)
            json_res = result.json()
            df = pd.concat([df, pd.DataFrame(json_res['Products'])])
            sleep(0.5)
        except:
            break
    
    # Drop row duplicates
    df.drop_duplicates(['shortDescription', 'urlKey'], inplace=True) 
    
    # Function to save as csv file under today's day as raw (before dropping)
    def csv_maker(df):
        filename = 'StockX_' + strftime("%m%d%H")
        df.to_csv(path_or_buf='C:\\Users\\Chris\\Desktop\\dsi-atl-3\\project\\Capstone\\datasets\\' + filename, encoding='utf-8')
    
    csv_maker(df)
    
    return df

In [164]:
def clean_df(busy_dataframe):
    '''Dropping columns that hold little to no information.
    Then reset the index since we are getting repeated indices'''
    
    # Market DataFrame
    market_df = pd.DataFrame([row for row in busy_dataframe['market']])
    
    
    # Drop unnecessary columns
    cleaner_dataframe = busy_dataframe.join(market_df)
    
    cleanest_dataframe = cleaner_dataframe.drop(['breadcrumbs', 'childId', 'countryOfManufacture', 'type', 
        'uuid', 'dataType', 'doppelgangers', 'condition', 'description', 'hidden', 'ipoDate', 'productCategory', 
        'shoeSize', 'urlKey', 'charityCondition', 'releaseTime', 'shortDescription', 'media', '_highlightResult', 
        'market', '_tags', 'id', 'objectID', 'lastHighestBidTime', 'lastLowestAskTime', 'styleId', 'productId',
        'productUuid', 'skuUuid', 'updatedAt', 'title', 'traits', 'tickerSymbol', 'salesLastPeriod'], axis=1)
    
    # Remember title = shoe + name

    # Reset the index, since we are getting repeated indices
    cleanest_dataframe.reset_index(drop=True, inplace=True)
    return cleanest_dataframe

In [122]:
scraped_shoe = shoe_scraper(5)

In [166]:
clean_df(scraped_shoe).ix[:, 8:].describe()

Unnamed: 0,year,absChangePercentage,annualHigh,annualLow,averageDeadstockPrice,averageDeadstockPriceRank,changePercentage,changeValue,deadstockRangeHigh,deadstockRangeLow,deadstockSold,deadstockSoldRank,highestBid,lastSale,lowestAsk,pricePremium,pricePremiumRank,salesLast72Hours,salesThisPeriod,volatility
count,156.0,159.0,159.0,159.0,159.0,159.0,159.0,159.0,159.0,159.0,159.0,159.0,159.0,159.0,159.0,159.0,159.0,159.0,159.0,159.0
mean,2016.371795,0.085035,498.930818,165.698113,266.90566,48.201258,0.00018,1.037736,235.534591,185.72327,1585.63522,52.641509,259.09434,210.628931,196.188679,0.140289,54.509434,79.477987,79.477987,0.129403
std,1.004621,0.088628,292.232938,77.647268,100.069773,27.939532,0.123011,27.571285,105.857845,95.262323,1806.381793,29.547491,219.398799,99.471296,79.936776,0.425318,32.249594,133.330456,133.330456,0.078411
min,2012.0,0.0,275.0,90.0,149.0,6.0,-0.368421,-70.0,119.0,84.0,176.0,1.0,115.0,110.0,105.0,-0.368,1.0,34.0,34.0,0.02491
25%,2016.0,0.017205,321.5,120.0,202.5,21.5,-0.069343,-14.0,167.0,122.5,564.5,23.5,155.5,145.0,150.0,-0.1505,24.0,38.0,38.0,0.073435
50%,2017.0,0.069149,375.0,136.0,237.0,50.0,0.0,0.0,195.0,157.0,881.0,59.0,200.0,170.0,169.0,0.0,65.0,45.0,45.0,0.108345
75%,2017.0,0.108596,471.0,196.0,285.5,65.5,0.059498,12.0,263.5,207.0,1904.0,78.0,260.5,241.0,210.5,0.214,79.0,67.0,67.0,0.179335
max,2017.0,0.4,1500.0,419.0,552.0,100.0,0.4,100.0,549.0,450.0,8778.0,100.0,1450.0,495.0,435.0,1.5,98.0,911.0,911.0,0.300471
