In [184]:
# Import necessary packages for webscraping and data collection
import bs4
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import json

# Load SQL extension
%load_ext sql

# Matplot frontend
%matplotlib inline

The sql extension is already loaded. To reload it, use:
  %reload_ext sql


In [38]:
df = pd.DataFrame()

In [39]:
page_link = 'https://www.rei.com/c/climbing-shoes?page='

In [40]:
def scrape_page_link(link):
    '''Parses an REI page link and returns a list of individual product links'''
    req = requests.get(link)
    soup = bs4.BeautifulSoup(req.text, 'html.parser')
    
    product_lst = soup.find_all('a', {'class': '_1A-arB0CEJjk5iTZIRpjPs'})
    
    links = []
    for product in product_lst[::2]:
        links.append('https://www.rei.com/' + product['href'])
    
    return links

In [41]:
num = 1
link = page_link + str(num)
ind_links = []

while requests.get(link).status_code != 404:
    ind_links.extend(scrape_page_link(link))
    num = num + 1
    link = page_link + str(num)

In [229]:
# Reference the listed number of products on REI site to check
len(ind_links)

131

In [233]:
def scrape_ind_link(link):
    '''Parses an REI shoe link and returns a list of the shoe's features.'''
    req = requests.get(link)
    soup = bs4.BeautifulSoup(req.text, 'html.parser')
    
    product_dict = {}
    
    if 'rei-garage' in link:
        all_info = soup.find('script', {'id': 'page-data'})
        json_product = json.loads(all_info.get_text())['product']['specifications']
        
        if 'reviewsSummary' in json.loads(all_info.get_text())['product']:
            reviews_dict = {'reviewsSummary': json.loads(all_info.get_text())['product']['reviewsSummary']}
        else:
            reviews_dict = {'reviewsSummary': {}}
        
        json_product.update(reviews_dict)
        product_name = json.loads(all_info.get_text())['product']['title']
        
    else:
        product_info = soup.find('script', {'data-client-store': 'product-details'})
        product_name = soup.find('script', {'data-client-store': 'product-metadata'})

        json_product = json.loads(product_info.get_text())
        product_name = json.loads(product_name.get_text())['title']
                
    for spec in json_product['specs']:
        product_dict[spec['name']] = spec['values']
        
        if 'overall' in json_product['reviewsSummary'].keys():
            product_dict['review_count'] = json_product['reviewsSummary']['total']
            product_dict['average_rating'] = json_product['reviewsSummary']['overall']
            product_dict['ratings_histogram'] = json_product['reviewsSummary']['ratingHistogram']
        else:
            product_dict['review_count'] = 0
            product_dict['average_rating'] = np.nan
            product_dict['ratings_histogram'] = np.nan
    
    '''
    print(json_product['reviewsSummary'])
    
    product_dict['average_rating'] = json_rating['averageRating']
    product_dict['review_count'] = json_rating['reviewCount']
    product_dict['price'] = json_rating['displayPrice']
    product_dict['climbing_types'] = json_product['specs'][0]['values'][0]
    product_dict['difficulty'] = json_product['specs'][1]['values'][0]
    product_dict['last'] = json_product['specs'][2]['values'][0]
    product_dict['upper_material'] = json_product['specs'][3]['values'][0]
    product_dict['lining'] = json_product['specs'][4]['values'][0]
    product_dict['outsole'] = json_product['specs'][5]['values'][0]
    product_dict['lace_type'] = json_product['specs'][6]['values'][0]
    product_dict['resolability'] = json_product['specs'][7]['values'][0]
    product_dict['gender'] = json_product['specs'][8]['values'][0]
    product_dict['weight'] = json_product['specs'][9]['values'][0]
    product_dict['brand'] = json_product['brand']
    '''
    
    return {product_name: product_dict}

In [234]:
shoes_dict = {}
for link in ind_links:
    shoes_dict.update(scrape_ind_link(link))

In [235]:
# Check to make sure it collected data properly
shoes_dict['Tarantulace Climbing Shoes']

{'Best Use': ['Rock Climbing'],
 'review_count': 15,
 'average_rating': 4.4,
 'ratings_histogram': {'1': 0, '2': 0, '3': 2, '4': 5, '5': 8},
 'Climbing Shoe Type': ['Neutral'],
 'Last': ['Slip-lasted'],
 'Upper': ['Leather/synthetic leather'],
 'Lining': ['No'],
 'Outsole': ['FriXion RS rubber'],
 'Footwear Closure': ['Lace-up'],
 'Can Be Resoled': ['Yes'],
 'Gender': ['Unisex'],
 'Weight (Pair)': ['1 lb. 2.3 oz.']}

In [236]:
# Convert to json file
with open('data.json', 'w') as fp:
    json.dump(shoes_dict, fp)