Name: Eric Wang

Project Name: REI Climbing Shoes Web Scraping

Hello! This notebook was used to generate a json file for use in UCSD's DSC106 - Intro to Data Visualization course's final project.

In [1]:
# Import necessary packages for webscraping and data collection
import bs4
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import json

# Load SQL extension
%load_ext sql

# Matplot frontend
%matplotlib inline

In [2]:
# Set base page link
page_link = 'https://www.rei.com/c/climbing-shoes?page='

In [3]:
def scrape_page_link(link):
    '''Parses an REI page link and returns a list of individual product links'''
    req = requests.get(link)
    soup = bs4.BeautifulSoup(req.text, 'html.parser')
    
    product_lst = soup.find_all('a', {'class': '_1A-arB0CEJjk5iTZIRpjPs'})
    
    links = []
    for product in product_lst[::2]:
        links.append('https://www.rei.com/' + product['href'])
    
    return links

In [4]:
num = 1
link = page_link + str(num)
ind_links = []

# Extracts individual shoe link on each page of the catalog
while requests.get(link).status_code != 404:
    ind_links.extend(scrape_page_link(link))
    num = num + 1
    link = page_link + str(num)

In [5]:
# Reference the listed number of products on REI site to make sure we got them all
len(ind_links)

132

Going through the list of links, I noticed a few of the urls containing "rei-garage" weren't being scraped properly because their page elements were a different format. Because of this, I added the case in the scraping function, and we're all set!

In [6]:
def scrape_ind_link(link):
    '''Parses an REI shoe link and returns a list of the shoe's features.'''
    req = requests.get(link)
    soup = bs4.BeautifulSoup(req.text, 'html.parser')
    
    product_dict = {}
    
    if 'rei-garage' in link:
        all_info = soup.find('script', {'id': 'page-data'})
        json_product = json.loads(all_info.get_text())['product']['specifications']
                
        if 'reviewsSummary' in json.loads(all_info.get_text())['product']:
            reviews_dict = {'reviewsSummary': json.loads(all_info.get_text())['product']['reviewsSummary']}
        else:
            reviews_dict = {'reviewsSummary': {}}
        
        json_product.update(reviews_dict)
        product_name = json.loads(all_info.get_text())['product']['title']
        
        product_dict['price'] = json.loads(all_info.get_text())['product']['displayPrice']['min']
        
        product_dict['features'] = json.loads(all_info.get_text())['product']['features']
        
    else:
        product_info = soup.find('script', {'data-client-store': 'product-details'})
        product_name = soup.find('script', {'data-client-store': 'product-metadata'})
        product_price = soup.find('script', {'data-client-store': 'product-price-data'})
        
        json_product = json.loads(product_info.get_text())
        product_name = json.loads(product_name.get_text())['title']
        json_price = json.loads(product_price.get_text())
    
        product_dict['price'] = json_price['compareAt']
        product_dict['features'] = json_product['features']
    
    for spec in json_product['specs']:
        product_dict[spec['name']] = spec['values']
        
        if 'overall' in json_product['reviewsSummary'].keys():
            product_dict['review_count'] = json_product['reviewsSummary']['total']
            product_dict['average_rating'] = json_product['reviewsSummary']['overall']
            product_dict['ratings_histogram'] = json_product['reviewsSummary']['ratingHistogram']
        else:
            product_dict['review_count'] = 0
            product_dict['average_rating'] = 'NaN'
            product_dict['ratings_histogram'] = 'NaN'
    
    return {product_name: product_dict}

In [7]:
# Test it out on an rei-garage case
scrape_ind_link('https://www.rei.com//rei-garage/product/189841/scarpa-crux-approach-shoes-mens')

{"Scarpa Crux Approach Shoes - Men's": {'price': 89.73,
  'features': ["Tackle slabby scrambles, loose talus and rugged descents with confidence in the men's Scarpa Crux approach shoes. They boast sticky outsoles and ample support and protection.",
   'Durable suede uppers with lace-to-toe design offer a performance fit',
   'Full-coverage rubber toe rands provide abrasion resistance and protection',
   'Vibram® Megagrip outsoles supply excellent traction'],
  'Best Use': ['Hiking'],
  'review_count': 2,
  'average_rating': 5.0,
  'ratings_histogram': {'1': 0, '2': 0, '3': 0, '4': 0, '5': 2},
  'Upper': ['Suede/polyester mesh'],
  'Lining': ['Polyester'],
  'Midsole': ['2D EVA-MP'],
  'Outsole': ['Vibram Vertical Approach Megagrip rubber'],
  'Can Be Resoled': ['Yes'],
  'Footwear Height': ['Ankle'],
  'Footwear Closure': ['Lace-up'],
  'Weight (Pair)': ['1 lb. 10.8 oz.'],
  'Gender': ["Men's"]}}

In [8]:
shoes_dict = {}

# Loop through links and store in shoes_dict dictionary
for link in ind_links:
    shoes_dict.update(scrape_ind_link(link))

In [9]:
# Check to make sure it collected data properly
shoes_dict['Tarantulace Climbing Shoes']

{'price': 85,
 'features': ['Tackle quick gym sessions and daylong multi-pitch missions in comfort with these La Sportiva Tarantulace climbing shoes. Their soft uppers and shape are made to perform without pinching your toes.',
  'Leather/synthetic leather uppers provide an accurate and secure fit',
  'Lined tongues help manage moisture and are comfortable next to skin',
  '5mm FriXion® RS rubber soles provide good grip and stand up to regular use',
  'Quick-pull lacing system delivers a snug, precise fit',
  'Shoes can be resoled'],
 'Best Use': ['Rock Climbing'],
 'review_count': 16,
 'average_rating': 4.4375,
 'ratings_histogram': {'1': 0, '2': 0, '3': 2, '4': 5, '5': 9},
 'Climbing Shoe Type': ['Neutral'],
 'Last': ['Slip-lasted'],
 'Upper': ['Leather/synthetic leather'],
 'Lining': ['No'],
 'Outsole': ['FriXion RS rubber'],
 'Footwear Closure': ['Lace-up'],
 'Can Be Resoled': ['Yes'],
 'Gender': ['Unisex'],
 'Weight (Pair)': ['1 lb. 2.3 oz.']}

In [10]:
# Convert to json file called data.json
with open('data.json', 'w') as fp:
    json.dump(shoes_dict, fp)

In [11]:
# DataFrame format
df = pd.DataFrame.from_dict(shoes_dict, orient='index')
df.head()

Unnamed: 0,price,features,Best Use,review_count,average_rating,ratings_histogram,Climbing Shoe Type,Last,Upper,Lining,...,Weight (Pair),Vegan,Midsole,Support,Footwear Height,Sustainability,Waterproof,Type Of Waterproofing,Material(s),Weight
Tarantulace Climbing Shoes,85.0,[Tackle quick gym sessions and daylong multi-p...,[Rock Climbing],16,4.4375,"{'1': 0, '2': 0, '3': 2, '4': 5, '5': 9}",[Neutral],[Slip-lasted],[Leather/synthetic leather],[No],...,[1 lb. 2.3 oz.],,,,,,,,,
Tarantulace Climbing Shoes - Women's,85.0,[Tackle quick gym sessions and daylong multi-p...,[Rock Climbing],10,4.9,"{'1': 0, '2': 0, '3': 0, '4': 1, '5': 9}",[Neutral],[Slip-lasted],[Leather/synthetic leather],[None],...,[1 lb. 2.6 oz. ounces],,,,,,,,,
Momentum Climbing Shoes - Ash - Men's,94.95,"[A great choice for new climbers, the Black Di...",[Rock Climbing],64,4.4375,"{'1': 1, '2': 1, '3': 5, '4': 19, '5': 38}",[Neutral],[Slip-lasted],[Synthetic knit],[Hemp],...,[15.6 ounces],,,,,,,,,
Asym VCS Climbing Shoes - Men's,125.0,[For beginners seeking performance and advance...,[Climbing],1,4.0,"{'1': 0, '2': 0, '3': 0, '4': 1, '5': 0}",[Neutral],[Slip-lasted],[Leather],[Leather],...,[1 lb. 1.6 oz.],,,,,,,,,
TC Pro Climbing Shoes,190.0,"[Designed for edging and crack climbing, the L...",[Rock Climbing],83,4.7952,"{'1': 0, '2': 2, '3': 3, '4': 5, '5': 73}",[Neutral],[Slip-lasted],[Leather],[Yes/unlined underfoot],...,[1 lb. 1.4 oz.],,,,,,,,,
