## Webscraping Guitar Listings from Reverb.com
This notebook will:
- get number of guitars to scrape
- create batches for data to scrape
- scrape initial data 
- use URL from initial data to scrape details 
- export data to csv

In [276]:
# importing libraries

import pandas as pd
import numpy as np
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By

from bs4 import BeautifulSoup as bs
import html.parser
import time
import locale
from tqdm import tqdm
import requests


import warnings
warnings.filterwarnings("ignore")

tqdm.pandas()
driver = webdriver.Chrome(ChromeDriverManager().install()) #this ensures that the chrome driver is current when launched



Current google-chrome version is 98.0.4758
Get LATEST chromedriver version for 98.0.4758 google-chrome
Driver [/Users/embernardo/.wdm/drivers/chromedriver/mac64/98.0.4758.80/chromedriver] found in cache


### Web Scraping Functions

In [254]:
## web scrape functions
def login():
    config = configparser.ConfigParser()
    config.read('reverb.ini')
    
    reverb_user  = config['reverb']['reverb_user']
    reverb_pass = config['reverb']['reverb_pass']  
    
    url = 'https://reverb.com/price-guide'
    driver.get(url)
    time.sleep(2)
    
    # find element using xpath to click login
    driver.find_element(By.XPATH, '/html/body/header/div[2]/div[1]/div/nav/div[2]/ul/li[6]/a').click()
    time.sleep(2)
    #enter credentials
    
    
    driver.find_element(By.ID, 'user_session[login]').send_keys(username)
    time.sleep(1)
    driver.find_element(By.ID, 'user_session[password]').send_keys(password)
    time.sleep(1)
    #click login button
    driver.find_element(By.XPATH, '/html/body/div[4]/div[1]/div[1]/div/div/div/div/div/div[2]/div/form/div[3]/input').click()
    

def getGuitars(html):
    '''
    This function retrieves details for each guitar listed
    '''
    guitar_list = []
    guitars = soup.find_all('div', {'class':'grid-card grid-card--redesign'})
    for guitar in guitars:
        guitar_dict = {}
        guitar_dict['title'] = guitar.find('h4',class_='grid-card__title').text
        guitar_dict['price'] = guitar.find('span',class_='price-display').text
        guitar_dict['condition'] = guitar.find('div',class_='condition-display__label').text
        guitar_dict['url'] = guitar.a['href']
        guitar_list.append(guitar_dict)
    return guitar_list



def getmin(vals): 
    nrange = vals.replace(' ','').split('-')
    newval = nrange[0]
    return newval


def getmax(vals):
    nrange = vals.replace(' ','').split('-')
    newval = nrange[1]
    return newval



<h2> Get number of guitars and pages to scrape</h2>

In [266]:
all_url = 'https://reverb.com/marketplace?product_type=electric-guitars'
driver.get(all_url)

In [267]:
html = driver.page_source
soup = bs(html,'html.parser')      


# get total # of guitars
totalguitars = soup.find('div',{'class':'search-overview__count'}).span.text.replace(' Results','')
totalguitars = totalguitars.replace(',','')



totalguitars = int(locale.atoi(totalguitars))

pages = totalguitars//60

print(f'total guitars: {totalguitars} across {pages} pages')

total guitars: 127768 across 2129 pages


## This divides the pages to be scraped into batches to avoid getting flagged by reverb

In [268]:
test_list = range(pages)
def create_scrape_batch(lst, n):  
    for i in range(1, len(lst), n): 
        yield lst[i:i + n] 

n = 300

batches = list(create_scrape_batch(test_list, n)) 
# test print to see if the 2k pages can be broken down in to batches
print(batches)

# test breaking down of batch to see if the range function works
batch1 = batches[:3]
batch2 = batches[3:4]

batch2

[range(1, 301), range(301, 601), range(601, 901), range(901, 1201), range(1201, 1501), range(1501, 1801), range(1801, 2101), range(2101, 2129)]


[range(901, 1201)]

## Cycle through all batches and scrape data initial data (Title, URL and image) using getGuitars function above

In [1]:

master_guitar_listing = []

for batch in batches:
    print(batch)
    for page in tqdm(batch):
        #print(page)
        #guitar_listing = []
        #print(f'scraping page {x} of {pages}')
        #driver.get(f'https://reverb.com/marketplace?product_type=electric-guitars'+
        #           f'&condition[]=used&condition[]=good&condition[]=very-good'+
        #           f'&condition[]=excellent&condition[]=mint&page={page}')
        driver.get(f'https://reverb.com/marketplace?product_type=electric-guitars&page={page}')


        time.sleep(2)

        html = driver.page_source
        soup = bs(html,'html.parser') 
        guitar_page_data = getGuitars(html)
        master_guitar_listing.extend(guitar_page_data)


## export data to csv

In [271]:
guitar_df_0211 = pd.DataFrame(master_guitar_listing)
#guitar_df_0210_600.to_csv('data/master_guitar_listing02102022_600.csv', encoding='utf-8') 
guitar_df_0211.head()

Unnamed: 0,title,price,condition,url
0,Gibson Zakk Wylde Moderne of Doom Limited Run ...,"$3,564.82",Used – Very Good,https://reverb.com/item/50653578-gibson-zakk-w...
1,Jay Turser Dragon Inlay 2000s Red Burst,$500,Used – Good,https://reverb.com/item/50480779-jay-turser-dr...
2,Custom Shop '67 Hendrix Tribute Strat #2 1991,"$5,995",Used – Excellent,https://reverb.com/item/44364257-custom-shop-6...
3,Ibanez UV77 Steve Vai Signature Universe Reiss...,"$7,499",Used – Mint,https://reverb.com/item/50653574-ibanez-uv77-s...
4,Jackson 2020 USA Signature Phil Collen PC1 Ele...,"$3,000",Used – Excellent,https://reverb.com/item/50653545-jackson-2020-...


## cycle though the list from previous scrape and get features (brand, color, origin etc) for each guitar.

In [277]:


import warnings
warnings.filterwarnings("ignore")

from tqdm.notebook import tqdm
tqdm.pandas()

def newcols(url):
   
    driver.get(url)
    html = driver.page_source
    soup = bs(html,'html.parser') 
    specs = soup.find('table',{'class':'spec-list'})
    guit_details = {}
    
    try:
        guit_details['title']      = soup.find('h1').text#, text='Categories').find_next_sibling("td").text
    except:
        guit_details['title']      = 'unknown'
    
    try:
        guit_details['brand']      = specs.find('td', text='Brand').find_next_sibling("td").text
    except:
        guit_details['brand']      = 'generic'
    
    try:
        guit_details['condition']  = soup.find('div', class_='condition-display__label').text 
    except:
        guit_details['condition']  = 'used'
        
    try:
        guit_details['categories'] = specs.find('td', text='Categories').find_next_sibling("td").text
    except:
        guit_details['categories'] = 'unknown'
        
    try:
        guit_details['price']      = soup.find('span', class_='price-display').text.replace('$','')
    except:
        guit_details['price']      = 'unknown'
        

    try:    
        guit_details['type']       = specs.find('td', text='Model Family').find_next_sibling("td").text
    except:
        guit_details['type']       = 'unknown'
    
    try:
        guit_details['model']     = specs.find('td', text='Model').find_next_sibling("td").text
    except:
        guit_details['model']     = 'unknown'
    try:
        guit_details['finish']    = specs.find('td', text='Finish').find_next_sibling("td").text
    except:
        guit_details['finish']    = 'unknown'
        

    
    try:
        guit_details['origin']    = specs.find('td', text='Made In').find_next_sibling("td").text
    except:
        guit_details['origin']    = 'China'        
        
    try:
        guit_details['pickups']   = specs.find('td', text='Pickup Configuration').find_next_sibling("td").text
    except:
        guit_details['pickups']   = 'unknown'        
        
    
    try:
        guit_details['year']      = specs.find('td', text='Year').find_next_sibling("td").text
    except:
        guit_details['year']      = 'unknown'  

        
    try:
        guit_details['top']    = specs.find('td', text='Top Material').find_next_sibling("td").text
    except:
        guit_details['top']    = 'unknown'  
        

    try:
        guit_details['handed']    = specs.find('td', text='Right / Left Handed').find_next_sibling("td").text
    except:
        guit_details['handed']    = 'Right Handed'        
        
    try:
        guit_details['neck']    = specs.find('td', text='Neck Material').find_next_sibling("td").text
    except:
        guit_details['neck']    = 'generic'             
            
            
    try:
        guit_details['product_group']    = specs.find('td', text='Product Family').find_next_sibling("td").text
    except:
        guit_details['product_group']    = 'generic'   
        
    try:
        guit_details['body_type']    = specs.find('td', text='Body Type').find_next_sibling("td").text
    except:
        guit_details['body_type']    = 'generic'           
        
        
    try:
        guit_details['body_material']    = specs.find('td', text='Body Material').find_next_sibling("td").text
    except:
        guit_details['body_material']    = 'unknown'     

    try:
        guit_details['frets']    = specs.find('td', text='Number of Frets').find_next_sibling("td").text
    except:
        guit_details['frets']    = '22'          
        
        
        
        
    #print(f'scraping details for guitar {url} ')  
    time.sleep(1)
    
    return guit_details

## run the secondary scrape function (seen above) which gets the details then write out csv

In [1]:

from tqdm import tqdm
tqdm.pandas()

full_master_guitar_df['more_info'] = full_master_guitar_df.url.progress_apply(newcols)
time.sleep(10)
print('scrape done!')
full_master_guitar_df.info()

#test_df = pd.DataFrame(guitar_listing)
#time.sleep(10)
full_master_guitar_df.to_csv('full_scrape_02142022.csv', encoding='utf-8')    




## Sanity check: This is a small batch scrap test which will be used for the updates later on

In [182]:
## small batch testing of function
guitar_listing = []
x=1

for batch in tqdm(batches):
    #guitar_listing = []
    #print(f'scraping page {x} of {pages}')
    driver.get(f'https://reverb.com/marketplace?product_type=electric-guitars'+
               f'&condition[]=used&condition[]=good&condition[]=very-good'+
               f'&condition[]=excellent&condition[]=mint&page={x}')
    

    time.sleep(2)

    html = driver.page_source
    soup = bs(html,'html.parser') 
    guitar_page_data = getGuitars(html)
    guitar_listing.extend(guitar_page_data)
    
print('all pages scraped!')    
guitar_df = pd.DataFrame(master_guitar_listing)
time.sleep(10)
#master_guitar_listing.to_csv('master_guitar_listing.csv', encoding='utf-8')      

100%|█████████████████████████████████████████████████████████████| 11/11 [00:41<00:00,  3.75s/it]


all pages scraped!
