In [1]:
try:
    import google.colab
    IN_COLAB = True
  
  # # If using colab
  # from google.colab import drive
  # drive.mount('/content/drive')

  # import os
  # os.chdir('/content/drive/My Drive/Colab Notebooks/EBA5004')
  # print('Working directory changed to ' + os.getcwd())

except:
    IN_COLAB = False


In [0]:
%%capture
# Install dependencies -chromium, its driver, and selenium
!apt-get update
!apt install chromium-chromedriver
!cp /usr/lib/chromium-browser/chromedriver /usr/bin
!pip install selenium
!pip install pyvirtualdisplay
!sudo apt-get install xvfb

In [3]:
# Set to Singapore
!rm /etc/localtime
!ln -s /usr/share/zoneinfo/Asia/Singapore /etc/localtime
!date

Mon Mar 23 17:48:58 +08 2020


In [4]:
from bs4 import BeautifulSoup as s
import pandas as pd
import re
from tqdm import tqdm

# set options to be headless
from selenium import webdriver

chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
# driver = webdriver.Chrome('chromedriver', chrome_options=chrome_options)

# function definitions
def web_browser(web):
    # browser = webdriver.Safari()
    browser = webdriver.Chrome('chromedriver', options=chrome_options)
    # browser = webdriver.Chrome('chromedriver_79', options=chrome_options)   # When running locally
    browser.get(web)
    content = browser.page_source
    return content

def updateDict(dict, key, desc, src, bPreferShort = True):
    
    # preprocess key to remove unwanted characters such as ',",®, etc
    k = re.sub('[^\w\-\.\/]+',' ', key.lower()).strip()

    # update value (choose shorter/longer value if exists)
    if k in dict and (len(desc) > len(dict[k]['desc']) if bPreferShort else len(desc) < len(dict[k]['desc'])): 
        return

    dict[k] = {
        'desc': desc,
        'src': src
    }

def getDetailsDict(dict):
    print('Dictionary: ')
    print(' num keys: ', len(dict))

    v = list(dict.values())
    unique_src = set([ t['src'] for t in v])
    print(' sources:', unique_src)


## Scrape Data - Amazon

In [0]:
# Scrape from amazon.com
url="https://www.amazon.com/s?crid=19P9EMN5KVHGL&i=electronics-intl-ship&k=headphones&ref=nb_sb_ss_i_1_9&sprefix=headphone%2Celectronics-intl-ship%2C361&url=search-alias%3Delectronics-intl-ship"
#url="https://www.amazon.com/s?k=AKG+K52+Headphones&s=price-asc-rank&qid=1584865427&swrs=5FBBF8F8699D3F4E0E1B8CCA9222F191&ref=sr_st_price-asc-rank"
webpage = web_browser(url)
soup = s(webpage,'html.parser')
models = [p.text for p in soup.find_all("span", class_="a-size-medium a-color-base a-text-normal")]
prices=[]
# for price_div in soup.find_all(lambda tag: tag.name == 'span' and tag.get('class') == ['a-price']):
#     price_whole = price_div.find(class_='a-offscreen').text
#     prices.append(price_whole)
#     #print(prices)

for item in soup.find_all("span", class_="a-size-medium a-color-base a-text-normal"):
    price_whole =item.findNext(class_='a-offscreen').text
    prices.append(price_whole)

print(len(models))
print(len(prices))
dictionary = dict(zip(models, prices))
print(dictionary)

22
22
{'COWIN E7 Active Noise Cancelling Headphones Bluetooth Headphones with Microphone Deep Bass Wireless Headphones Over Ear, Comfortable Protein Earpads, 30 Hours Playtime for Travel/Work, Black': '$59.99', 'Bluetooth Headphones, LETSCOM 100 Hours Playtime Wireless Headphones Over Ear with Deep Bass, Hi-Fi Sound and Soft Earpads,Built-in Mic Wired Headset for Cell Phones PC Tablet Home Office-Silver': '$26.99', 'Panasonic ErgoFit In-Ear Earbud Headphones RP-HJE120-K (Black) Dynamic Crystal Clear Sound, Ergonomic Comfort-Fit': '$9.01', 'Jabra Move Wireless Stereo Headphones - Black': '$49.99', 'Sony MDRE9LP/BLK Ear Buds': '$9.28', 'Aux Headphones/Earphones/Earbuds 3.5mm Wired Headphones Noise Isolating Earphones with Built-in Microphone & Volume Control Compatible with iPhone 6 SE 5S 4 iPod iPad Samsung/Android MP3': '$11.99', 'House of Marley Smile Jamaica Wired Noise Cancelling Headphones with Microphone': '$24.99', 'USB C to 3.5mm Headphone Adapter,Jevtech Type C to Headphone Ada



---


## Search Amazon by ASIN

If we search amazon based on model name eg AKG MKII, and sort by price, search results can be misleading. For example we may get accessories for it instead of the actual item. So the best way for Amazon, is to get the product's Amazon Standard Identification Number (ASIN), and use it to search

In [0]:
# Get search URL in AMAZON
base_url="https://www.amazon.com/s?k="
sort = "&s=price-asc-rank"
dept = "&i=electronics-intl-ship"

key ='AKG K141 MKII' # <-------- Product name
key = key.replace(' ', '+')
url = f"{base_url}{key}{dept}{sort}"
print(url)

https://www.amazon.com/s?k=AKG+K141+MKII&i=electronics-intl-ship&s=price-asc-rank


In [9]:
import re

# A couple of possibilities
# Product has 
# (i) Both 'New' and 'Used' sellers
# (ii) Only 'New' sellers
# (iii) Only 'Used' sellers
# (iv) No sellers
# Currently only tried for (i) and (iv)

def get_ps(soup, id_str, class_str):
    s = None
    p = soup.find(id=id_str) #
    if p is not None:
        p = float(p.get_text().replace("$",""))
        s = soup.select(class_str)[0].get_text().strip().lower()
        
        if 'free shipping' in s: 
            s = 0
        else:
            s = float(re.sub('[+$a-z]', '', s))
        
    return p, s
    

def get_all_prices(soup):
    if soup.find(id="unqualifiedBuyBox_feature_div") != None:
        print("No sellers!?")
        return None, None, None, None
    
    # Only 1 seller? Or only New item
    price, shipping = get_ps(soup, "price_inside_buybox", "div#shippingMessageInsideBuyBox_feature_div div.a-section div.a-row")
    if price is not None:
        print(f"Buy: ${price}, with shipping ${shipping}, for a total of ${price + shipping}")
        return price, shipping, None, None
    
    # Might have both New and Used item??
    new_price, new_shipping = get_ps(soup, "newBuyBoxPrice", "div#shippingMessageInsideBuyBox_feature_div div.a-section div.a-row")
    if new_price is not None:
        print(f"Buy new: ${new_price}, with shipping ${new_shipping}, for a total of ${new_price + new_shipping}")
    
    used_price, used_shipping = get_ps(soup, "usedPrice", "div#usedBuyBoxShippingMessage_feature_div div.a-section div.a-row")
    if used_price is not None:
        print(f"Buy used: ${used_price}, with shipping ${used_shipping}, for a total of ${used_price + used_shipping}")
        
    if new_price is None and used_price is None:
        print("No sellers!?")
  
    return (new_price, new_shipping, used_price, used_shipping)


In [12]:
import csv
import pickle

# Turns out if we have the ASIN, the url is pretty simple. The 'best' item for this ASIN is automatically recommended by Amazon!!
# (Though it is possible that there are NO sellers for this item)
# See https://blog.refundsmanager.com/multiple-sellers-of-the-same-item/
asin = 'B0016MNBAM' # <--- 'best' seller (has multiple 'new' and 'used' sellers)
# asin = "B083F1LVZV" # <--- no seller in Singapore
# asin = 
url = f"https://www.amazon.com/dp/{asin}"
print(url)

toParse = False  # Set to false if we already have a saved version of it
temp = f'temp_{asin}'

if toParse:
    webpage = web_browser(url)
    soup = s(webpage,'html.parser')

    # Save to analyse html code, if necessary
    with open(temp+'.html', 'w+', encoding="utf-8-sig") as f:
        f.writelines(str(soup))

else:
    # We are testing out stuff.. Don't webscrape again! Just use the cached results, so that we don't 'waste' our web-scraping attempts
    with open(temp+'.html', 'r', encoding="utf-8-sig") as p:
        soup = s(p, 'html.parser')


https://www.amazon.com/dp/B0016MNBAM


In [13]:
# Get price and shipping details. NOTE: Need to run locally to be SG prices and shipping costs, else will be US prices and shipping!!!!
(new_price, new_shipping, used_price, used_shipping) = get_all_prices(soup)

Buy new: $124.17, with shipping $4.49, for a total of $128.66
Buy used: $63.52, with shipping $0, for a total of $63.52


# ----------------------------------------------------------------------------------------------------------

In [0]:
import pandas as pd
data_items = dictionary.items()
data_list = list(data_items)
pd.DataFrame.from_dict(data_list)

Unnamed: 0,0,1
0,COWIN E7 Active Noise Cancelling Headphones Bl...,$49.99
1,Panasonic ErgoFit In-Ear Earbud Headphones RP-...,$9.01
2,Sony MDRE9LP/BLK Ear Buds,$9.12
3,House of Marley Smile Jamaica Wired Noise Canc...,$24.99
4,Jabra Move Wireless Stereo Headphones - Black,$49.99
5,Skullcandy S2DUDZ058 Headphone Earbud (Discont...,$8.99
6,Elite Core EC-WBP 3.5mm FM to XLRF Wired Body ...,$21.99
7,Sennheiser CX300 II CX 300 II Precision Enha...,$25.00
8,1MORE Triple Driver in-Ear Earphones Hi-Res He...,$71.41
9,"Earbuds/Headphones/Earphones,3.5mm Wired Headp...",$14.99


## Scrape Data - Google shopping

In [0]:
# Scrape from google shopping
url="https://www.google.com/search?tbm=shop&hl=en-HK&source=hp&biw=&bih=&q=headphones&oq=headp&gs_l=products-cc.3.0.0l10.5626.6736.0.8092.5.5.0.0.0.0.76.348.5.5.0....0...1ac.1.34.products-cc..0.5.348.bgXmiKW5Ibw"
webpage = web_browser(url)
soup = s(webpage,'html.parser')
models = [p.text for p in soup.find_all("h3", class_="xsRiS")]
print(models)

prices = [p.text for p in soup.find_all("span", class_="Nr22bf")]
print(prices)

print(len(models))
print(len(prices))
dictionary = dict(zip(models, prices))
print(dictionary)

['Panasonic RP-HT161-K Over-Ear Headphones - Black', 'SleepPhones Wireless - AA Royal Blue (Breeze Fabric) / One Size Fits Most', 'Bose A20 Aviation Bluetooth Wireless Over-Ear Headset - Noise-Canceling', 'TV Wireless Headphones - Silver', 'Flow - Over-Ear Bluetooth Hybrid Noise Cancelling Wireless Headphone', 'Bowers & Wilkins PX7 Bluetooth Wireless Over-Ear Headphones with Mic - Noise-Canceling - Space Gray', 'Bose A20 Headset with Bluetooth High Impedance for Military', 'Flow II - Wireless Bluetooth Noise Cancelling Over Ear Headphone with Google Assistant', 'Samsung Galaxy Buds True Wireless In-Ear Headphones (Black)', 'Bose QuietComfort 25 Acoustic Over-Ear Headphones with Mic - Noise-Canceling - Black', 'Refurbished Beats by Dr. Dre Studio3 Wireless Matte Black Over Ear Headphones MQ562LL/A', 'Beats Studio3 Bluetooth Wireless Over-Ear Headphones with Mic - Noise-Canceling - Defiant Black/Red', 'Sony Noise Cancelling Wireless Bluetooth Over the Ear Headphones WH1000XM3 Black', 'Be

In [0]:
import pandas as pd
data_items = dictionary.items()
data_list = list(data_items)
pd.DataFrame.from_dict(data_list)

Unnamed: 0,0,1
0,Panasonic RP-HT161-K Over-Ear Headphones - Black,$22.99.
1,SleepPhones Wireless - AA Royal Blue (Breeze F...,$55.00.
2,Bose A20 Aviation Bluetooth Wireless Over-Ear ...,"$1,095.95."
3,TV Wireless Headphones - Silver,$99.99.
4,Flow - Over-Ear Bluetooth Hybrid Noise Cancell...,$199.99.
5,Bowers & Wilkins PX7 Bluetooth Wireless Over-E...,$348.98.
6,Bose A20 Headset with Bluetooth High Impedance...,"$1,095.95."
7,Flow II - Wireless Bluetooth Noise Cancelling ...,$279.99.
8,Samsung Galaxy Buds True Wireless In-Ear Headp...,$129.99.
9,Bose QuietComfort 25 Acoustic Over-Ear Headpho...,$299.95.
