In [1]:
#imports

from pymongo import MongoClient
import pprint

import copy

import pandas as pd
import scipy.stats as stats
import numpy as np
import matplotlib.pyplot as plt

# Requests sends and recieves HTTP requests.
import requests

# Beautiful Soup parses HTML documents in python.
from bs4 import BeautifulSoup

import json
from time import sleep

In [2]:
from selectorlib import Extractor

In [3]:
def pause_to_seem_human(mean=4, std=1):
    #hahah joke's on you Bezos
    normal_random = stats.norm(loc=mean, scale=std).rvs()
    sleep(max(.25,normal_random))

pause_to_seem_human()

In [4]:
def amazon_url_from_id(product_id):
    return f'https://www.amazon.com/dp/{product_id}'

In [5]:
def scrape(url, pause=False):  

    #spoof your user-agent
    headers = {
        'dnt': '1',
        'upgrade-insecure-requests': '1',
        'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36',
        'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
        'sec-fetch-site': 'same-origin',
        'sec-fetch-mode': 'navigate',
        'sec-fetch-user': '?1',
        'sec-fetch-dest': 'document',
        'referer': 'https://www.amazon.com/',
        'accept-language': 'en-GB,en-US;q=0.9,en;q=0.8',
    }

    # Download the page using requests
    print("Downloading %s"%url)
    r = requests.get(url, headers=headers)
    # Simple check to check if page was blocked (Usually 503)
    print(r.status_code)
    if r.status_code > 500:
        if "To discuss automated access to Amazon data please contact" in r.text:
            print("Page %s was blocked by Amazon. Please try using better proxies\n"%url)
        else:
            print("Page %s must have been blocked by Amazon as the status code was %d"%(url,r.status_code))
        return None
    
    #to avoid scrape detection
    if pause:
        pause_to_seem_human()
    #return the request object 
    return r

In [6]:
def get_default_color_product_ids(soup_content):
    
    #Check if color menu is dropdown (native) or image radio button
    tag = soup_content.find('option', {'id' : 'native_color_name_0'})
    if tag:
        #image radio button
        native = True
    else:
        #dropdown
        tag = soup_content.find('li', {'id' : 'color_name_0'})
        native = False
        
        #we may have a product without any color selection, in which case return [(color='default',id)]
        if not tag:
            tag = soup_content.find('link', {'rel' : 'canonical'})
            
            #if tag still None then we cannot process this product's info
            if not tag:
                print('parsing product failed')
                return
            
            product_id = find_product_tag(str(tag))
            return [('default', product_id)]
    
    #get list of html blocks where color and product ID are stored
    tag_type = 'option' if native else 'li'
    color_tags = tag.find_parent().find_all(tag_type)
    
    #parse through tags to find color and ID
    return [find_default_color_product_id(tag, native=native) for tag in color_tags]
        

In [7]:
def find_product_tag(html_string, start='B0', id_length=10):
    
    start_index = html_string.find(start)
    return html_string[start_index: start_index + id_length]
    

In [8]:
def find_default_color_product_id(html_tag, native=False, start='B0'):
    '''
    params:
        html_string (bs4.element.Tag): the html tag to parse
        start = 'B0': the beginning of the product ID (B0 includes all(*tm) men's t-shirts
        native=False (bool) : whether or not the colors are given as a drop down or image radio button
    
    returns:
        color (str): the name of the color associated with that tag
        product_tag (str): the first substring starting with @start
    '''
    product_id = find_product_tag(str(html_tag), start=start)
    if native:
        color = html_tag.text.strip()
    else:
        color = html_tag.find('img')['alt'].strip()
    return color, product_id

In [9]:
def get_size_product_ids(color_ids):
    #BEZOS_ALERT: makes len(color_ids) requests
    color_size_product_ids = []
    
    for color, product_id in color_ids:
        
        request = scrape(amazon_url_from_id(product_id), True)
        color_soup = BeautifulSoup(request.content, 'html')
        tag = color_soup.find('option', {'id' : 'native_size_name_-1'})
        size_tags = tag.find_parent().find_all('option')[1:]
        
        for tag in size_tags:
            
            #check if size is available, if so add to ids list
            if not 'Un' in tag['class'][0]:
                size, size_product_id = find_default_color_product_id(tag, native=True)
                color_size_product_ids.append((color, size, size_product_id))
    
    return color_size_product_ids

In [10]:
def get_price_from_id(product_id):
    #BEZOS_ALERT: makes 1 request to amazon.com
    
    #need to append string to end to get the correct size to be selected
    request = scrape(amazon_url_from_id(product_id)+'?th=1&psc=1', True)
    soup = BeautifulSoup(request.content, 'html')
    return soup.find('span', {'id' : 'priceblock_ourprice'}).text.strip()

In [11]:
def get_color_size_price_list(color_size_product_ids):
    #BEZOS_ALERT makes len(color_size_product_ids) requests to amazon.com
    color_size_price_list = []
    
    for color, size, product_id in color_size_product_ids:
        price = get_price_from_id(product_id)
        color_size_price_list.append((color, size, product_id, price))

In [12]:
price_request = scrape('https://www.amazon.com/dp/B00WARXD5U?th=1&psc=1', True)

Downloading https://www.amazon.com/dp/B00WARXD5U?th=1&psc=1
200


In [13]:
price_soup = BeautifulSoup(price_request.content, 'html')
print(price_soup.prettify())

<!DOCTYPE html>
<!--[if lt IE 7]> <html lang="en-us" class="a-no-js a-lt-ie9 a-lt-ie8 a-lt-ie7"> <![endif]-->
<!--[if IE 7]>    <html lang="en-us" class="a-no-js a-lt-ie9 a-lt-ie8"> <![endif]-->
<!--[if IE 8]>    <html lang="en-us" class="a-no-js a-lt-ie9"> <![endif]-->
<!--[if gt IE 8]><!-->
<html class="a-no-js" lang="en-us">
 <!--<![endif]-->
 <head>
  <meta content="text/html; charset=utf-8" http-equiv="content-type"/>
  <meta charset="utf-8"/>
  <meta content="IE=edge,chrome=1" http-equiv="X-UA-Compatible"/>
  <title dir="ltr">
   Amazon.com
  </title>
  <meta content="width=device-width" name="viewport"/>
  <link href="https://images-na.ssl-images-amazon.com/images/G/01/AUIClients/AmazonUI-3c913031596ca78a3768f4e934b1cc02ce238101.secure.min._V1_.css" rel="stylesheet"/>
  <script>
   if (true === true) {
    var ue_t0 = (+ new Date()),
        ue_csm = window,
        ue = { t0: ue_t0, d: function() { return (+new Date() - ue_t0); } },
        ue_furl = "fls-na.amazon.com",
      

In [97]:
x = price_soup.find('span', {'id':'priceblock_ourprice'})
type(x)

NoneType

In [57]:
r = scrape(amazon_url_from_id('B07MY42QMV'), True)

Downloading https://www.amazon.com/dp/B07MY42QMV


In [58]:
color_ids = get_default_color_product_ids(BeautifulSoup(r.content, 'html'))

In [39]:
color_size_ids = get_size_product_ids(color_ids)

Downloading https://www.amazon.com/dp/B00WARWQMG
Downloading https://www.amazon.com/dp/B00WARVH26
Downloading https://www.amazon.com/dp/B00WARXK08
Downloading https://www.amazon.com/dp/B00WARVCFI
Downloading https://www.amazon.com/dp/B00WARVW2Q
Downloading https://www.amazon.com/dp/B00WARWK4U
Downloading https://www.amazon.com/dp/B00WARXBRA
Downloading https://www.amazon.com/dp/B00WARVWMG
Downloading https://www.amazon.com/dp/B00WARX5MQ
Downloading https://www.amazon.com/dp/B00WARXJV8
Downloading https://www.amazon.com/dp/B00WARWGX0
Downloading https://www.amazon.com/dp/B00WARWQG2
Downloading https://www.amazon.com/dp/B00WARWUKE


In [72]:
color_size_ids

[('Kelly Green', 'Large Tall', 'B00WARWQMG'),
 ('Kelly Green', 'X-Large Tall', 'B00WARWS1A'),
 ('Kelly Green', 'XX-Large Tall', 'B00WARWT82'),
 ('Kelly Green', '3X-Large Tall', 'B00WARWUPO'),
 ('Kelly Green', '4X-Large Tall', 'B00WARWVXU'),
 ('Neon Yellow', 'Large Tall', 'B00WARVH26'),
 ('Neon Yellow', 'X-Large Tall', 'B00WARVK3C'),
 ('Neon Yellow', '3X-Large Tall', 'B00WARVMEE'),
 ('White', 'Large Tall', 'B00WARXK08'),
 ('White', 'X-Large Tall', 'B00WARXL8O'),
 ('White', 'XX-Large Tall', 'B00WARXMJC'),
 ('Black', 'X-Large Tall', 'B00WARVCFI'),
 ('Black', 'XX-Large Tall', 'B00WARVEMO'),
 ('Deep Orange', 'Large Tall', 'B00WARVW2Q'),
 ('Deep Orange', 'X-Large Tall', 'B00WARVXF2'),
 ('Iron Grey', 'X-Large Tall', 'B00WARWK4U'),
 ('Iron Grey', 'XX-Large Tall', 'B00WARWLT4'),
 ('Maroon', 'Large Tall', 'B00WARXBRA'),
 ('Maroon', 'X-Large Tall', 'B00WARXD5U'),
 ('Sand', 'Large Tall', 'B00WARVWMG'),
 ('Sand', 'XX-Large Tall', 'B00WARVZAK'),
 ('Lime Shock', 'X-Large Tall', 'B00WARX5MQ'),
 ('Neon

In [76]:
first_full_list = get_color_size_price_list(color_size_ids)

Downloading https://www.amazon.com/dp/B00WARWQMG?th=1&psc=1


AttributeError: 'NoneType' object has no attribute 'text'

In [None]:
amzn_test_url = 'https://www.amazon.com/dp/B00GJBI7JU'
another_test_url = 'https://www.amazon.com/Amazon-Essentials-Performance-Long-Sleeve-T-Shirt/dp/B07MY48KSW/ref=sr_1_230?dchild=1&qid=1623177527&s=apparel&sr=1-230'

In [8]:
wikipedia_color_urls = ['https://en.m.wikipedia.org/wiki/List_of_colors:_A%E2%80%93F', 'https://en.m.wikipedia.org/wiki/List_of_colors:_G%E2%80%93M', 'https://en.m.wikipedia.org/wiki/List_of_colors:_N%E2%80%93Z']

In [188]:
''.join([char for char in rows[0].find('th').text.strip().lower() if char not in [' ','-']])

'absolutezero'

In [11]:
color_dict = {}
for url in wikipedia_color_urls:
    request = requests.get(url)
    color_soup = BeautifulSoup(request.content, 'html')
    table = color_soup.find('tbody')
    rows = table.find_all('tr')[1:]
    for row in rows:
        color_dict[clean_string(row.find('th').text)] = row.find('td').text.strip()
    color_dict

In [13]:
with open('data/color_dict.json','w') as file:
    json.dump(color_dict, file)

In [9]:
def clean_string(string, remove=[' ','-']):
    return ''.join([char for char in string.strip().lower() if char not in remove])

In [12]:
color_dict

{'absolutezero': '#0048BA',
 'acidgreen': '#B0BF1A',
 'aero': '#7CB9E8',
 'aeroblue': '#C0E8D5',
 'africanviolet': '#B284BE',
 'airsuperiorityblue': '#72A0C1',
 'alabaster': '#EDEAE0',
 'aliceblue': '#F0F8FF',
 'alloyorange': '#C46210',
 'almond': '#EFDECD',
 'amaranth': '#E52B50',
 'amaranth(m&p)': '#9F2B68',
 'amaranthpink': '#F19CBB',
 'amaranthpurple': '#AB274F',
 'amaranthred': '#D3212D',
 'amazon': '#3B7A57',
 'amber': '#FFBF00',
 'amber(sae/ece)': '#FF7E00',
 'amethyst': '#9966CC',
 'androidgreen': '#3DDC84',
 'antiquebrass': '#CD9575',
 'antiquebronze': '#665D1E',
 'antiquefuchsia': '#915C83',
 'antiqueruby': '#841B2D',
 'antiquewhite': '#FAEBD7',
 'ao(english)': '#008000',
 'applegreen': '#8DB600',
 'apricot': '#FBCEB1',
 'aqua': '#00FFFF',
 'aquamarine': '#7FFFD4',
 'arcticlime': '#D0FF14',
 'armygreen': '#4B5320',
 'artichoke': '#8F9779',
 'arylideyellow': '#E9D66B',
 'ashgray': '#B2BEB5',
 'asparagus': '#87A96B',
 'atomictangerine': '#FF9966',
 'auburn': '#A52A2A',
 'aureol

## 