> Notes: 
> + [Web Scraping](https://automatetheboringstuff.com/chapter11/)
> + [JSON](http://docs.python-guide.org/en/latest/scenarios/json/)

In [1]:
# modules that make it easy to scrape web pages in Python.
# (1) webbrowser: comes with Python and opens a browser to a specific page.
# (2) Requests: downloads files and web pages from the Internet.
#     Requests is easier and faster to use then urllib2.
# (3) Beautiful Soup: parses HTML, the format that web pages are written in.
# (4) Selenium: launches and controls a web browser. 
#     Selenium is able to fill in forms and simulate mouse clicks in this browser.
# (5) pyperclip: A cross-platform clipboard module for Python. (only handles plain text for now)

In [14]:
# open a web page in a browser using webbrowser
import webbrowser
webbrowser.open('http://inventwithpython.com/')

True

In [16]:
# open a web page in a browser using webbrowser with input as either:
# (1) *args [list of arguments] or
# (2) **kwargs [list of key,value pairs] or
# (3) sys.argv [only works as an app] or
# (4) clipboard
import webbrowser, sys, pyperclip 

def map_it(*args, **kwargs):
    if (len(args)>0):
        address = ''.join(args[:])
        print('args: address: {}'.format(address))
    elif (len(kwargs)>0):
        values = []
        for k,v in kwargs.items():
            values.append(v)
        address = ''.join(values)
        print('kwargs: address: {}'.format(address))
    #elif (len(sys.argv)>1):
        #address = ' '.join(sys.argv[1:])
        #print('argv: address: {}'.format(address))
    else:
        address = pyperclip.paste()
        print('pyperclip: address: {}'.format(address))
    webbrowser.open('https://www.google.com/maps/place/' + address)

# test case - *args
map_it('870 Valencia St,',' San Francisco,',' CA 94110')    

# test case - **kwargs
#map_it(add1='870 Valencia St,',add2=' San Francisco,',add3=' CA 94110') 

# test case - sys.argv
# can do as an app, not in jupyter notebook

# test case - clipboard
#map_it()

args: address: 870 Valencia St, San Francisco, CA 94110


In [17]:
# opens links on a web page in individual tabs
# inputs:
# (1) web page link
# (2) max number of links on the web page to open in tabs
# (3) sleep time between opening links to give browser to finish loading
#     otherwise if window is not loaded completely, tabs could not be
#     opened, and a new window is initiated for the next link

from lxml import html
import requests, webbrowser, time

def open_tab(url, sleep_time):
    print('sleeping for {} before opening {}'.format(sleep_time, url))
    time.sleep(sleep_time)
    webbrowser.open_new_tab(url)
    
def open_page_links_in_browser_tabs(url, max_page_urls_to_open, sleep_time):
    page = requests.get(url)
    tree = html.fromstring(page.content)
    hrefs = tree.xpath('//a/@href')
    hrefs_non_relative = []
    for href in hrefs:
        if 'http' in href:
            hrefs_non_relative.append(href)
    for href in hrefs_non_relative[:max_page_urls_to_open]:
        open_tab(href,sleep_time)

open_page_links_in_browser_tabs('https://automatetheboringstuff.com/',2,2)

sleeping for 2 before opening https://automatetheboringstuff.com/
sleeping for 2 before opening https://www.nostarch.com/automatestuff


In [21]:
# opens a browser with local weather
# inputs:
# (1) zip code
# (2) None - uses https://ipinfo.io/ to get local zip code

import requests, json, webbrowser

def open_weather(*zip):
    if (len(zip)>0):
        zip_code = zip
    else:
        res = requests.get('https://ipinfo.io')
        res_json = json.loads(res.text[:])
        zip_code = res_json['postal']
        
        # can use google in chrome to get local zip code
        # needs web scraping through xpath
        #res = requests.get('http://www.google.com/search?q=local zip code')
        #tree = html.fromstring(res.content)
        #divs = tree.xpath('//b/text()')
        #print(divs)

    webbrowser.open('https://weather.com/weather/today/l/'+zip_code)

# test case - zip code
#open_weather('06902')

# test case - none
open_weather()

In [31]:
import requests

# response object
res = requests.get('https://automatetheboringstuff.com/files/rj.txt')
print('response type: {}'.format(type(res)))
print('response status ok? {}'.format(res.status_code == requests.codes.ok))
print('response content size: {} characters'.format(len(res.text)))
print('\nresponse content (first 250 characters):\n{}'.format(res.text[:250]))

# exception handling
res = requests.get('http://inventwithpython.com/page_that_does_not_exist')
try:
    res.raise_for_status()
except Exception as exc:
    print('\nThere was a problem: {}'.format(exc))

# saving content
res = requests.get('https://automatetheboringstuff.com/files/rj.txt')
try:
    res.raise_for_status()
    # wb=write binary even though content is text
    playFile = open('RomeoAndJuliet.txt', 'wb') 
    for chunk in res.iter_content(100000):
        playFile.write(chunk)
    playFile.close()
    print('\nSaved.')
except Exception as exc:
    print('\nThere was a problem: {}'.format(exc))

response type: <class 'requests.models.Response'>
response status ok? True
response content size: 174130 characters

response content (first 250 characters):
ï»¿The Project Gutenberg EBook of Romeo and Juliet, by William Shakespeare

This eBook is for the use of anyone anywhere at no cost and with
almost no restrictions whatsoever.  You may copy it, give it away or
re-use it under the terms of the Project

There was a problem: 404 Client Error: Not Found for url: http://inventwithpython.com/page_that_does_not_exist

Saved.


In [3]:
# pip install beautifulsoup4
from bs4 import BeautifulSoup
import requests
res = requests.get('http://nostarch.com')
res.raise_for_status()
# lxml to get rid of parser not specified warning
noStarchSoup = BeautifulSoup(res.text,"lxml") 
type(noStarchSoup)

bs4.BeautifulSoup

In [4]:
# Selector passed to the select() method -- Will match
# soup.select('div') -- All elements named <div>
# soup.select('#author') -- The element with an id attribute of author
# soup.select('.notice') -- All elements that use a CSS class attribute named notice
# soup.select('div span') -- All elements named <span> that are within an element named <div>
# soup.select('div > span') -- All elements named <span> that are directly within an element 
#                              named <div>, with no other element in between
# soup.select('input[name]') -- All elements named <input> that have a name attribute with any value
# soup.select('input[type="button"]') -- All elements named <input> that have an attribute named 
#                                        type with value button

In [7]:
# get the example file from  http://nostarch.com/automatestuff/
exampleFile = open('example.html')
exampleSoup = BeautifulSoup(exampleFile,"lxml")
type(exampleSoup)

bs4.BeautifulSoup

In [8]:
import bs4
exampleFile = open('example.html')
exampleSoup = bs4.BeautifulSoup(exampleFile.read(),"lxml")

print('----- id attribute of value author -----')
elems = exampleSoup.select('#author')
print('type elems: {}'.format(type(elems)))
print('len elems: {}'.format(len(elems)))
print('type elems[0]: {}'.format(type(elems[0])))
print('elems[0].getText(): {}'.format(elems[0].getText()))
print('str(elems[0]): {}'.format(str(elems[0])))
print('elems[0].attrs: {}'.format(elems[0].attrs))

print('\n\n----- all p tags -----')
p_elems = exampleSoup.select('p')
print('str(p_elems[0]): {}'.format(str(p_elems[0])))
print('p_elems[0].getText(): {}'.format(p_elems[0].getText()))
print('str(p_elems[1]): {}'.format(str(p_elems[1])))
print('p_elems[1].getText(): {}'.format(p_elems[1].getText()))
print('str(p_elems[2]): {}'.format(str(p_elems[2])))
print('p_elems[2].getText(): {}'.format(p_elems[2].getText()))

print('\n\n----- get() attrib value -----')
span_elem = exampleSoup.select('span')[0]
print('str(span_elem): {}'.format(str(span_elem)))
print('span_elem.get("id"): {}'.format(span_elem.get('id')))
print('span_elem.get("some_nonexistent_addr") == None: {}'.format(span_elem.get("some_nonexistent_addr")))
print('span_elem.attrs: {}'.format(span_elem.attrs))

type elems: <class 'list'>
len elems: 1
type elems[0]: <class 'bs4.element.Tag'>
elems[0].getText(): Al Sweigart
str(elems[0]): <span id="author">Al Sweigart</span>
elems[0].attrs: {'id': 'author'}


str(p_elems[0]): <p>Download my <strong>Python</strong> book from <a href="http://inventwithpython.com">my website</a>.</p>
p_elems[0].getText(): Download my Python book from my website.
str(p_elems[1]): <p class="slogan">Learn Python the easy way!</p>
p_elems[1].getText(): Learn Python the easy way!
str(p_elems[2]): <p>By <span id="author">Al Sweigart</span></p>
p_elems[2].getText(): By Al Sweigart


str(span_elem): <span id="author">Al Sweigart</span>
span_elem.get("id"): author
span_elem.get("some_nonexistent_addr") == None: None
span_elem.attrs: {'id': 'author'}


In [1]:
# Get search keywords.
# Retrieve the search results page.
# Open a browser tab for each result

import requests, webbrowser, bs4

def top_searches(search_key):
    res = requests.get('http://google.com/search?q=' + search_key)
    res.raise_for_status()
    
    soup = bs4.BeautifulSoup(res.text,"lxml")
    link_elems = soup.select('.r a')
    num_open = min(5,len(link_elems))
    for i in range(num_open):
        webbrowser.open('http://google.com' + link_elems[i].get('href'))   

top_searches('beautiful soup')

def top_amazon_products(search_key):
    res = requests.get('http://www.amazon.com/' + search_key)
    res.raise_for_status()
    
    soup = bs4.BeautifulSoup(res.text,"lxml")
    link_elems = soup.select('#merchandisedContent .a-link-normal')
    num_open = min(5,len(link_elems))
    for i in range(num_open):
        url = 'http://www.amazon.com/' + link_elems[i].get('href')
        webbrowser.open(url)   

top_amazon_products('nutribullet')


20
http://www.amazon.com//NutriBullet-Blender-8-piece-Certified-Refurbished/dp/B017DFJXXY
http://www.amazon.com//Magic-Bullet-NBR-5554D6-NutriBullet-Superfood/dp/B017NDTT12
http://www.amazon.com//NutriBullet-Blender-9-piece-Certified-Refurbished/dp/B017DFJWLM
http://www.amazon.com//Nutri-Bullet-N12-1201-Max-Silver/dp/B01M32BO0K
http://www.amazon.com//NutriBullet-Cup-Blade-Replacement-Set/dp/B00JUED8V0


In [4]:
# Downloading pages and following links are the basis of many web crawling programs. 
# Similar programs could also do the following:
# (1) Back up an entire site by following all of its links.
# (2) Copy all the messages off a web forum.
# (3) Duplicate the catalog of items for sale on an online store.

# XKCD [http://xkcd.com/] is a popular geek webcomic.
# Load the XKCD home page.
# Save the comic image on that page.
# Follow the Previous Comic link.
# Repeat until it reaches the first comic.

import requests, os, bs4

def get_all_xkcd():
    url = 'http://xkcd.com'              # starting url
    os.makedirs('xkcd', exist_ok=True)   # store comics in ./xkcd
    while not url.endswith('#'):
        # Download the page.
        print('Downloading page: {}...'.format(url))
        res = requests.get(url)
        res.raise_for_status()
        soup = bs4.BeautifulSoup(res.text)
        
        # Find the URL of the comic image.
        comicElem = soup.select('#comic img')
        if comicElem == []:    
            print('Could not find comic image.')
        else:
            try:
                comicUrl = 'http:' + comicElem[0].get('src')
                # Download the image.
                print('Downloading image %s...' % (comicUrl))
                res = requests.get(comicUrl)
                res.raise_for_status()
            except requests.exceptions.MissingSchema:
                # skip this comic
                prevLink = soup.select('a[rel="prev"]')[0]
                url = 'http://xkcd.com' + prevLink.get('href')
                continue

        # Save the image to ./xkcd.
        imageFile = open(os.path.join('xkcd', os.path.basename(comicUrl)), 'wb')
        for chunk in res.iter_content(100000):
            imageFile.write(chunk)
        imageFile.close()

        # Get the Prev button's url.
        prevLink = soup.select('a[rel="prev"]')[0]
        url = 'http://xkcd.com' + prevLink.get('href')
    print('Done.')
    
get_all_xkcd()

Downloading page: http://xkcd.com...




 BeautifulSoup([your markup])

to this:

 BeautifulSoup([your markup], "lxml")

  markup_type=markup_type))


Downloading image http://imgs.xkcd.com/comics/unpublished_discoveries.png...
Downloading page: http://xkcd.com/1804/...
Downloading image http://imgs.xkcd.com/comics/video_content.png...
Downloading page: http://xkcd.com/1803/...
Downloading image http://imgs.xkcd.com/comics/location_reviews.png...
Downloading page: http://xkcd.com/1802/...
Downloading image http://imgs.xkcd.com/comics/phone.png...
Downloading page: http://xkcd.com/1801/...
Downloading image http://imgs.xkcd.com/comics/decision_paralysis.png...
Downloading page: http://xkcd.com/1800/...
Downloading image http://imgs.xkcd.com/comics/chess_notation.png...
Downloading page: http://xkcd.com/1799/...
Downloading image http://imgs.xkcd.com/comics/bad_map_projection_time_zones.png...
Downloading page: http://xkcd.com/1798/...
Downloading image http://imgs.xkcd.com/comics/box_plot.png...
Downloading page: http://xkcd.com/1797/...
Downloading image http://imgs.xkcd.com/comics/stardew_valley.png...
Downloading page: http://xkcd.

KeyboardInterrupt: 

In [10]:
# The selenium module lets Python directly control the browser by programmatically 
# clicking links and filling in login information, almost as though there is a 
# human user interacting with the page. Selenium allows you to interact with web 
# pages in a much more advanced way than Requests and Beautiful Soup; but because 
# it launches a web browser, it is a bit slower and hard to run in the background 
# if, say, you just need to download some files from the Web.

# install or upgrade selenium - run under ~/anaconda/bin
# pip install -U selenium

# Selenium requires a driver to interface with the chosen browser. 
# Chrome:	https://sites.google.com/a/chromium.org/chromedriver/downloads
# Edge:	https://developer.microsoft.com/en-us/microsoft-edge/tools/webdriver/
# Firefox:	https://github.com/mozilla/geckodriver/releases
# Safari:	https://webkit.org/blog/6900/webdriver-support-in-safari-10/
# Make sure it’s in your PATH, e.g., place it in /usr/bin or /usr/local/bin by soft linking
# as:
# sudo ln -s /Users/[username]/Documents/dev/selenium/chromedriver /usr/local/bin/chromedriver

from selenium import webdriver

browser = webdriver.Chrome()
print('type: {}'.format(type(browser)))
browser.get('http://inventwithpython.com')

type: <class 'selenium.webdriver.chrome.webdriver.WebDriver'>


In [12]:
# Selenium’s WebDriver Methods for Finding Elements

# Elements that use the CSS class name
    # browser.find_element_by_class_name(name)
    # browser.find_elements_by_class_name(name)

# Elements that match the CSS selector
    # browser.find_element_by_css_selector(selector)
    # browser.find_elements_by_css_selector(selector)

# Elements with a matching id attribute value
    # browser.find_element_by_id(id)
    # browser.find_elements_by_id(id)

# <a> elements that completely match the text provided
    # browser.find_element_by_link_text(text)
    # browser.find_elements_by_link_text(text)

# <a> elements that contain the text provided
# browser.find_element_by_partial_link_text(text)
# browser.find_elements_by_partial_link_text(text)

# Elements with a matching name attribute value
    # browser.find_element_by_name(name)
    # browser.find_elements_by_name(name)

# Elements with a matching tag name 
# (case insensitive; an <a> element is matched by 'a' and 'A')
    # browser.find_element_by_tag_name(name)
    # browser.find_elements_by_tag_name(name)

# Once you have the WebElement object, you can find out more about it 
# by reading the attributes or calling the methods as below:

# The tag name, such as 'a' for an <a> element
    # tag_name

# The value for the element’s name attribute
    # get_attribute(name)

# The text within the element, such as 'hello' in <span>hello</span>
    # text

# For text field or text area elements, clears the text typed into it
    # clear()

# Returns True if the element is visible; otherwise returns False
    # is_displayed()

# For input elements, returns True if the element is enabled; 
# otherwise returns False
    # is_enabled()

# For checkbox or radio button elements, returns True if the element 
# is selected; otherwise returns False
    # is_selected()

# A dictionary with keys 'x' and 'y' for the position of the element 
# in the page
    # location

from selenium import webdriver

browser = webdriver.Chrome()
browser.get('http://inventwithpython.com')
try:
    elem = browser.find_element_by_class_name('bookcover')
    print('Found <{}> element with that class name!'.format(elem.tag_name))
except:
    print('Was not able to find an element with that name.')

Found <img> element with that class name!


In [13]:
# clicking a page

from selenium import webdriver

browser = webdriver.Chrome()
browser.get('http://inventwithpython.com')
linkElem = browser.find_element_by_link_text('Read It Online')
print('type: {}'.format(type(linkElem)))
linkElem.click() # follows the "Read It Online" 

type: <class 'selenium.webdriver.remote.webelement.WebElement'>


In [21]:
# Filling Out and Submitting Forms

from selenium import webdriver
from time import sleep

browser = webdriver.Chrome()
browser.get('https://mail.yahoo.com')
emailElem = browser.find_element_by_id('login-username')
emailElem.send_keys('username')
emailElem.submit()
sleep(5)
passwordElem = browser.find_element_by_id('login-passwd')
passwordElem.send_keys('userpassword')
passwordElem.submit()

ElementNotVisibleException: Message: element not visible
  (Session info: chrome=56.0.2924.87)
  (Driver info: chromedriver=2.27.440174 (e97a722caafc2d3a8b807ee115bfb307f7d2cfd9),platform=Mac OS X 10.12.3 x86_64)
