> Notes: 
> + [Web Scraping](https://automatetheboringstuff.com/chapter11/)
> + [JSON](http://docs.python-guide.org/en/latest/scenarios/json/)

In [1]:
# modules that make it easy to scrape web pages in Python.
# (1) webbrowser: comes with Python and opens a browser to a specific page.
# (2) Requests: downloads files and web pages from the Internet.
#     Requests is easier and faster to use then urllib2.
# (3) Beautiful Soup: parses HTML, the format that web pages are written in.
# (4) Selenium: launches and controls a web browser. 
#     Selenium is able to fill in forms and simulate mouse clicks in this browser.
# (5) pyperclip: A cross-platform clipboard module for Python. (only handles plain text for now)

In [14]:
# open a web page in a browser using webbrowser
import webbrowser
webbrowser.open('http://inventwithpython.com/')

True

In [16]:
# open a web page in a browser using webbrowser with input as either:
# (1) *args [list of arguments] or
# (2) **kwargs [list of key,value pairs] or
# (3) sys.argv [only works as an app] or
# (4) clipboard
import webbrowser, sys, pyperclip 

def map_it(*args, **kwargs):
    if (len(args)>0):
        address = ''.join(args[:])
        print('args: address: {}'.format(address))
    elif (len(kwargs)>0):
        values = []
        for k,v in kwargs.items():
            values.append(v)
        address = ''.join(values)
        print('kwargs: address: {}'.format(address))
    #elif (len(sys.argv)>1):
        #address = ' '.join(sys.argv[1:])
        #print('argv: address: {}'.format(address))
    else:
        address = pyperclip.paste()
        print('pyperclip: address: {}'.format(address))
    webbrowser.open('https://www.google.com/maps/place/' + address)

# test case - *args
map_it('870 Valencia St,',' San Francisco,',' CA 94110')    

# test case - **kwargs
#map_it(add1='870 Valencia St,',add2=' San Francisco,',add3=' CA 94110') 

# test case - sys.argv
# can do as an app, not in jupyter notebook

# test case - clipboard
#map_it()

args: address: 870 Valencia St, San Francisco, CA 94110


In [17]:
# opens links on a web page in individual tabs
# inputs:
# (1) web page link
# (2) max number of links on the web page to open in tabs
# (3) sleep time between opening links to give browser to finish loading
#     otherwise if window is not loaded completely, tabs could not be
#     opened, and a new window is initiated for the next link

from lxml import html
import requests, webbrowser, time

def open_tab(url, sleep_time):
    print('sleeping for {} before opening {}'.format(sleep_time, url))
    time.sleep(sleep_time)
    webbrowser.open_new_tab(url)
    
def open_page_links_in_browser_tabs(url, max_page_urls_to_open, sleep_time):
    page = requests.get(url)
    tree = html.fromstring(page.content)
    hrefs = tree.xpath('//a/@href')
    hrefs_non_relative = []
    for href in hrefs:
        if 'http' in href:
            hrefs_non_relative.append(href)
    for href in hrefs_non_relative[:max_page_urls_to_open]:
        open_tab(href,sleep_time)

open_page_links_in_browser_tabs('https://automatetheboringstuff.com/',2,2)

sleeping for 2 before opening https://automatetheboringstuff.com/
sleeping for 2 before opening https://www.nostarch.com/automatestuff


In [21]:
# opens a browser with local weather
# inputs:
# (1) zip code
# (2) None - uses https://ipinfo.io/ to get local zip code

import requests, json, webbrowser

def open_weather(*zip):
    if (len(zip)>0):
        zip_code = zip
    else:
        res = requests.get('https://ipinfo.io')
        res_json = json.loads(res.text[:])
        zip_code = res_json['postal']
        
        # can use google in chrome to get local zip code
        # needs web scraping through xpath
        #res = requests.get('http://www.google.com/search?q=local zip code')
        #tree = html.fromstring(res.content)
        #divs = tree.xpath('//b/text()')
        #print(divs)

    webbrowser.open('https://weather.com/weather/today/l/'+zip_code)

# test case - zip code
#open_weather('06902')

# test case - none
open_weather()

In [31]:
import requests

# response object
res = requests.get('https://automatetheboringstuff.com/files/rj.txt')
print('response type: {}'.format(type(res)))
print('response status ok? {}'.format(res.status_code == requests.codes.ok))
print('response content size: {} characters'.format(len(res.text)))
print('\nresponse content (first 250 characters):\n{}'.format(res.text[:250]))

# exception handling
res = requests.get('http://inventwithpython.com/page_that_does_not_exist')
try:
    res.raise_for_status()
except Exception as exc:
    print('\nThere was a problem: {}'.format(exc))

# saving content
res = requests.get('https://automatetheboringstuff.com/files/rj.txt')
try:
    res.raise_for_status()
    # wb=write binary even though content is text
    playFile = open('RomeoAndJuliet.txt', 'wb') 
    for chunk in res.iter_content(100000):
        playFile.write(chunk)
    playFile.close()
    print('\nSaved.')
except Exception as exc:
    print('\nThere was a problem: {}'.format(exc))

response type: <class 'requests.models.Response'>
response status ok? True
response content size: 174130 characters

response content (first 250 characters):
ï»¿The Project Gutenberg EBook of Romeo and Juliet, by William Shakespeare

This eBook is for the use of anyone anywhere at no cost and with
almost no restrictions whatsoever.  You may copy it, give it away or
re-use it under the terms of the Project

There was a problem: 404 Client Error: Not Found for url: http://inventwithpython.com/page_that_does_not_exist

Saved.


In [3]:
# pip install beautifulsoup4
from bs4 import BeautifulSoup
import requests
res = requests.get('http://nostarch.com')
res.raise_for_status()
# lxml to get rid of parser not specified warning
noStarchSoup = BeautifulSoup(res.text,"lxml") 
type(noStarchSoup)

bs4.BeautifulSoup

In [4]:
# Selector passed to the select() method -- Will match
# soup.select('div') -- All elements named <div>
# soup.select('#author') -- The element with an id attribute of author
# soup.select('.notice') -- All elements that use a CSS class attribute named notice
# soup.select('div span') -- All elements named <span> that are within an element named <div>
# soup.select('div > span') -- All elements named <span> that are directly within an element 
#                              named <div>, with no other element in between
# soup.select('input[name]') -- All elements named <input> that have a name attribute with any value
# soup.select('input[type="button"]') -- All elements named <input> that have an attribute named 
#                                        type with value button

In [7]:
# get the example file from  http://nostarch.com/automatestuff/
exampleFile = open('example.html')
exampleSoup = BeautifulSoup(exampleFile,"lxml")
type(exampleSoup)

bs4.BeautifulSoup

In [14]:
import bs4
exampleFile = open('example.html')
exampleSoup = bs4.BeautifulSoup(exampleFile.read(),"lxml")
elems = exampleSoup.select('#author')
print('type elems: {}'.format(type(elems)))
print('len elems: {}'.format(len(elems)))
print('type elems[0]: {}'.format(type(elems[0])))
print('elems[0].getText(): {}'.format(elems[0].getText()))
print('str(elems[0]): {}'.format(str(elems[0])))
print('elems[0].attrs: {}'.format(elems[0].attrs))

type elems: <class 'list'>
len elems: 1
type elems[0]: <class 'bs4.element.Tag'>
elems[0].getText(): Al Sweigart
str(elems[0]): <span id="author">Al Sweigart</span>
elems[0].attrs: {'id': 'author'}
