> Notes: 

> + [HTML Scraping](http://docs.python-guide.org/en/latest/scenarios/scrape/)
> + [Speed (concurrency)](http://docs.python-guide.org/en/latest/scenarios/speed/)

In [1]:
# lxml is a library written for parsing XML and HTML documents.
# Requests module provides more speed and readability than the built-in urllib2 module.

In [8]:
from lxml import html
import requests
import timeit

In [9]:
# looking at the 5 pages, these are the tags of interest
# <div title="buyer-name">Carson Busses</div>
# <span class="item-price">$29.95</span>

# using XPath(https://www.w3schools.com/xml/xpath_intro.asp) 
# (can also use CSSSelect)
def get_econpy():
    start = timeit.timeit()
    buyers = []
    prices = []
    for pageno in range(1,6):
        url = 'http://econpy.pythonanywhere.com/ex/00'+str(pageno)+'.html'
        print('url: {}'.format(url))
        page = requests.get(url)
        tree = html.fromstring(page.content)
        b = tree.xpath('//div[@title="buyer-name"]/text()')
        p = tree.xpath('//span[@class="item-price"]/text()')
        buyers.append(b)
        prices.append(p)
    end = timeit.timeit()
    print('buyers: {}'.format(buyers))
    print('\nprices: {}'.format(prices))
    print('\ntime: {}'.format(end-start))
get_econpy()

url: http://econpy.pythonanywhere.com/ex/001.html
url: http://econpy.pythonanywhere.com/ex/002.html
url: http://econpy.pythonanywhere.com/ex/003.html
url: http://econpy.pythonanywhere.com/ex/004.html
url: http://econpy.pythonanywhere.com/ex/005.html
buyers: [['Carson Busses', 'Earl E. Byrd', 'Patty Cakes', 'Derri Anne Connecticut', 'Moe Dess', 'Leda Doggslife', 'Dan Druff', 'Al Fresco', 'Ido Hoe', 'Howie Kisses', 'Len Lease', 'Phil Meup', 'Ira Pent', 'Ben D. Rules', 'Ave Sectomy', 'Gary Shattire', 'Bobbi Soks', 'Sheila Takya', 'Rose Tattoo', 'Moe Tell'], ['Les Toil', 'Lionel Train', 'N.V. Ubble', 'Mayflower van Lines', 'Chad A. While', 'Larry Yet Ann Arbor', 'Ted D. Baer', 'Etta Burger', 'Doris Closed', 'Bard Dahl', 'Barry Dellive', 'Robin Droppings', 'Io Ewe', 'May Flye', 'Alma Gedon', 'Hy Marks', 'Holly Hox', 'Chris Kraft', 'Ellis I. Land', 'Penny Loafer'], ['Dell Monte', 'Baxter Nature', 'Agatha L. Outtahere', 'Bunny Pellits', 'Cheri Pitts', 'Forrest Ranger', 'Ron Rico', 'Cole Shute

In [4]:
from threading import Thread, Lock
from lxml import html
import requests

buyers = []
prices = []
print('buyers: {}'.format(buyers))
print('\nprices: {}'.format(prices))

t_lock = Lock()
def get_econpy_thread(url):
    page = requests.get(url)
    tree = html.fromstring(page.content)
    b = tree.xpath('//div[@title="buyer-name"]/text()')
    p = tree.xpath('//span[@class="item-price"]/text()')
    with t_lock:
        print('url: {}'.format(url))
        buyers.append(b)
        prices.append(p)

for pageno in range(1,6):
    url = 'http://econpy.pythonanywhere.com/ex/00'+str(pageno)+'.html'
    t = Thread(target=get_econpy_thread,args=(url,))
    t.start()

buyers: []

prices: []
url: http://econpy.pythonanywhere.com/ex/001.html
url: http://econpy.pythonanywhere.com/ex/003.html
url: http://econpy.pythonanywhere.com/ex/005.html
url: http://econpy.pythonanywhere.com/ex/002.html
url: http://econpy.pythonanywhere.com/ex/004.html


In [5]:
print('buyers: {}'.format(buyers))
print('\nprices: {}'.format(prices))

buyers: [['Carson Busses', 'Earl E. Byrd', 'Patty Cakes', 'Derri Anne Connecticut', 'Moe Dess', 'Leda Doggslife', 'Dan Druff', 'Al Fresco', 'Ido Hoe', 'Howie Kisses', 'Len Lease', 'Phil Meup', 'Ira Pent', 'Ben D. Rules', 'Ave Sectomy', 'Gary Shattire', 'Bobbi Soks', 'Sheila Takya', 'Rose Tattoo', 'Moe Tell'], ['Dell Monte', 'Baxter Nature', 'Agatha L. Outtahere', 'Bunny Pellits', 'Cheri Pitts', 'Forrest Ranger', 'Ron Rico', 'Cole Shute', 'Gracie Spoon', 'May Zola Penny Ante', 'Ray Beeze', 'Barbara Blacksheep', 'Petey Bowt', 'Warner Brothers', 'Mel N. Colic', 'Petey Cue', 'B.V. Dease', 'Benny Fitt', 'Bette R. Haff', 'Desi Krashum'], ['Stella Constellation', 'Lee Derhosen', 'Stan Dupp', 'Hammond Ecks', 'Fanny Farmer', 'Golda Fish', 'Shirley U. Geste', 'Lou Gubrious', 'Beverly Hills', 'Castor Hoyle', 'Bella Katt', 'Al Lergy', 'Ida Lowers', 'Pete Moss', 'Rhoda Mule', 'Missy Perriad', 'Carolina Rice', 'Anna Septic', 'Cass Tigate', 'Val Voline'], ['Les Toil', 'Lionel Train', 'N.V. Ubble', 'M

In [6]:
from concurrent.futures import ThreadPoolExecutor
import requests

buyers = []
prices = []
print('buyers: {}'.format(buyers))
print('\nprices: {}'.format(prices))
urls = []

def get_econpy_futures(url):
    page = requests.get(url)
    tree = html.fromstring(page.content)
    b = tree.xpath('//div[@title="buyer-name"]/text()')
    p = tree.xpath('//span[@class="item-price"]/text()')
    print('url: {}'.format(url))
    return (b,p)

pool = ThreadPoolExecutor(max_workers=5)

for pageno in range(1,6):
    url = 'http://econpy.pythonanywhere.com/ex/00'+str(pageno)+'.html'
    urls.append(url)

for (b,p) in pool.map(get_econpy_futures,urls):
    buyers.append(b)
    prices.append(p)

buyers: []

prices: []
url: http://econpy.pythonanywhere.com/ex/003.html
url: http://econpy.pythonanywhere.com/ex/001.html
url: http://econpy.pythonanywhere.com/ex/002.html
url: http://econpy.pythonanywhere.com/ex/004.html
url: http://econpy.pythonanywhere.com/ex/005.html


In [7]:
print('buyers: {}'.format(buyers))
print('\nprices: {}'.format(prices))

buyers: [['Carson Busses', 'Earl E. Byrd', 'Patty Cakes', 'Derri Anne Connecticut', 'Moe Dess', 'Leda Doggslife', 'Dan Druff', 'Al Fresco', 'Ido Hoe', 'Howie Kisses', 'Len Lease', 'Phil Meup', 'Ira Pent', 'Ben D. Rules', 'Ave Sectomy', 'Gary Shattire', 'Bobbi Soks', 'Sheila Takya', 'Rose Tattoo', 'Moe Tell'], ['Les Toil', 'Lionel Train', 'N.V. Ubble', 'Mayflower van Lines', 'Chad A. While', 'Larry Yet Ann Arbor', 'Ted D. Baer', 'Etta Burger', 'Doris Closed', 'Bard Dahl', 'Barry Dellive', 'Robin Droppings', 'Io Ewe', 'May Flye', 'Alma Gedon', 'Hy Marks', 'Holly Hox', 'Chris Kraft', 'Ellis I. Land', 'Penny Loafer'], ['Dell Monte', 'Baxter Nature', 'Agatha L. Outtahere', 'Bunny Pellits', 'Cheri Pitts', 'Forrest Ranger', 'Ron Rico', 'Cole Shute', 'Gracie Spoon', 'May Zola Penny Ante', 'Ray Beeze', 'Barbara Blacksheep', 'Petey Bowt', 'Warner Brothers', 'Mel N. Colic', 'Petey Cue', 'B.V. Dease', 'Benny Fitt', 'Bette R. Haff', 'Desi Krashum'], ['Gill D. Lily', 'Cole Mines', 'Phil R. Monik', '