In [1]:
import requests as rq
import re
import pandas as pd
import numpy as np
import lxml.etree
import cssselect as css
from IPython.core.display import display, HTML

In [2]:
def parse_details(page_text):
    html_root = lxml.etree.HTML(page_text)

    name        = html_root.xpath(css.HTMLTranslator().css_to_xpath('div.box-title span'))[0].text
    breed       = html_root.xpath(css.HTMLTranslator().css_to_xpath('div.breed-dog p'))[0].text
    gender      = html_root.xpath(css.HTMLTranslator().css_to_xpath('div.gender p'))[0].text
    age         = html_root.xpath(css.HTMLTranslator().css_to_xpath('div.age p'))[0].text
    weight      = html_root.xpath(css.HTMLTranslator().css_to_xpath('div.weight p'))[0].text
    location    = html_root.xpath(css.HTMLTranslator().css_to_xpath('div.location p'))[0].xpath("string()")
    description = html_root.xpath(css.HTMLTranslator().css_to_xpath('h2:contains(Meet) + div'))[0].xpath("string()")
    
    age_match = re.fullmatch(r'(\d+) (Month(s?)|Year(s?))', age)
    if age_match: 
        if age_match.group(2).startswith('Month'):
            age = int(age_match.group(1)) / 12
        else :
            age = int(age_match.group(1))
    else:
        age = np.nan
    
    weight_match = re.fullmatch(r'(.*)lbs', weight)
    if weight_match:
        weight = float(weight_match.group(1))
    else:
        weight = np.nan
    
    description = re.sub('\s+', ' ',description)
    
    return {
        'Name': name,
        'Breed': breed,
        'Gender': gender,
        'Age (years)': age,
        'Weight (lbs)': weight,
        'Location': location,
        'Description':  description
    }

In [3]:
dogs_page = rq.get(r'http://www.pawschicago.org/our-work/pet-adoption/pets-available/#dogsResults')
some_dog_urls = re.findall(r'http://www.pawschicago.org/pet-available-for-adoption/showdog/.*/', dogs_page.text)

more_dogs_data_url = re.search('.*dogs-load-more.*data-url="(.*)"', dogs_page.text).group(1)
more_dogs_result = rq.get(more_dogs_data_url)
more_dog_urls = [dog['url'] for dog in more_dogs_result.json()['pets']]

dog_urls = set(some_dog_urls + more_dog_urls)
dog_urls

{'http://www.pawschicago.org/pet-available-for-adoption/showdog/ace-8/',
 'http://www.pawschicago.org/pet-available-for-adoption/showdog/adonis-2/',
 'http://www.pawschicago.org/pet-available-for-adoption/showdog/alda/',
 'http://www.pawschicago.org/pet-available-for-adoption/showdog/aldo-1/',
 'http://www.pawschicago.org/pet-available-for-adoption/showdog/angel-3/',
 'http://www.pawschicago.org/pet-available-for-adoption/showdog/angel-and-arden/',
 'http://www.pawschicago.org/pet-available-for-adoption/showdog/angelica-5/',
 'http://www.pawschicago.org/pet-available-for-adoption/showdog/arista/',
 'http://www.pawschicago.org/pet-available-for-adoption/showdog/atka/',
 'http://www.pawschicago.org/pet-available-for-adoption/showdog/aurelio/',
 'http://www.pawschicago.org/pet-available-for-adoption/showdog/axel-4/',
 'http://www.pawschicago.org/pet-available-for-adoption/showdog/becca-2/',
 'http://www.pawschicago.org/pet-available-for-adoption/showdog/benita-1/',
 'http://www.pawschicag

In [4]:
dog_page_texts = map (lambda u: rq.get(u).text, dog_urls)
#dog_page_texts = map (lambda u: rq.get(u).text, list(dog_urls)[:10])

df = pd.DataFrame(map(parse_details, dog_page_texts))
df.set_index('Name',inplace=True)
df

Unnamed: 0_level_0,Breed,Gender,Age (years),Weight (lbs),Location,Description
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Faith,Chihuahu,Female,5.000000,4.09,Foster Home,Faith would love to meet you! She would do be...
Smokey,Boxer Mix,Male,0.250000,16.56,Glenn L. Felner North Shore Adoption Center,Smokey would love to meet you! Stop by our Gl...
Harry,Dachshund Mix,Male,5.000000,23.00,Foster Home,Featured 5 Harry would love to meet you! He w...
Darby,Labrador Retriever Mix,Male,2.000000,40.00,Foster Home,Darby would love to meet you! He would do bes...
Queenie,Husky Mix,Female,0.333333,18.00,"Angels With Tails Roscoe Village, Sunday July ...",AWT Roscoe Vil Meet Queenie! Queenie will be ...
Wilma,Daschund,Female,0.250000,8.00,"Angels With Tails Roscoe Village, Sunday July ...",AWT Roscoe Vil Meet Wilma! Wilma will be one ...
Merlock,Chihuahua Mix,Male,10.000000,11.25,Glenn L. Felner North Shore Adoption Center,Merlock would love to meet you! Stop by our G...
Aldo,"Terrier, Rat",Male,7.000000,26.06,Foster Home,Aldo would love to meet you! Aldo is currentl...
Sydney,Boxer Mix,Female,0.250000,13.13,Glenn L. Felner North Shore Adoption Center,Sydney would love to meet you! Stop by our Gl...
Ziggy,Wire Fox Terrier Mix,Female,0.333333,8.00,"Angels With Tails Roscoe Village, Sunday July ...",AWT Roscoe Vil Meet Ziggy! Ziggy will be one ...
