In [48]:
import requests as rq
import re
import pandas as pd
import numpy as np
import lxml.etree
import cssselect as css
from IPython.core.display import display, HTML

In [61]:
def parse_details(page_text):
    html_root = lxml.etree.HTML(page_text)

    name        = html_root.xpath(css.HTMLTranslator().css_to_xpath('div.box-title span'))[0].text
    breed       = html_root.xpath(css.HTMLTranslator().css_to_xpath('div.breed-dog p'))[0].text
    gender      = html_root.xpath(css.HTMLTranslator().css_to_xpath('div.gender p'))[0].text
    age         = html_root.xpath(css.HTMLTranslator().css_to_xpath('div.age p'))[0].text
    weight      = html_root.xpath(css.HTMLTranslator().css_to_xpath('div.weight p'))[0].text
    location    = html_root.xpath(css.HTMLTranslator().css_to_xpath('div.location p'))[0].text
    description = html_root.xpath(css.HTMLTranslator().css_to_xpath('h2:contains(Meet) + div'))[0].xpath("string()")
    
    age_match = re.fullmatch(r'(\d+) (Month(s?)|Year(s?))', age)
    if age_match: 
        if age_match.group(2).startswith('Month'):
            age = int(age_match.group(1)) / 12
        else :
            age = int(age_match.group(1))
    else:
        age = np.nan
    
    weight_match = re.fullmatch(r'(.*)lbs', weight)
    if weight_match:
        weight = float(weight_match.group(1))
    else:
        weight = np.nan
    
    return {
        'Name': name,
        'Breed': breed,
        'Gender': gender,
        'Age (years)': age,
        'Weight (lbs)': weight,
        'Location': location,
        'Description':  description
    }

In [3]:
dogs_page = rq.get(r'http://www.pawschicago.org/our-work/pet-adoption/pets-available/#dogsResults')
some_dog_urls = re.findall(r'http://www.pawschicago.org/pet-available-for-adoption/showdog/.*/', dogs_page.text)

more_dogs_data_url = re.search('.*dogs-load-more.*data-url="(.*)"', dogs_page.text).group(1)
more_dogs_result = rq.get(more_dogs_data_url)
more_dog_urls = [dog['url'] for dog in more_dogs_result.json()['pets']]

dog_urls = set(some_dog_urls + more_dog_urls)
dog_urls

{'http://www.pawschicago.org/pet-available-for-adoption/showdog/adler-3/',
 'http://www.pawschicago.org/pet-available-for-adoption/showdog/adonis-2/',
 'http://www.pawschicago.org/pet-available-for-adoption/showdog/alaric/',
 'http://www.pawschicago.org/pet-available-for-adoption/showdog/alda/',
 'http://www.pawschicago.org/pet-available-for-adoption/showdog/aldo-1/',
 'http://www.pawschicago.org/pet-available-for-adoption/showdog/angelica-5/',
 'http://www.pawschicago.org/pet-available-for-adoption/showdog/becca-2/',
 'http://www.pawschicago.org/pet-available-for-adoption/showdog/bob-4/',
 'http://www.pawschicago.org/pet-available-for-adoption/showdog/braxton-1/',
 'http://www.pawschicago.org/pet-available-for-adoption/showdog/brazil/',
 'http://www.pawschicago.org/pet-available-for-adoption/showdog/brody-2/',
 'http://www.pawschicago.org/pet-available-for-adoption/showdog/bromley-1/',
 'http://www.pawschicago.org/pet-available-for-adoption/showdog/bruno-7/',
 'http://www.pawschicago.

In [62]:
df = pd.DataFrame(map(lambda u: parse_details(rq.get(u).text), list(dog_urls)[:5]))
df.set_index('Name')
df

Unnamed: 0,Name,Breed,Gender,Age (years),Weight (lbs),Location,Description
0,Rani,Terrier/ Mix,Female,3.0,55.0,Foster Home,\n \n ...
1,Vedder,Pug/Beagle,Male,1.0,25.0,Foster Home,\n \n ...
2,Aldo,"Terrier, Rat",Male,7.0,26.06,Foster Home,\n \n ...
3,Becca,American Staffordshire/Mix,Female,1.0,42.0,Foster Home,\n \n ...
4,Colby,Labrador Retriever Mix,Male,0.916667,47.0,,\n \n ...
