In [1]:
import requests as rq
import re
import pandas as pd
import numpy as np
import lxml.etree
import cssselect as css
from IPython.core.display import display, HTML

In [26]:
def parse_details(page_text):
    html_root = lxml.etree.HTML(page_text)

    name        = html_root.xpath(css.HTMLTranslator().css_to_xpath('div.box-title span'))[0].text
    breed       = html_root.xpath(css.HTMLTranslator().css_to_xpath('div.breed-dog p'))[0].text
    gender      = html_root.xpath(css.HTMLTranslator().css_to_xpath('div.gender p'))[0].text
    age         = html_root.xpath(css.HTMLTranslator().css_to_xpath('div.age p'))[0].text
    weight      = html_root.xpath(css.HTMLTranslator().css_to_xpath('div.weight p'))[0].text
    location    = html_root.xpath(css.HTMLTranslator().css_to_xpath('div.location p'))[0].xpath("string()")
    description_divs = html_root.xpath(css.HTMLTranslator().css_to_xpath('h2:contains(Meet) + div'))
    if len(description_divs) > 0:
        description = description_divs[0].xpath("string()")
        description = re.sub('\s+', ' ',description)
    else:
        description = ''
    
    age_match = re.fullmatch(r'(\d+) (Month(s?)|Year(s?))', age)
    if age_match: 
        if age_match.group(2).startswith('Month'):
            age = int(age_match.group(1))
        else :
            age = int(age_match.group(1)) * 12
    else:
        age = np.nan
    
    weight_match = re.fullmatch(r'(.*)lbs', weight)
    if weight_match:
        weight = float(weight_match.group(1))
    else:
        weight = np.nan
    
    return {
        'Name': name,
        'Breed': breed,
        'Gender': gender,
        'Age (months)': age,
        'Weight (lbs)': weight,
        'Location': location,
        'Description':  description
    }

In [9]:
def get_restrictions(description_text):
    first_restriction = ''
    second_restriction = ''
    restriction_match = re.search(r'would do best (.*?)\.', description_text)
    if restriction_match:
        first_restriction = restriction_match.group(1)
    second_restriction_match = re.search(r'[Ww]ould also do best (.*?)\.', description_text)
    if second_restriction_match:
        second_restriction = second_restriction_match.group(1)
    return first_restriction

In [10]:
dogs_page = rq.get(r'http://www.pawschicago.org/our-work/pet-adoption/pets-available/#dogsResults')
some_dog_urls = re.findall(r'http://www.pawschicago.org/pet-available-for-adoption/showdog/.*/', dogs_page.text)

more_dogs_data_url = re.search('.*dogs-load-more.*data-url="(.*)"', dogs_page.text).group(1)
more_dogs_result = rq.get(more_dogs_data_url)
more_dog_urls = [dog['url'] for dog in more_dogs_result.json()['pets']]

dog_urls = set(some_dog_urls + more_dog_urls)
dog_urls

{'http://www.pawschicago.org/pet-available-for-adoption/showdog/ace-8/',
 'http://www.pawschicago.org/pet-available-for-adoption/showdog/adonis-2/',
 'http://www.pawschicago.org/pet-available-for-adoption/showdog/alda/',
 'http://www.pawschicago.org/pet-available-for-adoption/showdog/aldo-1/',
 'http://www.pawschicago.org/pet-available-for-adoption/showdog/angel-and-arden/',
 'http://www.pawschicago.org/pet-available-for-adoption/showdog/ariella/',
 'http://www.pawschicago.org/pet-available-for-adoption/showdog/arista/',
 'http://www.pawschicago.org/pet-available-for-adoption/showdog/atka/',
 'http://www.pawschicago.org/pet-available-for-adoption/showdog/aurelio/',
 'http://www.pawschicago.org/pet-available-for-adoption/showdog/becca-2/',
 'http://www.pawschicago.org/pet-available-for-adoption/showdog/benita-1/',
 'http://www.pawschicago.org/pet-available-for-adoption/showdog/bob-4/',
 'http://www.pawschicago.org/pet-available-for-adoption/showdog/braxton-1/',
 'http://www.pawschicago.

In [19]:
dog_page_texts = [t for t in map (lambda u: rq.get(u).text, dog_urls)]
#dog_page_texts = map (lambda u: rq.get(u).text, list(dog_urls)[:6])

In [27]:
df = pd.DataFrame(map(parse_details, dog_page_texts))
df.set_index('Name',inplace=True)
df

Unnamed: 0_level_0,Breed,Gender,Age (months),Weight (lbs),Location,Description
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Faith,Chihuahu,Female,60,4.09,Foster Home,Faith would love to meet you! She would do be...
Harry,Dachshund Mix,Male,60,23.00,Foster Home,Featured 5 Harry would love to meet you! He w...
Ariella,Australian Cattle Dog Terrier Mix,Female,3,14.00,"Angels With Tails Roscoe Village, Sunday July ...",AWT Roscoe Vil Meet Ariella! Ariella will be ...
Queenie,Husky Mix,Female,4,18.00,"Angels With Tails Roscoe Village, Sunday July ...",AWT Roscoe Vil Meet Queenie! Queenie will be ...
Darby,Labrador Retriever Mix,Male,24,40.00,Foster Home,Darby would love to meet you! He would do bes...
Wilma,Daschund,Female,4,8.00,"Angels With Tails Roscoe Village, Sunday July ...",AWT Roscoe Vil Meet Wilma! Wilma will be one ...
Merlock,Chihuahua Mix,Male,120,11.25,Lincoln Park Adoption Center,Merlock would love to meet you! Stop by the P...
Aldo,"Terrier, Rat",Male,84,26.06,Lincoln Park Adoption Center,Aldo would love to meet you! Aldo is currentl...
Estelle,Chihuahua,Female,132,8.13,Lincoln Park Adoption Center,Estelle would love to meet you! Stop by our P...
Ziggy,Wire Fox Terrier Mix,Female,4,8.00,"Angels With Tails Roscoe Village, Sunday July ...",AWT Roscoe Vil Meet Ziggy! Ziggy will be one ...


In [29]:
df['Restriction'] = df['Description'].apply(get_restrictions)
df.sort_values(by=['Restriction','Age (months)'], ascending=[True,False])

Unnamed: 0_level_0,Breed,Gender,Age (months),Weight (lbs),Location,Description,Restriction
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Nish,Chihuahua/Mix,Male,156,12.88,Foster Home,Nish would love to meet you! Nish is looking ...,
Estelle,Chihuahua,Female,132,8.13,Lincoln Park Adoption Center,Estelle would love to meet you! Stop by our P...,
Merlock,Chihuahua Mix,Male,120,11.25,Lincoln Park Adoption Center,Merlock would love to meet you! Stop by the P...,
Velma,Miniature Poodle Mix,Female,120,7.00,Lincoln Park Adoption Center,Velma would love to meet you! Stop by the Pip...,
Phil,Maltese,Male,108,8.63,Foster Home,Phil would love to meet you! Phil is currentl...,
Elisa,"Pinscher, Miniature/Mix",Female,96,14.06,Foster Home,Elisa would love to meet you! Elisa is curren...,
Aldo,"Terrier, Rat",Male,84,26.06,Lincoln Park Adoption Center,Aldo would love to meet you! Aldo is currentl...,
Calvin,Chihuahua/Terrier,Male,72,11.56,Foster Home,Calvin would love to meet you! Calvin is curr...,
Patton,"Chihuahua, Short Coat",Male,72,6.38,Lincoln Park Adoption Center,Patton would love to meet you! Stop by the Pi...,
Hampton,Chihuahua,Male,48,8.00,"Angels With Tails Roscoe Village, Sunday July ...",AWT Roscoe Vil Meet Hampton! Hampton will be ...,
