In [2]:
import requests as rq
import re
import pandas as pd
import numpy as np
import lxml.etree
import cssselect as css
from IPython.core.display import display, HTML

In [3]:
def safer_extract_text(html_root, css_pattern):
    possible_hits = html_root.xpath(css.HTMLTranslator().css_to_xpath(css_pattern))
    if len(possible_hits) > 0:
        return possible_hits[0].text
    else:
        return ''

In [4]:
def safer_extract_string(html_root, css_pattern):
    possible_hits = html_root.xpath(css.HTMLTranslator().css_to_xpath(css_pattern))
    if len(possible_hits) > 0:
        return possible_hits[0].xpath("string()")
    else:
        return ''

In [36]:
def parse_details(page_text):
    html_root = lxml.etree.HTML(page_text)

    name        = safer_extract_text(html_root, 'div.box-title span')
    breed       = safer_extract_text(html_root, 'div.breed-dog p')
    gender      = safer_extract_text(html_root, 'div.gender p')
    age         = safer_extract_text(html_root, 'div.age p')
    weight      = safer_extract_text(html_root, 'div.weight p')
    location    = safer_extract_string(html_root, 'div.location p')
    description = safer_extract_string(html_root, 'h2:contains(Meet) + div')
    
    age_match = re.fullmatch(r'(\d+) (Month(s?)|Year(s?))', age)
    if age_match: 
        if age_match.group(2).startswith('Month'):
            age = int(age_match.group(1))
        else :
            age = int(age_match.group(1)) * 12
    else:
        age = np.nan
    
    weight_match = re.fullmatch(r'(.*)lbs', weight)
    if weight_match:
        weight = float(weight_match.group(1))
    else:
        weight = np.nan
        
    description = re.sub('\s+', ' ',description)
    
    image_link = ''
    possible_images = html_root.xpath(css.HTMLTranslator().css_to_xpath('img.img-circle'))
    if possible_images:
        image_link = possible_images[0].get('src')
    
    return {
        'Name': name,
        'Breed': breed,
        'Gender': gender,
        'Age': age,
        'Weight (lbs)': weight,
        'Location': location,
        'Description':  description,
        'Image Source': image_link
    }

In [7]:
dogs_page = rq.get(r'http://www.pawschicago.org/our-work/pet-adoption/pets-available/#dogsResults')
some_dog_urls = re.findall(r'http://www.pawschicago.org/pet-available-for-adoption/showdog/.*/', dogs_page.text)

more_dogs_data_url = re.search('.*dogs-load-more.*data-url="(.*)"', dogs_page.text).group(1)
more_dogs_result = rq.get(more_dogs_data_url)
more_dog_urls = [dog['url'] for dog in more_dogs_result.json()['pets']]

dog_urls = set(some_dog_urls + more_dog_urls)
dog_urls

{'http://www.pawschicago.org/pet-available-for-adoption/showdog/ace-8/',
 'http://www.pawschicago.org/pet-available-for-adoption/showdog/adler-3/',
 'http://www.pawschicago.org/pet-available-for-adoption/showdog/adonis-2/',
 'http://www.pawschicago.org/pet-available-for-adoption/showdog/alaric/',
 'http://www.pawschicago.org/pet-available-for-adoption/showdog/alda/',
 'http://www.pawschicago.org/pet-available-for-adoption/showdog/aldo-1/',
 'http://www.pawschicago.org/pet-available-for-adoption/showdog/benita-1/',
 'http://www.pawschicago.org/pet-available-for-adoption/showdog/bob-4/',
 'http://www.pawschicago.org/pet-available-for-adoption/showdog/braxton-1/',
 'http://www.pawschicago.org/pet-available-for-adoption/showdog/brody-2/',
 'http://www.pawschicago.org/pet-available-for-adoption/showdog/bruno-7/',
 'http://www.pawschicago.org/pet-available-for-adoption/showdog/burton/',
 'http://www.pawschicago.org/pet-available-for-adoption/showdog/cailean-1/',
 'http://www.pawschicago.org/

In [8]:
dog_page_texts = [p for p in map (lambda u: (rq.get(u).text, u), dog_urls)]
#dog_page_texts = map (lambda u: rq.get(u).text, list(dog_urls)[:6])

In [43]:
#df = pd.DataFrame(map(parse_details, dog_page_texts))
details_dicts = []
for pair in dog_page_texts:
    details = parse_details(pair[0])
    details['URL'] = pair[1]
    details_dicts.append(details)
df = pd.DataFrame(details_dicts)
#df.set_index('Name',inplace=True)

In [44]:
def get_restrictions(description_text):
    first_restriction = ''
    second_restriction = ''
    restriction_match = re.search(r'would do best (.*?)\.', description_text)
    if restriction_match:
        first_restriction = restriction_match.group(1)
    second_restriction_match = re.search(r'[Ww]ould also do best (.*?)\.', description_text)
    if second_restriction_match:
        second_restriction = second_restriction_match.group(1)
    return first_restriction

In [45]:
def make_name_link(row):
    return r'<a href="' + row['URL'] + r'">' + row['Name'] + r'</a>'

In [46]:
def make_image_link(row):
    return r'<a href="' + row['URL'] + r'">' + r'<img src="' + row['Image Source'] + r'"></a>'

In [47]:
df['Restriction'] = df['Description'].apply(get_restrictions)
df['Image Link'] = df.apply(make_image_link, axis=1)
df['Name Link'] = df.apply(make_name_link, axis=1)

In [12]:
def age_formater(age):
    if np.isnan(age):
        return 'NaN'
    elif age < 12:
        return str(int(age)) + " Months"
    else:
        return str(int(age / 12)) + " Years"

In [13]:
def name_formatter(name):
    if name: return '<a href="' + df.loc[name,'URL'] +'">' + name + '</a>'
    else: return ''

In [14]:
def image_formatter(image_link):
    if image_link: return '<img src="'+image_link+'">'
    else: return ''

In [50]:
with open("dogs.html", "w") as f:
    f.write("<html><body>")
    f.write(df.sort_values(by=['Restriction','Age'], ascending=[True,False]).to_html(
        escape=False,
        columns = ['Name Link', 'Image Link','Breed','Gender','Age','Weight (lbs)','Restriction','Description', 'Location'],
        formatters = {'Age': age_formater},
        index=False
    ))
    f.write("</body></html>")

[Output](dogs.html)

In [None]:
display(HTML(df.sort_values(by=['Restriction','Age'], ascending=[True,False]).to_html(
        escape=False,
        columns = ['Image','Breed','Gender','Age','Weight (lbs)','Restriction','Description', 'Location'],
        formatters = {'Age': age_formater,
                      'Name': name_formatter,
                      'Image': image_formatter}
    )))

In [None]:
import beakerx