This notebook is used to scrape dog breed data from a combination of sites and store it in a postgres database.

In [12]:
import pandas as pd
from selenium import webdriver
from sqlalchemy import create_engine
from sqlalchemy_utils import database_exists, create_database
import psycopg2
import re
from bs4 import BeautifulSoup

In [5]:
#Create or load database
dbname = 'dog_breeds'
username = 'chris'
engine = create_engine('postgres://%s@localhost/%s'%(username,dbname))
print(engine.url)
## create a database (if it doesn't exist)
if not database_exists(engine.url):
    create_database(engine.url)
print(database_exists(engine.url))

postgres://chris@localhost/dog_breeds
True


Get all of the dog breed links from the akc website

In [25]:
driver = webdriver.Chrome('/Users/chris/Downloads/chromedriver')
driver.get('https://www.akc.org/dog-breeds/page/1/')
soup = BeautifulSoup(driver.page_source,"lxml")
#Now we need to parse the dog breeds
dog_breeds = []
for item in soup.find_all('option', value=True):
    dog_breeds.append(str(item['value']))
driver.close()
dog_breeds = list(set(dog_breeds))
#Clean up some of the links
remove = ['','https://www.akc.org/dog-breeds/sporting/','https://www.akc.org/dog-breeds/hound/','https://www.akc.org/dog-breeds/working/','https://www.akc.org/dog-breeds/terrier/','https://www.akc.org/dog-breeds/toy/',
          'https://www.akc.org/dog-breeds/non-sporting/','https://www.akc.org/dog-breeds/herding/','https://www.akc.org/dog-breeds/miscellaneous-class/',
          'https://www.akc.org/dog-breeds/foundation-stock-service/','https://www.akc.org/dog-breeds/smallest-dog-breeds/',
         'https://www.akc.org/dog-breeds/medium-dog-breeds/','https://www.akc.org/dog-breeds/largest-dog-breeds/',
         'https://www.akc.org/dog-breeds/hypoallergenic-dogs/','https://www.akc.org/dog-breeds/best-family-dogs/','https://www.akc.org/dog-breeds/best-guard-dogs/',
         'https://www.akc.org/dog-breeds/best-dogs-for-kids/','https://www.akc.org/dog-breeds/best-dogs-for-apartments-dwellers/',
         'https://www.akc.org/dog-breeds/hairless-dog-breeds/','https://www.akc.org/dog-breeds/smartest-dogs/']
dog_breeds_filtered = []
for breed in dog_breeds:
    if breed not in remove:
        dog_breeds_filtered.append(breed)

Follow the links and get all the breed info (raw)

In [58]:
def get_breed_info(link):
    #follow the links
    dog_dict = {}
    driver = webdriver.Chrome('/Users/chris/Downloads/chromedriver')
    driver.get(link)
    soup = BeautifulSoup(driver.page_source,"lxml")
    driver.close()
    #Get temperament
    for item in soup.find_all('span', attrs={'class':'attribute-list__description attribute-list__text attribute-list__text--lg mb4 bpm-mb5 pb0 d-block'}):
        dog_dict['temperament'] = str(item.text.encode('utf-8').strip())
    #Get attributes
    attributes = []
    for item in soup.find_all('span', attrs={'class': 'attribute-list__description attribute-list__text '}):
        attributes.append(str(item.text.encode('utf-8').strip()))
    #Get about
    for item in soup.find_all('div', attrs={'class':'breed-info__content-wrap'},limit=1):
        dog_dict['about'] = str(item.text.encode('utf-8').strip())
    #Get care
    for item in soup.find_all('div', attrs={'class':'tabs__tab-panel-content'}):
        if "NUTRITION" in str(item.text.encode('utf-8').strip()):
            dog_dict['nutrition'] = str(item.text.encode('utf-8').strip())
        if "GROOMING" in str(item.text.encode('utf-8').strip()):
            dog_dict['grooming'] = str(item.text.encode('utf-8').strip())
        if "EXERCISE" in str(item.text.encode('utf-8').strip()):
            dog_dict['exercise'] = str(item.text.encode('utf-8').strip())
        if "TRAINING" in str(item.text.encode('utf-8').strip()):
            dog_dict['training'] = str(item.text.encode('utf-8').strip())
        if "HEALTH" in str(item.text.encode('utf-8').strip()):
            dog_dict['health'] = str(item.text.encode('utf-8').strip())
    #Put attributes into dictionary
    for i in range(len(attributes)):
        if i == 0:
            dog_dict['rank'] = attributes[i]
        elif i == 1:
            dog_dict['height'] = attributes[i]
        elif i == 2:
            dog_dict['weight'] = attributes[i]
        elif i == 3:
            dog_dict['age'] = attributes[i]
        elif i == 4:
            dog_dict['group'] = attributes[i]
    return dog_dict

In [57]:
#Main loop to populate the database
get_breed_info(1)

{'about': "b'Greyhounds are the essence of the dog breeder\\xe2\\x80\\x99s credo \\xe2\\x80\\x9cForm follows function.\\xe2\\x80\\x9d From the narrow, aerodynamic skull to the shock-absorbing pads of the feet, Greyhounds are perfectly constructed for high-speed pursuit. The lean beauty of the Greyhound\\xe2\\x80\\x99s \\xe2\\x80\\x9cinverted S\\xe2\\x80\\x9d shape, created by the deep chest curving gently into a tightly tucked waist, has been an object of fascination for artists, poets, and kings for as long as human beings have called themselves civilized. Greyhounds are the template from which other coursing hounds have been struck.'",
 'age': "b'10-13 years'",
 'exercise': "b'EXERCISE\\n\\n\\n\\n\\n\\nThe Greyhound is the cheetah of the dog world. While perfectly happy to lounge around the house all day, he is capable of amazing speed and energy when faced with potential prey\\xe2\\x80\\x94or the chance to chase a tennis ball or a coursing lure. Greyhounds require a regular schedule