This notebook is used to scrape dog breed data from a combination of sites and store it in a postgres database.

In [1]:
import pandas as pd
from selenium import webdriver
from sqlalchemy import create_engine
from sqlalchemy_utils import database_exists, create_database
import psycopg2
import re
from bs4 import BeautifulSoup

  """)


In [2]:
#Create or load database
dbname = 'dog_breeds'
username = 'chris'
engine = create_engine('postgres://%s@localhost/%s'%(username,dbname))
print(engine.url)
## create a database (if it doesn't exist)
if not database_exists(engine.url):
    create_database(engine.url)
print(database_exists(engine.url))

postgres://chris@localhost/dog_breeds
True


Get all of the dog breed links from the akc website https://www.akc.org/dog-breeds/page/1/

In [3]:
def get_akc_links(page):
    driver = webdriver.Chrome('/Users/chris/Downloads/chromedriver')
    driver.get(page)
    soup = BeautifulSoup(driver.page_source,"lxml")
    driver.close()
    #Now we need to parse the dog breeds
    dog_breeds = []
    for item in soup.find_all('option', value=True):
        dog_breeds.append(str(item['value']))
    dog_breeds = list(set(dog_breeds))
    #Clean up some of the links
    remove = ['','https://www.akc.org/dog-breeds/sporting/','https://www.akc.org/dog-breeds/hound/','https://www.akc.org/dog-breeds/working/','https://www.akc.org/dog-breeds/terrier/','https://www.akc.org/dog-breeds/toy/',
          'https://www.akc.org/dog-breeds/non-sporting/','https://www.akc.org/dog-breeds/herding/','https://www.akc.org/dog-breeds/miscellaneous-class/',
          'https://www.akc.org/dog-breeds/foundation-stock-service/','https://www.akc.org/dog-breeds/smallest-dog-breeds/',
         'https://www.akc.org/dog-breeds/medium-dog-breeds/','https://www.akc.org/dog-breeds/largest-dog-breeds/',
         'https://www.akc.org/dog-breeds/hypoallergenic-dogs/','https://www.akc.org/dog-breeds/best-family-dogs/','https://www.akc.org/dog-breeds/best-guard-dogs/',
         'https://www.akc.org/dog-breeds/best-dogs-for-kids/','https://www.akc.org/dog-breeds/best-dogs-for-apartments-dwellers/',
         'https://www.akc.org/dog-breeds/hairless-dog-breeds/','https://www.akc.org/dog-breeds/smartest-dogs/']
    dog_breeds_filtered = []
    for breed in dog_breeds:
        if breed not in remove:
            dog_breeds_filtered.append(breed)
    return dog_breeds_filtered

Follow the links on the akc site and get all the breed info (raw)

In [4]:
def get_akc_breed_info(link):
    #follow the links
    dog_dict = {}
    driver = webdriver.Chrome('/Users/chris/Downloads/chromedriver')
    driver.get(link)
    soup = BeautifulSoup(driver.page_source,"lxml")
    driver.close()
    #Get temperament
    for item in soup.find_all('span', attrs={'class':'attribute-list__description attribute-list__text attribute-list__text--lg mb4 bpm-mb5 pb0 d-block'}):
        dog_dict['temperament'] = str(item.text.encode('utf-8').strip())
    #Get attributes
    attributes = []
    for item in soup.find_all('span', attrs={'class': 'attribute-list__description attribute-list__text '}):
        attributes.append(str(item.text.encode('utf-8').strip()))
    #Get about
    for item in soup.find_all('div', attrs={'class':'breed-info__content-wrap'},limit=1):
        dog_dict['about'] = str(item.text.encode('utf-8').strip())
    #Get care
    for item in soup.find_all('div', attrs={'class':'tabs__tab-panel-content'}):
        if "NUTRITION" in str(item.text.encode('utf-8').strip()):
            dog_dict['nutrition'] = str(item.text.encode('utf-8').strip())
        if "GROOMING" in str(item.text.encode('utf-8').strip()):
            dog_dict['grooming'] = str(item.text.encode('utf-8').strip())
        if "EXERCISE" in str(item.text.encode('utf-8').strip()):
            dog_dict['exercise'] = str(item.text.encode('utf-8').strip())
        if "TRAINING" in str(item.text.encode('utf-8').strip()):
            dog_dict['training'] = str(item.text.encode('utf-8').strip())
        if "HEALTH" in str(item.text.encode('utf-8').strip()):
            dog_dict['health'] = str(item.text.encode('utf-8').strip())
    #Put attributes into dictionary
    for i in range(len(attributes)):
        if i == 0:
            dog_dict['rank'] = attributes[i]
        elif i == 1:
            dog_dict['height'] = attributes[i]
        elif i == 2:
            dog_dict['weight'] = attributes[i]
        elif i == 3:
            dog_dict['age'] = attributes[i]
        elif i == 4:
            dog_dict['group'] = attributes[i]
    dog_dict['url'] = link
    return dog_dict

In [24]:
#Main loop to populate the database from the american kennel club
page = 'https://www.akc.org/dog-breeds/page/1/'
dog_breeds_akc = get_akc_links(page)
counter = 1
akc_dog_dict = get_akc_breed_info(dog_breeds_akc[0])
df_akc = pd.DataFrame.from_records(akc_dog_dict,index=[0])
for breed in dog_breeds_akc[1:]:
    akc_dog_dict = get_akc_breed_info(breed)
    df1 = pd.DataFrame.from_records(akc_dog_dict,index=[counter])
    df_akc = df_akc.append(df1)
    counter += 1 #Keep track of index

Let's scrape from another database https://dogtime.com/dog-breeds/profiles/

In [5]:
def get_dogtime_links(page):
    driver = webdriver.Chrome('/Users/chris/Downloads/chromedriver')
    driver.get(page)
    soup = BeautifulSoup(driver.page_source,"lxml")
    driver.close()
    #Now we need to parse the dog breeds
    dog_breeds = []
    for item in soup.find_all('a', attrs={'class':'list-item-title'}):
        dog_breeds.append(item['href'])
    return dog_breeds

In [10]:
def get_dogtime_breed_info(link):
    #follow the links
    driver = webdriver.Chrome('/Users/chris/Downloads/chromedriver')
    driver.get(link)
    soup = BeautifulSoup(driver.page_source,"lxml")
    driver.close()
    dog_dict = {}
    #get introduction
    print(link)
    for item in soup.find_all('div', attrs={'class':'breeds-single-intro'}):
        dog_dict['about'] = str(item.text.encode('utf-8').strip())
    #get ratings
    for item in soup.find_all('div', attrs={'class':'js-list-item child-characteristic'}):
        #split on the first integer
        res = re.split('[0-9]', str(item.text.encode('utf-8').strip()))
        score = re.search('[0-9]',str(item.text.encode('utf-8').strip()))
        dog_dict[res[0]] = score.group(0)
    #get vital info
    try:
        attributes = []
        for item in soup.find_all('div', attrs={'class':'vital-stat-box'}):
            attributes.append(str(item.text.encode('utf-8').strip()).split(':')[1])
        for i in range(len(attributes)):
            if i == 0:
                dog_dict['group'] = attributes[i]
            elif i == 1:
                dog_dict['height'] = attributes[i]
            elif i == 2:
                dog_dict['weight'] = attributes[i]
            elif i ==3:
                dog_dict['age'] = attributes[i]
    except:
        dog_dict['group'] = ''
        dog_dict['height'] = ''
        dog_dict['weight'] = ''
        dog_dict['age'] = ''
    dog_dict['url'] = link
    return dog_dict

In [11]:
#Main loop to populate the database from dogtime
page = 'https://dogtime.com/dog-breeds/profiles/'
dogtime_breeds = get_dogtime_links(page)
counter = 1
dogtime_dog_dict = get_dogtime_breed_info(dogtime_breeds[0])
df_dogtime = pd.DataFrame.from_records(dogtime_dog_dict,index=[0])
for breed in dogtime_breeds[1:]:
    dogtime_dog_dict = get_dogtime_breed_info(breed)
    df1 = pd.DataFrame.from_records(dogtime_dog_dict,index=[counter])
    df_dogtime = df_dogtime.append(df1)
    counter += 1 #Keep track of index

https://dogtime.com/dog-breeds/afador
https://dogtime.com/dog-breeds/affenhuahua
https://dogtime.com/dog-breeds/affenpinscher
https://dogtime.com/dog-breeds/afghan-hound
https://dogtime.com/dog-breeds/airedale-terrier
https://dogtime.com/dog-breeds/akbash
https://dogtime.com/dog-breeds/akita
https://dogtime.com/dog-breeds/akita-chow
https://dogtime.com/dog-breeds/akita-pit
https://dogtime.com/dog-breeds/akita-shepherd
https://dogtime.com/dog-breeds/alaskan-klee-kai
https://dogtime.com/dog-breeds/alaskan-malamute
https://dogtime.com/dog-breeds/american-bulldog
https://dogtime.com/dog-breeds/american-english-coonhound
https://dogtime.com/dog-breeds/american-eskimo-dog
https://dogtime.com/dog-breeds/american-foxhound
https://dogtime.com/dog-breeds/american-hairless-terrier
https://dogtime.com/dog-breeds/american-leopard-hound
https://dogtime.com/dog-breeds/american-pit-bull-terrier
https://dogtime.com/dog-breeds/american-pugabull
https://dogtime.com/dog-breeds/american-staffordshire-terri

https://dogtime.com/dog-breeds/french-bulldog
https://dogtime.com/dog-breeds/french-bullhuahua
https://dogtime.com/dog-breeds/french-spaniel
https://dogtime.com/dog-breeds/frenchton
https://dogtime.com/dog-breeds/frengle
https://dogtime.com/dog-breeds/german-longhaired-pointer
https://dogtime.com/dog-breeds/german-pinscher
https://dogtime.com/dog-breeds/german-shepherd-dog
https://dogtime.com/dog-breeds/german-shepherd-pit-bull
https://dogtime.com/dog-breeds/german-shepherd-rottweiler-mix
https://dogtime.com/dog-breeds/german-sheprador
https://dogtime.com/dog-breeds/german-shorthaired-pointer
https://dogtime.com/dog-breeds/german-spitz
https://dogtime.com/dog-breeds/german-wirehaired-pointer
https://dogtime.com/dog-breeds/giant-schnauzer
https://dogtime.com/dog-breeds/glen-of-imaal-terrier
https://dogtime.com/dog-breeds/goberian
https://dogtime.com/dog-breeds/goldador
https://dogtime.com/dog-breeds/golden-cocker-retriever
https://dogtime.com/dog-breeds/golden-mountain-dog
https://dogti

https://dogtime.com/dog-breeds/treeing-walker-coonhound
https://dogtime.com/dog-breeds/valley-bulldog
https://dogtime.com/dog-breeds/vizsla
https://dogtime.com/dog-breeds/weimaraner
https://dogtime.com/dog-breeds/welsh-springer-spaniel
https://dogtime.com/dog-breeds/welsh-terrier
https://dogtime.com/dog-breeds/west-highland-white-terrier
https://dogtime.com/dog-breeds/westiepoo
https://dogtime.com/dog-breeds/whippet
https://dogtime.com/dog-breeds/whoodle
https://dogtime.com/dog-breeds/wirehaired-pointing-griffon
https://dogtime.com/dog-breeds/xoloitzuintli
https://dogtime.com/dog-breeds/yorkipoo
https://dogtime.com/dog-breeds/yorkshire-terrier


In [12]:
df_dogtime.head()
df_dogtime.to_csv('dogtime_breeds.csv')