This notebook is used to scrape dog breed data from a combination of sites and store it in a postgres database.

In [6]:
import pandas as pd
from selenium import webdriver
from sqlalchemy import create_engine
from sqlalchemy_utils import database_exists, create_database
import psycopg2
import re
from bs4 import BeautifulSoup

In [7]:
#Create or load database
dbname = 'dog_breeds'
username = 'chris'
engine = create_engine('postgres://%s@localhost/%s'%(username,dbname))
print(engine.url)
## create a database (if it doesn't exist)
if not database_exists(engine.url):
    create_database(engine.url)
print(database_exists(engine.url))

postgres://chris@localhost/dog_breeds
True


Get all of the dog breed links from the akc website https://www.akc.org/dog-breeds/page/1/

In [8]:
def get_akc_links(page):
    driver = webdriver.Chrome('/Users/chris/Downloads/chromedriver')
    driver.get(page)
    soup = BeautifulSoup(driver.page_source,"lxml")
    driver.close()
    #Now we need to parse the dog breeds
    dog_breeds = []
    for item in soup.find_all('option', value=True):
        dog_breeds.append(str(item['value']))
    dog_breeds = list(set(dog_breeds))
    #Clean up some of the links
    remove = ['','https://www.akc.org/dog-breeds/sporting/','https://www.akc.org/dog-breeds/hound/','https://www.akc.org/dog-breeds/working/','https://www.akc.org/dog-breeds/terrier/','https://www.akc.org/dog-breeds/toy/',
          'https://www.akc.org/dog-breeds/non-sporting/','https://www.akc.org/dog-breeds/herding/','https://www.akc.org/dog-breeds/miscellaneous-class/',
          'https://www.akc.org/dog-breeds/foundation-stock-service/','https://www.akc.org/dog-breeds/smallest-dog-breeds/',
         'https://www.akc.org/dog-breeds/medium-dog-breeds/','https://www.akc.org/dog-breeds/largest-dog-breeds/',
         'https://www.akc.org/dog-breeds/hypoallergenic-dogs/','https://www.akc.org/dog-breeds/best-family-dogs/','https://www.akc.org/dog-breeds/best-guard-dogs/',
         'https://www.akc.org/dog-breeds/best-dogs-for-kids/','https://www.akc.org/dog-breeds/best-dogs-for-apartments-dwellers/',
         'https://www.akc.org/dog-breeds/hairless-dog-breeds/','https://www.akc.org/dog-breeds/smartest-dogs/']
    dog_breeds_filtered = []
    for breed in dog_breeds:
        if breed not in remove:
            dog_breeds_filtered.append(breed)
    return dog_breeds_filtered

Follow the links on the akc site and get all the breed info (raw)

In [9]:
def get_akc_breed_info(link):
    #follow the links
    dog_dict = {}
    driver = webdriver.Chrome('/Users/chris/Downloads/chromedriver')
    driver.get(link)
    soup = BeautifulSoup(driver.page_source,"lxml")
    driver.close()
    #Get temperament
    for item in soup.find_all('span', attrs={'class':'attribute-list__description attribute-list__text attribute-list__text--lg mb4 bpm-mb5 pb0 d-block'}):
        dog_dict['temperament'] = str(item.text.encode('utf-8').strip())
    #Get attributes
    attributes = []
    for item in soup.find_all('span', attrs={'class': 'attribute-list__description attribute-list__text '}):
        attributes.append(str(item.text.encode('utf-8').strip()))
    #Get about
    for item in soup.find_all('div', attrs={'class':'breed-info__content-wrap'},limit=1):
        dog_dict['about'] = str(item.text.encode('utf-8').strip())
    #Get care
    for item in soup.find_all('div', attrs={'class':'tabs__tab-panel-content'}):
        if "NUTRITION" in str(item.text.encode('utf-8').strip()):
            dog_dict['nutrition'] = str(item.text.encode('utf-8').strip())
        if "GROOMING" in str(item.text.encode('utf-8').strip()):
            dog_dict['grooming'] = str(item.text.encode('utf-8').strip())
        if "EXERCISE" in str(item.text.encode('utf-8').strip()):
            dog_dict['exercise'] = str(item.text.encode('utf-8').strip())
        if "TRAINING" in str(item.text.encode('utf-8').strip()):
            dog_dict['training'] = str(item.text.encode('utf-8').strip())
        if "HEALTH" in str(item.text.encode('utf-8').strip()):
            dog_dict['health'] = str(item.text.encode('utf-8').strip())
    #Put attributes into dictionary
    for i in range(len(attributes)):
        if i == 0:
            dog_dict['rank'] = attributes[i]
        elif i == 1:
            dog_dict['height'] = attributes[i]
        elif i == 2:
            dog_dict['weight'] = attributes[i]
        elif i == 3:
            dog_dict['age'] = attributes[i]
        elif i == 4:
            dog_dict['group'] = attributes[i]
    dog_dict['url'] = link
    return dog_dict

In [24]:
#Main loop to populate the database from the american kennel club
page = 'https://www.akc.org/dog-breeds/page/1/'
dog_breeds_akc = get_akc_links(page)
counter = 1
akc_dog_dict = get_akc_breed_info(dog_breeds_akc[0])
df_akc = pd.DataFrame.from_records(akc_dog_dict,index=[0])
for breed in dog_breeds_akc[1:]:
    akc_dog_dict = get_akc_breed_info(breed)
    df1 = pd.DataFrame.from_records(akc_dog_dict,index=[counter])
    df_akc = df_akc.append(df1)
    counter += 1 #Keep track of index

Let's scrape from another database https://dogtime.com/dog-breeds/profiles/

In [None]:
def get_dogtime_links(page):
    driver = webdriver.Chrome('/Users/chris/Downloads/chromedriver')
    driver.get(page)
    soup = BeautifulSoup(driver.page_source,"lxml")
    driver.close()
    #Now we need to parse the dog breeds
    dog_breeds = []
    for item in soup.find_all('a', attrs={'class':'list-item-title'}):
        dog_breeds.append(item['href'])
    return dog_breeds

In [None]:
def get_dogtime_breed_info(link):
    #follow the links
    driver = webdriver.Chrome('/Users/chris/Downloads/chromedriver')
    driver.get(link)
    soup = BeautifulSoup(driver.page_source,"lxml")
    driver.close()
    dog_dict = {}
    #get introduction
    for item in soup.find_all('div', attrs={'class':'breeds-single-intro'}):
        dog_dict['about'] = str(item.text.encode('utf-8').strip())
    #get ratings
    for item in soup.find_all('div', attrs={'class':'js-list-item child-characteristic'}):
        #split on the first integer
        res = re.split('[0-9]', str(item.text.encode('utf-8').strip()))
        score = re.search('[0-9]',str(item.text.encode('utf-8').strip()))
        dog_dict[res[0]] = score.group(0)
    #get vital info
    attributes = []
    for item in soup.find_all('div', attrs={'class':'vital-stat-box'}):
        attributes.append(str(item.text.encode('utf-8').strip()).split(':')[1])
    for i in range(len(attributes)):
        if i == 0:
            dog_dict['group'] = attributes[i]
        elif i == 1:
            dog_dict['height'] = attributes[i]
        elif i == 2:
            dog_dict['weight'] = attributes[i]
        elif i ==3:
            dog_dict['age'] = attributes[i]
    dog_dict['url'] = link
    return dog_dict

In [None]:
#Main loop to populate the database from dogtime
dogtime_breeds = get_dogtime_links('https://dogtime.com/dog-breeds/profiles/')
for breed in dogtime_breeds:
    dogtime_dog_dict = get_dogtime_breed_info(breed)

In [26]:
df_akc.head()
df_akc.to_csv('akc_breeds.csv')