In [1]:
import numpy as np
import pandas as pd
import requests
from requests import get
from bs4 import BeautifulSoup
import pickle

## dogtime.com webscrape

#### Dogtime has information on over 200 breeds of dogs, including attributes related to adaptability, friendliness, trainability, exercise needs, etc. -- all with ratings from 1-5.  This will be very helpful in quantifying a breed by its traits.

In [2]:
url = 'https://dogtime.com/dog-breeds'
# Get contents of webpage
webpage = requests.get(url).content
# Process html for scraping
soup = BeautifulSoup(webpage,'html.parser')

In [3]:
# All breeds are located in an 'a' tag with a class that says 'post-title'
soup.find(class_='post-title')

<a class="post-title" href="https://dogtime.com/dog-breeds/affenpinscher">Affenpinscher</a>

In [4]:
# Using find_all to get the html & name of every breed on dogtime.com
breeds = soup.find_all(class_='post-title')
# URL of the dog can be separated using this
breeds[0]['href']

'https://dogtime.com/dog-breeds/affenpinscher'

In [5]:
# Breed can be split off and put into its own list for later comparison to the
# dog classifier list
breeds[0]['href'].split('/')[-1]

'affenpinscher'

In [6]:
# List comprehension to get all urls on webpage
breed_urls = [x['href'] for x in breeds]
breed_urls[:10]

['https://dogtime.com/dog-breeds/affenpinscher',
 'https://dogtime.com/dog-breeds/afghan-hound',
 'https://dogtime.com/dog-breeds/airedale-terrier',
 'https://dogtime.com/dog-breeds/akita',
 'https://dogtime.com/dog-breeds/alaskan-klee-kai',
 'https://dogtime.com/dog-breeds/alaskan-malamute',
 'https://dogtime.com/dog-breeds/american-bulldog',
 'https://dogtime.com/dog-breeds/american-english-coonhound',
 'https://dogtime.com/dog-breeds/american-eskimo-dog',
 'https://dogtime.com/dog-breeds/american-foxhound']

In [7]:
# List comprehension to get all dog names on webpage
dogtime_breed_list = [x.split('/')[-1] for x in breed_urls]
dogtime_breed_list[:10]

['affenpinscher',
 'afghan-hound',
 'airedale-terrier',
 'akita',
 'alaskan-klee-kai',
 'alaskan-malamute',
 'american-bulldog',
 'american-english-coonhound',
 'american-eskimo-dog',
 'american-foxhound']

In [8]:
def page_scrape(url):
    '''
    INPUT: web address
    OUTPUT: Dictionary detailing dog attributes as keys and ratings as values
    '''
    output = {}
    
    # Download html content
    response = get(url)
    
    # Use Beautiful Soup to clean html
    html_soup = BeautifulSoup(response.text, 'html.parser')
    
    # Find the div containing the attributes and ratings, [0] makes sure it's not a list
    content = html_soup.find_all('div', class_ = "js-listing-box dashed-box characteristics-ratings")
    content = content[0]
    
    # Categories and subcategories are organized in a different way in the html
    parent_ratings = content.find_all('div', class_="star-by-breed clearfix default-border-bottom default-margin-bottom default-padding-bottom parent-characteristic")
    child_ratings = content.find_all('div',class_="js-list-item item-expandable-content default-border-bottom default-margin-bottom default-padding-bottom star-by-breed child-characteristic")
    
    for char in parent_ratings:
        attribute = char.find('span', class_="characteristic item-trigger-title").text
        rating = int(char.find('span',class_='star')['class'][1][-1])
        output[attribute] = rating

    for char in child_ratings:
        attribute = char.find('span',class_="characteristic item-trigger-title").text
        rating = int(char.find('span',class_="star").text)
        output[attribute] = rating        

    return output

In [9]:
# Attributes and accompanying ratings for 'affenpinscher'
page_scrape('https://dogtime.com/dog-breeds/affenpinscher')

{' Adaptability': 3,
 ' All Around Friendliness': 3,
 ' Health Grooming': 2,
 ' Trainability': 3,
 ' Exercise Needs': 4,
 'Adapts Well to Apartment Living': 5,
 'Good For Novice Owners': 4,
 'Sensitivity Level': 3,
 'Tolerates Being Alone': 1,
 'Tolerates Cold Weather': 3,
 'Tolerates Hot Weather': 3,
 'Affectionate with Family': 5,
 'Incredibly Kid Friendly Dogs': 1,
 'Dog Friendly': 4,
 'Friendly Toward Strangers': 3,
 'Amount Of Shedding': 1,
 'Drooling Potential': 1,
 'Easy To Groom': 3,
 'General Health': 4,
 'Potential For Weight Gain': 3,
 'Size': 1,
 'Easy To Train': 2,
 'Intelligence': 4,
 'Potential For Mouthiness': 4,
 'Prey Drive': 3,
 'Tendency To Bark Or Howl': 2,
 'Wanderlust Potential': 2,
 'Energy Level': 4,
 'Intensity': 3,
 'Exercise Needs': 3,
 'Potential For Playfulness': 4}

In [10]:
def dogtime_scrape(url_list):
    '''
    Input: List of dogs urls to scrape website
    Output: Dictionary: Keys are dog breed
            Values are dictionarys with attributes as keys and
            ratings as values
    '''
    output = {}
    for url in url_list:
        key = url.split('/')[-1]
        try:
            output[key] = page_scrape(url)
        except:
            #korean-jindo-dog & xoloitzcuintli brought up errors with a previous method
            print(key)
    return output

In [11]:
data = dogtime_scrape(breed_urls)

In [30]:
#data

In [13]:
# Convert dictionary into a readable dataframe
traits_df = pd.DataFrame(data).T
# Switch breeds from columns to rows; switch attributes from rows to columns
traits_df.head(10)

Unnamed: 0,Adaptability,All Around Friendliness,Exercise Needs,Health Grooming,Trainability,Adapts Well to Apartment Living,Affectionate with Family,Amount Of Shedding,Dog Friendly,Drooling Potential,...,Potential For Playfulness,Potential For Weight Gain,Prey Drive,Sensitivity Level,Size,Tendency To Bark Or Howl,Tolerates Being Alone,Tolerates Cold Weather,Tolerates Hot Weather,Wanderlust Potential
affenpinscher,3.0,3.0,4.0,2.0,3.0,5.0,5.0,1.0,4.0,1.0,...,4.0,3.0,3.0,3.0,1.0,2.0,1.0,3.0,3.0,2.0
afghan-hound,4.0,4.0,4.0,2.0,3.0,5.0,5.0,4.0,4.0,1.0,...,4.0,1.0,5.0,5.0,4.0,2.0,2.0,5.0,5.0,5.0
airedale-terrier,2.0,4.0,5.0,3.0,5.0,1.0,4.0,2.0,4.0,1.0,...,5.0,4.0,5.0,3.0,3.0,4.0,2.0,3.0,3.0,4.0
akita,3.0,2.0,4.0,4.0,4.0,2.0,5.0,5.0,1.0,5.0,...,5.0,4.0,4.0,5.0,4.0,5.0,1.0,5.0,2.0,4.0
alaskan-klee-kai,3.0,3.0,4.0,3.0,4.0,3.0,4.0,4.0,2.0,2.0,...,4.0,2.0,5.0,4.0,2.0,3.0,2.0,5.0,2.0,4.0
alaskan-malamute,2.0,4.0,5.0,3.0,4.0,1.0,4.0,5.0,2.0,1.0,...,5.0,3.0,4.0,4.0,4.0,5.0,1.0,5.0,2.0,5.0
american-bulldog,2.0,3.0,4.0,3.0,3.0,1.0,5.0,1.0,2.0,4.0,...,4.0,2.0,4.0,4.0,4.0,2.0,1.0,2.0,2.0,3.0
american-english-coonhound,3.0,5.0,5.0,3.0,5.0,1.0,5.0,3.0,4.0,1.0,...,5.0,3.0,5.0,5.0,3.0,5.0,2.0,4.0,4.0,5.0
american-eskimo-dog,4.0,5.0,4.0,3.0,4.0,5.0,5.0,5.0,5.0,1.0,...,5.0,4.0,3.0,4.0,2.0,4.0,2.0,5.0,3.0,3.0
american-foxhound,2.0,5.0,5.0,4.0,4.0,1.0,4.0,3.0,5.0,1.0,...,5.0,5.0,4.0,2.0,3.0,5.0,1.0,4.0,4.0,5.0


In [14]:
#Save data to .csv
traits_df.to_csv(r'/Users/Chris/Desktop/Galvanize/dsi-capstone/breed_traits.csv')

## Matching Kaggle Photo Labels to dogtime.com Labels

#### As a baseline model, I'll be using a photo classifier trained on photos from a kaggle competition that predicts a dogs breed by its photo, which came from the Stanford Dogs Dataset.  

#### The issue is the prediction output (i.e. name of the breed predicted) is different from the above matrix.

#### Example: the trained classifier would output 'mexican_hairless', but dogtime labels it as 'Xoloitzcuintli'

In [15]:
kaggle_labels = 'labels.csv'

In [16]:
kaggle_labels_df = pd.read_csv(kaggle_labels)

In [17]:
kaggle_labels_df.head()

Unnamed: 0,id,breed
0,000bec180eb18c7604dcecc8fe0dba07,boston_bull
1,001513dfcb2ffafc82cccf4d8bbaba97,dingo
2,001cdf01b096e06d78e9e5112d419397,pekinese
3,00214f311d5d2247d5dfe4fe24b2303d,bluetick
4,0021f9ceb3235effd7fcde7f7538ed62,golden_retriever


In [18]:
prediction_output = sorted(kaggle_labels_df['breed'].unique())

In [19]:
len(prediction_output)

120

In [20]:
prediction_output[:15]

['affenpinscher',
 'afghan_hound',
 'african_hunting_dog',
 'airedale',
 'american_staffordshire_terrier',
 'appenzeller',
 'australian_terrier',
 'basenji',
 'basset',
 'beagle',
 'bedlington_terrier',
 'bernese_mountain_dog',
 'black-and-tan_coonhound',
 'blenheim_spaniel',
 'bloodhound']

In [21]:
dogtime_breed_list[:15]

['affenpinscher',
 'afghan-hound',
 'airedale-terrier',
 'akita',
 'alaskan-klee-kai',
 'alaskan-malamute',
 'american-bulldog',
 'american-english-coonhound',
 'american-eskimo-dog',
 'american-foxhound',
 'american-pit-bull-terrier',
 'american-staffordshire-terrier',
 'american-water-spaniel',
 'anatolian-shepherd-dog',
 'appenzeller-sennenhunde']

In [22]:
copy = prediction_output.copy()

In [23]:
print(len(copy))

120


In [24]:
translation_dict = {}

In [25]:
while len(copy)>23:
    
    # 'beagle' = 'beagle'
    for i,dog in enumerate(copy):
        if dog in dogtime_breed_list:
            key = copy.pop(i)
            translation_dict[key] = key

    print('found: {},not found: {}'.format(len(translation_dict.keys()),len(copy)))

    # 'afghan_hound' doesn't equal 'afghan-hound' until you change '_' to '-'
    for i,dog in enumerate(copy):
        if dog.lower().replace(' ','-').replace('_','-') in dogtime_breed_list:
            key = copy.pop(i)
            translation_dict[key] = key.lower().replace(' ','-').replace('_','-')

    print('found: {},not found: {}'.format(len(translation_dict.keys()),len(copy)))
    
    # 'doberman' is in but doesn't equel 'doberman-pinscher'
    for i,dog in enumerate(copy):
        for dog2 in dogtime_breed_list:
            if dog.lower().replace(' ','-').replace('_','-') in dog2:
                key = copy.pop(i)
                translation_dict[key] = dog2

    print('found: {},not found: {}'.format(len(translation_dict.keys()),len(copy)))

translation_dict

found: 21,not found: 99
found: 54,not found: 66
found: 81,not found: 39
found: 82,not found: 38
found: 90,not found: 30
found: 96,not found: 24
found: 96,not found: 24
found: 96,not found: 24
found: 98,not found: 22


{'affenpinscher': 'affenpinscher',
 'basenji': 'basenji',
 'beagle': 'beagle',
 'bloodhound': 'bloodhound',
 'borzoi': 'borzoi',
 'boxer': 'boxer',
 'briard': 'briard',
 'chihuahua': 'chihuahua',
 'collie': 'collie',
 'keeshond': 'keeshond',
 'komondor': 'komondor',
 'newfoundland': 'newfoundland',
 'otterhound': 'otterhound',
 'pomeranian': 'pomeranian',
 'rottweiler': 'rottweiler',
 'saluki': 'saluki',
 'schipperke': 'schipperke',
 'shih-tzu': 'shih-tzu',
 'vizsla': 'vizsla',
 'weimaraner': 'weimaraner',
 'whippet': 'whippet',
 'afghan_hound': 'afghan-hound',
 'american_staffordshire_terrier': 'american-staffordshire-terrier',
 'australian_terrier': 'australian-terrier',
 'bedlington_terrier': 'bedlington-terrier',
 'black-and-tan_coonhound': 'black-and-tan-coonhound',
 'border_collie': 'border-collie',
 'bouvier_des_flandres': 'bouvier-des-flandres',
 'chesapeake_bay_retriever': 'chesapeake-bay-retriever',
 'cocker_spaniel': 'cocker-spaniel',
 'english_foxhound': 'english-foxhound',

In [26]:
# Edge cases: 'mexican_hairless' is the same breed as 'xoloitzuintli', 
# or pekingese and pekinese are spellt differently

translation_dict['bernese_mountain_dog'] = 'bernese-mountain-dog'
translation_dict['blenheim_spaniel'] = 'cavalier-king-charles-spaniel'
translation_dict['boston_bull'] = 'boston-terrier'
translation_dict['brabancon_griffon'] = 'brussels-griffon'
translation_dict['brittany_spaniel'] = 'brittany'
translation_dict['bull_mastiff'] = 'bullmastiff'
translation_dict['german_short-haired_pointer'] = 'german-shorthaired-pointer'
translation_dict['groenendael'] = 'belgian-sheepdog'
translation_dict['japanese_spaniel'] = 'japanese-chin'
translation_dict['maltese_dog'] = 'maltese'
translation_dict['mexican_hairless'] = 'xoloitzuintli'
translation_dict['miniature_poodle'] = 'poodle'
translation_dict['pekinese'] = 'pekinese'
translation_dict['scotch_terrier'] = 'scotch-terrier'
translation_dict['staffordshire_bullterrier'] = 'staffordshire-bull-terrier'
translation_dict['standard_poodle'] = 'poodle'
translation_dict['toy_poodle'] = 'poodle'
translation_dict['toy_terrier'] = 'toy-fox-terrier'
translation_dict['walker_hound'] = 'treeing-walker-coonhound'
translation_dict['wire-haired_fox_terrier'] = 'fox-terrier'
len(translation_dict.keys())

117

In [27]:
# The last 3 remaining are all wild animals, which are not on dogtime.com: 
translation_dict['african_hunting_dog'] = 'african_hunting_dog'
translation_dict['dhole'] = 'dhole'
translation_dict['dingo'] = 'dingo'
len(translation_dict.keys())

120

In [28]:
# Save dict object through pickle
pickle_out = open('translation_dict.pickle','wb')
pickle.dump(translation_dict, pickle_out)
pickle_out.close

<function BufferedWriter.close>

In [29]:
# Code to re-open pickled dictionary

# pickle_in = open('translation_dict.pickle','rb')
# translation_dict = pickle.load(pickle_in)