# Since there are over 1200 dog species in the 'Breed' column, it seems unreasonable to try and apply one hot encoding for each unique species. This would most likely result in overfitting.

# The strategy here is to group the species either by size or by the type/category of dog. The crawling will be done based on information on the following webpage:

### http://www.dogbreedslist.info/small-dog-breeds/#.Wfxwb2i0NPY

In [1]:
import requests
from bs4 import BeautifulSoup
import re

## 1) Let us first extract the dog list by size

#### First up is small sized dogs

In [8]:
def dog_size_crawler(url, selector):
    res = requests.get(url)
    soup = BeautifulSoup(res.content, 'html.parser')
    dog_list = soup.select(selector)
    compile_list = []
    for dog in dog_list:
        compile_list.append(dog.get_text())
    return compile_list

In [15]:
def dog_size_compile(url_list, selector):
    list_compile = []
    for url in url_list:
        crawler_list = dog_size_crawler(url, selector)
        list_compile += crawler_list
    return list_compile

#### Small size

In [16]:
small_url_list = ['http://www.dogbreedslist.info/small-dog-breeds/list_2_1.html#.Wfx1Mmi0NPY', 
                'http://www.dogbreedslist.info/small-dog-breeds/list_2_2.html#.Wfx1Mmi0NPY', 
                'http://www.dogbreedslist.info/small-dog-breeds/list_2_3.html#.Wfx1Mmi0NPY', 
                'http://www.dogbreedslist.info/small-dog-breeds/list_2_4.html#.Wfx1Mmi0NPY', 
                'http://www.dogbreedslist.info/small-dog-breeds/list_2_5.html#.Wfx1Mmi0NPY',
                'http://www.dogbreedslist.info/small-dog-breeds/list_2_6.html#.Wfx1Mmi0NPY']

selector = 'body > div.main > div.main-r > div > div.list-01 > div.right > div.right-t > p > a'
small_dog_compile = dog_size_compile(small_url_list, selector)

In [18]:
print(len(small_dog_compile))
print(small_dog_compile)

104
['Beagle', 'French Bulldog', 'Yorkshire Terrier', 'Dachshund', 'Miniature Schnauzer', 'Pembroke Welsh Corgi', 'Cavalier King Charles Spaniel', 'Shih Tzu', 'Boston Terrier', 'Pomeranian', 'Havanese', 'Shetland Sheepdog', 'Chihuahua', 'Pug', 'Maltese', 'West Highland White Terrier', 'Bichon Frise', 'Papillon', 'Scottish Terrier', 'Miniature Pinscher', 'Cardigan Welsh Corgi', 'Cairn Terrier', 'Lhasa Apso', 'Italian Greyhound', 'Chinese Crested', 'Coton De Tulear', 'Border Terrier', 'Jack Russell Terrier', 'Keeshond', 'Pekingese', 'Rat Terrier', 'Brussels Griffon', 'Silky Terrier', 'Norwich Terrier', 'American Hairless Terrier', 'Japanese Chin', 'Parson Russell Terrier', 'Welsh Terrier', 'Schipperke', 'Irish Terrier', 'Toy Fox Terrier', 'Tibetan Spaniel', 'Miniature Bull Terrier', 'American Eskimo Dog', 'Smooth Fox Terrier', 'English Toy Spaniel', 'Manchester Terrier', 'Norfolk Terrier', 'Australian Terrier', 'Bedlington Terrier', 'Xoloitzcuintli', 'Lakeland Terrier', 'Petit Basset Gri

#### Medium size

In [19]:
medium_url_list = ['http://www.dogbreedslist.info/medium-dog-breeds/list_3_1.html#.Wfx3bmi0NPY',
                   'http://www.dogbreedslist.info/medium-dog-breeds/list_3_2.html#.Wfx3bmi0NPY',
                   'http://www.dogbreedslist.info/medium-dog-breeds/list_3_3.html#.Wfx3bmi0NPY', 
                   'http://www.dogbreedslist.info/medium-dog-breeds/list_3_4.html#.Wfx3bmi0NPY',
                   'http://www.dogbreedslist.info/medium-dog-breeds/list_3_5.html#.Wfx3bmi0NPY',
                   'http://www.dogbreedslist.info/medium-dog-breeds/list_3_6.html#.Wfx3bmi0NPY']

selector = 'body > div.main > div.main-r > div > div.list-01 > div.right > div.right-t > p > a'
medium_dog_compile = dog_size_compile(medium_url_list, selector)

In [20]:
print(len(medium_dog_compile))
print(medium_dog_compile)

120
['Labrador Retriever', 'Bulldog', 'Poodle', 'Boxer', 'Siberian Husky', 'Australian Shepherd', 'Brittany', 'English Springer Spaniel', 'American Cocker Spaniel', 'Vizsla', 'Weimaraner', 'Miniature American Shepherd', 'Border Collie', 'Basset Hound', 'Shiba Inu', 'Belgian Malinois', 'Soft Coated Wheaten Terrier', 'Portuguese Water Dog', 'Australian Cattle Dog', 'Airedale Terrier', 'English Cocker Spaniel', 'Bull Terrier', 'Whippet', 'Shar-Pei', 'German Wirehaired Pointer', 'Samoyed', 'Wirehaired Pointing Griffon', 'Chow Chow', 'American Staffordshire Terrier', 'Staffordshire Bull Terrier', 'Standard Schnauzer', 'Nova Scotia Duck Tolling Retriever', 'Basenji', 'Tibetan Terrier', 'Norwegian Elkhound', 'Wire Fox Terrier', 'Belgian Tervuren', 'Boykin Spaniel', 'Lagotto Romagnolo', 'Pointer', 'American Eskimo Dog', 'Welsh Springer Spaniel', 'Bearded Collie', 'Kerry Blue Terrier', 'Treeing Walker Coonhound', 'Berger Picard', 'Redbone Coonhound', 'Clumber Spaniel', 'German Pinscher', 'Field

#### Large size

In [21]:
large_url_list = ['http://www.dogbreedslist.info/large-dog-breeds/list_4_1.html#.Wfx9nWi0NPY',
                   'http://www.dogbreedslist.info/large-dog-breeds/list_4_2.html#.Wfx9nWi0NPY',
                   'http://www.dogbreedslist.info/large-dog-breeds/list_4_3.html#.Wfx9nWi0NPY', 
                   'http://www.dogbreedslist.info/large-dog-breeds/list_4_4.html#.Wfx9nWi0NPY',
                   'http://www.dogbreedslist.info/large-dog-breeds/list_4_5.html#.Wfx9nWi0NPY',
                   'http://www.dogbreedslist.info/large-dog-breeds/list_4_6.html#.Wfx9nWi0NPY']

selector = 'body > div.main > div.main-r > div > div.list-01 > div.right > div.right-t > p > a'
large_dog_compile = dog_size_compile(large_url_list, selector)

In [22]:
print(len(large_dog_compile))
print(large_dog_compile)

93
['German Shepherd', 'Golden Retriever', 'Rottweiler', 'German Shorthaired Pointer', 'Doberman Pinscher', 'Bernese Mountain Dog', 'Collie', 'Cane Corso', 'Rhodesian Ridgeback', 'Chesapeake Bay Retriever', 'Akita', 'Bullmastiff', 'Bloodhound', 'Alaskan Malamute', 'Dogue de Bordeaux', 'Dalmatian', 'Old English Sheepdog', 'Irish Setter', 'Greater Swiss Mountain Dog', 'Bouvier des Flandres', 'Anatolian Shepherd Dog', 'Flat-Coated Retriever', 'Borzoi', 'English Setter', 'Gordon Setter', 'Spinone Italiano', 'Afghan Hound', 'Black Russian Terrier', 'Bluetick Coonhound', 'Saluki', 'Black and Tan Coonhound', 'Belgian Sheepdog', 'Boerboel', 'Briard', 'Tibetan Mastiff', 'Beauceron', 'Irish Red and White Setter', 'Ibizan Hound', 'Curly-Coated Retriever', 'Otterhound', 'Chinook', 'Komondor', 'Bergamasco', 'American Foxhound', 'Grand Bleu de Gascogne', 'Kangal Dog', 'Spanish Greyhound', 'Akbash Dog', 'Alano Espanol', 'Alapaha Blue Blood Bulldog', 'American Mastiff', 'American Staghound', 'Argentin

#### Giant size

In [23]:
giant_url_list = ['http://www.dogbreedslist.info/giant-dog-breeds/list_5_1.html#.Wfx9nWi0NPY',]

selector = 'body > div.main > div.main-r > div > div.list-01 > div.right > div.right-t > p > a'
giant_dog_compile = dog_size_compile(giant_url_list, selector)

In [24]:
print(len(giant_dog_compile))
print(giant_dog_compile)

18
['Great Dane', 'Mastiff', 'Newfoundland', 'St. Bernard', 'Great Pyrenees', 'Irish Wolfhound', 'Giant Schnauzer', 'Leonberger', 'Neapolitan Mastiff', 'Greyhound', 'Scottish Deerhound', 'Kuvasz', 'Chart Polski', 'Landseer', 'Pakistani Mastiff', 'Shiloh Shepherd dog', 'Spanish Mastiff', 'King Shepherd']


## 2) Alternatively, we could extract the list by grouping category used by American Kennel Club

### Reference: https://en.wikipedia.org/wiki/List_of_dog_breeds_recognized_by_the_American_Kennel_Club

In [25]:
def dog_type_crawler(url, selector):
    res = requests.get(url)
    soup = BeautifulSoup(res.content, 'html.parser')
    dog_list = soup.select(selector)
    compile_list = []
    for dog in dog_list:
        compile_list.append(dog.get_text())
    return compile_list

def dog_type_compile(url_list, selector):
    list_compile = []
    for url in url_list:
        crawler_list = dog_type_crawler(url, selector)
        list_compile += crawler_list
    return list_compile

#### Toy type

In [26]:
toy_url_list = ['http://www.dogbreedslist.info/toy-dog-breeds/list_6_1.html#.WfyEA2i0NPY',
                   'http://www.dogbreedslist.info/toy-dog-breeds/list_6_2.html#.WfyEA2i0NPY']

selector = 'body > div.main > div.main-r > div > div.list-01 > div.right > div.right-t > p > a'
toy_dog_compile = dog_type_compile(toy_url_list, selector)

In [27]:
print(len(toy_dog_compile))
print(toy_dog_compile)

26
['Yorkshire Terrier', 'Cavalier King Charles Spaniel', 'Shih Tzu', 'Pomeranian', 'Havanese', 'Chihuahua', 'Pug', 'Maltese', 'Bichon Frise', 'Papillon', 'Miniature Pinscher', 'Italian Greyhound', 'Chinese Crested', 'Coton De Tulear', 'Pekingese', 'Brussels Griffon', 'Silky Terrier', 'Japanese Chin', 'Toy Fox Terrier', 'English Toy Spaniel', 'Affenpinscher', 'Toy Poodle', 'Karst Shepherd', 'Beaglier', 'Bolognese dog', 'Russian Tsvetnaya Bolonka']


#### Sporting

In [28]:
sporting_url_list = ['http://www.dogbreedslist.info/sporting-dog-breeds/list_7_1.html#.WfyEbmi0NPY',
                   'http://www.dogbreedslist.info/sporting-dog-breeds/list_7_2.html#.WfyEbmi0NPY']

selector = 'body > div.main > div.main-r > div > div.list-01 > div.right > div.right-t > p > a'
sporting_dog_compile = dog_type_compile(sporting_url_list, selector)

In [29]:
print(len(sporting_dog_compile))
print(sporting_dog_compile)

30
['Labrador Retriever', 'Golden Retriever', 'German Shorthaired Pointer', 'Brittany', 'English Springer Spaniel', 'American Cocker Spaniel', 'Vizsla', 'Weimaraner', 'Chesapeake Bay Retriever', 'English Cocker Spaniel', 'German Wirehaired Pointer', 'Wirehaired Pointing Griffon', 'Irish Setter', 'Nova Scotia Duck Tolling Retriever', 'Flat-Coated Retriever', 'English Setter', 'Gordon Setter', 'Spinone Italiano', 'Boykin Spaniel', 'Pointer', 'Welsh Springer Spaniel', 'Irish Red and White Setter', 'Clumber Spaniel', 'Field Spaniel', 'American Water Spaniel', 'Wirehaired Vizsla', 'Curly-Coated Retriever', 'Sussex Spaniel', 'Barbet', 'Kooikerhondje']


#### Hound

In [30]:
hound_url_list = ['http://www.dogbreedslist.info/hound-dog-breeds/list_8_1.html#.WfyEz2i0NPY',
                   'http://www.dogbreedslist.info/hound-dog-breeds/list_8_2.html#.WfyEz2i0NPY',
                   'http://www.dogbreedslist.info/hound-dog-breeds/list_8_3.html#.WfyEz2i0NPY', 
                   'http://www.dogbreedslist.info/hound-dog-breeds/list_8_4.html#.WfyEz2i0NPY']

selector = 'body > div.main > div.main-r > div > div.list-01 > div.right > div.right-t > p > a'
hound_dog_compile = dog_type_compile(hound_url_list, selector)

In [31]:
print(len(hound_dog_compile))
print(hound_dog_compile)

65
['Beagle', 'Dachshund', 'Basset Hound', 'Rhodesian Ridgeback', 'Bloodhound', 'Whippet', 'Irish Wolfhound', 'Basenji', 'Norwegian Elkhound', 'Borzoi', 'Afghan Hound', 'Bluetick Coonhound', 'Saluki', 'Black and Tan Coonhound', 'Treeing Walker Coonhound', 'Redbone Coonhound', 'Petit Basset Griffon Vendeen', 'Greyhound', 'Ibizan Hound', 'Scottish Deerhound', 'Plott Hound', 'Portuguese Podengo Pequeno', 'Otterhound', 'Pharaoh Hound', 'American English Coonhound', 'Finnish Spitz', 'Cirneco dell’Etna', 'Harrier', 'English Foxhound', 'American Foxhound', 'Grand Bleu de Gascogne', 'Spanish Greyhound', 'Africanis', 'Ariegeois', 'Austrian Black and Tan Hound', 'Azawakh', 'Basset Bleu de Gascogne', 'Basset Fauve de Bretagne', 'Bavarian Mountain Hound', 'Black Mouth Cur', 'Black Norwegian Elkhound', 'Blue Picardy Spaniel', 'Deutsche Bracke', 'Dingo', 'Drever', 'Dunker', 'East Siberian Laika', 'Finnish Hound', 'French Spaniel', 'Francais Blanc et Noir', 'Polish Hunting Dog', 'Grand Basset Griffon

#### Terrier

In [32]:
terrier_url_list = ['http://www.dogbreedslist.info/terrier-dog-breeds/list_9_1.html#.WfyFRGi0NPY',
                   'http://www.dogbreedslist.info/terrier-dog-breeds/list_9_2.html#.WfyFRGi0NPY',
                   'http://www.dogbreedslist.info/terrier-dog-breeds/list_9_3.html#.WfyFRGi0NPY']

selector = 'body > div.main > div.main-r > div > div.list-01 > div.right > div.right-t > p > a'
terrier_dog_compile = dog_type_compile(terrier_url_list, selector)

In [33]:
print(len(terrier_dog_compile))
print(terrier_dog_compile)

42
['Miniature Schnauzer', 'West Highland White Terrier', 'Soft Coated Wheaten Terrier', 'Airedale Terrier', 'Bull Terrier', 'Scottish Terrier', 'Cairn Terrier', 'American Staffordshire Terrier', 'Staffordshire Bull Terrier', 'Border Terrier', 'Jack Russell Terrier', 'Rat Terrier', 'Wire Fox Terrier', 'Norwich Terrier', 'American Hairless Terrier', 'Parson Russell Terrier', 'Welsh Terrier', 'Irish Terrier', 'Miniature Bull Terrier', 'Smooth Fox Terrier', 'Kerry Blue Terrier', 'Manchester Terrier', 'Norfolk Terrier', 'Australian Terrier', 'Bedlington Terrier', 'Lakeland Terrier', 'Sealyham Terrier', 'Dandie Dinmont Terrier', 'Skye Terrier', 'Glen of Imaal Terrier', 'Cesky Terrier', 'American Pit Bull Terrier', 'Austrian Pinscher', 'Braque du Bourbonnais', 'Brazilian Terrier', 'Dutch Smoushond', 'Jagdterrier', 'Japanese Terrier', 'Kromfohrlander', 'Patterdale Terrier', 'Mountain Feist', 'Lucas Terrier']


#### Working

In [34]:
working_url_list = ['http://www.dogbreedslist.info/working-dog-breeds/list_10_1.html#.WfyFsmi0NPY',
                   'http://www.dogbreedslist.info/working-dog-breeds/list_10_2.html#.WfyFsmi0NPY',
                   'http://www.dogbreedslist.info/working-dog-breeds/list_10_3.html#.WfyFsmi0NPY']

selector = 'body > div.main > div.main-r > div > div.list-01 > div.right > div.right-t > p > a'
working_dog_compile = dog_type_compile(working_url_list, selector)

In [35]:
print(len(working_dog_compile))
print(working_dog_compile)

50
['Rottweiler', 'Boxer', 'Siberian Husky', 'Great Dane', 'Doberman Pinscher', 'Bernese Mountain Dog', 'Mastiff', 'Newfoundland', 'Cane Corso', 'Akita', 'Bullmastiff', 'St. Bernard', 'Portuguese Water Dog', 'Alaskan Malamute', 'Dogue de Bordeaux', 'Samoyed', 'Great Pyrenees', 'Old English Sheepdog', 'Greater Swiss Mountain Dog', 'Giant Schnauzer', 'Anatolian Shepherd Dog', 'Standard Schnauzer', 'Leonberger', 'Neapolitan Mastiff', 'Black Russian Terrier', 'Boerboel', 'Tibetan Mastiff', 'German Pinscher', 'Kuvasz', 'Chinook', 'Komondor', 'Akbash Dog', 'Alano Espanol', 'Alapaha Blue Blood Bulldog', 'American Mastiff', 'Appenzeller Sennenhund', 'Argentine Dogo', 'Ariege Pointer', 'Blue Lacy', 'Canadian Eskimo Dog', 'Fila Brasileiro', 'Perro de Presa Canario', 'Plummer Terrier', 'Seppala Siberian Sleddog', 'Tamaskan Dog', 'Wetterhoun', 'Alaskan Husky', 'Alopekis', 'Maremma Sheepdog', 'Mountain Cur']


#### Herding

In [36]:
herding_url_list = ['http://www.dogbreedslist.info/herding-dog-breeds/list_11_1.html#.WfyF9Gi0NPY',
                   'http://www.dogbreedslist.info/herding-dog-breeds/list_11_2.html#.WfyF9Gi0NPY',
                   'http://www.dogbreedslist.info/herding-dog-breeds/list_11_3.html#.WfyF9Gi0NPY']

selector = 'body > div.main > div.main-r > div > div.list-01 > div.right > div.right-t > p > a'
herding_dog_compile = dog_type_compile(herding_url_list, selector)

In [37]:
print(len(herding_dog_compile))
print(herding_dog_compile)

48
['German Shepherd', 'Australian Shepherd', 'Pembroke Welsh Corgi', 'Shetland Sheepdog', 'Miniature American Shepherd', 'Collie', 'Border Collie', 'Belgian Malinois', 'Australian Cattle Dog', 'Cardigan Welsh Corgi', 'Bouvier des Flandres', 'Belgian Tervuren', 'Bearded Collie', 'Belgian Sheepdog', 'Briard', 'Beauceron', 'Berger Picard', 'Icelandic Sheepdog', 'Puli', 'Entlebucher Mountain Dog', 'Swedish Vallhund', 'Polish Lowland Sheepdog', 'Finnish Lapphund', 'Norwegian Buhund', 'Pyrenean Shepherd', 'Canaan Dog', 'Bergamasco', 'Kangal Dog', 'American White Shepherd', 'Australian Kelpie', 'Australian Stumpy Tail Cattle Dog', 'Belgian Shepherd Laekenois', 'Bouvier des Ardennes', 'Cão da Serra de Aires', 'Catahoula Leopard Dog', 'Catalan Sheepdog', 'Croatian Sheepdog', 'Czechoslovakian Wolfdog', 'Dutch Shepherd Dog', 'English Shepherd', 'Lancashire Heeler', 'Lapponian Herder', 'McNab', 'Mudi', 'Pumi', 'Saarloos wolfdog', 'Schapendoes', 'Polish Tatra Sheepdog']
