# Import Libraries

In [1]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
from time import sleep
import re

Note: [Clean Breeds](https://github.com/gwoodstock/project4/blob/main/0_data_clean_breeds_info.ipynb) must be run prior

In [2]:
# url of mixed dog breeds
url = 'https://www.dogbreedinfo.com/hybriddogs.htm'

Get content from url

In [3]:
res = requests.get(url)
res.status_code

200

Convert content into Python syntax

In [4]:
# soup of all the <li> of dog breeds
soup = BeautifulSoup(res.content, 'lxml')
list_items = soup.find('div', {'class': 'container'}).find_all('li')

# Extract Breeds

In [5]:
# https://stackoverflow.com/questions/6116978/how-to-replace-multiple-substrings-of-a-string
# Three lines of code below taken from stackoverflow, answered by Andrew Clark

# dictionary of things to replace
rep = {'(': '', ')':'', 'mix':'', ' - ':'', '\xa0': ' ', '-\t':'', '- ':'', '= ': ''}
rep = dict((re.escape(k), v) for k, v in rep.items()) 
pattern = re.compile("|".join(rep.keys()))


# initialize empty list to add dogs to
dogs = []

# iterates over lists of dog names and breeds
for li in list_items[5:]:

    mutt = {}

    # breaks out of lists of dogs at the end of the loop
    if li.text == 'Mixed Breed Dog Information':
        break

    # if the <li> has an <a> tag
    if li.find('a'):

        # cleans the name of the dog
        # regex pattern to sub characters from breeds. (parenthesis, the word 'mix', random chars, etc)
        name = li.find('a').text.lower().strip()
        breeds = pattern.sub(lambda m: rep[re.escape(m.group(0))], li.contents[-1]).lower().strip().split(' x ')

        # assigns name and breeds to mutt dict
        mutt['name'] = name
        mutt['breeds'] = [breed.strip() for breed in breeds]

    # if there is no <a> in the <li>
    else:
    
        # cleans the name and breeds to be able to split. ".replace" chains to make all input usuable
        cleaned_name = li.text.lower().replace(' -', ' - ').replace('- ', ' - ').replace('\xa0',' ').split(' - ')[0].strip()
        cleaned_breed = li.text.lower().replace(' -', ' - ').replace('- ', ' - ').replace('\xa0',' ').split(' - ')[1].strip()

        # regex pattern to sub characters from name and breeds. (parenthesis, the word 'mix', random chars, etc)
        name = pattern.sub(lambda m: rep[re.escape(m.group(0))], cleaned_name).strip()
        breeds = pattern.sub(lambda m: rep[re.escape(m.group(0))], cleaned_breed).strip().split(' x ')

        # assigns name and breeds to mutt dict
        mutt['name'] = name
        mutt['breeds'] = [breed.strip() for breed in breeds]

    # appends dog name and breeds to dogs list
    dogs.append(mutt)
    # print(f"{name} added.")


# Clean Breed Names

In [6]:
breeds_to_fix = []
names = []

# iterates over all the dogs
for dog in dogs:
    
    # iterates over all the breeds for each dog
    for breed in dog['breeds']:

        # checks for weirdness in the breeds and assigns that dog to a list to be dealt with. BAD DOG!
        if ' with ' in breed:
            names.append(dog['name'])
            breeds_to_fix.append(dog)
        elif ' or ' in breed:
            names.append(dog['name'])
            breeds_to_fix.append(dog)
        elif ' the ' in breed:
            names.append(dog['name'])
            breeds_to_fix.append(dog)
        elif ' ed ' in breed:
            names.append(dog['name'])
            breeds_to_fix.append(dog)
        elif ' Miniature ' in breed:
            names.append(dog['name'])
            breeds_to_fix.append(dog)
        elif ' Toy ' in breed:
            names.append(dog['name'])
            breeds_to_fix.append(dog)
        elif ' Mini ' in breed:
            names.append(dog['name'])
            breeds_to_fix.append(dog)

Check for consistency

In [7]:
# removes all the bad dogs
for dog in dogs:
    if dog['name'] in names:
        dogs.remove(dog)

## Parse mixed breeds

Example:

In [8]:
breeds_to_fix[0]

{'name': 'aussie-corgi',
 'breeds': ['australian shepherd or miniature australian shepherd',
  'pembroke welch corgi']}

In [9]:
dogs.append(
    {
        "name": "aussie-corgi",
        "breeds": ["australian shepherd", "pembroke welch corgi",],
    }
)

dogs.append(
    {
        "name": "aussie-corgi",
        "breeds": ["miniature australian shepherd", "pembroke welch corgi",],
    }
)

In [10]:
breeds_to_fix[1]

{'name': 'aussie pom',
 'breeds': ['australian shepherd or mini or toy australian shepherd',
  'pomeranian']}

In [11]:
dogs.append(
    {"name": "aussie pom", "breeds": ["australian shepherd", "pomeranian"],}
)

dogs.append(
    {"name": "aussie pom", "breeds": ["toy australian shepherd", "pomeranian"],}
)

dogs.append(
    {"name": "aussie pom", "breeds": ["mini australian shepherd", "pomeranian"],}
)


In [12]:
breeds_to_fix[2]

{'name': 'austi-pap',
 'breeds': ['australian shepherd standard, toy or miniature', 'papillon']}

In [13]:
dogs.append(
    {"name": "austi-pap", "breeds": ["australian shepherd", "papillon"],}
)

dogs.append(
    {"name": "austi-pap", "breeds": ["miniature australian shepherd", "papillon"],}
)

dogs.append(
    {"name": "austi-pap", "breeds": ["toy australian shepherd", "papillon"],}
)

In [14]:
breeds_to_fix[3]

{'name': 'beagi', 'breeds': ['beagle', 'pembroke or cardigan welsh corgi']}

In [15]:
dogs.append({"name": "beagi", "breeds": ["beagle", "pembroke"]})

dogs.append({"name": "beagi", "breeds": ["beagle", "cardigan welsh corgi"]})

In [16]:
breeds_to_fix[4]

{'name': 'border stack',
 'breeds': ['border collie ed with jack russell terrier',
  'staffordshire bull terrier']}

In [17]:
{
    "name": "border stack",
    "breeds": [
        "border collie ed with jack russell terrier",
        "staffordshire bull terrier",
    ],
}

{
    "name": "border stack",
    "breeds": [
        "border collie ed with jack russell terrier",
        "staffordshire bull terrier",
    ],
}

{'name': 'border stack',
 'breeds': ['border collie ed with jack russell terrier',
  'staffordshire bull terrier']}

In [18]:
breeds_to_fix[5]

{'name': 'bully pitsky',
 'breeds': ['american bully', 'siberian husky or alaskan husky']}

In [19]:
dogs.append(
    {"name": "bully pitsky", "breeds": ["american bully", "siberian husky"],}
)

dogs.append(
    {"name": "bully pitsky", "breeds": ["american bully", "alaskan husky"],}
)

In [20]:
breeds_to_fix[6]

{'name': 'cotralian',
 'breeds': ['cocker spaniel',
  'standard, miniature or toy australian shepherd']}

In [21]:
dogs.append(
    {"name": "cotralian", "breeds": ["cocker spaniel", "australian shepherd"],}
)

dogs.append(
    {"name": "cotralian", "breeds": ["cocker spaniel", "toy australian shepherd"],}
)

dogs.append(
    {
        "name": "cotralian",
        "breeds": ["cocker spaniel", "miniature australian shepherd"],
    }
)

In [22]:
breeds_to_fix[7]

{'name': 'english cotralian',
 'breeds': ['english cocker spaniel',
  'standard, miniature or toy australian shepherd']}

In [23]:
dogs.append(
    {
        "name": "english cotralian",
        "breeds": ["english cocker spaniel", "australian shepherd",],
    }
)

dogs.append(
    {
        "name": "english cotralian",
        "breeds": ["english cocker spaniel", "toy australian shepherd",],
    }
)

dogs.append(
    {
        "name": "english cotralian",
        "breeds": ["english cocker spaniel", "miniature australian shepherd",],
    }
)

In [24]:
breeds_to_fix[8]

{'name': 'labradoodle miniature',
 'breeds': ['labrador retriever', 'toy or miniature poodle']}

In [25]:
dogs.append(
    {
        "name": "labradoodle miniature",
        "breeds": ["labrador retriever", "miniature poodle"],
    }
)

dogs.append(
    {"name": "labradoodle miniature", "breeds": ["labrador retriever", "toy poodle"],}
)

In [26]:
breeds_to_fix[9]

{'name': 'miniature goldendoodle',
 'breeds': ['golden retriever', 'toy or miniature poodle']}

In [27]:
dogs.append({
    "name": "miniature goldendoodle",
    "breeds": ["golden retriever", "miniature poodle"],
})

dogs.append({
    "name": "miniature goldendoodle",
    "breeds": ["golden retriever", "toy poodle"],
})

In [28]:
breeds_to_fix[10]

{'name': 'miniature labradoodle',
 'breeds': ['labrador retriever', 'toy or miniature poodle']}

In [29]:
dogs.append(
    {
        "name": "miniature labradoodle",
        "breeds": ["labrador retriever", "miniature poodle"],
    }
)

dogs.append(
    {"name": "miniature labradoodle", "breeds": ["labrador retriever", "toy poodle"],}
)

In [30]:
breeds_to_fix[11]

{'name': 'old anglican bulldogge',
 'breeds': ['american pit bull terrier or american staffordshire terrier',
  'bulldog']}

In [31]:
dogs.append(
    {
        "name": "old anglican bulldogge",
        "breeds": ["american pit bull terrier", "bulldog",],
    }
)

dogs.append(
    {
        "name": "old anglican bulldogge",
        "breeds": ["american staffordshire terrier", "bulldog",],
    }
)

In [32]:
breeds_to_fix[12]

{'name': 'pitsky',
 'breeds': ['american pitbull terrier', 'siberian husky or alaskan husky']}

In [33]:
dogs.append({"name": "pitsky", "breeds": ["american pitbull terrier", "alaskan husky"]})

dogs.append({"name": "pitsky", "breeds": ["american pitbull terrier", "siberian husky"]})

In [34]:
breeds_to_fix[13]

{'name': 'shel-aussie',
 'breeds': ['australian shepherd or miniature australian shepherd',
  'shetland sheepdog']}

In [35]:
dogs.append(
    {"name": "shel-aussie", "breeds": ["australian shepherd", "shetland sheepdog",],}
)

dogs.append(
    {
        "name": "shel-aussie",
        "breeds": ["miniature australian shepherd", "shetland sheepdog",],
    }
)

In [36]:
breeds_to_fix[14]

{'name': 'sibercaan',
 'breeds': ['siberian husky or native american indian dog', 'canaan dog']}

In [37]:
dogs.append({"name": "sibercaan", "breeds": ["siberian husky", "canaan dog"]})

dogs.append(
    {"name": "sibercaan", "breeds": ["native american indian dog", "canaan dog"]}
)

In [38]:
dogs.append({"name": "labrabull", "breeds": ['labrador retriever', 'pit bull']})

In [39]:
df = pd.DataFrame(dogs)
df.head(2)

Unnamed: 0,name,breeds
0,afador,"[afghan hound, labrador retriever]"
1,afaird,"[afghan hound, briard]"


In [40]:
df['breeds'] = df['breeds'].map(lambda x: re.sub('german shepherd dog', 'german shepherd', str(x)))
df['breeds'] = df['breeds'].map(lambda x: re.sub('labrador  retriever', 'labrador retriever', str(x)))

In [41]:
# removes minicoonhound, breed information available
df = df[df['name'] != 'minicoonhound']

# Export Mutts

In [42]:
# saves mutts to csv
df.to_csv('../datasets/working_data/mutts.csv', index=False)

# Aggregate Dog Features

In [43]:
purebred_stats = pd.read_csv('../datasets/working_data/breeds_info.csv')

Binarize pure breed column

In [None]:
# breed_1 and breed_2 set to the purebred's name
purebred_stats['pure'] = 1
purebred_stats['breed_1'] = purebred_stats['breed']
purebred_stats['breed_2'] = purebred_stats['breed']

In [45]:
purebred_stats.head(3)

Unnamed: 0,breed,classification,obey,reps_lower,reps_upper,height_low_inches,height_high_inches,weight_low_lbs,weight_high_lbs,pure,breed_1,breed_2
0,border collie,brightest dogs,0.95,1.0,4.0,19.0,21.0,40.0,40.0,1,border collie,border collie
1,poodle,brightest dogs,0.95,1.0,4.0,17.941176,20.908497,42.934641,57.522876,1,poodle,poodle
2,german shepherd,brightest dogs,0.95,1.0,4.0,22.0,26.0,75.0,90.0,1,german shepherd,german shepherd
3,golden retriever,brightest dogs,0.95,1.0,4.0,21.0,24.0,55.0,75.0,1,golden retriever,golden retriever
4,doberman pinscher,brightest dogs,0.95,1.0,4.0,26.0,28.0,60.0,100.0,1,doberman pinscher,doberman pinscher


In [46]:
purebred_stats.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 168 entries, 0 to 167
Data columns (total 12 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   breed               168 non-null    object 
 1   classification      141 non-null    object 
 2   obey                168 non-null    float64
 3   reps_lower          168 non-null    float64
 4   reps_upper          168 non-null    float64
 5   height_low_inches   168 non-null    float64
 6   height_high_inches  168 non-null    float64
 7   weight_low_lbs      168 non-null    float64
 8   weight_high_lbs     168 non-null    float64
 9   pure                168 non-null    int64  
 10  breed_1             168 non-null    object 
 11  breed_2             168 non-null    object 
dtypes: float64(7), int64(1), object(4)
memory usage: 15.9+ KB


Aggregate and merge the metrics from purebreed dogs.

In [47]:
# creates a set of purebred names for use in .issubset()
purebreds = set(purebred_stats['breed'])

mutt_stats = []

# iterates over all the mutts
for dog in dogs:

    # if the the two breeds that make up a mutt are in the purebred set, do this
    if set(dog['breeds']).issubset(purebreds):
        
        new_dog = {}

        # assigns the mutt name
        new_dog['breed'] = dog['name']

        # obey
        # takes the stats from each purebred, takes the average, applies it to the mutt
        d1_obey = purebred_stats[purebred_stats['breed'] == dog['breeds'][0]]['obey'].values[0]
        d2_obey = purebred_stats[purebred_stats['breed'] == dog['breeds'][1]]['obey'].values[0]
        new_dog['obey'] = (d1_obey + d2_obey) / 2

        # reps_lower
        # takes the stats from each purebred, takes the average, applies it to the mutt
        d1_reps_lower = purebred_stats[purebred_stats['breed'] == dog['breeds'][0]]['reps_lower'].values[0]
        d2_reps_lower = purebred_stats[purebred_stats['breed'] == dog['breeds'][1]]['reps_lower'].values[0]
        new_dog['reps_lower'] = (d1_reps_lower + d2_reps_lower) / 2

        # reps_upper
        # takes the stats from each purebred, takes the average, applies it to the mutt
        d1_reps_upper = purebred_stats[purebred_stats['breed'] == dog['breeds'][0]]['reps_upper'].values[0]
        d2_reps_upper = purebred_stats[purebred_stats['breed'] == dog['breeds'][1]]['reps_upper'].values[0]
        new_dog['reps_upper'] = (d1_reps_upper + d2_reps_upper) / 2

        # height_low_inches
        # takes the stats from each purebred, takes the average, applies it to the mutt
        d1_height_low_inches = purebred_stats[purebred_stats['breed'] == dog['breeds'][0]]['height_low_inches'].values[0]
        d2_height_low_inches = purebred_stats[purebred_stats['breed'] == dog['breeds'][1]]['height_low_inches'].values[0]
        new_dog['height_low_inches'] = (d1_height_low_inches + d2_height_low_inches) / 2

        # height_high_inches
        # takes the stats from each purebred, takes the average, applies it to the mutt
        d1_height_high_inches = purebred_stats[purebred_stats['breed'] == dog['breeds'][0]]['height_high_inches'].values[0]
        d2_height_high_inches = purebred_stats[purebred_stats['breed'] == dog['breeds'][1]]['height_high_inches'].values[0]
        new_dog['height_high_inches'] = (d1_height_high_inches + d2_height_high_inches) / 2

        # weight_low_lbs
        # takes the stats from each purebred, takes the average, applies it to the mutt
        d1_weight_low_lbs = purebred_stats[purebred_stats['breed'] == dog['breeds'][0]]['weight_low_lbs'].values[0]
        d2_weight_low_lbs = purebred_stats[purebred_stats['breed'] == dog['breeds'][1]]['weight_low_lbs'].values[0]
        new_dog['weight_low_lbs'] = (d1_weight_low_lbs + d2_weight_low_lbs) / 2

        # weight_high_lbs
        # takes the stats from each purebred, takes the average, applies it to the mutt
        d1_weight_high_lbs = purebred_stats[purebred_stats['breed'] == dog['breeds'][0]]['weight_high_lbs'].values[0]
        d2_weight_high_lbs = purebred_stats[purebred_stats['breed'] == dog['breeds'][1]]['weight_high_lbs'].values[0]
        new_dog['weight_high_lbs'] = (d1_weight_high_lbs + d2_weight_high_lbs) / 2

        # assigns a 0 for all the mutts in the purebred column
        new_dog['pure'] = 0

        # assigns breed 1 and breed 2 to columns that make up the mutt
        new_dog['breed_1'] = dog['breeds'][0]
        new_dog['breed_2'] = dog['breeds'][1]

        # add mutt with stats to the list
        mutt_stats.append(new_dog)
        
        
mutts = pd.DataFrame(mutt_stats)
mutts.head(2)

Unnamed: 0,breed,obey,reps_lower,reps_upper,height_low_inches,height_high_inches,weight_low_lbs,weight_high_lbs,pure,breed_1,breed_2
0,afador,0.525,41.0,52.0,23.0,25.5,52.5,70.0,0,afghan hound,labrador retriever
1,afaird,0.4,48.5,62.5,24.0,27.0,62.0,68.0,0,afghan hound,briard


Merge

In [48]:
all_dogs = pd.concat([purebred_stats, mutts])
all_dogs.reset_index(drop=True, inplace=True)
all_dogs.head()

Unnamed: 0,breed,classification,obey,reps_lower,reps_upper,height_low_inches,height_high_inches,weight_low_lbs,weight_high_lbs,pure,breed_1,breed_2
0,border collie,brightest dogs,0.95,1.0,4.0,19.0,21.0,40.0,40.0,1,border collie,border collie
1,poodle,brightest dogs,0.95,1.0,4.0,17.941176,20.908497,42.934641,57.522876,1,poodle,poodle
2,german shepherd,brightest dogs,0.95,1.0,4.0,22.0,26.0,75.0,90.0,1,german shepherd,german shepherd
3,golden retriever,brightest dogs,0.95,1.0,4.0,21.0,24.0,55.0,75.0,1,golden retriever,golden retriever
4,doberman pinscher,brightest dogs,0.95,1.0,4.0,26.0,28.0,60.0,100.0,1,doberman pinscher,doberman pinscher


In [49]:
all_dogs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1218 entries, 0 to 1217
Data columns (total 12 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   breed               1218 non-null   object 
 1   classification      141 non-null    object 
 2   obey                1218 non-null   float64
 3   reps_lower          1218 non-null   float64
 4   reps_upper          1218 non-null   float64
 5   height_low_inches   1218 non-null   float64
 6   height_high_inches  1218 non-null   float64
 7   weight_low_lbs      1218 non-null   float64
 8   weight_high_lbs     1218 non-null   float64
 9   pure                1218 non-null   int64  
 10  breed_1             1218 non-null   object 
 11  breed_2             1218 non-null   object 
dtypes: float64(7), int64(1), object(4)
memory usage: 114.3+ KB


# Export

In [50]:
# all dogs to csv
all_dogs.to_csv('../datasets/working_data/all_breeds_info.csv', index=False)

[Run Next](https://github.com/gwoodstock/project4/blob/main/scrapers/petfinder.ipynb): PetFinder API data scraper