# Import Libraries

In [1]:
import pandas as pd
import numpy as np
import re

# Import Data

In [2]:
df = pd.read_csv('./datasets/working_data/austin.csv')

# Inspect Data

In [3]:
df.head(3)

Unnamed: 0,id,name,date_in,location,intake_type,condition,age_in,breed,color,date_out,dob,outcome,age_out,sex,intact_in,intact_out,age,primary_color,secondary_color
0,A047759,Oreo,2014-04-02 15:55:00,Austin (TX),surrender,normal,10.0,Dachshund,Tricolor,2014-04-07 15:12:00,2004-04-02 00:00:00,transfer,10.0,1,0,0,Senior,tricolor,tricolor
1,A134067,Bandit,2013-11-16 09:02:00,12034 Research Blvd in Austin (TX),public_assist,injured,16.0,Shetland Sheepdog,Brown/White,2013-11-16 11:54:00,1997-10-16 00:00:00,return_owner,16.0,1,0,0,Senior,brown,white
2,A141142,Bettie,2013-11-16 14:46:00,Austin (TX),stray,aged,15.0,Labrador Retriever/Pit Bull,Black/White,2013-11-17 11:40:00,1998-06-01 00:00:00,return_owner,15.0,0,0,0,Senior,black,white


In [4]:
df['breed'].nunique()

2219

# Clean Breeds

2219 unique breeds!! That is far too many. Will need to reduce for modeling.

In [4]:
# lowercase and strip extra white space for breed names
df['breed'] = df['breed'].str.lower().str.strip()

# remove 'mix' at the end of the breed name
df['breed'] = df['breed'].str.replace(' mix', '')

# expand pitbull to pit bull for consistency
df['breed'].replace('pitbull', 'pit bull', inplace=True)

Identify unique singular breed observations

In [6]:
df[df['breed'].str.contains('/') == False]['breed'].nunique()

202

202 unique combo breeds. Will want to separate these into their own column and reduce wherever possible.

In [7]:
df[df['breed'].str.contains('/')].head(3)

Unnamed: 0,id,name,date_in,location,intake_type,condition,age_in,breed,color,date_out,dob,outcome,age_out,sex,intact_in,intact_out,age,primary_color,secondary_color
2,A141142,Bettie,2013-11-16 14:46:00,Austin (TX),stray,aged,15.0,labrador retriever/pit bull,Black/White,2013-11-17 11:40:00,1998-06-01 00:00:00,return_owner,15.0,0,0,0,Senior,black,white
10,A212672,Cujo,2013-11-25 17:18:00,14611 Wells Port Dr in Austin (TX),stray,normal,13.0,german shepherd/labrador retriever,Tan/Black,2013-12-06 14:34:00,2000-03-05 00:00:00,return_owner,13.0,0,1,1,Senior,tan,black
16,A226069,Cedar,2015-10-06 12:29:00,Wheless Ln And Berkman Dr in Austin (TX),stray,normal,15.0,labrador retriever/beagle,Sable/White,2015-10-29 15:00:00,2000-06-17 00:00:00,adoption,15.0,1,0,0,Senior,sable,white


Identify unique combo breed observations

In [8]:
df[df['breed'].str.contains('/')]['breed'].nunique()

1843

Was able to reduce the total number of unique breeds by about 400. Still a lot of work to be done.

In [8]:
df['breed'].value_counts(ascending=True)[df['breed'].value_counts(ascending=True) == 1]

akita/great pyrenees                    1
beagle/treeing walker coonhound         1
queensland heeler/great dane            1
pit bull/american foxhound              1
pit bull/dogue de bordeaux              1
                                       ..
american bulldog/blue lacy              1
siberian husky/great pyrenees           1
shetland sheepdog/chihuahua longhair    1
queensland heeler/basset hound          1
border collie/shetland sheepdog         1
Name: breed, Length: 997, dtype: int64

## Initial breed name cleaning

Regex to replace breed names to remain consistent

In [9]:
df['breed'] = df['breed'].map(lambda x: re.sub('labrador retr\Z', 'labrador retriever', x))
df['breed'] = df['breed'].map(lambda x: re.sub('germ\sshepherd', 'german shepherd', x))
df['breed'] = df['breed'].map(lambda x: re.sub('^staffordshire$', 'american staffordshire terrier', x))
df['breed'] = df['breed'].map(lambda x: re.sub('chihuahua shorthair', 'chihuahua', x))
df['breed'] = df['breed'].map(lambda x: re.sub('chihuahua longhair', 'chihuahua', x))
df['breed'] = df['breed'].map(lambda x: re.sub('chihuahua sh', 'chihuahua', x))
df['breed'] = df['breed'].map(lambda x: re.sub('chihuahua lh', 'chihuahua', x))
df['breed'] = df['breed'].map(lambda x: re.sub('airedale terr\Z', 'airedale terrier', x))
df['breed'] = df['breed'].map(lambda x: re.sub('amer ', 'american', x))
df['breed'] = df['breed'].map(lambda x: re.sub('alask ', 'alaskan', x))
df['breed'] = df['breed'].map(lambda x: re.sub('am pit bull ter', 'pit bull', x))
df['breed'] = df['breed'].map(lambda x: re.sub('aust ', 'australian', x))
df['breed'] = df['breed'].map(lambda x: re.sub('american bulldog', 'bulldog', x))
df['breed'] = df['breed'].map(lambda x: re.sub('english bulldog', 'bulldog', x))
df['breed'] = df['breed'].map(lambda x: re.sub('black/tan hound', 'black and tan coonhound', x))
df['breed'] = df['breed'].map(lambda x: re.sub('yorkshire terr\Z', 'yorkshire terrier', x))
df['breed'] = df['breed'].map(lambda x: re.sub('australiancattle dog', 'australian cattle dog', x))
df['breed'] = df['breed'].map(lambda x: re.sub('anatol shepherd', 'anatolian sheepdog', x))
df['breed'] = df['breed'].map(lambda x: re.sub('doberman pinsch\Z', 'doberman pinscher', x))
df['breed'] = df['breed'].map(lambda x: re.sub('poodle min\Z', 'miniature poodle', x))
df['breed'] = df['breed'].map(lambda x: re.sub('alaskan husky', 'siberian husky', x))
df['breed'] = df['breed'].map(lambda x: re.sub('rat terrier', 'fox terrier', x))
df['breed'] = df['breed'].map(lambda x: re.sub('wire hair fox terrier', 'fox terrier', x))
df['breed'] = df['breed'].map(lambda x: re.sub('americanbulldog', 'bulldog', x))
df['breed'] = df['breed'].map(lambda x: re.sub('australianshepherd', 'australian shepherd', x))
df['breed'] = df['breed'].map(lambda x: re.sub('dachshund wirehair', 'dachshund', x))
df['breed'] = df['breed'].map(lambda x: re.sub('american staff\Z', 'pit bull', x))
df['breed'] = df['breed'].map(lambda x: re.sub('american staffordshire terrier', 'pit bull', x))
df['breed'] = df['breed'].map(lambda x: re.sub('dachshund longhair', 'dachshund', x))
df['breed'] = df['breed'].map(lambda x: re.sub('schnauzer min\Z', 'miniature schnauzer', x))
df['breed'] = df['breed'].map(lambda x: re.sub('flat coat retriever', 'flat-coated retriever', x))
df['breed'] = df['breed'].map(lambda x: re.sub('collie smooth', 'collie', x))
df['breed'] = df['breed'].map(lambda x: re.sub('chinese sharpei', 'chinese shar pei', x))
df['breed'] = df['breed'].map(lambda x: re.sub('queensland heeler', 'australian cattle dog', x))
df['breed'] = df['breed'].map(lambda x: re.sub('rhod ridgeback', 'rhodesian ridgeback', x))
df['breed'] = df['breed'].map(lambda x: re.sub('german shorthair pointer', 'german shorthaired pointer', x))
df['breed'] = df['breed'].map(lambda x: re.sub('manchester terrier', 'standard manchester terrier', x))
df['breed'] = df['breed'].map(lambda x: re.sub('collie rough', 'collie', x))
df['breed'] = df['breed'].map(lambda x: re.sub('golden retr\Z', 'golden retriever', x))
df['breed'] = df['breed'].map(lambda x: re.sub('soft coated wheaten terrier', 'soft-coated wheaten terrier', x))
df['breed'] = df['breed'].map(lambda x: re.sub('west highland\Z', 'west highland white terrier', x))
df['breed'] = df['breed'].map(lambda x: re.sub('australian kelpie', 'australian cattle dog', x))
df['breed'] = df['breed'].map(lambda x: re.sub('redbone hound', 'redbone coonhound', x))
df['breed'] = df['breed'].map(lambda x: re.sub('eng bulldog', 'bulldog', x))

Import breed info for all dogs, convert it to a dictionary for more filtering.

In [10]:
all_dogs = pd.read_csv('./datasets/working_data/all_breeds_info.csv')
pure_dogs = pd.read_csv('./datasets/raw_data/dog_intelligence.csv')
master_list = set(list(all_dogs['breed']) + list(pure_dogs['Breed'].str.lower()))
all_dogs = all_dogs.to_dict(orient='records')

In [11]:
df.head(3)

Unnamed: 0,id,name,date_in,location,intake_type,condition,age_in,breed,color,date_out,dob,outcome,age_out,sex,intact_in,intact_out,age,primary_color,secondary_color
0,A047759,Oreo,2014-04-02 15:55:00,Austin (TX),surrender,normal,10.0,dachshund,Tricolor,2014-04-07 15:12:00,2004-04-02 00:00:00,transfer,10.0,1,0,0,Senior,tricolor,tricolor
1,A134067,Bandit,2013-11-16 09:02:00,12034 Research Blvd in Austin (TX),public_assist,injured,16.0,shetland sheepdog,Brown/White,2013-11-16 11:54:00,1997-10-16 00:00:00,return_owner,16.0,1,0,0,Senior,brown,white
2,A141142,Bettie,2013-11-16 14:46:00,Austin (TX),stray,aged,15.0,labrador retriever/pit bull,Black/White,2013-11-17 11:40:00,1998-06-01 00:00:00,return_owner,15.0,0,0,0,Senior,black,white


New breed columns

In [12]:
df['breed_1'] = np.nan
df['breed_2'] = np.nan
df['pure'] = np.nan

Collect all the known breeds and compile in a new column.

In [13]:
def breed_check(x):
    if x in master_list:
        return x

df['breed_1'] = df['breed'].apply(breed_check)
df['breed_2'] = df['breed'].apply(breed_check)
df['pure'] = np.where(df['breed_1'] == df['breed_2'], 1, np.nan)

Observations with breed not missing from breed library.

In [15]:
df['breed_1'].isna().sum()

10805

Most common observation breed names not in main breed library.

In [18]:
df[df['breed_1'].isna()]['breed'].value_counts()[:10]

catahoula                                   575
chihuahua/dachshund                         334
black mouth cur                             305
dachshund/chihuahua                         292
labrador retriever/pit bull                 289
german shepherd/labrador retriever          215
labrador retriever/german shepherd          198
blue lacy                                   152
pit bull/labrador retriever                 136
labrador retriever/australian cattle dog    118
Name: breed, dtype: int64

Parse breed names if they contain a '/' and attempt to catch all known breed names in our new split breed columns.

In [19]:
# loops over all breeds in dataframe
df_mini = df[(df['breed_1'].isna()) & (df['breed'].str.contains('/'))].copy()
for i, dog in df_mini['breed'].items():

    # splits the dogs on the '/'
    breeds = dog.split('/')

    # checks if any items of breeds is in the master list of dog breeds
    if not set(breeds).isdisjoint(master_list):
        
        # checks both position combinations
        if (breeds[0] in master_list) and (breeds[1] in master_list):
            df.loc[i, 'breed_1'] = breeds[0]
            df.loc[i, 'breed_2'] = breeds[1]

Missing breeds have been reduced from just under 11,000 down to only 2,500!! Almost there.

In [19]:
df['breed_1'].isna().sum()

2579

Make classifier for 'mixed' breeds

In [20]:
for i, v in df[df['breed'].str.contains('/')]['breed'].items():
    v = v.split('/')
    if v[0] in master_list and not v[1] in master_list:
        df.loc[i, 'breed_1'] = v[0]
        df.loc[i, 'breed_2'] = 'mix'
    elif not v[0] in master_list and v[1] in master_list:
        df.loc[i, 'breed_1'] = v[1]
        df.loc[i, 'breed_2'] = 'mix'

In [21]:
df['breed_1'].isna().sum()

1869

Reduced missing breeds by 700

List remaining missing breeds. Probably best to drop these values.

In [23]:
set(df[df['breed_1'].isna()]['breed']).difference(master_list)

{'akbash',
 'alaskan klee kai',
 'bedlington terr',
 'black mouth cur',
 'black mouth cur/blue lacy',
 'black mouth cur/catahoula',
 'black mouth cur/staffordshire',
 'blue lacy',
 'blue lacy/staffordshire',
 'bluetick hound',
 'bluetick hound/treeing walker coonhound',
 'boerboel',
 'boykin span',
 'bruss griffon',
 'bull terrier miniature',
 'cane corso',
 'carolina dog',
 'carolina dog/catahoula',
 'catahoula',
 'catahoula/black mouth cur',
 'catahoula/carolina dog',
 'catahoula/english coonhound',
 'catahoula/staffordshire',
 'cavalier span',
 'chesa bay retr',
 'coton de tulear',
 'dachshund stan',
 'dandie dinmont',
 'doberman pinsch/black mouth cur',
 'doberman pinsch/catahoula',
 'dogo argentino',
 'dogue de bordeaux',
 'dutch sheepdog',
 'dutch shepherd',
 'eng toy spaniel',
 'english coonhound',
 'english pointer',
 'english shepherd',
 'entlebucher',
 'feist',
 'glen of imaal',
 'grand basset griffon vendeen',
 'greater swiss mountain dog',
 'hovawart',
 'jindo',
 'landseer'

List most common

In [24]:
df[df['breed_1'].isna()]['breed'].value_counts()[:20]

catahoula                   575
black mouth cur             305
blue lacy                   152
carolina dog                 80
bruss griffon                57
pbgv                         55
parson russell terrier       45
cavalier span                42
dutch shepherd               37
english coonhound            32
treeing walker coonhound     31
english pointer              27
st. bernard rough coat       27
st. bernard smooth coat      26
dogo argentino               24
bluetick hound               22
chesa bay retr               22
schnauzer giant              21
leonberger                   21
old bulldog                  15
Name: breed, dtype: int64

# Export Data

In [25]:
df.to_csv('./datasets/working_data/austin_clean_breed.csv', index=False)

[Run Next](https://github.com/gwoodstock/gooddogs/blob/main/3_data_clean_petfinder.ipynb): PetFinder API data cleaning