In [141]:
import pandas as pd

dogs = pd.read_csv('./ShelterDogs.csv')


In [142]:
dogs.dtypes

ID                     int64
name                  object
age                  float64
sex                   object
breed                 object
date_found            object
adoptable_from        object
posted                object
color                 object
coat                  object
size                  object
neutered              object
housebroken           object
likes_people          object
likes_children        object
get_along_males       object
get_along_females     object
get_along_cats        object
keep_in               object
dtype: object

In [143]:
# Check frequency counts while also printing the NaN count
print(dogs["keep_in"].value_counts(dropna=False))

# Switch to a categorical variable
dogs["keep_in"] = dogs["keep_in"].astype("category")

# Add new categories
new_categories = ["Unknown History", "Open Yard (Countryside)"]
dogs["keep_in"] = dogs["keep_in"].cat.add_categories(new_categories)

# Check frequency counts one more time
print(dogs['keep_in'].value_counts(dropna=False))

both flat and garden    1224
NaN                     1021
garden                   510
flat                     182
Name: keep_in, dtype: int64
both flat and garden       1224
NaN                        1021
garden                      510
flat                        182
Unknown History               0
Open Yard (Countryside)       0
Name: keep_in, dtype: int64


In [144]:
dogs.likes_children.fillna('maybe', inplace=True)

In [145]:
print(dogs["likes_children"].value_counts())
dogs["likes_children"] = dogs["likes_children"].astype("category")
# dogs.loc[dogs["likes_children"] == "NaN", "likes_children"] = "maybe"

maybe    1718
yes      1172
no         47
Name: likes_children, dtype: int64


In [146]:
dogs["likes_children"] = dogs["likes_children"].astype("category")
# Set "maybe" to be "no"
dogs.loc[dogs["likes_children"] == "maybe", "likes_children"] = "no"
# Print out categories
print(dogs["likes_children"].cat.categories)

# Print the frequency table
print(dogs["likes_children"].value_counts())

# Remove the `"maybe" category
dogs["likes_children"] = dogs["likes_children"].cat.remove_categories(["maybe"])
print(dogs["likes_children"].value_counts())

# Print the categories one more time
print(dogs["likes_children"].cat.categories)

Index(['maybe', 'no', 'yes'], dtype='object')
no       1765
yes      1172
maybe       0
Name: likes_children, dtype: int64
no     1765
yes    1172
Name: likes_children, dtype: int64
Index(['no', 'yes'], dtype='object')


In [147]:
dogs["get_along_males"].fillna("Maybe?", inplace=True)
dogs['get_along_males']=dogs['get_along_males'].astype('category')

In [148]:
# Create the my_changes dictionary
my_changes = {'Maybe?':'Maybe'}

# Rename the categories listed in the my_changes dictionary
dogs["get_along_males"] = dogs['get_along_males'].cat.rename_categories(my_changes)

# Use a lambda function to convert all categories to uppercase using upper()
dogs["get_along_males"] =  dogs["get_along_males"].cat.rename_categories(lambda c: c.upper())

# Print the list of categories
print(dogs['get_along_males'].cat.categories)

Index(['MAYBE', 'NO', 'YES'], dtype='object')


In [149]:
# Create the update_coats dictionary
update_coats = {
    'wirehaired':'medium',
    'medium-long':'medium'
}

# Create a new column, coat_collapsed
dogs["coat_collapsed"] = dogs['coat'].replace(update_coats)

# Convert the column to categorical
dogs['coat_collapsed'] = dogs['coat_collapsed'].astype('category')

# Print the frequency table
print(dogs['coat_collapsed'].value_counts())

short     1972
medium     785
long       180
Name: coat_collapsed, dtype: int64


In [150]:
dogs['size']=dogs['size'].astype('category')

In [151]:
# Print out the current categories of the size variable
print(dogs["size"].cat.categories)

# Reorder the categories, specifying the Series is ordinal, and overwriting the original series
dogs["size"]=dogs["size"].cat.reorder_categories(
  new_categories=["small", "medium", "large"],
  ordered=True,
  # inplace = True
)

Index(['large', 'medium', 'small'], dtype='object')


In [152]:
dogs["size"].cat.categories

Index(['small', 'medium', 'large'], dtype='object')

In [153]:
dogs['size'].head()

0     small
1     small
2    medium
3    medium
4     small
Name: size, dtype: category
Categories (3, object): ['small' < 'medium' < 'large']

In [154]:
# Previous code
dogs["size"]=dogs["size"].cat.reorder_categories(
  new_categories=["small", "medium", "large"],
  ordered=True,
#   inplace=True
)

# How many Male/Female dogs are available of each size?
print(dogs.groupby('size')['sex'].value_counts())

# Do larger dogs need more room to roam?
print(dogs.groupby('size')['keep_in'].value_counts(dropna=False))

size    sex   
small   male       260
        female     214
medium  male      1090
        female     854
large   male       331
        female     188
Name: sex, dtype: int64
size                           
small   both flat and garden       238
        flat                        80
        garden                      21
        Unknown History              0
        Open Yard (Countryside)      0
medium  both flat and garden       795
        garden                     317
        flat                        97
        Unknown History              0
        Open Yard (Countryside)      0
large   both flat and garden       191
        garden                     172
        flat                         5
        Unknown History              0
        Open Yard (Countryside)      0
Name: keep_in, dtype: int64


In [155]:
# Fix the misspelled word
replace_map = {"Malez": "male"}

# Update the sex column using the created map
dogs["sex"] = dogs["sex"].replace(replace_map)

# Strip away leading whitespace
dogs["sex"] = dogs["sex"].str.strip()

# Make all responses lowercase
dogs["sex"] = dogs["sex"].str.lower()

# Convert to a categorical Series
dogs["sex"] = dogs['sex'].astype('category')

print(dogs["sex"].value_counts())

male      1681
female    1256
Name: sex, dtype: int64


In [156]:
dogs = dogs.set_index('ID')
dogs.coat = dogs.coat.astype('category')

In [157]:
# Print the category of the coat for ID 23807
print(dogs.loc[dogs.index==23807,'coat'])

ID
23807    short
Name: coat, dtype: category
Categories (4, object): ['long', 'medium', 'short', 'wirehaired']


In [158]:
# Find the count of male and female dogs who have a "long" coat
print(dogs.loc[dogs['coat']=='long', 'sex'].value_counts())

male      124
female     56
Name: sex, dtype: int64


In [159]:
# Print the mean age of dogs with a breed of "English Cocker Spaniel"
print(dogs.loc[dogs['breed']=="English Cocker Spaniel", 'age'].mean())

8.186153846153847


In [160]:
# Count the number of dogs that have "English" in their breed name
print(dogs[dogs["breed"].str.contains('English', regex=False)].shape[0])

35
