# Petfinder EDA

Basic EDA on adopted dogs data returned from the Petfinder API. This notebook includes analysis and exploration of raw data returned from the Petfinder API (before any other cleaning has been applied to it).

To get the data evaluated in this notebook, you can either run the script in this Repository called `data_getter.py`, or you can download the data file produced by running this script from our [Code for Chicago Google Drive](https://drive.google.com/drive/u/0/folders/16YyhvVVQVecoBtmWOVZOR0rynGKkmYj_), and saving it to the location `rescuechi/petfinder/data/chicago_animals.pkl`.

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
plt.rcParams["figure.figsize"] = (20, 10)

In [3]:
# this file results from running `get_animals` in petfinder/data_getter.py
df = pd.read_pickle("data/chicago_il_animals_cleaned.pkl")

In [5]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [6]:
df.head()

Unnamed: 0,id,age,gender,size,coat,name,organization_id,organization_name,published_at,status_changed_at,los,breed_primary,breed_secondary,breed_mixed,breed_unknown,color_primary,color_secondary,color_tertiary,good_with_children,good_with_dogs,good_with_cats,attribute_spayed_neutered,attribute_house_trained,attribute_declawed,attribute_special_needs,attribute_shots_current,city,state
0,37845448,Young,Female,Small,Medium,Mika,IL608,Friends of Chicago Animal Care & Control,2017-04-13 22:22:16+00:00,2017-05-11 17:41:38+00:00,27,Schnauzer,Poodle,True,False,Gray / Blue / Silver,,,False,,,True,True,,False,True,Chicago,IL
1,37868394,Young,Female,Small,Long,Luna (LuLu),IL608,Friends of Chicago Animal Care & Control,2017-04-17 21:12:27+00:00,2017-04-25 02:33:04+00:00,7,Shih Tzu,,False,False,White / Cream,Black,,,,,True,False,,False,True,Chicago,IL
2,37868562,Adult,Male,Small,Medium,Jack,IL608,Friends of Chicago Animal Care & Control,2017-04-17 21:44:31+00:00,2017-04-25 02:31:52+00:00,7,Maltese,Poodle,True,False,White / Cream,,,,,,True,True,,False,True,Chicago,IL
3,38338429,Adult,Male,Small,Short,PinPon,IL608,Friends of Chicago Animal Care & Control,2017-06-07 22:25:36+00:00,2017-08-03 17:02:39+00:00,56,Pug,,False,False,Yellow / Tan / Blond / Fawn,,,False,,,True,True,,False,True,Chicago,IL
4,40204215,Adult,Male,Large,Short,Uncle Jesse,IL608,Friends of Chicago Animal Care & Control,2017-12-14 19:41:06+00:00,2018-02-06 19:38:45+00:00,53,Pit Bull Terrier,,False,False,Gray / Blue / Silver,,,,,,True,True,,False,True,Chicago,IL


In [None]:
df.columns

In [7]:
df.dtypes

id                                         int64
age                                       object
gender                                    object
size                                      object
coat                                      object
name                                      object
organization_id                           object
organization_name                         object
published_at                 datetime64[ns, UTC]
status_changed_at            datetime64[ns, UTC]
los                                        int64
breed_primary                             object
breed_secondary                           object
breed_mixed                                 bool
breed_unknown                               bool
color_primary                             object
color_secondary                           object
color_tertiary                            object
good_with_children                        object
good_with_dogs                            object
good_with_cats      

In [8]:
df.describe()

Unnamed: 0,id,los
count,2500.0,2500.0
mean,33208090.0,38.6284
std,10708150.0,94.438057
min,16403180.0,0.0
25%,21833730.0,10.0
50%,32985480.0,20.0
75%,44966740.0,40.0
max,58920660.0,2932.0


In [None]:
df["published_at"].head()

In [None]:
# cast to datetime
df["published_at"] = pd.to_datetime(df["published_at"])


In [None]:
df["status_changed_at"] = pd.to_datetime(df["status_changed_at"])

In [None]:
df["length_of_stay"] = (df["status_changed_at"] - df["published_at"]).dt.days

In [None]:
df.describe()

In [None]:
# explode the breeds column
df_breeds = df["breeds"].apply(pd.Series)
df_breeds.columns = [f"breed_{col}" for col in df_breeds.columns]


In [None]:
df_breeds.head()

In [None]:
df = pd.concat([df.drop(["breeds"], axis=1), df_breeds], axis=1)

In [None]:
df.head()

In [None]:
df["breed_primary"].value_counts()

In [None]:
df_agg = df.groupby("breed_primary")["length_of_stay"].agg(["median", "count"])

In [None]:
df_agg

In [None]:
# many rare breeds
df_agg["count"].hist(range=(0,500), bins=20)

In [None]:
# remove rare breeds
df_agg.loc[df_agg["count"]>25].sort_values(by="median", ascending=False).head(50)

In [None]:
# remove rare breeds
df_agg.loc[df_agg["count"]>25].sort_values(by="median", ascending=False).tail(50)

In [None]:
df_agg.columns

In [None]:
df_agg.plot.scatter(x="count", y="median")

In [None]:
# take the 25 most frequent breeds
df_common_breeds = df_agg.sort_values(by="count", ascending=False).head(25)

In [None]:
fig, ax = plt.subplots()
ax.scatter(df_common_breeds["count"], df_common_breeds["median"])
for i, txt in enumerate(df_common_breeds.index):
    ax.annotate(txt, (df_common_breeds["count"].iloc[i], df_common_breeds["median"].iloc[i]))
plt.xlabel("Count of Dogs")
plt.ylabel("Median Duration of Stay (days)")
plt.title("Shelter Duration of 25 Most Common Primary Breeds")

In [None]:
top_breeds = df_common_breeds.index

In [None]:
df_limited_to_common = df.loc[df["breed_primary"].isin(top_breeds), :]

In [None]:
# sort medians per breed
medians = df_limited_to_common.groupby("breed_primary")["length_of_stay"].median().sort_values(ascending=False)

In [None]:
ax = sns.boxplot(y="breed_primary", x="length_of_stay", data=df_limited_to_common, showfliers=False, order=medians.index)
# plt.xticks(rotation=90)