# Predicting Animal Adoptions

### Project Goal:

Using a few simple inputs (related to animal type, age of animal, color, etc), build an online tool that will predict whether an animal will be adopted or not from the Austin Animal Center.

In [None]:
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', 25)
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, roc_auc_score, plot_confusion_matrix

## Data Exploration and Merging

Data Sources:
- [Austin Animal Center Intakes Data](https://data.austintexas.gov/Health-and-Community-Services/Austin-Animal-Center-Intakes/wter-evkm)
- [Austin Animal Center Outcomes Data](https://data.austintexas.gov/Health-and-Community-Services/Austin-Animal-Center-Outcomes/9t4d-g238)

In [None]:
# Read in intakes data, downloaded 9/19/22
df_in = pd.read_csv('data/Austin_Animal_Center_Intakes-091922.csv',
                    parse_dates=['DateTime'])
df_in.head()

In [None]:
# Read in outcomes data, downloaded 9/19/22
df_out = pd.read_csv('data/Austin_Animal_Center_Outcomes-091922.csv', 
                     parse_dates=['DateTime'])
df_out.head()

In [None]:
# There are some duplicate rows with the exact same Animal ID and DateTime
# Assuming input issues (ie accidentally entering the animal into the system twice)
print(f"Intake dupes: {df_in.duplicated(subset=['Animal ID', 'DateTime']).sum()}")
print(f"Outcome dupes: {df_out.duplicated(['Animal ID', 'DateTime']).sum()}")

In [None]:
# Dropping these dupes, keeping the last entry
df_in = df_in.drop_duplicates(subset=['Animal ID', 'DateTime'], keep='last')
df_out = df_out.drop_duplicates(subset=['Animal ID', 'DateTime'], keep='last')

### Data Cleaning: Merging Repeat Intakes/Outcomes

In [None]:
# First - order both dfs by DateTime
df_in = df_in.sort_values(by='DateTime')
df_out = df_out.sort_values(by='DateTime')

In [None]:
# Create iterative counts for animals that have multiple intakes/outcomes
df_in['Intake Num'] = df_in.groupby('Animal ID', sort=False).cumcount()+1
df_out['Outcome Num'] = df_out.groupby('Animal ID', sort=False).cumcount()+1

In [None]:
# Checking a known repeat offender - multiple intakes ...
df_in.loc[df_in['Animal ID'] == 'A721033'].tail(5)

In [None]:
# ... and multiple outcomes
df_out.loc[df_out['Animal ID'] == 'A721033'].tail(5)

In [None]:
# Let's try merging using these new iterative count columns
df = df_in.merge(df_out, 
                 left_on=['Animal ID', 'Intake Num'], 
                 right_on=['Animal ID', 'Outcome Num'],
                 how='inner',
                 suffixes=("_in", "_out"))

In [None]:
df.head()

In [None]:
# Exploring rows where the simple iterative count didn't work 
# Aka the outcome date was before the intake date
dirty = df.loc[df['DateTime_in'] > df['DateTime_out']]

In [None]:
# Ew - this many rows didn't quite work with a simple iterative count
dirty.shape

In [None]:
# Subsetting down, for simplicity
dirty = dirty[['Animal ID', 'DateTime_in', 'Intake Num']]

In [None]:
# Going to add 1 to intake num, then try to merge again
dirty['Intake Num'] += 1

In [None]:
dirty.head()

In [None]:
# Now let's get a clean version of df_in ready
df_in_clean = df_in.copy()

In [None]:
# Adding columns just on the dirty rows using a left merge
# The Intake Num with "first" was the 1st attempt, "second" is 2nd attempt
df_in_clean = df_in_clean.merge(dirty,
                                left_on=['Animal ID', 'DateTime'], 
                                right_on=['Animal ID', 'DateTime_in'], 
                                how='left',
                                suffixes=('_first', '_second'))

In [None]:
# Don't need an extra datetime column anymore
df_in_clean = df_in_clean.drop(columns=['DateTime_in'])
# If we check, we can see that non-null count in Intake Num_second
# matches the number of dirty rows we found above
df_in_clean.info()

In [None]:
# Creating a new Intake Num column
# Using np.where to take the second num when not null, else the first
df_in_clean['Intake Num'] = np.where(~df_in_clean['Intake Num_second'].isna(),
                                     df_in_clean['Intake Num_second'],
                                     df_in_clean['Intake Num_first'])

In [None]:
# Now trying the big merge again, using the new Intake Num
df = df_in_clean.merge(df_out, 
                       left_on=['Animal ID', 'Intake Num'], 
                       right_on=['Animal ID', 'Outcome Num'],
                       how='inner',
                       suffixes=("_in", "_out"))

In [None]:
# Re-checking for rows with outcome date before intake date
df.loc[df['DateTime_in'] > df['DateTime_out']] # yassssss

## Data Exploration and Feature Engineering

In [None]:
df.head().T

In [None]:
# Segmenting down to mostly intake columns that I'll explore for modeling
data = df[['Animal ID', 'DateTime_in', 'Intake Type', 'Intake Condition', 
           'Animal Type_in', 'Sex upon Intake', 'Age upon Intake', 'Breed_in', 
           'Color_in', 'Intake Num', 'DateTime_out', 'Date of Birth', 'Outcome Type']]

In [None]:
data.head()

### Exploring Outcome Type - aka Target Engineering

In [None]:
data['Outcome Type'].value_counts()

In [None]:
# Excluding animals that were returned to owner in some way
data = data.loc[(data['Outcome Type'] != 'Return to Owner') & (data['Outcome Type'] != 'Rto-Adopt')]

In [None]:
data.head()

In [None]:
data['Outcome Type'].value_counts(normalize=True)

In [None]:
# Creating our target column, which is a binary (was either adopted or not)
data['Adopted'] = np.where(data['Outcome Type'] == 'Adoption', 1, 0)

In [None]:
data['Adopted'].value_counts(normalize=True)

In [None]:
data.head()

### Exploring Object-Type Columns

In [None]:
data.describe(include='O')

In [None]:
# Getting a list of object-type columns
obj_cols = [c for c in data.columns if data[c].dtype == 'O']

# Looping over object-type columns (except Animal ID)
# Checking out the top 10 of the value counts
for col in obj_cols[1:]:
    print(col)
    print(f"Uniques: {len(data[col].value_counts())}")
    print(data[col].value_counts().head(10))
    print("*"*20)

### Feature Engineering!

In [None]:
# Creating Age in Days from date of birth
data['Age in Days'] = (pd.Timestamp.today().date() - pd.to_datetime(data['Date of Birth']).dt.date).dt.days

In [None]:
# Creating an indicator for black animals (notoriously under-adopted)
data['Color_black'] = data['Color_in'].str.lower().str.contains('black')

In [None]:
# Creating a 'fixed' col for animals that come in fixed
data['Fixed'] = np.where(
    (data['Sex upon Intake'] == 'Neutered Male') | (data['Sex upon Intake'] == 'Spayed Female'), True, False)

In [None]:
# Mainly looking at dogs and cats - rest will be 'Other'
data['Animal Type_in'] = data['Animal Type_in'].replace({'Bird': 'Other', 'Livestock': 'Other'})

In [None]:
# Creating Type_Cat and Type_Dog columns
data['Type_Cat'] = data['Animal Type_in'] == 'Cat'
data['Type_Dog'] = data['Animal Type_in'] == 'Dog'

In [None]:
# Finding not-normal intake conditions
data['Intake Condition_Not Normal'] = data['Intake Condition'] != 'Normal'

In [None]:
# Creating a 'female' indicator
data['Female'] = data['Sex upon Intake'].str.contains("Female")

In [None]:
# Simplifying the age in days column to extract animals < 1yo
data['Young'] = data['Age in Days'] < 365

In [None]:
data.head()

# Modeling

In [None]:
# Defining our used calls - all indicator boolean columns!
used_cols = ['Color_black', 'Fixed', 'Type_Cat', 'Type_Dog', 
             'Intake Condition_Not Normal', 'Female', 'Young']

In [None]:
X = data[used_cols]
y = data['Adopted']

In [None]:
X.head()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=84)

## Model-Less Baseline

In [None]:
y_train.value_counts(normalize=True)

## Logistic Regression

In [None]:
logreg = LogisticRegression()

logreg.fit(X_train, y_train)

logreg.score(X_train, y_train)

In [None]:
test_preds = logreg.predict(X_test)
test_probas = logreg.predict_proba(X_test)[:,1]

print(f"Test Acc: {logreg.score(X_test, y_test)}")
print(f"Test F1: {f1_score(y_test, test_preds)}")
print(f"Test ROCAUC: {roc_auc_score(y_test, test_probas)}")

plot_confusion_matrix(logreg, X_test, y_test);

## Decision Tree

In [None]:
tree = DecisionTreeClassifier(max_depth=5)

tree.fit(X_train, y_train)

tree.score(X_train, y_train)

In [None]:
test_preds = tree.predict(X_test)
test_probas = tree.predict_proba(X_test)[:,1]

print(f"Test Acc: {tree.score(X_test, y_test)}")
print(f"Test F1: {f1_score(y_test, test_preds)}")
print(f"Test ROCAUC: {roc_auc_score(y_test, test_probas)}")

plot_confusion_matrix(tree, X_test, y_test);

## Random Forest

In [None]:
rf = RandomForestClassifier(max_depth=5)

rf.fit(X_train, y_train)

rf.score(X_train, y_train)

In [None]:
test_preds = rf.predict(X_test)
test_probas = rf.predict_proba(X_test)[:,1]

print(f"Test Acc: {rf.score(X_test, y_test)}")
print(f"Test F1: {f1_score(y_test, test_preds)}")
print(f"Test ROCAUC: {roc_auc_score(y_test, test_probas)}")

plot_confusion_matrix(rf, X_test, y_test);

We'd prefer to false negatives over false positives: would rather an animal be predicted to not be adopted, but actually is, rather than one that's predicted to be adopted but isn't.

Because of that, and a marginally higher ROC-AUC score, we'll choose our (mostly untuned and could definitely be improved) Random Forest model to pickle and deploy.

# Save Model for Deployment

In [None]:
# New library!
import pickle

In [None]:
# To save our model as a .sav file, use this code
# Note that rf is the variable of our model to save
pickle.dump(rf, open("rf_model.sav", 'wb'))

In [None]:
# Test that out, too!
# Loading up the model from the .sav file
loaded_model = pickle.load(open("rf_model.sav", 'rb'))

In [None]:
rf.score(X_test, y_test)

In [None]:
# We can see it's the same as above
loaded_model.score(X_test, y_test)

### Testing On a New Input

(aka what we'll need in our streamlit app!)

In [None]:
# We'll need to copy this over to streamlit
print(used_cols)

In [None]:
# We want a new intake to look like this
X_test[:1]

In [None]:
# Needs to match these dtypes too
X_test.dtypes

In [None]:
# Example predictions on just one 
loaded_model.predict(X_test[:1])

In [None]:
loaded_model.predict_proba(X_test[:1])

In [None]:
# Example row of new inputs
example_row = [True, False, False, True, False, False, True]

In [None]:
# Turning that into a dataframe
new_test_example = pd.DataFrame(dict(zip(used_cols, example_row)), index=[0])
new_test_example

In [None]:
# Proving we can use our model to predict on that!
loaded_model.predict(new_test_example)