# Kaggle Shelter Animal Outcome Data Challenge

The data comes from Austin Animal Center from October 1st, 2013 to March, 2016. Outcomes represent the status of animals as they leave the Animal Center. All animals receive a unique Animal ID during intake. 

In this competition, you are going to predict the outcome of the animal as they leave the Animal Center. These outcomes include: Adoption, Died, Euthanasia, Return to owner, and Transfer. 

### import libraries

In [13]:
import re
import datetime as dt
import numpy as np
import pandas as pd
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC

### import training data

In [14]:
data = pd.read_csv("train.csv")

print(data.shape)
data.head()

Unnamed: 0,AnimalID,Name,DateTime,OutcomeType,OutcomeSubtype,AnimalType,SexuponOutcome,AgeuponOutcome,Breed,Color
0,A671945,Hambone,2014-02-12 18:22:00,Return_to_owner,,Dog,Neutered Male,1 year,Shetland Sheepdog Mix,Brown/White
1,A656520,Emily,2013-10-13 12:44:00,Euthanasia,Suffering,Cat,Spayed Female,1 year,Domestic Shorthair Mix,Cream Tabby
2,A686464,Pearce,2015-01-31 12:28:00,Adoption,Foster,Dog,Neutered Male,2 years,Pit Bull Mix,Blue/White
3,A683430,,2014-07-11 19:09:00,Transfer,Partner,Cat,Intact Male,3 weeks,Domestic Shorthair Mix,Blue Cream
4,A667013,,2013-11-15 12:52:00,Transfer,Partner,Dog,Neutered Male,2 years,Lhasa Apso/Miniature Poodle,Tan


In [15]:
# convert DateTime colume to actual datetime object
data['DateTime'] = pd.to_datetime(data['DateTime'])

def convert_outcome(x):
    if x == 'Adoption':
        return 0
    elif x == 'Died':
        return 1
    elif x == 'Euthanasia':
        return 2
    elif x == 'Return_to_owner':
        return 3
    elif x == 'Transfer':
        return 4
data['outcome'] = data['OutcomeType'].apply(convert_outcome)

# name or no name
data['has_name'] = data['Name'].apply(lambda x: pd.isnull(x))

# month of outcome
months_dict = {1: 'Jan', 2: 'Feb', 3: 'Mar', 4: 'Apr', 5: 'May', 6: 'Jun',
               7: 'Jul', 8: 'Aug', 9: 'Sep', 10: 'Oct', 11: 'Nov', 12: 'Dec'}
data['outcome_month'] = data['DateTime'].apply(lambda x: months_dict[x.month])
month_df = pd.get_dummies(data['outcome_month'], prefix="month")
data = pd.merge(data, month_df, left_index=True, right_index=True)

# day of the week of outcome
days_dict = {0: 'Mon', 1: 'Tue', 2: 'Wed', 3: 'Thu', 4: 'Fri', 5: 'Sat', 6: 'Sun'}
data['outcome_day'] = data['DateTime'].apply(lambda x: days_dict[x.weekday()])
day_df = pd.get_dummies(data['outcome_day'], prefix="day")
data = pd.merge(data, day_df, left_index=True, right_index=True)
    
# dog or cat
data['AnimalType'] = data['AnimalType'].apply(lambda x: x.lower())
animal_df = pd.get_dummies(data['AnimalType'], prefix='animal_type')
data = pd.merge(data, animal_df, left_index=True, right_index=True)

# sex 
data['SexuponOutcome'].fillna(value='Unknown', inplace=True)
data['SexuponOutcome'] = data['SexuponOutcome'].apply(lambda x: ('_').join(x.lower().split()))
sex_df = pd.get_dummies(data['SexuponOutcome'], prefix="sex")
data = pd.merge(data, sex_df, left_index=True, right_index=True)

# age in months
data['AgeuponOutcome'].dropna(inplace=True)
def age_in_months(x):
    x = x.split()
    age_num = int(x[0])
    time_unit = x[1]
    if time_unit == 'year' or time_unit == 'years':
        return age_num * 12.
    elif time_unit == 'month' or time_unit == 'months':
        return age_num * 1.
    elif time_unit == 'week' or time_unit == 'weeks':
        return age_num * 0.25
    elif time_unit == 'day' or time_unit == 'days':
        return age_num * (1/30)
data['age_in_months'] = data['AgeuponOutcome'].apply(age_in_months)

# drop some unnecessary columns
cols_to_drop = ['AnimalID', 'Name', 'DateTime', 'OutcomeType', 'OutcomeSubtype',
               'AnimalType', 'SexuponOutcome', 'AgeuponOutcome', 'outcome_month',
               'outcome_day']
data.drop(cols_to_drop, axis=1, inplace=True)

# ----
# bag of words for breed & color
# ----
# combine breed and color into one field
data['breed_color'] = data['Breed'] + " " + data["Color"]
data.drop(['Breed', 'Color'], axis=1, inplace=True)

# lowercase and remove anything that's not alphabetic
data['breed_color'] = data['breed_color'].apply(lambda x: re.sub("[^a-z]", " ", x.lower()))

# compile all breed_color entries into a list
breed_color_list = data['breed_color'].tolist()

# train the bag of words
vectorizer = CountVectorizer(analyzer = "word", max_features = 200) 
breed_color_features = vectorizer.fit_transform(breed_color_list)

breed_color_features = pd.DataFrame(breed_color_features.toarray())

# get dictionary for feature names
vocab_dict = vectorizer.vocabulary_
inverse_vocab_dict = {v: k for k, v in vocab_dict.items()}

print(vocab_dict.values)

breed_color_features.rename(columns=inverse_vocab_dict, inplace=True)

data = pd.merge(data, breed_color_features, left_index=True, right_index=True)
data.drop("breed_color", axis=1, inplace=True)

# drop any remaining missing values
data.dropna(inplace=True)

# standardize age in months
min_max_scaler = MinMaxScaler()
data['age_in_months'] = min_max_scaler.fit_transform(data['age_in_months'].values.reshape(-1,1))

data.head()

<built-in method values of dict object at 0x1131bbd38>


Unnamed: 0,outcome,has_name,month_Apr,month_Aug,month_Dec,month_Feb,month_Jan,month_Jul,month_Jun,month_Mar,...,weimaraner,welsh,west,wheaten,whippet,white,wire,wirehair,yellow,yorkshire
0,3,False,0,0,0,1,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1,2,False,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,False,0,0,0,0,1,0,0,0,...,0,0,0,0,0,1,0,0,0,0
3,4,True,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,4,True,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Split the Data into Training and Test Sets

In [16]:
# first, shuffle the data
data = data.sample(frac=1., random_state=14)

# split into training and test sets
train_frac = 0.7
split = int(train_frac * len(data))

train_data = data[:split]
test_data = data[split:]

# split into feature sets and labels
X_train = train_data.drop("outcome", axis=1)
y_train = train_data["outcome"]

X_test = test_data.drop("outcome", axis=1)
y_test = test_data["outcome"]

print("Breakdown of labels in y_train: ", Counter(y_train))
print("Breakdown of labels in y_test: ", Counter(y_test))

Breakdown of labels in y_train:  Counter({0: 7546, 4: 6565, 3: 3339, 2: 1103, 1: 144})
Breakdown of labels in y_test:  Counter({0: 3223, 4: 2841, 3: 1447, 2: 450, 1: 53})


### Train a Logisitic Regression Model

In [17]:
# parameters to search over
lr_parameters = {'penalty':('l1', 'l2'), 
                 'C':[0.1, 1, 10, 100]}

# instantiate the estimator
lr = LogisticRegression()

# perform the grid search (with 5-fold cross-validation)
clr = GridSearchCV(lr, lr_parameters, cv=5)
clr.fit(X_train, y_train)

# accuracy of the model
print('-' * 20)
print("Accurarcy: ", round(clr.score(X_test, y_test), 2))
print('-' * 20)

# print breakdown of each parameter combination
clr_cv_results = pd.DataFrame(clr.cv_results_)
clr_cv_results = clr_cv_results[['rank_test_score', 'param_C', 'param_penalty', 
                                 'mean_test_score', 'mean_train_score']]
clr_cv_results.sort_values(by='rank_test_score', inplace=True)
clr_cv_results

--------------------
Accurarcy:  0.64
--------------------


Unnamed: 0,rank_test_score,param_C,param_penalty,mean_test_score,mean_train_score
2,1,1.0,l1,0.638926,0.645852
3,2,1.0,l2,0.637963,0.646628
0,3,0.1,l1,0.63791,0.638458
5,4,10.0,l2,0.637321,0.648192
4,5,10.0,l1,0.637108,0.648219
6,5,100.0,l1,0.637108,0.648286
7,7,100.0,l2,0.637054,0.648339
1,8,0.1,l2,0.636519,0.642456


### Train a Random Forest

In [None]:
# parameters to search over
rf_parameters = {'n_estimators':(10, 100, 500),
                 'criterion': ['gini', 'entropy'],
                 'min_samples_split': (2, 10, 20), 
                 'min_samples_leaf': (1, 5, 10)}

# instantiate the estimator
rf = RandomForestClassifier()

# perform the grid search (with 5-fold cross-validation)
crf = GridSearchCV(rf, rf_parameters, cv=5)
crf.fit(X_train, y_train)

# accuracy of the model
print('-' * 20)
print("Accurarcy: ", round(crf.score(X_test, y_test), 2))
print('-' * 20)

# print breakdown of each parameter combination
crf_cv_results = pd.DataFrame(crf.cv_results_)
crf_cv_results = crf_cv_results[['rank_test_score', 
                                 'mean_test_score', 
                                 'mean_train_score',
                                 'param_n_estimators', 
                                 'param_criterion', 
                                 'param_min_samples_split', 
                                 'param_min_samples_leaf']]
crf_cv_results.sort_values(by='rank_test_score', inplace=True)
crf_cv_results

--------------------
Accurarcy:  0.66
--------------------


Unnamed: 0,rank_test_score,mean_test_score,mean_train_score,param_n_estimators,param_criterion,param_min_samples_split,param_min_samples_leaf
5,1,0.646146,0.832567,500,gini,10,1
32,2,0.644916,0.8279,500,entropy,10,1
4,3,0.644702,0.829518,100,gini,10,1
31,4,0.644381,0.825373,100,entropy,10,1
8,5,0.644328,0.766674,500,gini,20,1
7,6,0.643954,0.765016,100,gini,20,1
34,7,0.643312,0.755964,100,entropy,20,1
35,8,0.642777,0.757274,500,entropy,20,1
14,9,0.640424,0.677395,500,gini,10,5
41,10,0.639675,0.676245,500,entropy,10,5


### Train a GradientBoost Model

In [None]:
# parameters to search over
gb_parameters = {'n_estimators':(10, 100, 500),
                 'loss': ['deviance'],
                 'min_samples_split': (2, 10, 20), 
                 'min_samples_leaf': (1, 5, 10)}

# instantiate the estimator
gb = GradientBoostingClassifier()

# perform the grid search (with 5-fold cross-validation)
cgb = GridSearchCV(gb, gb_parameters, cv=5)
cgb.fit(X_train, y_train)

# accuracy of the model
print('-' * 20)
print("Accurarcy: ", round(cgb.score(X_test, y_test), 2))
print('-' * 20)

# print breakdown of each parameter combination
cgb_cv_results = pd.DataFrame(cgb.cv_results_)
cgb_cv_results = cgb_cv_results[['rank_test_score', 
                                 'mean_test_score', 
                                 'mean_train_score',
                                 'param_n_estimators', 
                                 'param_loss', 
                                 'param_min_samples_split', 
                                 'param_min_samples_leaf']]
cgb_cv_results.sort_values(by='rank_test_score', inplace=True)
cgb_cv_results

### Train a Support Vector Machine

In [None]:
#sv = SVC(probability=True)
#sv.fit(X_train, y_train)
#print(sv.score(X_test, y_test))

### clean the test data

In [None]:
data = pd.read_csv("test.csv")

orig_data = data.copy()

print(data.shape)

# convert DateTime colume to actual datetime object
data['DateTime'] = pd.to_datetime(data['DateTime'])

# name or no name
data['has_name'] = data['Name'].apply(lambda x: pd.isnull(x))

# month of outcome
months_dict = {1: 'Jan', 2: 'Feb', 3: 'Mar', 4: 'Apr', 5: 'May', 6: 'Jun',
               7: 'Jul', 8: 'Aug', 9: 'Sep', 10: 'Oct', 11: 'Nov', 12: 'Dec'}
data['outcome_month'] = data['DateTime'].apply(lambda x: months_dict[x.month])
month_df = pd.get_dummies(data['outcome_month'], prefix="month")
data = pd.merge(data, month_df, left_index=True, right_index=True)

# day of the week of outcome
days_dict = {0: 'Mon', 1: 'Tue', 2: 'Wed', 3: 'Thu', 4: 'Fri', 5: 'Sat', 6: 'Sun'}
data['outcome_day'] = data['DateTime'].apply(lambda x: days_dict[x.weekday()])
day_df = pd.get_dummies(data['outcome_day'], prefix="day")
data = pd.merge(data, day_df, left_index=True, right_index=True)

# dog or cat
data['AnimalType'] = data['AnimalType'].apply(lambda x: x.lower())
animal_df = pd.get_dummies(data['AnimalType'], prefix='animal_type')
data = pd.merge(data, animal_df, left_index=True, right_index=True)

# sex 
data['SexuponOutcome'].fillna(value='Unknown', inplace=True)
data['SexuponOutcome'] = data['SexuponOutcome'].apply(lambda x: ('_').join(x.lower().split()))
sex_df = pd.get_dummies(data['SexuponOutcome'], prefix="sex")
data = pd.merge(data, sex_df, left_index=True, right_index=True)


# age in months
data['AgeuponOutcome'].dropna(inplace=True)
def age_in_months(x):
    x = x.split()
    age_num = int(x[0])
    time_unit = x[1]
    if time_unit == 'year' or time_unit == 'years':
        return age_num * 12.
    elif time_unit == 'month' or time_unit == 'months':
        return age_num * 1.
    elif time_unit == 'week' or time_unit == 'weeks':
        return age_num * 0.25
    elif time_unit == 'day' or time_unit == 'days':
        return age_num * (1/30)
data['age_in_months'] = data['AgeuponOutcome'].apply(age_in_months)

# drop some unnecessary columns
cols_to_drop = ['ID', 'Name', 'DateTime',
               'AnimalType', 'SexuponOutcome', 'AgeuponOutcome', 'outcome_month',
               'outcome_day']
data.drop(cols_to_drop, axis=1, inplace=True)

# ----
# bag of words for breed & color
# ----
# combine breed and color into one field
data['breed_color'] = data['Breed'] + " " + data["Color"]
data.drop(['Breed', 'Color'], axis=1, inplace=True)

# lowercase and remove anything that's not alphabetic
data['breed_color'] = data['breed_color'].apply(lambda x: re.sub("[^a-z]", " ", x.lower()))

# compile all breed_color entries into a list
breed_color_list = data['breed_color'].tolist()

# apply the bag of words
breed_color_features = vectorizer.transform(breed_color_list)

breed_color_features = pd.DataFrame(breed_color_features.toarray())

# get dictionary for feature names
vocab_dict = vectorizer.vocabulary_
inverse_vocab_dict = {v: k for k, v in vocab_dict.items()}

breed_color_features.rename(columns=inverse_vocab_dict, inplace=True)

data = pd.merge(data, breed_color_features, left_index=True, right_index=True)
data.drop("breed_color", axis=1, inplace=True)

# drop any remaining missing values
#data.dropna(inplace=True)

data['age_in_months'] = data['age_in_months'].fillna(value=data['age_in_months'].mean())

# standardize age in months
min_max_scaler = MinMaxScaler()
data['age_in_months'] = min_max_scaler.fit_transform(data['age_in_months'].values.reshape(-1,1))

data.head()

In [None]:
pred_df = pd.DataFrame(cgb.predict(data))
pred_df.columns = ['prediction']
pred_df.head()

In [None]:
pred_df['Adoption'] = pred_df['prediction'] == 0
pred_df['Died'] = pred_df['prediction'] == 1
pred_df['Euthanasia'] = pred_df['prediction'] == 2
pred_df['Return_to_owner'] = pred_df['prediction'] == 3
pred_df['Transfer'] = pred_df['prediction'] == 4

pred_df = pd.merge(orig_data, pred_df, left_index=True, right_index=True)

cols_to_keep = ['ID', 'Adoption', 'Died', 'Euthanasia', 'Return_to_owner', 'Transfer']

pred_df = pred_df[cols_to_keep].astype(int)

pred_df.head()

In [None]:
prob_df = np.round(pd.DataFrame(cgb.predict_proba(data)), 3)
prob_df.columns = ['Adoption', 'Died', 'Euthanasia', 'Return_to_owner', 'Transfer']

prob_df = pd.merge(orig_data, prob_df, left_index=True, right_index=True)

cols_to_keep = ['ID', 'Adoption', 'Died', 'Euthanasia', 'Return_to_owner', 'Transfer']
prob_df = prob_df[cols_to_keep]

prob_df.head()

In [None]:
prob_df.to_csv("submission_2.18.2018.csv", index=False)