# Shelter animal outcomes XGBoost training

In [192]:
import warnings
warnings.filterwarnings('ignore')


import pandas as pd
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelBinarizer, LabelEncoder, FunctionTransformer, Normalizer
from sklearn.feature_extraction import DictVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import log_loss
from util import *

## Load data

In [193]:
train_df = pd.read_csv("input/train.csv")
test_df = pd.read_csv("input/test.csv")

In [194]:
train_df.head()

Unnamed: 0,AnimalID,Name,DateTime,OutcomeType,OutcomeSubtype,AnimalType,SexuponOutcome,AgeuponOutcome,Breed,Color
0,A671945,Hambone,2014-02-12 18:22:00,Return_to_owner,,Dog,Neutered Male,1 year,Shetland Sheepdog Mix,Brown/White
1,A656520,Emily,2013-10-13 12:44:00,Euthanasia,Suffering,Cat,Spayed Female,1 year,Domestic Shorthair Mix,Cream Tabby
2,A686464,Pearce,2015-01-31 12:28:00,Adoption,Foster,Dog,Neutered Male,2 years,Pit Bull Mix,Blue/White
3,A683430,,2014-07-11 19:09:00,Transfer,Partner,Cat,Intact Male,3 weeks,Domestic Shorthair Mix,Blue Cream
4,A667013,,2013-11-15 12:52:00,Transfer,Partner,Dog,Neutered Male,2 years,Lhasa Apso/Miniature Poodle,Tan


In [195]:
test_df.head()

Unnamed: 0,ID,Name,DateTime,AnimalType,SexuponOutcome,AgeuponOutcome,Breed,Color
0,1,Summer,2015-10-12 12:15:00,Dog,Intact Female,10 months,Labrador Retriever Mix,Red/White
1,2,Cheyenne,2014-07-26 17:59:00,Dog,Spayed Female,2 years,German Shepherd/Siberian Husky,Black/Tan
2,3,Gus,2016-01-13 12:20:00,Cat,Neutered Male,1 year,Domestic Shorthair Mix,Brown Tabby
3,4,Pongo,2013-12-28 18:12:00,Dog,Intact Male,4 months,Collie Smooth Mix,Tricolor
4,5,Skooter,2015-09-24 17:59:00,Dog,Neutered Male,2 years,Miniature Poodle Mix,White


## Clean data

In [196]:
# 使其與 test data 的 id coulumn 一致
train_df.rename(columns={'AnimalID': 'ID'}, inplace=True)

In [197]:
# add empty outcome columns => 欄位數量、名稱一致，才能和 train data combine 再一起
# 用來區分 combine 後的 train, test data，如果 OutcomeType (target，預測目標) is null，代表是 test data
test_df['OutcomeType'] = np.nan
test_df['OutcomeSubtype'] = np.nan

In [198]:
# 先合併 train, test 成同一個 dataframe，方便清洗資料
combined_df = pd.concat([train_df, test_df], axis=0)
combined_df

Unnamed: 0,ID,Name,DateTime,OutcomeType,OutcomeSubtype,AnimalType,SexuponOutcome,AgeuponOutcome,Breed,Color
0,A671945,Hambone,2014-02-12 18:22:00,Return_to_owner,,Dog,Neutered Male,1 year,Shetland Sheepdog Mix,Brown/White
1,A656520,Emily,2013-10-13 12:44:00,Euthanasia,Suffering,Cat,Spayed Female,1 year,Domestic Shorthair Mix,Cream Tabby
2,A686464,Pearce,2015-01-31 12:28:00,Adoption,Foster,Dog,Neutered Male,2 years,Pit Bull Mix,Blue/White
3,A683430,,2014-07-11 19:09:00,Transfer,Partner,Cat,Intact Male,3 weeks,Domestic Shorthair Mix,Blue Cream
4,A667013,,2013-11-15 12:52:00,Transfer,Partner,Dog,Neutered Male,2 years,Lhasa Apso/Miniature Poodle,Tan
...,...,...,...,...,...,...,...,...,...,...
11451,11452,,2014-07-08 14:50:00,,,Cat,Neutered Male,2 months,Domestic Shorthair Mix,Black
11452,11453,,2014-10-21 12:57:00,,,Cat,Intact Female,2 weeks,Domestic Shorthair Mix,Blue
11453,11454,,2014-09-29 09:00:00,,,Cat,Intact Female,1 year,Domestic Shorthair Mix,Calico
11454,11455,Rambo,2015-09-05 17:16:00,,,Dog,Neutered Male,6 years,German Shepherd Mix,Black/Tan


In [199]:
# normalize several columns
combined_df['AnimalType'] = combined_df['AnimalType'].apply(normalize_name)
combined_df['SexuponOutcome'] = combined_df['SexuponOutcome'].apply(normalize_name)
combined_df['OutcomeType'] = combined_df['OutcomeType'].apply(normalize_name)
combined_df['OutcomeSubtype'] = combined_df['OutcomeSubtype'].apply(normalize_name)

# SexuponOutcome 有 'unkown' 值 => 直接填充成 nan 值
combined_df['SexuponOutcome'].fillna(value='unknown', inplace=True)

In [200]:
# convert name to indicator variable => 有名字或沒名字
combined_df.loc[~combined_df['Name'].isnull(), 'Name'] = 1
combined_df.loc[combined_df['Name'].isnull(), 'Name'] = 0

In [201]:
# 把 mix 品種分開成兩個欄位 Breed, Breed2
# 新增 Mix 的指標，如果是混種則標示 1，else 0
combined_df[['Breed', 'Breed2', 'Mix']] = combined_df['Breed'].apply(clean_breed).str.split("/", n=2, expand=True)
combined_df.replace(to_replace="", value=np.nan, inplace=True)

In [202]:
# 顏色有單一的跟混色的 => 拆分成 Color, Color2 columns
# 新增 Multicolor 欄位，有混色標示 1, else 0
combined_df[["Color", "Color2"]] = combined_df["Color"].str.split("/", n=1, expand=True)
combined_df.loc[~combined_df["Color2"].isnull(), "Multicolor"] = 1
combined_df.loc[combined_df["Color2"].isnull(), "Multicolor"] = 0
combined_df.loc[combined_df["Color"] == "Tricolor", "Multicolor"] = 1

combined_df["Color"] = combined_df["Color"].apply(normalize_name)
combined_df["Color2"] = combined_df["Color2"].apply(normalize_name)

In [203]:
# 把 AgeuponOutcome 轉換成相同單位 => month
combined_df['AgeuponOutcome'] = combined_df['AgeuponOutcome'].apply(clean_age)

In [204]:
combined_df["DateTime"] = pd.to_datetime(combined_df["DateTime"])

# 把 Datetime 拆分成 : 年，月，日，時，工作日
combined_df["Hour"] = combined_df["DateTime"].dt.hour + combined_df["DateTime"].dt.minute / 60
combined_df["Weekday"] = combined_df["DateTime"].dt.weekday
combined_df["Month"] = combined_df["DateTime"].dt.month
combined_df["Day"] = combined_df["DateTime"].dt.day
combined_df["Year"] = combined_df["DateTime"].dt.year

combined_df = combined_df.drop(["DateTime"], axis=1)

In [205]:
# save the cleaned data
train_clean_df = combined_df[~combined_df['OutcomeType'].isnull()]
# train_clean_df.to_csv("data/train_clean_v2.csv", index=False)

test_clean_df = combined_df[combined_df['OutcomeType'].isnull()].drop(['OutcomeType', 'OutcomeSubtype'], axis=1)
# test_clean_df.to_csv("data/test_clean_v2.csv", index=False)

In [206]:
train_clean_df

Unnamed: 0,ID,Name,OutcomeType,OutcomeSubtype,AnimalType,SexuponOutcome,AgeuponOutcome,Breed,Color,Breed2,Mix,Color2,Multicolor,Hour,Weekday,Month,Day,Year
0,A671945,1,return_to_owner,,dog,neutered_male,12.000000,shetland_sheepdog,brown,,1,white,1.0,18.366667,2,2,12,2014
1,A656520,1,euthanasia,suffering,cat,spayed_female,12.000000,domestic_shorthair,cream_tabby,,1,,0.0,12.733333,6,10,13,2013
2,A686464,1,adoption,foster,dog,neutered_male,24.000000,pit_bull,blue,,1,white,1.0,12.466667,5,1,31,2015
3,A683430,0,transfer,partner,cat,intact_male,0.692308,domestic_shorthair,blue_cream,,1,,0.0,19.150000,4,7,11,2014
4,A667013,0,transfer,partner,dog,neutered_male,24.000000,lhasa_apso,tan,miniature_poodle,1,,0.0,12.866667,4,11,15,2013
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26724,A702446,0,transfer,partner,cat,intact_male,1.000000,domestic_shorthair,brown_tabby,,1,white,1.0,11.933333,3,5,14,2015
26725,A718934,0,transfer,scrp,cat,spayed_female,3.000000,domestic_shorthair,brown_tabby,,1,,0.0,18.983333,2,1,20,2016
26726,A698128,1,adoption,,dog,neutered_male,48.000000,old_english_bulldog,white,,1,tan,1.0,13.550000,0,3,9,2015
26727,A677478,0,transfer,partner,cat,intact_male,0.923077,domestic_shorthair,black,,1,,0.0,12.366667,6,4,27,2014


In [207]:
test_clean_df

Unnamed: 0,ID,Name,AnimalType,SexuponOutcome,AgeuponOutcome,Breed,Color,Breed2,Mix,Color2,Multicolor,Hour,Weekday,Month,Day,Year
0,1,1,dog,intact_female,10.000000,labrador_retriever,red,,1,white,1.0,12.250000,0,10,12,2015
1,2,1,dog,spayed_female,24.000000,german_shepherd,black,siberian_husky,1,tan,1.0,17.983333,5,7,26,2014
2,3,1,cat,neutered_male,12.000000,domestic_shorthair,brown_tabby,,1,,0.0,12.333333,2,1,13,2016
3,4,1,dog,intact_male,4.000000,collie_smooth,tricolor,,1,,1.0,18.200000,5,12,28,2013
4,5,1,dog,neutered_male,24.000000,miniature_poodle,white,,1,,0.0,17.983333,3,9,24,2015
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11451,11452,0,cat,neutered_male,2.000000,domestic_shorthair,black,,1,,0.0,14.833333,1,7,8,2014
11452,11453,0,cat,intact_female,0.461538,domestic_shorthair,blue,,1,,0.0,12.950000,1,10,21,2014
11453,11454,0,cat,intact_female,12.000000,domestic_shorthair,calico,,1,,0.0,9.000000,0,9,29,2014
11454,11455,1,dog,neutered_male,72.000000,german_shepherd,black,,1,tan,1.0,17.266667,5,9,5,2015
