# Shelter animal outcomes xbg training

In [86]:
import warnings

warnings.filterwarnings("ignore")


import pandas as pd
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelBinarizer, LabelEncoder, FunctionTransformer, Normalizer, OneHotEncoder
from sklearn.feature_extraction import DictVectorizer
from sklearn.model_selection import GridSearchCV, cross_validate
from sklearn.pipeline import Pipeline
from sklearn.metrics import log_loss
from sklearn_pandas import DataFrameMapper
from util import *

## Load data

In [87]:
train_df = pd.read_csv("input/train.csv")
test_df = pd.read_csv("input/test.csv")

In [88]:
train_df.head()

Unnamed: 0,AnimalID,Name,DateTime,OutcomeType,OutcomeSubtype,AnimalType,SexuponOutcome,AgeuponOutcome,Breed,Color
0,A671945,Hambone,2014-02-12 18:22:00,Return_to_owner,,Dog,Neutered Male,1 year,Shetland Sheepdog Mix,Brown/White
1,A656520,Emily,2013-10-13 12:44:00,Euthanasia,Suffering,Cat,Spayed Female,1 year,Domestic Shorthair Mix,Cream Tabby
2,A686464,Pearce,2015-01-31 12:28:00,Adoption,Foster,Dog,Neutered Male,2 years,Pit Bull Mix,Blue/White
3,A683430,,2014-07-11 19:09:00,Transfer,Partner,Cat,Intact Male,3 weeks,Domestic Shorthair Mix,Blue Cream
4,A667013,,2013-11-15 12:52:00,Transfer,Partner,Dog,Neutered Male,2 years,Lhasa Apso/Miniature Poodle,Tan


In [89]:
test_df.head()

Unnamed: 0,ID,Name,DateTime,AnimalType,SexuponOutcome,AgeuponOutcome,Breed,Color
0,1,Summer,2015-10-12 12:15:00,Dog,Intact Female,10 months,Labrador Retriever Mix,Red/White
1,2,Cheyenne,2014-07-26 17:59:00,Dog,Spayed Female,2 years,German Shepherd/Siberian Husky,Black/Tan
2,3,Gus,2016-01-13 12:20:00,Cat,Neutered Male,1 year,Domestic Shorthair Mix,Brown Tabby
3,4,Pongo,2013-12-28 18:12:00,Dog,Intact Male,4 months,Collie Smooth Mix,Tricolor
4,5,Skooter,2015-09-24 17:59:00,Dog,Neutered Male,2 years,Miniature Poodle Mix,White


## Clean data

In [90]:
# 使其與 test data 的 id coulumn 一致
train_df.rename(columns={"AnimalID": "ID"}, inplace=True)

In [91]:
# add empty outcome columns => 欄位數量、名稱一致，才能和 train data combine 再一起
# 用來區分 combine 後的 train, test data，如果 OutcomeType (target，預測目標) is null，代表是 test data
test_df["OutcomeType"] = np.nan
test_df["OutcomeSubtype"] = np.nan

In [92]:
# 先合併 train, test 成同一個 dataframe，方便清洗資料
combined_df = pd.concat([train_df, test_df], axis=0)
combined_df

Unnamed: 0,ID,Name,DateTime,OutcomeType,OutcomeSubtype,AnimalType,SexuponOutcome,AgeuponOutcome,Breed,Color
0,A671945,Hambone,2014-02-12 18:22:00,Return_to_owner,,Dog,Neutered Male,1 year,Shetland Sheepdog Mix,Brown/White
1,A656520,Emily,2013-10-13 12:44:00,Euthanasia,Suffering,Cat,Spayed Female,1 year,Domestic Shorthair Mix,Cream Tabby
2,A686464,Pearce,2015-01-31 12:28:00,Adoption,Foster,Dog,Neutered Male,2 years,Pit Bull Mix,Blue/White
3,A683430,,2014-07-11 19:09:00,Transfer,Partner,Cat,Intact Male,3 weeks,Domestic Shorthair Mix,Blue Cream
4,A667013,,2013-11-15 12:52:00,Transfer,Partner,Dog,Neutered Male,2 years,Lhasa Apso/Miniature Poodle,Tan
...,...,...,...,...,...,...,...,...,...,...
11451,11452,,2014-07-08 14:50:00,,,Cat,Neutered Male,2 months,Domestic Shorthair Mix,Black
11452,11453,,2014-10-21 12:57:00,,,Cat,Intact Female,2 weeks,Domestic Shorthair Mix,Blue
11453,11454,,2014-09-29 09:00:00,,,Cat,Intact Female,1 year,Domestic Shorthair Mix,Calico
11454,11455,Rambo,2015-09-05 17:16:00,,,Dog,Neutered Male,6 years,German Shepherd Mix,Black/Tan


In [93]:
# normalize several columns
combined_df["AnimalType"] = combined_df["AnimalType"].apply(normalize_name)
combined_df["SexuponOutcome"] = combined_df["SexuponOutcome"].apply(normalize_name)
combined_df["OutcomeType"] = combined_df["OutcomeType"].apply(normalize_name)
combined_df["OutcomeSubtype"] = combined_df["OutcomeSubtype"].apply(normalize_name)

# SexuponOutcome 有 'unkown' 值 => 直接填充成 nan 值
combined_df["SexuponOutcome"].fillna(value="unknown", inplace=True)

In [94]:
# convert name to indicator variable => 有名字或沒名字
combined_df.loc[~combined_df["Name"].isnull(), "Name"] = 1
combined_df.loc[combined_df["Name"].isnull(), "Name"] = 0

In [95]:
# 把 mix 品種分開成兩個欄位 Breed, Breed2
# 新增 Mix 的指標，如果是混種則標示 1，else 0
combined_df[["Breed", "Breed2", "Mix"]] = combined_df["Breed"].apply(clean_breed).str.split("/", n=2, expand=True)
combined_df.replace(to_replace="", value=np.nan, inplace=True)

# 標示 rare breed
dog_breed_threshold = 20
cat_breed_threshold = 15

# split by animal type
dogs_df = combined_df[combined_df["AnimalType"] == "dog"]
cats_df = combined_df[combined_df["AnimalType"] == "cat"]

all_dogs = pd.concat([dogs_df["Breed"], dogs_df["Breed2"]], axis=0).dropna()
all_cats = pd.concat([cats_df["Breed"], cats_df["Breed2"]], axis=0).dropna()

dog_breeds = all_dogs.value_counts()
total_dog_breeds = len(dog_breeds)
dog_breeds = dog_breeds[dog_breeds < dog_breed_threshold]
rare_dog_breeds = list(dog_breeds.index)

cat_breeds = all_cats.value_counts()
total_cat_breeds = len(cat_breeds)
cat_breeds = cat_breeds[cat_breeds < cat_breed_threshold]
rare_cat_breeds = list(cat_breeds.index)

rare_breeds = rare_dog_breeds + rare_cat_breeds
combined_df['Breed'].replace(to_replace=rare_breeds, value="rare", inplace=True)
combined_df['Breed2'].replace(to_replace=rare_breeds, value="rare", inplace=True)

In [96]:
# 把 SexuponOutcome 拆分 => Sex，Neutered
combined_df[["Neutered", "Sex"]] = (
    combined_df["SexuponOutcome"].apply(clean_normalized_sex).str.split("/", n=1, expand=True)
)
combined_df = combined_df.drop(["SexuponOutcome"], axis=1)

In [97]:
# 顏色有單一的跟混色的 => 拆分成 Color, Color2 columns
# 新增 Multicolor 欄位，有混色標示 1, else 0
combined_df[["Color", "Color2"]] = combined_df["Color"].str.split("/", n=1, expand=True)
combined_df.loc[~combined_df["Color2"].isnull(), "Multicolor"] = 1
combined_df.loc[combined_df["Color2"].isnull(), "Multicolor"] = 0
combined_df.loc[combined_df["Color"] == "Tricolor", "Multicolor"] = 1

combined_df["Color"] = combined_df["Color"].apply(normalize_name)
combined_df["Color2"] = combined_df["Color2"].apply(normalize_name)

# 標示 rare color
color_threshold = 20
all_colors = pd.concat([combined_df['Color'], combined_df['Color2']], axis=0).dropna()
color_counts = all_colors.value_counts()

total_colors = len(color_counts)
color_counts = color_counts[color_counts < color_threshold]
rare_colors = list(color_counts.index)

combined_df['Color'].replace(to_replace=rare_colors, value="rare", inplace=True)
combined_df['Color2'].replace(to_replace=rare_colors, value="rare", inplace=True)

In [98]:
# 把 AgeuponOutcome 轉換成相同單位 => month
combined_df['AgeuponOutcome'] = combined_df['AgeuponOutcome'].apply(clean_age)

In [99]:
combined_df["DateTime"] = pd.to_datetime(combined_df["DateTime"])

# 把 Datetime 拆分成 : 年，月，日，時，工作日
combined_df["Hour"] = combined_df["DateTime"].dt.hour + combined_df["DateTime"].dt.minute / 60
combined_df["Weekday"] = combined_df["DateTime"].dt.weekday
combined_df["Month"] = combined_df["DateTime"].dt.month
combined_df["Day"] = combined_df["DateTime"].dt.day
combined_df["Year"] = combined_df["DateTime"].dt.year

combined_df = combined_df.drop(["DateTime"], axis=1)

In [100]:
# 先使用一般的 label encoder 幫所有類別資料編碼
# 之後考慮使用 one hot encoding
# combined_df["Breed"] = LabelEncoder().fit_transform(combined_df["Breed"])
# combined_df["Breed2"] = LabelEncoder().fit_transform(combined_df["Breed2"])
# combined_df["Color"] = LabelEncoder().fit_transform(combined_df["Color"])
# combined_df["Color2"] = LabelEncoder().fit_transform(combined_df["Color2"])

combined_df = pd.concat([combined_df, pd.get_dummies(combined_df["Breed"]).astype("int32")], axis=1)
combined_df = pd.concat([combined_df, pd.get_dummies(combined_df["Breed2"]).astype("int32")], axis=1)
combined_df = pd.concat([combined_df, pd.get_dummies(combined_df["Color"]).astype("int32")], axis=1)
combined_df = pd.concat([combined_df, pd.get_dummies(combined_df["Color2"]).astype("int32")], axis=1)
combined_df = combined_df.drop(["Breed", "Breed2", "Color", "Color2"], axis=1)

# save the cleaned data
train_clean_df = combined_df[~combined_df["OutcomeType"].isnull()]
# train_clean_df.to_csv("data/train_clean_v2.csv", index=False)

test_clean_df = combined_df[combined_df["OutcomeType"].isnull()].drop(["OutcomeType", "OutcomeSubtype"], axis=1)
# test_clean_df.to_csv("data/test_clean_v2.csv", index=False)

In [101]:
train_clean_df

Unnamed: 0,ID,Name,OutcomeType,OutcomeSubtype,AnimalType,AgeuponOutcome,Mix,Neutered,Sex,Multicolor,...,seal_point,silver,silver_tabby,tan,tortie,tortie_point,tricolor,white,yellow,yellow_brindle
0,A671945,1,return_to_owner,,dog,12.000000,1,1,1,1.0,...,0,0,0,0,0,0,0,1,0,0
1,A656520,1,euthanasia,suffering,cat,12.000000,1,1,0,0.0,...,0,0,0,0,0,0,0,0,0,0
2,A686464,1,adoption,foster,dog,24.000000,1,1,1,1.0,...,0,0,0,0,0,0,0,1,0,0
3,A683430,0,transfer,partner,cat,0.692308,1,0,1,0.0,...,0,0,0,0,0,0,0,0,0,0
4,A667013,0,transfer,partner,dog,24.000000,1,1,1,0.0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26724,A702446,0,transfer,partner,cat,1.000000,1,0,1,1.0,...,0,0,0,0,0,0,0,1,0,0
26725,A718934,0,transfer,scrp,cat,3.000000,1,1,0,0.0,...,0,0,0,0,0,0,0,0,0,0
26726,A698128,1,adoption,,dog,48.000000,1,1,1,1.0,...,0,0,0,1,0,0,0,0,0,0
26727,A677478,0,transfer,partner,cat,0.923077,1,0,1,0.0,...,0,0,0,0,0,0,0,0,0,0


In [102]:
test_clean_df

Unnamed: 0,ID,Name,AnimalType,AgeuponOutcome,Mix,Neutered,Sex,Multicolor,Hour,Weekday,...,seal_point,silver,silver_tabby,tan,tortie,tortie_point,tricolor,white,yellow,yellow_brindle
0,1,1,dog,10.000000,1,0,0,1.0,12.250000,0,...,0,0,0,0,0,0,0,1,0,0
1,2,1,dog,24.000000,1,1,0,1.0,17.983333,5,...,0,0,0,1,0,0,0,0,0,0
2,3,1,cat,12.000000,1,1,1,0.0,12.333333,2,...,0,0,0,0,0,0,0,0,0,0
3,4,1,dog,4.000000,1,0,1,1.0,18.200000,5,...,0,0,0,0,0,0,0,0,0,0
4,5,1,dog,24.000000,1,1,1,0.0,17.983333,3,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11451,11452,0,cat,2.000000,1,1,1,0.0,14.833333,1,...,0,0,0,0,0,0,0,0,0,0
11452,11453,0,cat,0.461538,1,0,0,0.0,12.950000,1,...,0,0,0,0,0,0,0,0,0,0
11453,11454,0,cat,12.000000,1,0,0,0.0,9.000000,0,...,0,0,0,0,0,0,0,0,0,0
11454,11455,1,dog,72.000000,1,1,1,1.0,17.266667,5,...,0,0,0,1,0,0,0,0,0,0


In [103]:
train_clean_df.dtypes

ID                object
Name              object
OutcomeType       object
OutcomeSubtype    object
AnimalType        object
                   ...  
tortie_point       int32
tricolor           int32
white              int32
yellow             int32
yellow_brindle     int32
Length: 329, dtype: object

In [104]:
# test one hot encoding
# breed = train_clean_df['Breed']
# onehot = OneHotEncoder(sparse=False).fit_transform(np.array(LabelEncoder().fit_transform(breed)).reshape(-1, 1))
# pd.get_dummies(train_clean_df['Breed']).astype('int32')

## Training xgb

In [110]:
# breed_encoder = LabelEncoder().fit(combined_df["Breed"].unique().tolist())
# breed2_encoder = LabelEncoder().fit(combined_df["Breed2"].unique().tolist())
# color_encoder = LabelEncoder().fit(combined_df["Color"].unique().tolist())
# color2_encoder = LabelEncoder().fit(combined_df["Color2"].unique().tolist())

# setup a data frame mapper, convert breed to breed mean, encode the two categorical variables,
# pass the rest through
# skipping standardization since it's not needed for decision tree based classifiers
mapper = DataFrameMapper(
    [
        ("AnimalType", LabelBinarizer()),
        # ("SexuponOutcome", LabelBinarizer()),
    ],
    default=None,
    sparse=True,
)

# XGB classifier instance
xgb = XGBClassifier(
    max_depth=10,
    learning_rate=0.02,
    n_estimators=500,
    objective="multi:softprob",
    subsample=0.8,
    colsample_bytree=0.8,
    nthread=5,
)

# xgb_params = {
# "xgb__colsample_bytree": [0.8],
# "xgb__subsample": [0.8],
# "xgb__n_estimators": [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000],
# "xgb__learning_rate": [0.01, 0.02, 0.03],
# "xgb__max_depth": [5, 10, 15, 20, 25, 30],
# "xgb__nthread": [5],
# }

label_encoder = LabelEncoder()
y = label_encoder.fit_transform(train_clean_df["OutcomeType"])
X = train_clean_df.drop(["OutcomeType", "ID", "OutcomeSubtype"], axis=1)

pipeline = Pipeline([("mapper", mapper), ("xgb", xgb)])

## Evaluation

In [106]:
# grid = GridSearchCV(pipeline, xgb_params, scoring='neg_log_loss', cv=3)
# grid.fit(X, y)

In [107]:
# best : -0.7342839577072033
# grid.best_params_

In [108]:
scores = cross_validate(pipeline, X, y, scoring='neg_log_loss', cv=3)
scores

{'fit_time': array([223.56221986, 226.90380454, 227.77742028]),
 'score_time': array([0.60652471, 0.53612876, 0.69598126]),
 'test_score': array([-0.73439692, -0.73313144, -0.72202071])}

In [109]:
from numpy import mean


mean(scores["test_score"])

-0.7298496909140911

In [111]:
label_encoder.classes_

array(['adoption', 'died', 'euthanasia', 'return_to_owner', 'transfer'],
      dtype=object)