# Predict outcomes of shelter pets
___
Begin by importing useful libraries.

In [1]:
import pandas as pd
import numpy as np
import re, datetime
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

## Use label encoder for `y` and create dummy variables with pandas

In [2]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier, ExtraTreesClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.grid_search import GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.cross_validation import StratifiedShuffleSplit
from sklearn.metrics import log_loss, classification_report

from scipy import sparse

### Create function to load train/test data

In [4]:
def convage(age):
    dperiod = {'month': 30, 'year': 365, 'years': 365, 'day': 1, 'days': 1, 'weeks': 7, 'week': 7, 'months': 30}
    if isinstance(age, float):
        return age
    else:
        num = int(re.match('\d+',age).group())
        period = re.search('\w+$', age).group()
        return float(num*dperiod[period])
    
def breedconv(breed):
    if 'pit bull' in breed:
        return 2
    elif '/' not in breed and 'mix' not in breed:
        return 1
    else:
        return 0

def readdata(trainortest):
    assert(trainortest == 'train' or trainortest == 'test')
    dataurl = '/Users/dbricare/Documents/Python/datasets/shelter-outcomes/'
    df = pd.read_csv(dataurl+trainortest+'.csv', sep=',', encoding='utf-8')
    
    # fill NaN sex values
    df['SexuponOutcome'].fillna('Unknown', inplace=True)
    assert(set(df['SexuponOutcome'])=={'Neutered Male', 'Spayed Female', 'Intact Male', 'Intact Female', 
                                       'Unknown'})
    
    # convert age and drop unneeded columns
    Xcols = ['Name', 'AnimalType', 'SexuponOutcome', 'Breed']
    dfX = df[Xcols].copy()
    dfX['AgeDays'] = df['AgeuponOutcome'].apply(convage)
    dfX['AgeDays'] = dfX['AgeDays'].fillna(value=dfX['AgeDays'].mean(), inplace=False)
    # if normalize:
    #     ss = StandardScaler(with_mean=False)
    #     dfX['AgeDays'] = ss.fit_transform(dfX['AgeDays'].values.reshape(-1,1))
    dfX['Breed'] = dfX['Breed'].str.lower().apply(breedconv)
    
    # convert name category to with name and nameless
    dfX['Name'].fillna(0, inplace=True)
    dfX['Name'] = dfX['Name'].apply(lambda x: 0 if x==0 else 1)

    # parse datetime
    dfX['DateTime'] = pd.to_datetime(df['DateTime'], infer_datetime_format=True)
    # Create features for each time interval
    dfX['Year'] = dfX['DateTime'].dt.year
    dfX['Month'] = dfX['DateTime'].dt.month
    # dfX['WeekofYear'] = dfX['DateTime'].dt.weekofyear
    dfX['DayofMonth'] = dfX['DateTime'].dt.day
    dfX['DayofWeek'] = dfX['DateTime'].dt.dayofweek
    dfX['Hour'] = dfX['DateTime'].dt.hour
    dfX['Minute'] = dfX['DateTime'].dt.minute.apply(lambda x: 0 if x==0 else 1)
    dfX.drop('DateTime', axis=1, inplace=True)
    
    convlist = list(dfX.columns)
    convlist.remove('AgeDays')
    idx = 0
    for col in convlist:
        if idx==0:
            dfdum = pd.get_dummies(dfX[col], dummy_na=False, prefix=col)
            idx = 1
        else:
            dfnew = pd.get_dummies(dfX[col], dummy_na=False, prefix=col)
            dfdum = pd.concat([dfdum,dfnew], axis=1)
    dfdum = pd.concat([dfdum,dfX['AgeDays']], axis=1)
    dfdum[dfdum.columns] = dfdum[dfdum.columns].astype(int)
    
    if trainortest=='train':
        le = LabelEncoder()
        sery = le.fit_transform(df['OutcomeType'])
        dy = dict(zip(list(range(le.classes_.shape[0])),le.classes_))
        return dfdum, sery, dy
    else:
        return dfdum, df['ID']

### Examine loaded training data

In [5]:
dfX, sery, dy = readdata('train')
print(dfX.shape)
dfX.head()

(26729, 89)


Unnamed: 0,Name_0,Name_1,AnimalType_Cat,AnimalType_Dog,SexuponOutcome_Intact Female,SexuponOutcome_Intact Male,SexuponOutcome_Neutered Male,SexuponOutcome_Spayed Female,SexuponOutcome_Unknown,Breed_0,...,Hour_17,Hour_18,Hour_19,Hour_20,Hour_21,Hour_22,Hour_23,Minute_0,Minute_1,AgeDays
0,0,1,0,1,0,0,1,0,0,1,...,0,1,0,0,0,0,0,0,1,365
1,0,1,1,0,0,0,0,1,0,1,...,0,0,0,0,0,0,0,0,1,365
2,0,1,0,1,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,1,730
3,1,0,1,0,0,1,0,0,0,1,...,0,0,1,0,0,0,0,0,1,21
4,1,0,0,1,0,0,1,0,0,1,...,0,0,0,0,0,0,0,0,1,730


## Divide dataset into train and test, convert to sparse matrix if necessary

In [126]:
sss = StratifiedShuffleSplit(sery, test_size=0.25, random_state=42, n_iter=1)
for train_index, test_index in sss:
    X_train, X_test = dfX.iloc[train_index], dfX.iloc[test_index]
    y_train, y_test = sery[train_index], sery[test_index]

def sparsity_ratio(X):
    return 1.0 - np.count_nonzero(X) / float(X.shape[0] * X.shape[1])

# sparsify
spratio = sparsity_ratio(X_train.values)
print("Sparsity ratio:", spratio)

if spratio >= 0.9:
    X_train = sparse.csr_matrix(X_train)

print('Training data type:',type(X_train))

Sparsity ratio: 0.8764129024302062
Training data type: <class 'pandas.core.frame.DataFrame'>


## Use grid search to find best estimator and best parameters

Tested random forest, extra trees, adaboost, and gradient boosting classifiers.

In [104]:
params = {'n_estimators': [150], 'learning_rate' : [0.15], 'max_features' : [0.25,0.3,0.35]}

# clf = AdaBoostClassifier(DecisionTreeClassifier(max_depth=4),random_state=42)
# clf = ExtraTreesClassifier(max_features='auto', random_state=42)
# clf = RandomForestClassifier(random_state=42, n_jobs=3, class_weight='balanced_subsample', oob_score=True)
clf = GradientBoostingClassifier(max_depth=4, random_state=42)

grid = GridSearchCV(clf, param_grid=params, n_jobs=3, cv=3, scoring='log_loss')
grid.fit(X_train, y_train)

print(grid.best_params_)
grid.grid_scores_

{'n_estimators': 150, 'max_features': 0.3, 'learning_rate': 0.15}


[mean: -0.77143, std: 0.01432, params: {'n_estimators': 150, 'max_features': 0.25, 'learning_rate': 0.15},
 mean: -0.76887, std: 0.00679, params: {'n_estimators': 150, 'max_features': 0.3, 'learning_rate': 0.15},
 mean: -0.77039, std: 0.01042, params: {'n_estimators': 150, 'max_features': 0.35, 'learning_rate': 0.15}]

### Test best fit parameters

In [105]:
# rf, 800 estimator, None maxdepth (0.79)
# rf, 800 estimators, 32 maxdepth (0.781)
# gb, 100 est, 4 maxdepth (0.773)
# gb, 150 est, 4 maxdepth (0.771)
# gb, 150 est, 4 maxdepth, 0.15 learnrate (0.771)
# gb, 150 est, 4 maxdepth, 0.15 learnrate, 0.2 maxfeatures (0.773)
# gb, 150 est, 4 maxdepth, 0.15 learnrate, 0.3 maxfeatures (0.775)
# gb, 150 est, 4 maxdepth, 0.15 learnrate, 0.2 maxfeatures (0.764) new features
# gb, 150 est, 4 maxdepth, 0.15 learnrate, 0.3 maxfeatures (0.763) new features

try: results
except NameError:
    results = {}

ypred = grid.predict_proba(X_test)
yerror = log_loss(y_test, ypred)
results[yerror] = grid.get_params()['estimator']
print(yerror)

0.763019399211


### Train on whole dataset with best params from CV

In [6]:
#clf.set_params(**grid.best_params_)
bestparams = {'n_estimators': 150, 'max_features': 0.3, 'learning_rate': 0.15}
clf = GradientBoostingClassifier(max_depth=4, random_state=42, **bestparams)

_ = clf.fit(dfX, sery)

## Load test data for submission

In [77]:
dfXtest, serID = readdata('test')
print(dfXtest.shape)
dfXtest.head()

(11456, 89)


Unnamed: 0,Name_0,Name_1,AnimalType_Cat,AnimalType_Dog,SexuponOutcome_Intact Female,SexuponOutcome_Intact Male,SexuponOutcome_Neutered Male,SexuponOutcome_Spayed Female,SexuponOutcome_Unknown,Breed_0,...,Hour_17,Hour_18,Hour_19,Hour_20,Hour_21,Hour_22,Hour_23,Minute_0,Minute_1,AgeDays
0,0,1,0,1,1,0,0,0,0,1,...,0,0,0,0,0,0,0,0,1,300
1,0,1,0,1,0,0,0,1,0,1,...,1,0,0,0,0,0,0,0,1,730
2,0,1,1,0,0,0,1,0,0,1,...,0,0,0,0,0,0,0,0,1,365
3,0,1,0,1,0,1,0,0,0,1,...,0,1,0,0,0,0,0,0,1,120
4,0,1,0,1,0,0,1,0,0,1,...,1,0,0,0,0,0,0,0,1,730


## Calculate probabilities and save to csv

In [78]:
def proboutput(dfXtest, serID, dy):
    yprob = clf.predict_proba(dfXtest)

    dfres = pd.concat([serID,pd.DataFrame(yprob)], axis=1)
    dfres.columns = [dy.get(col, col) for col in dfres.columns]
    dfres = dfres[['ID', 'Adoption', 'Died', 'Euthanasia', 'Return_to_owner', 'Transfer']]
    now = datetime.datetime.now()
    dfres.to_csv('animalshelter'+now.strftime('%Y%m%d_%H%M')+'.csv', sep=',', encoding='utf-8', index=False)
    return dfres

In [79]:
dfres = proboutput(dfXtest, serID, dy)
dfres.head()

Unnamed: 0,ID,Adoption,Died,Euthanasia,Return_to_owner,Transfer
0,1,0.026828,0.00255,0.030857,0.140623,0.799142
1,2,0.79674,0.000452,0.005702,0.156383,0.040722
2,3,0.503034,0.001012,0.010297,0.167838,0.317819
3,4,0.262617,0.006946,0.045311,0.338869,0.346257
4,5,0.383399,0.001088,0.018942,0.519301,0.07727


## Save model for web deployment

In [11]:
import joblib

In [12]:
joblib.dump(clf, 'gbc.pkl', compress=True) 

['gbc.pkl']

In [13]:
test = joblib.load('gbc.pkl')
type(test)

sklearn.ensemble.gradient_boosting.GradientBoostingClassifier

In [14]:
dfX.columns

Index(['Name_0', 'Name_1', 'AnimalType_Cat', 'AnimalType_Dog',
       'SexuponOutcome_Intact Female', 'SexuponOutcome_Intact Male',
       'SexuponOutcome_Neutered Male', 'SexuponOutcome_Spayed Female',
       'SexuponOutcome_Unknown', 'Breed_0', 'Breed_1', 'Breed_2', 'Year_2013',
       'Year_2014', 'Year_2015', 'Year_2016', 'Month_1', 'Month_2', 'Month_3',
       'Month_4', 'Month_5', 'Month_6', 'Month_7', 'Month_8', 'Month_9',
       'Month_10', 'Month_11', 'Month_12', 'DayofMonth_1', 'DayofMonth_2',
       'DayofMonth_3', 'DayofMonth_4', 'DayofMonth_5', 'DayofMonth_6',
       'DayofMonth_7', 'DayofMonth_8', 'DayofMonth_9', 'DayofMonth_10',
       'DayofMonth_11', 'DayofMonth_12', 'DayofMonth_13', 'DayofMonth_14',
       'DayofMonth_15', 'DayofMonth_16', 'DayofMonth_17', 'DayofMonth_18',
       'DayofMonth_19', 'DayofMonth_20', 'DayofMonth_21', 'DayofMonth_22',
       'DayofMonth_23', 'DayofMonth_24', 'DayofMonth_25', 'DayofMonth_26',
       'DayofMonth_27', 'DayofMonth_28', 'DayofM

In [21]:
dfsa = pd.DataFrame(np.zeros(shape=(1,len(dfX.columns)), dtype=int), columns=dfX.columns)
dfsa

Unnamed: 0,Name_0,Name_1,AnimalType_Cat,AnimalType_Dog,SexuponOutcome_Intact Female,SexuponOutcome_Intact Male,SexuponOutcome_Neutered Male,SexuponOutcome_Spayed Female,SexuponOutcome_Unknown,Breed_0,...,Hour_17,Hour_18,Hour_19,Hour_20,Hour_21,Hour_22,Hour_23,Minute_0,Minute_1,AgeDays
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [22]:
dname = {'Named':'1', 'Unnamed':'0'}
dfsa.set_value(0, 'Name_'+dname['Named'], 1)
dfsa

Unnamed: 0,Name_0,Name_1,AnimalType_Cat,AnimalType_Dog,SexuponOutcome_Intact Female,SexuponOutcome_Intact Male,SexuponOutcome_Neutered Male,SexuponOutcome_Spayed Female,SexuponOutcome_Unknown,Breed_0,...,Hour_17,Hour_18,Hour_19,Hour_20,Hour_21,Hour_22,Hour_23,Minute_0,Minute_1,AgeDays
0,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [43]:
dt = datetime.datetime.strptime("2013-10-06", "%Y-%m-%d")
dt.day

6

In [68]:
p = re.compile('\w+(?=_)')
chops = []
for col in dfX.columns[:-1]:
    curr = p.search(col).group()
    if curr not in chops:
        chops.append(curr)
chops = [s+'_' for s in chops]
chops.append(dfX.columns[-1])
print(chops)

['Name_', 'AnimalType_', 'SexuponOutcome_', 'Breed_', 'Year_', 'Month_', 'DayofMonth_', 'DayofWeek_', 'Hour_', 'Minute_', 'AgeDays']


In [72]:
dy

{0: 'Adoption',
 1: 'Died',
 2: 'Euthanasia',
 3: 'Return_to_owner',
 4: 'Transfer'}

In [73]:
clf.classes_

array([0, 1, 2, 3, 4])

In [123]:
def saformat(name,type,sex,breed,age,date,hour,minute):
    cols = ['Name_0', 'Name_1', 'AnimalType_Cat', 'AnimalType_Dog',
       'SexuponOutcome_Intact Female', 'SexuponOutcome_Intact Male',
       'SexuponOutcome_Neutered Male', 'SexuponOutcome_Spayed Female',
       'SexuponOutcome_Unknown', 'Breed_0', 'Breed_1', 'Breed_2', 'Year_2013',
       'Year_2014', 'Year_2015', 'Year_2016', 'Month_1', 'Month_2', 'Month_3',
       'Month_4', 'Month_5', 'Month_6', 'Month_7', 'Month_8', 'Month_9',
       'Month_10', 'Month_11', 'Month_12', 'DayofMonth_1', 'DayofMonth_2',
       'DayofMonth_3', 'DayofMonth_4', 'DayofMonth_5', 'DayofMonth_6',
       'DayofMonth_7', 'DayofMonth_8', 'DayofMonth_9', 'DayofMonth_10',
       'DayofMonth_11', 'DayofMonth_12', 'DayofMonth_13', 'DayofMonth_14',
       'DayofMonth_15', 'DayofMonth_16', 'DayofMonth_17', 'DayofMonth_18',
       'DayofMonth_19', 'DayofMonth_20', 'DayofMonth_21', 'DayofMonth_22',
       'DayofMonth_23', 'DayofMonth_24', 'DayofMonth_25', 'DayofMonth_26',
       'DayofMonth_27', 'DayofMonth_28', 'DayofMonth_29', 'DayofMonth_30',
       'DayofMonth_31', 'DayofWeek_0', 'DayofWeek_1', 'DayofWeek_2',
       'DayofWeek_3', 'DayofWeek_4', 'DayofWeek_5', 'DayofWeek_6', 'Hour_0',
       'Hour_5', 'Hour_6', 'Hour_7', 'Hour_8', 'Hour_9', 'Hour_10', 'Hour_11',
       'Hour_12', 'Hour_13', 'Hour_14', 'Hour_15', 'Hour_16', 'Hour_17',
       'Hour_18', 'Hour_19', 'Hour_20', 'Hour_21', 'Hour_22', 'Hour_23',
       'Minute_0', 'Minute_1', 'AgeDays']
    dfsa = pd.DataFrame(np.zeros((1,len(cols)), dtype=int), columns=cols)
    # get col name bases (before _)
    p = re.compile('\w+(?=_)')
    chops = []
    for col in cols[:-1]:
        curr = p.search(col).group()
        if curr not in chops:
            chops.append(curr)
    chops = [s+'_' for s in chops]
    chops.append(cols[-1])
    # dictionaries and datetime for setting values
    dname = {'Named':'1', 'Unnamed':'0'}
    dbreed = {'Mix':'0', 'Pure breed':'1', 'Pit bull':'2'}
    dt = datetime.datetime.strptime(date, '%Y-%m-%d')
    # suffixes for column names
    locs = dict(zip(chops,[dname[name], type, sex, dbreed[breed], str(dt.year), 
                           str(dt.month), str(dt.day), str(dt.weekday()), hour, minute, '']))
    #set values
    for chop in chops:
        if chop=='AgeDays':
            dfsa.set_value(0, chop+locs[chop], convage(age))
        else:
            dfsa.set_value(0, chop+locs[chop], 1)
    nparr = clf.predict_proba(dfsa)[0]
    return ['{:.1%}'.format(float(p)) for p in nparr]
#     return nparr

In [124]:
probs = saformat('Named', 'Dog', 'Neutered Male', 'Mix', 
                    '1 years', '2016-01-01', '17', '0')
probs

['56.8%', '0.1%', '0.4%', '31.7%', '11.0%']