# Austin_Animal_Center_Machine_Learning

In [267]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np


In [268]:
combined = pd.read_csv('aac_intakes_outcomes.csv')

In [269]:
#combined.info()

In [270]:
#select columns
combined = combined[['outcome_type','animal_type','intake_condition','intake_type','age_upon_intake_age_group']]

In [271]:
#Drop missing value
combined = combined.dropna()

In [272]:
combined.outcome_type.value_counts()

Adoption           33594
Transfer           23799
Return to Owner    14791
Euthanasia          6244
Died                 690
Disposal             304
Rto-Adopt            179
Missing               46
Relocate              15
Name: outcome_type, dtype: int64

In [273]:
#drop the rows with rare situations
combined = combined.loc[(combined.outcome_type == 'Adoption') | (combined.outcome_type == 'Transfer') | (combined.outcome_type == 'Return to Owner') | (combined.outcome_type == 'Euthanasia')]

In [274]:
combined.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 78428 entries, 0 to 79671
Data columns (total 5 columns):
outcome_type                 78428 non-null object
animal_type                  78428 non-null object
intake_condition             78428 non-null object
intake_type                  78428 non-null object
age_upon_intake_age_group    78428 non-null object
dtypes: object(5)
memory usage: 3.6+ MB


In [275]:
combined.outcome_type.value_counts()

Adoption           33594
Transfer           23799
Return to Owner    14791
Euthanasia          6244
Name: outcome_type, dtype: int64

# Model to predict the outcome of animals

In [276]:
#sample features and labels
X = combined[['animal_type','intake_condition','intake_type','age_upon_intake_age_group']]
y = combined.outcome_type

In [277]:
#get dummy variable
from sklearn import preprocessing

#encode labels with value between 0 and n_classes-1.
le = preprocessing.LabelEncoder()

# use df.apply() to apply le.fit_transform to all columns
X_2 = X.apply(le.fit_transform)
X_2.head()

Unnamed: 0,animal_type,intake_condition,intake_type,age_upon_intake_age_group
0,2,3,3,9
1,2,3,2,8
2,2,3,2,8
3,2,3,1,9
4,2,2,2,3


In [278]:
#INSTANTIATE
enc = preprocessing.OneHotEncoder(categories='auto')

#Fit
enc.fit(X_2)

#Transform
X_ohe = enc.transform(X_2).toarray()
X_ohe

array([[0., 0., 1., ..., 0., 0., 1.],
       [0., 0., 1., ..., 0., 1., 0.],
       [0., 0., 1., ..., 0., 1., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 1.]])

# K-Nearest Neighbors 

In [279]:
from sklearn import datasets
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split,GridSearchCV


In [280]:
#split train and test data with cross validation
X_train,X_test,y_train,y_test = train_test_split(X_ohe,y,test_size=0.8,random_state=21,stratify=y)

In [16]:
#train KNeighbors model
param_grid={'n_neighbors':np.arange(1,30)}
knn=KNeighborsClassifier()
knn_cv=GridSearchCV(knn,param_grid,cv=5)
knn_cv.fit(X_train,y_train)
print(knn_cv.best_params_,knn_cv.best_score_)

({'n_neighbors': 27}, 0.5691424928275423)


K=27 has the best accurate rate which is 57%.

In [281]:
#Use test group to check accuracy of the model

from sklearn.metrics import accuracy_score

#Train model with best parameter
knn=KNeighborsClassifier(n_neighbors=27)
knn.fit(X_train,y_train)

#predict test data
y_predict=knn.predict(X_test)

score = accuracy_score(y_predict,y_test)
print('The accuracy score is '+str(score))

The accuracy score is 0.57043813652519


The accuracy score for test data is consistent, which is 57%

In [292]:
#confusion materics

from sklearn.metrics import classification_report,confusion_matrix

confusion1=confusion_matrix(y_predict,y_test)
classifiction1=classification_report(y_predict,y_test)

print('Confusion matrix:\n',confusion1)
print('Classifiction report:\n',classifiction1)

Confusion matrix:
 [[16629   637  4386  6953]
 [  369  3260   142   470]
 [ 2617   349  6483  1756]
 [ 7261   749   822  9860]]
Classifiction report:
                  precision    recall  f1-score   support

       Adoption       0.62      0.58      0.60     28605
     Euthanasia       0.65      0.77      0.71      4241
Return to Owner       0.55      0.58      0.56     11205
       Transfer       0.52      0.53      0.52     18692

       accuracy                           0.58     62743
      macro avg       0.58      0.61      0.60     62743
   weighted avg       0.58      0.58      0.58     62743



# RandomForest

In [67]:
from sklearn.ensemble import RandomForestClassifier

In [26]:
#train RandomForest Model
estimators=[10,100,300,500,800,1000]
criterions=['gini','entropy']
grid_param_rf={'n_estimators':estimators,'criterion':criterions,'bootstrap':[True,False]}
rf=RandomForestClassifier()
rf_cv=GridSearchCV(rf,grid_param_rf,cv=5)
rf_cv.fit(X_train,y_train)

print(rf_cv.best_params_,rf_cv.best_score_)

({'n_estimators': 1000, 'bootstrap': True, 'criterion': 'gini'}, 0.5813834874083519)


The parameter with best score is n_estimator=1000,bootstrap=True,criterion='gini'. The optimized score is 58%

In [283]:
#Check the test data accuracy score

#train modelwith best paramter
rf=RandomForestClassifier(n_estimators=1000,criterion='gini',bootstrap=True)
rf.fit(X_train,y_train)

#predict the test data
y_predict=rf.predict(X_test)

score = accuracy_score(y_predict,y_test)
print('The accuracy score is: '+str(score))

The accuracy score is: 0.5767177214988126


The test data has similar accuracy, which is 58%

In [291]:
#confusion materics

confusion2=confusion_matrix(y_predict,y_test)
classifiction2=classification_report(y_predict,y_test)

print('Confusion matrix:\n',confusion2)
print('Classifiction report:\n',classifiction2)

Confusion matrix:
 [[16629   637  4386  6953]
 [  369  3260   142   470]
 [ 2617   349  6483  1756]
 [ 7261   749   822  9860]]
Classifiction report:
                  precision    recall  f1-score   support

       Adoption       0.62      0.58      0.60     28605
     Euthanasia       0.65      0.77      0.71      4241
Return to Owner       0.55      0.58      0.56     11205
       Transfer       0.52      0.53      0.52     18692

       accuracy                           0.58     62743
      macro avg       0.58      0.61      0.60     62743
   weighted avg       0.58      0.58      0.58     62743



# Support Vector Machine

In [69]:
from sklearn.svm import SVC

In [31]:
#train SVM model
Cs=[0.01,0.1,1,10]
gammas=[0.001,0.01,0.1]
kernels=['linear', 'poly', 'rbf', 'sigmoid']
param_grid_svc={'C':Cs,'kernel':kernels,'gamma':gammas}
svc=SVC()
svc_cv=GridSearchCV(svc,param_grid_svc,cv=5)
svc_cv.fit(X_train,y_train)

print(svc_cv.best_params_,svc_cv.best_score_)

({'kernel': 'rbf', 'C': 10, 'gamma': 0.1}, 0.5820210392094358)


The parameter for best score is kerner='rbf',C=10,gamma=0.1. The optimized score is 58%.

In [285]:
#Check the test data accuracy score

#train modelwith best paramter
svc=SVC(C=10,kernel='rbf',gamma=0.1)
svc.fit(X_train,y_train)

#predict the test data
y_predict=svc.predict(X_test)

score = accuracy_score(y_predict,y_test)
print('The accuracy score is: '+str(score))

The accuracy score is: 0.5774668090464274


The test data has similar accuracy, which is 58%

In [290]:
#confusion materics

confusion3=confusion_matrix(y_predict,y_test)
classifiction3=classification_report(y_predict,y_test)

print('Confusion matrix:\n',confusion3)
print('Classifiction report:\n',classifiction3)

Confusion matrix:
 [[16629   637  4386  6953]
 [  369  3260   142   470]
 [ 2617   349  6483  1756]
 [ 7261   749   822  9860]]
Classifiction report:
                  precision    recall  f1-score   support

       Adoption       0.62      0.58      0.60     28605
     Euthanasia       0.65      0.77      0.71      4241
Return to Owner       0.55      0.58      0.56     11205
       Transfer       0.52      0.53      0.52     18692

       accuracy                           0.58     62743
      macro avg       0.58      0.61      0.60     62743
   weighted avg       0.58      0.58      0.58     62743



# Model Stacking

In [189]:
from sklearn.model_selection import StratifiedKFold

In [208]:
y = combined[['outcome_type']]

In [209]:
# use df.apply() to apply le.fit_transform to all columns
y_2 = le.fit_transform(y)

y_2

  y = column_or_1d(y, warn=True)


array([2, 2, 2, ..., 1, 2, 1])

In [210]:
#reset combined index
combined=combined.reindex(range(78428))

In [211]:
#define a function to make predictions on n-folds of train and test dataset
def Stacking(model,X,y,n_fold,name):
    
    #create empty dataframe for model result
    combined[name]=pd.Series()
    i=combined.columns.get_loc(name)
    
    #split train and test dataset
    nfolds=StratifiedKFold(n_splits=n_fold,random_state=1)
    
    #split train data and test data
    for train_i,test_i in nfolds.split(X,y):
        X1_train,X1_test=X[train_i],X[test_i]
        y1_train,y1_test=y[train_i],y[test_i]

        model.fit(X1_train,y1_train)
        #store predict result to dataframe
        combined.iloc[test_i,i]=model.predict(X1_test)
        
    return combined[[name]]

In [212]:
#knn model feature
knn_predict=Stacking(model=knn,X=X_ohe,y=y_2,n_fold=5,name='knn')
knn_predict.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 78428 entries, 0 to 78427
Data columns (total 1 columns):
knn    78428 non-null float64
dtypes: float64(1)
memory usage: 612.8 KB


In [213]:
# Randomforest model feature
rf_predict=Stacking(model=rf,X=X_ohe,y=y_2,n_fold=5,name='rf')
rf_predict.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 78428 entries, 0 to 78427
Data columns (total 1 columns):
rf    78428 non-null float64
dtypes: float64(1)
memory usage: 612.8 KB


In [214]:
# SVM model feature
svc_predict=Stacking(model=svc,X=X_ohe,y=y_2,n_fold=5,name='svc')
svc_predict.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 78428 entries, 0 to 78427
Data columns (total 1 columns):
svc    78428 non-null float64
dtypes: float64(1)
memory usage: 612.8 KB


In [216]:
#concate the result of all model, use it as a feature

df = pd.concat([knn_predict, rf_predict,svc_predict], axis=1)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 78428 entries, 0 to 78427
Data columns (total 3 columns):
knn    78428 non-null float64
rf     78428 non-null float64
svc    78428 non-null float64
dtypes: float64(3)
memory usage: 1.8 MB


In [179]:
new_X=df[['knn','rf','svc']]
new_y=df.outcome_type

In [248]:
#split train and test data with cross validation
new_X_train,new_X_test,new_y_train,new_y_test = train_test_split(df,y,test_size=0.5,random_state=21,stratify=y)

In [245]:
Cs=[0.01,0.1,1,10]
param_grid_logreg={'C':Cs,'multi_class':['auto'],'solver':['lbfgs']}
logreg=LogisticRegression()
logreg_cv=GridSearchCV(logreg,param_grid_logreg,cv=5)
logreg_cv.fit(new_X_train,new_y_train)
print(logreg_cv.best_params_,logreg_cv.best_score_)

{'C': 0.01, 'multi_class': 'auto', 'solver': 'lbfgs'} 0.469704697301984


In [287]:
#train the second model
model=LogisticRegression(C=0.01,multi_class='auto',solver='lbfgs')
model.fit(new_X_train,new_y_train)
prediction=model.predict(new_X_test)
model_score=accuracy_score(new_y_test,prediction)
print("The accuracy socre of second model is "+str(model_score))

The accuracy socre of second model is 0.46835313918498495


In [289]:
#confusion materics

from sklearn.metrics import classification_report,confusion_matrix

print('Confusion matrix:\n',confusion_matrix(new_y_test,prediction))
print('Classifiction report:\n',classification_report(new_y_test,prediction))

Confusion matrix:
 [[12183     1     0  4613]
 [ 2668     3     0   451]
 [ 6852     1     0   542]
 [ 5716     4     0  6180]]
Classifiction report:
                  precision    recall  f1-score   support

       Adoption       0.44      0.73      0.55     16797
     Euthanasia       0.33      0.00      0.00      3122
Return to Owner       0.00      0.00      0.00      7395
       Transfer       0.52      0.52      0.52     11900

       accuracy                           0.47     39214
      macro avg       0.33      0.31      0.27     39214
   weighted avg       0.38      0.47      0.39     39214

