# Austin_Animal_Center_Machine_Learning

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np


In [2]:
combined = pd.read_csv('aac_intakes_outcomes.csv')

In [3]:
#combined.info()

In [4]:
#select columns
combined = combined[['outcome_type','animal_type','intake_condition','intake_type','age_upon_intake_age_group']]

In [5]:
#Drop missing value
combined = combined.dropna()

In [6]:
combined.outcome_type.value_counts()

Adoption           33594
Transfer           23799
Return to Owner    14791
Euthanasia          6244
Died                 690
Disposal             304
Rto-Adopt            179
Missing               46
Relocate              15
Name: outcome_type, dtype: int64

In [7]:
#drop the rows with rare situations
combined = combined.loc[(combined.outcome_type == 'Adoption') | (combined.outcome_type == 'Transfer') | (combined.outcome_type == 'Return to Owner') | (combined.outcome_type == 'Euthanasia')]

In [8]:
combined.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 78428 entries, 0 to 79671
Data columns (total 5 columns):
outcome_type                 78428 non-null object
animal_type                  78428 non-null object
intake_condition             78428 non-null object
intake_type                  78428 non-null object
age_upon_intake_age_group    78428 non-null object
dtypes: object(5)
memory usage: 3.6+ MB


In [9]:
combined.outcome_type.value_counts()

Adoption           33594
Transfer           23799
Return to Owner    14791
Euthanasia          6244
Name: outcome_type, dtype: int64

# Model to predict the outcome of animals

In [104]:
#sample features and labels
X = combined[['animal_type','intake_condition','intake_type','age_upon_intake_age_group']]
y = combined.outcome_type

In [105]:
#get dummy variable
from sklearn import preprocessing

#encode labels with value between 0 and n_classes-1.
le = preprocessing.LabelEncoder()

# use df.apply() to apply le.fit_transform to all columns
X_2 = X.apply(le.fit_transform)
X_2.head()

Unnamed: 0,animal_type,intake_condition,intake_type,age_upon_intake_age_group
0,2,3,3,9
1,2,3,2,8
2,2,3,2,8
3,2,3,1,9
4,2,2,2,3


In [106]:
#INSTANTIATE
enc = preprocessing.OneHotEncoder(categories='auto')

#Fit
enc.fit(X_2)

#Transform
X_ohe = enc.transform(X_2).toarray()
X_ohe

array([[0., 0., 1., ..., 0., 0., 1.],
       [0., 0., 1., ..., 0., 1., 0.],
       [0., 0., 1., ..., 0., 1., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 1.]])

# K-Nearest Neighbors 

In [13]:
from sklearn import datasets
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split,GridSearchCV


In [107]:
#split train and test data with cross validation
X_train,X_test,y_train,y_test = train_test_split(X_ohe,y,test_size=0.8,random_state=21,stratify=y)

In [16]:
#train KNeighbors model
param_grid={'n_neighbors':np.arange(1,30)}
knn=KNeighborsClassifier()
knn_cv=GridSearchCV(knn,param_grid,cv=5)
knn_cv.fit(X_train,y_train)
print(knn_cv.best_params_,knn_cv.best_score_)

({'n_neighbors': 27}, 0.5691424928275423)


K=27 has the best accurate rate which is 57%.

In [15]:
#Use test group to check accuracy of the model

from sklearn.metrics import accuracy_score

#Train model with best parameter
knn=KNeighborsClassifier(n_neighbors=27)
knn.fit(X_train,y_train)

#predict test data
y_predict=knn.predict(X_test)

score = accuracy_score(y_predict,y_test)
print('The accuracy score is ',score)

The accuracy score is  0.57043813652519


The accuracy score for test data is consistent, which is 57%

# RandomForest

In [16]:
from sklearn.ensemble import RandomForestClassifier

In [26]:
#train RandomForest Model
estimators=[10,100,300,500,800,1000]
criterions=['gini','entropy']
grid_param_rf={'n_estimators':estimators,'criterion':criterions,'bootstrap':[True,False]}
rf=RandomForestClassifier()
rf_cv=GridSearchCV(rf,grid_param_rf,cv=5)
rf_cv.fit(X_train,y_train)

print(rf_cv.best_params_,rf_cv.best_score_)

({'n_estimators': 1000, 'bootstrap': True, 'criterion': 'gini'}, 0.5813834874083519)


The parameter with best score is n_estimator=1000,bootstrap=True,criterion='gini'. The optimized score is 58%

In [17]:
#Check the test data accuracy score

#train modelwith best paramter
rf=RandomForestClassifier(n_estimators=1000,criterion='gini',bootstrap=True)
rf.fit(X_train,y_train)

#predict the test data
y_predict=rf.predict(X_test)

score = accuracy_score(y_predict,y_test)
print('The accuracy score is: '+str(score))

The accuracy score is: 0.5765105270707489


The test data has similar accuracy, which is 58%

# Support Vector Machine

In [18]:
from sklearn.svm import SVC

In [31]:
#train SVM model
Cs=[0.01,0.1,1,10]
gammas=[0.001,0.01,0.1]
kernels=['linear', 'poly', 'rbf', 'sigmoid']
param_grid_svc={'C':Cs,'kernel':kernels,'gamma':gammas}
svc=SVC()
svc_cv=GridSearchCV(svc,param_grid_svc,cv=5)
svc_cv.fit(X_train,y_train)

print(svc_cv.best_params_,svc_cv.best_score_)

({'kernel': 'rbf', 'C': 10, 'gamma': 0.1}, 0.5820210392094358)


The parameter for best score is kerner='rbf',C=10,gamma=0.1. The optimized score is 58%.

In [19]:
#Check the test data accuracy score

#train modelwith best paramter
svc=SVC(C=10,kernel='rbf',gamma=0.1)
svc.fit(X_train,y_train)

#predict the test data
y_predict=svc.predict(X_test)

score = accuracy_score(y_predict,y_test)
print('The accuracy score is: '+str(score))

The accuracy score is: 0.5774668090464274


The test data has similar accuracy, which is 58%

# Model Stacking

In [108]:
from sklearn.model_selection import StratifiedKFold

In [109]:
#define a function to make predictions on n-folds of train and test dataset
def Stacking(model,X,y,n_fold,name):
    
    #split train and test dataset
    nfolds=StratifiedKFold(n_splits=n_fold,random_state=1)
    
    #create empty dataframe for model result
    test_pred=pd.DataFrame(columns=[name])
    
    #split train data and test data
    for train_i,test_i in nfolds.split(X,y):
        X_train,X_test=X[train_i],X[test_i]
        y_train,y_test=y[train_i],y[test_i]

        model.fit(X_train,y_train)
        #store predict result to list
        test_pred[test_i][name]=model.predict(X_test)
        
        return test_pred

In [110]:
#knn model freature

test_pred_knn=Stacking(model=knn,X=X_train,y=y_train,n_fold=5,name='knn')
test_pred_knn=pd.DataFrame(data={'knn':test_pred_knn})


Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  return self.loc[key]


ValueError: Input contains NaN

# Logistic regression

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
param_grid_logreg={'C':Cs}
logreg=LogisticRegression()
logreg_cv=GridSearchCV(logreg,param_grid_logreg,cv=5)
logreg_cv.fit(X_train,y_train)

In [None]:
print(logreg_cv.best_params,logreg_cv.best_score_)

In [18]:
#confusion materics

from sklearn.metrics import classification_report,confusion_matrix

print('Confusion matrix: ',confusion_matrix(y_test,y_predict))
print('Classifiction report: ',classification_report(y_test,y_predict))

('Confusion matrix: ', array([[16490,   409,  2662,  7315],
       [  842,  3135,   392,   626],
       [ 4393,   184,  6437,   819],
       [ 7052,   454,  1804,  9729]], dtype=int64))
('Classifiction report: ', u'                 precision    recall  f1-score   support\n\n       Adoption       0.57      0.61      0.59     26876\n     Euthanasia       0.75      0.63      0.68      4995\nReturn to Owner       0.57      0.54      0.56     11833\n       Transfer       0.53      0.51      0.52     19039\n\n      micro avg       0.57      0.57      0.57     62743\n      macro avg       0.60      0.57      0.59     62743\n   weighted avg       0.57      0.57      0.57     62743\n')
