In [100]:
import pandas as pd
from imblearn.combine import SMOTETomek
from sklearn import cross_validation
import cPickle

In [114]:
data = pd.read_csv('predict_data.csv', index_col=0)

In [102]:
# X and y split

In [116]:
X = data[data.columns[data.columns != 'outcometype']]
y = data['outcometype']

In [117]:
# Deal with Unbalanced Classes, you can try more in 

In [118]:
st = SMOTETomek()
X, y = st.fit_sample(X,y)

In [119]:
# Cross Validation

In [120]:
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size = .3, random_state = 0)

# Prediction

### Logistic Regression

In [44]:
from sklearn.linear_model import LogisticRegressionCV

log_clf = LogisticRegressionCV()
log_clf.fit(X_train, y_train)
log_pred = log_clf.predict(X_test)

In [45]:
from sklearn.metrics import classification_report
print classification_report(y_test, log_pred, labels = ['Adoption','Transfer', 'Return_to_owner','Euthanasia', 'Died' ])

             precision    recall  f1-score   support

   Adoption       0.57      0.89      0.69      3029
   Transfer       0.54      0.14      0.23      2701
Return_to_owner       0.49      0.04      0.07      1343
 Euthanasia       0.52      0.58      0.55      3186
       Died       0.54      0.73      0.62      3266

avg / total       0.54      0.54      0.49     13525



In [46]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, log_pred, labels = ['Adoption','Transfer', 'Return_to_owner','Euthanasia', 'Died' ])

array([[2684,   29,   27,  193,   96],
       [ 723,  387,   12,  529, 1050],
       [ 766,   31,   53,  426,   67],
       [ 349,  171,   10, 1849,  807],
       [ 189,  101,    7,  582, 2387]])

### Nearest Neighbors

In [75]:
from sklearn.neighbors import KNeighborsClassifier
knn_clf = KNeighborsClassifier()
knn_clf.fit(X_train, y_train)
knn_pred = knn_clf.predict(X_test)

In [76]:
from sklearn.metrics import classification_report
print classification_report(y_test, knn_pred, labels = ['Adoption','Transfer', 'Return_to_owner','Euthanasia', 'Died' ])

             precision    recall  f1-score   support

   Adoption       0.63      0.74      0.68      3182
   Transfer       0.76      0.51      0.62      2772
Return_to_owner       0.40      0.37      0.38      1382
 Euthanasia       0.26      0.08      0.13       472
       Died       0.81      0.99      0.89      3265

avg / total       0.67      0.68      0.67     11073



### Random Forest Classifier

In [121]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.grid_search import GridSearchCV
rf = RandomForestClassifier()

parameters = {'n_estimators':range(5,20,5), 'criterion':['gini', 'entropy'], 'bootstrap': [True, False]}
rf_clf = GridSearchCV(rf, parameters)
rf_clf.fit(X_train, y_train)
rf_pred = rf_clf.predict(X_test)

In [122]:
from sklearn.metrics import classification_report
print classification_report(y_test, rf_pred, labels = ['Adoption','Transfer', 'Return_to_owner','Euthanasia', 'Died' ])

             precision    recall  f1-score   support

   Adoption       0.67      0.79      0.73      3200
   Transfer       0.72      0.70      0.71      2812
Return_to_owner       0.45      0.39      0.42      1391
 Euthanasia       0.44      0.14      0.21       445
       Died       1.00      0.99      0.99      3225

avg / total       0.74      0.75      0.74     11073



In [123]:
from sklearn.metrics import confusion_matrix

confusion_matrix(y_test, rf_pred, labels = ['Adoption','Transfer', 'Return_to_owner','Euthanasia', 'Died' ])


array([[2534,  290,  371,    5,    0],
       [ 554, 1977,  225,   47,    9],
       [ 601,  217,  548,   24,    1],
       [  80,  228,   73,   61,    3],
       [   4,   35,    5,    2, 3179]])

### Gradient Boosting Classifier

In [84]:
from sklearn.ensemble import GradientBoostingClassifier


gb = GradientBoostingClassifier()
gb.fit(X_train, y_train)
gb_pred = gb.predict(X_test)

In [86]:
from sklearn.metrics import classification_report
print classification_report(y_test, gb_pred, labels = ['Adoption','Transfer', 'Return_to_owner','Euthanasia', 'Died' ])

             precision    recall  f1-score   support

   Adoption       0.70      0.82      0.76      3156
   Transfer       0.75      0.71      0.73      2814
Return_to_owner       0.49      0.48      0.48      1404
 Euthanasia       0.58      0.21      0.31       464
       Died       0.99      0.98      0.99      3237

avg / total       0.77      0.77      0.76     11075



In [92]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, gb_pred, labels = ['Adoption','Transfer', 'Return_to_owner','Euthanasia', 'Died' ])

array([[2590,  230,  327,    8,    1],
       [ 520, 1986,  259,   38,   11],
       [ 540,  168,  671,   25,    0],
       [  39,  214,  109,   98,    4],
       [   8,   42,    6,    0, 3181]])

## I Will Choose Random Forrest Classifier

In [124]:
with open('rforest_model.pkl', 'wb') as fid:
    cPickle.dump(rf_clf, fid) 