In [190]:
import pandas as pd # To load in one of the functionality that pandas provides (import pandas as pd)
import numpy as np # To load additional functionality and package that pandas relies on called NumPy (import numpy as np)
import category_encoders as ce
from sklearn.decomposition import PCA
from sklearn.preprocessing import LabelEncoder, PolynomialFeatures, StandardScaler
from sklearn.linear_model import Lasso, Ridge, LinearRegression, LogisticRegression, ElasticNet
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import BaggingRegressor, RandomForestRegressor
from sklearn.metrics import mean_squared_error, accuracy_score
from sklearn import metrics
from sklearn.cross_validation import cross_val_score, train_test_split
import scipy.stats as stats

In [191]:
pTrain = pd.read_csv("../data/pTrain.csv") #To read in the Dataset; note changed xlsx to csv
pTrain.DateTime = pd.to_datetime(pTrain.DateTime) #To make date/time readable

In [192]:
#Let's take a look at the first few rows of data
pTrain.head()

Unnamed: 0,AnimalID,Name,DateTime,OutcomeType,AnimalType,SexuponOutcome,AgeuponOutcome,Breed,Color
0,A671945,Hambone,2014-02-12 18:22:00,Return_to_owner,Dog,Neutered Male,1 year,Shetland Sheepdog Mix,Brown/White
1,A656520,Emily,2013-10-13 12:44:00,Euthanasia,Cat,Spayed Female,1 year,Domestic Shorthair Mix,Cream Tabby
2,A686464,Pearce,2015-01-31 12:28:00,Adoption,Dog,Neutered Male,2 years,Pit Bull Mix,Blue/White
3,A683430,,2014-07-11 19:09:00,Transfer,Cat,Intact Male,3 weeks,Domestic Shorthair Mix,Blue Cream
4,A667013,,2013-11-15 12:52:00,Transfer,Dog,Neutered Male,2 years,Lhasa Apso/Miniature Poodle,Tan


In [139]:
#Let's take a look at the last rows of data
pTrain.tail()

Unnamed: 0,AnimalID,Name,DateTime,OutcomeType,AnimalType,SexuponOutcome,AgeuponOutcome,Breed,Color
26724,A702446,,2015-05-14 11:56:00,Transfer,Cat,Intact Male,1 month,Domestic Shorthair Mix,Brown Tabby/White
26725,A718934,,2016-01-20 18:59:00,Transfer,Cat,Spayed Female,3 months,Domestic Shorthair Mix,Brown Tabby
26726,A698128,Zeus,2015-03-09 13:33:00,Adoption,Dog,Neutered Male,4 years,Old English Bulldog Mix,White/Tan
26727,A677478,,2014-04-27 12:22:00,Transfer,Cat,Intact Male,4 weeks,Domestic Shorthair Mix,Black
26728,A706629,,2015-07-02 09:00:00,Transfer,Cat,Intact Male,1 year,Domestic Shorthair Mix,Brown Tabby/White


In [193]:
#Check out to see what format columns are in
pTrain.dtypes

AnimalID                  object
Name                      object
DateTime          datetime64[ns]
OutcomeType               object
AnimalType                object
SexuponOutcome            object
AgeuponOutcome            object
Breed                     object
Color                     object
dtype: object

In [194]:
#Let's get an idea of how big this data set is
(numRows, numColumns) = pTrain.shape
print numRows
print numColumns

26729
9


In [195]:
#Lets get a sense of how many unique value for each feature
print "Number of unique Animal IDs: " , pTrain.AnimalID.nunique()
print "Number of unique Names: " , pTrain.Name.nunique()
print "Number of unique Outcome Types: " , pTrain.OutcomeType.nunique()
print "Number of unique Animal Types: " , pTrain.AnimalType.nunique()
print "Number of unique Sex Upon Outcomes: " , pTrain.SexuponOutcome.nunique()
print "Number of unique Age Upon Outcomes: " , pTrain.AgeuponOutcome.nunique()
print "Number of unique Breed Upon Outcomes: " , pTrain.Breed.nunique()



Number of unique Animal IDs:  26729
Number of unique Names:  6374
Number of unique Outcome Types:  5
Number of unique Animal Types:  2
Number of unique Sex Upon Outcomes:  5
Number of unique Age Upon Outcomes:  44
Number of unique Breed Upon Outcomes:  1380


In [196]:
# lets see how many cells have a name filled in; 
pTrain.isnull().sum()

AnimalID             0
Name              7691
DateTime             0
OutcomeType          0
AnimalType           0
SexuponOutcome       0
AgeuponOutcome       0
Breed                0
Color                0
dtype: int64

In [197]:
#since 30% of cells don't have names let's get rid of the Name column
del pTrain["Name"]

In [198]:
pTrain["month"] = pTrain.DateTime.dt.month

In [199]:
#Let's First concentrate on Animal types and what exactly they are
pTrain.AnimalType.unique()

array(['Dog', 'Cat'], dtype=object)

In [200]:
#Let's now work on converting our categorical target into a number
# The results shows that there are as many cats and dogs combined as Animal Ids and total rows
pTrain.AnimalType.value_counts()

Dog    15595
Cat    11134
Name: AnimalType, dtype: int64

In [201]:
pTrain["Animal_Type"] = (pTrain.AnimalType=="Cat").astype(int)
pTrain.Animal_Type.value_counts()
#So we have developed a coding where Dog = 0 ~60% and Cat = 1 ~40%

0    15595
1    11134
Name: Animal_Type, dtype: int64

In [148]:
#Lets check that this is working
pTrain.tail()

Unnamed: 0,AnimalID,Name,DateTime,OutcomeType,AnimalType,SexuponOutcome,AgeuponOutcome,Breed,Color,month,Animal_Type
26724,A702446,,2015-05-14 11:56:00,Transfer,Cat,Intact Male,1 month,Domestic Shorthair Mix,Brown Tabby/White,5,1
26725,A718934,,2016-01-20 18:59:00,Transfer,Cat,Spayed Female,3 months,Domestic Shorthair Mix,Brown Tabby,1,1
26726,A698128,Zeus,2015-03-09 13:33:00,Adoption,Dog,Neutered Male,4 years,Old English Bulldog Mix,White/Tan,3,0
26727,A677478,,2014-04-27 12:22:00,Transfer,Cat,Intact Male,4 weeks,Domestic Shorthair Mix,Black,4,1
26728,A706629,,2015-07-02 09:00:00,Transfer,Cat,Intact Male,1 year,Domestic Shorthair Mix,Brown Tabby/White,7,1


In [149]:
#Great! Now lets work on Breed using binary encoding:
#Step 1 Let's encode it
ce.BinaryEncoder(pTrain[["Breed"]])


BinaryEncoder(cols=None, drop_invariant=False, return_df=True,
       verbose=                                       Breed
0                      Shetland Sheepdog Mix
1                     Domestic Shorthair Mix
2                               Pit Bull Mix
3                     Domestic Shorthair Mix
4                Lhasa Apso/Miniature Poodle
5          Cairn Terri...       Domestic Shorthair Mix
26728                 Domestic Shorthair Mix

[26729 rows x 1 columns])

In [150]:
#Step 2 Let's consolidate the answer
binaryenc = ce.BinaryEncoder().fit_transform(pTrain[["Breed"]])


In [129]:
#To print out results
binaryenc

Unnamed: 0,Breed_0,Breed_1,Breed_2,Breed_3,Breed_4,Breed_5,Breed_6,Breed_7,Breed_8,Breed_9,Breed_10
0,0,0,1,1,0,0,0,0,0,1,0
1,0,1,0,1,1,0,0,0,1,0,1
2,0,1,0,1,1,0,0,1,1,0,1
3,0,1,0,1,1,0,0,0,1,0,1
4,0,0,0,0,1,1,0,1,1,1,1
5,0,0,1,0,1,0,0,0,1,0,0
6,0,1,0,1,1,0,0,0,1,0,1
7,0,1,0,1,1,0,0,0,1,0,1
8,0,0,1,0,0,1,1,1,0,0,0
9,0,1,1,1,0,0,0,1,0,1,0


In [202]:
#Great it works! now let's pass in Sex Upon Outcome, Breed, & Color and have it
#be part of the Dataset pTrain.
binaryenc = ce.BinaryEncoder(cols = ["SexuponOutcome","Breed", "Color"]).fit_transform(pTrain)

In [203]:
binaryenc

Unnamed: 0,SexuponOutcome_0,SexuponOutcome_1,SexuponOutcome_2,Breed_0,Breed_1,Breed_2,Breed_3,Breed_4,Breed_5,Breed_6,...,Color_6,Color_7,Color_8,AnimalID,DateTime,OutcomeType,AnimalType,AgeuponOutcome,month,Animal_Type
0,0,0,1,0,1,1,0,1,1,1,...,1,0,0,A671945,2014-02-12 18:22:00,Return_to_owner,Dog,1 year,2,0
1,0,0,0,1,0,1,0,1,0,1,...,0,0,1,A656520,2013-10-13 12:44:00,Euthanasia,Cat,1 year,10,1
2,0,0,1,0,1,0,0,0,0,0,...,0,1,1,A686464,2015-01-31 12:28:00,Adoption,Dog,2 years,1,0
3,1,0,0,1,0,1,0,1,0,1,...,0,0,1,A683430,2014-07-11 19:09:00,Transfer,Cat,3 weeks,7,1
4,0,0,1,0,1,0,1,0,0,0,...,1,0,1,A667013,2013-11-15 12:52:00,Transfer,Dog,2 years,11,0
5,0,1,1,0,1,1,1,1,1,1,...,1,1,1,A677334,2014-04-25 13:04:00,Transfer,Dog,1 month,4,0
6,1,0,0,1,0,1,0,1,0,1,...,0,1,0,A699218,2015-03-28 13:11:00,Transfer,Cat,3 weeks,3,1
7,0,1,0,1,0,1,0,1,0,1,...,1,1,1,A701489,2015-04-30 17:02:00,Transfer,Cat,3 weeks,4,1
8,0,0,0,1,0,1,0,1,0,1,...,0,1,1,A671784,2014-02-04 17:17:00,Adoption,Dog,5 months,2,0
9,0,0,0,0,0,0,1,1,1,1,...,1,0,0,A677747,2014-05-03 07:48:00,Adoption,Dog,1 year,5,0


In [204]:
#Let's first concentrate on Outcome types and what exactly they are
pTrain.OutcomeType.unique()

array(['Return_to_owner', 'Euthanasia', 'Adoption', 'Transfer', 'Died'], dtype=object)

In [205]:
le = LabelEncoder()

In [206]:
#Let's get an array of numbers for Outcome type
le.fit_transform(pTrain.OutcomeType)


array([3, 2, 0, ..., 0, 4, 4])

In [207]:
#Let's make a new column
pTrain["newOutCome"]=le.fit_transform(pTrain.OutcomeType)

In [208]:
pTrain.head()

Unnamed: 0,AnimalID,DateTime,OutcomeType,AnimalType,SexuponOutcome,AgeuponOutcome,Breed,Color,month,Animal_Type,newOutCome
0,A671945,2014-02-12 18:22:00,Return_to_owner,Dog,Neutered Male,1 year,Shetland Sheepdog Mix,Brown/White,2,0,3
1,A656520,2013-10-13 12:44:00,Euthanasia,Cat,Spayed Female,1 year,Domestic Shorthair Mix,Cream Tabby,10,1,2
2,A686464,2015-01-31 12:28:00,Adoption,Dog,Neutered Male,2 years,Pit Bull Mix,Blue/White,1,0,0
3,A683430,2014-07-11 19:09:00,Transfer,Cat,Intact Male,3 weeks,Domestic Shorthair Mix,Blue Cream,7,1,4
4,A667013,2013-11-15 12:52:00,Transfer,Dog,Neutered Male,2 years,Lhasa Apso/Miniature Poodle,Tan,11,0,4


In [209]:
#binaryenc.corr()

In [162]:
pTrain.newOutCome.value_counts()
#Key 0 = Adoption 40%; 1= Died <1%; 2 = Euthanasia 6%; 3 = Return to Owner 18%; 4= Transfer 35%

0    10769
4     9422
3     4786
2     1555
1      197
Name: newOutCome, dtype: int64

In [163]:
logreg = LogisticRegression(C=1e9)
feature_cols = ['Animal_Type',"converted_age", "month"]
X = pTrain[feature_cols]
y = pTrain.newOutCome
logreg.fit(X, y)
outcome_pred_class_log = logreg.predict(X)

In [164]:
from sklearn import metrics

In [185]:
X_train,X_test,y_train, y_test = train_test_split(X,y,test_size=0.3)
y_test_pred = logreg.predict(X_test)
print "Test set accuracy of LR model: ", metrics.accuracy_score(y_test, y_test_pred)

 Test set accuracy of LR model:  0.481980296795


In [210]:
#compute null accuracy manually
#print "null accuracy on test set: ", y_test_pred.mean()

In [211]:
print "Classification Report: \n" , metrics.classification_report(y_test,y_test_pred)

Classification Report: 
             precision    recall  f1-score   support

          0       0.46      0.53      0.49      3202
          1       0.00      0.00      0.00        57
          2       0.32      0.02      0.04       470
          3       0.46      0.31      0.37      1419
          4       0.51      0.59      0.55      2871

avg / total       0.47      0.48      0.46      8019



In [226]:
#lr probabilities per category for first five samples
#predicted_probs_logreg = logreg.predict_proba(X_test).round(3)
#predictions_logreg = logreg.predict(X_test)

#print "Logistic Regression predicted probabilities for first five samples in test set:\n",predicted_probs_logreg[:5]
#print "Logistic Regression predictions for first five samples in test set:\n",predictions_logreg[:5]
y_test_logreg_df = pd.DataFrame(
    np.concatenate((
        predicted_probs_logreg,predictions_logreg.reshape((predictions_logreg.shape[0],-1)),
        y_test.reshape((y_test.shape[0],-1))),axis=1
    ),
    columns = ["class_0","class_1","class_2", "class_3","class_4","predicted","actual"])

y_test_logreg_df.head()

Unnamed: 0,class_0,class_1,class_2,class_3,class_4,predicted,actual
0,0.401,0.013,0.051,0.033,0.502,4.0,0.0
1,0.369,0.014,0.067,0.047,0.503,4.0,4.0
2,0.485,0.004,0.032,0.18,0.299,0.0,4.0
3,0.459,0.004,0.036,0.201,0.3,0.0,0.0
4,0.396,0.003,0.059,0.317,0.226,0.0,4.0


In [222]:
data = np.concatenate((
        predicted_probs_logreg,predictions_logreg.reshape((predictions_logreg.shape[0],-1)),
        y_test.reshape((y_test.shape[0],-1))),axis=1
    )

In [225]:
pd.DataFrame(data,columns=)

Unnamed: 0,0,1,2,3,4,5,6
0,0.401,0.013,0.051,0.033,0.502,4.0,0.0
1,0.369,0.014,0.067,0.047,0.503,4.0,4.0
2,0.485,0.004,0.032,0.180,0.299,0.0,4.0
3,0.459,0.004,0.036,0.201,0.300,0.0,0.0
4,0.396,0.003,0.059,0.317,0.226,0.0,4.0
5,0.394,0.015,0.048,0.031,0.513,4.0,3.0
6,0.188,0.002,0.127,0.550,0.133,3.0,3.0
7,0.416,0.013,0.047,0.029,0.496,4.0,0.0
8,0.447,0.003,0.044,0.241,0.265,0.0,2.0
9,0.497,0.004,0.032,0.178,0.290,0.0,4.0


In [216]:
#predicted_probs_logreg.shape
#predictions_logreg.reshape((predictions_logreg.shape[0],-1)).shape
y_test.reshape((y_test.shape[0],-1)).shape

(8019, 1)

In [166]:
y_test.value_counts()

0    3186
4    2853
3    1460
2     467
1      53
Name: newOutCome, dtype: int64

In [92]:
logreg.fit(X_train, y_train)
metrics.confusion_matrix(y_test,logreg.predict(X_test))

array([[1708,    0,   13,  211, 1272],
       [   8,    0,    1,    4,   33],
       [ 186,    0,   14,   87,  209],
       [ 860,    0,   12,  420,  152],
       [ 980,    0,    9,  160, 1680]])

In [167]:
np.mean(cross_val_score(logreg,X,y,cv=10,scoring="accuracy"))

0.47461778644854091

In [175]:
feature_cols = ['Animal_Type',"converted_age", "month"]
X_2 = pTrain[feature_cols]
y_2 = pTrain.newOutCome
X_train,X_test,y_train,y_test = train_test_split(X_2,y_2,test_size=0.3)

In [181]:
#Random forest of 500 "trees" or estimators
rf = RandomForestRegressor(n_estimators=500, bootstrap=True, oob_score=True, random_state=12)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)

#print "Random Forest RMSE:",np.sqrt(mean_squared_error(y_train,y_pred_rf))
print "Random Forest RMSE:",np.sqrt(mean_squared_error(y_test,y_pred_rf))

Random Forest RMSE: 1.62290243505


In [158]:
#changing AgeUponOutcome column to weeks Step 1 parse at the "string numbers
pTrain["age_new"] = pTrain.AgeuponOutcome.str.split(" ")

In [159]:
pTrain.AgeuponOutcome.value_counts()

1 year       3969
2 years      3742
2 months     3397
3 years      1823
1 month      1281
3 months     1277
4 years      1071
5 years       992
4 months      888
6 years       670
3 weeks       659
5 months      652
6 months      588
8 years       536
7 years       531
2 weeks       529
10 months     457
10 years      446
8 months      402
4 weeks       334
9 years       288
7 months      288
1 weeks       269
12 years      234
9 months      224
11 months     166
13 years      143
11 years      126
3 days        109
2 days         99
14 years       97
15 years       85
1 day          66
1 week         66
6 days         50
4 days         50
16 years       36
5 days         24
0 years        22
17 years       17
5 weeks        11
18 years       10
19 years        3
20 years        2
Name: AgeuponOutcome, dtype: int64

In [160]:
def parse_age_outcome(x):
    if x[1] == "days":
        return int(x[0])
    elif x[1] == "weeks":
        return int(x[0])*7
    elif x[1] == "year" or x[1] == "years":
        return int(x[0]) * 365
    elif x[1] == "month" or x[1] == "months":
        return int(x[0]) * 30
    else:
        return int(x[0])

In [161]:
pTrain["converted_age"] = pTrain.age_new.map(lambda x: parse_age_outcome(x))

In [79]:
pTrain.converted_age.value_counts()

365     3969
730     3742
60      3397
1095    1823
30      1281
90      1277
1460    1071
1825     992
120      888
2190     670
21       659
150      652
180      588
2920     536
2555     531
14       529
300      457
3650     446
240      402
28       334
3285     288
210      288
7        269
4380     234
270      224
330      166
4745     143
1        132
4015     126
3        109
2         99
5110      97
5475      85
6         50
4         50
5840      36
5         24
0         22
6205      17
35        11
6570      10
6935       3
7300       2
Name: converted_age, dtype: int64

In [97]:
monthGroups = pTrain.groupby("month")
monthGroupsSizes = monthGroups.size()
print "The month with the most activity is:"
print monthGroupsSizes[monthGroupsSizes==monthGroupsSizes.max()]

The month with the most activity is:
month
10    2881
dtype: int64


In [98]:
monthGroups = pTrain.groupby("month")
monthGroupsSizes = monthGroups.size()
print "The month with the least activity is:"
print monthGroupsSizes[monthGroupsSizes==monthGroupsSizes.min()]

The month with the most activity is:
month
3    1498
dtype: int64
