In [16]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
import timeit
from sklearn.metrics import accuracy_score
from sklearn import preprocessing

In [17]:
intake = pd.read_csv(r'C:\Users\1\Desktop\school\CS-519 Machine Learning\Project\Austin_Animal_Center_Intakes(2).csv',header=0)
outcome = pd.read_csv(r'C:\Users\1\Desktop\school\CS-519 Machine Learning\Project\Austin_Animal_Center_Outcomes(2).csv',header=0)

In [18]:
# Name column is mostly blank and will not be considered for analysis
# MonthYear and Date Time are the same so drop MonthYear
# Found Location is street names and mostly unique values we will drop this column
intake = intake.drop(['Name', 'MonthYear', 'Found Location'], axis=1)

In [19]:
# Remove the instances that are not Dog or Cat
animals = ['Other', 'Bird', 'Livestock']
for i in animals:
    intake.drop(intake[intake['Animal Type']==i].index, inplace = True)

In [20]:
# Put the Date Time into a correct format
intake['DateTime']=pd.to_datetime(intake['DateTime'])

In [21]:
# Sort the Date Time values so we can keep the most recent intake of an animal
intake = intake.sort_values(by=['DateTime'])
intake = intake.drop_duplicates(subset=['Animal ID'], keep = 'last')

In [22]:
# Perform the same cleaning on the coutcome dataset remove columns that appear in both sets
# Remove outcome subtype because it is mostly blank
outcome = outcome.drop(['Name', 'MonthYear', 'Animal Type', 'Breed', 'Color', 'Outcome Subtype'], axis=1)
outcome['DateTime']=pd.to_datetime(outcome['DateTime'])
outcome = outcome.sort_values(by=['DateTime'])
outcome = outcome.drop_duplicates(subset=['Animal ID'], keep = 'last')

In [23]:
# Merge the data sets on the animal ID and base the merge on the Outcome Data Set
# So we don't have animals that are still in the shelter
data = pd.merge(intake, outcome, on = 'Animal ID', how = 'right')

In [24]:
data['Time in Shelter'] = data['DateTime_y']-data['DateTime_x']
data = data.drop(['DateTime_x', 'DateTime_y', 'Animal ID'], axis=1)
data.head()

Unnamed: 0,Intake Type,Intake Condition,Animal Type,Sex upon Intake,Age upon Intake,Breed,Color,Date of Birth,Outcome Type,Sex upon Outcome,Age upon Outcome,Time in Shelter
0,Stray,Normal,Dog,Spayed Female,7 years,Border Terrier/Border Collie,White/Tan,09/07/2006,Return to Owner,Spayed Female,7 years,07:48:00
1,Stray,Normal,Cat,Unknown,1 week,Domestic Shorthair Mix,Orange/White,09/24/2013,Transfer,Unknown,1 week,02:06:00
2,Stray,Normal,Cat,Unknown,1 week,Domestic Shorthair Mix,Orange/White,09/24/2013,Transfer,Unknown,1 week,02:11:00
3,Stray,Normal,Cat,Unknown,1 week,Domestic Shorthair Mix,Orange/White,09/24/2013,Transfer,Unknown,1 week,02:11:00
4,Stray,Injured,Dog,Intact Female,3 years,Pit Bull Mix,Blue/White,09/30/2010,Euthanasia,Intact Female,3 years,06:40:00


In [25]:
# Lets drop 'Age upon Intake' and 'Date of Birth' since these are represented in 'Age upon Outcome' and 'Time in Shelter'
data = data.drop(['Age upon Intake', 'Date of Birth'], axis=1)

In [26]:
data.head()

Unnamed: 0,Intake Type,Intake Condition,Animal Type,Sex upon Intake,Breed,Color,Outcome Type,Sex upon Outcome,Age upon Outcome,Time in Shelter
0,Stray,Normal,Dog,Spayed Female,Border Terrier/Border Collie,White/Tan,Return to Owner,Spayed Female,7 years,07:48:00
1,Stray,Normal,Cat,Unknown,Domestic Shorthair Mix,Orange/White,Transfer,Unknown,1 week,02:06:00
2,Stray,Normal,Cat,Unknown,Domestic Shorthair Mix,Orange/White,Transfer,Unknown,1 week,02:11:00
3,Stray,Normal,Cat,Unknown,Domestic Shorthair Mix,Orange/White,Transfer,Unknown,1 week,02:11:00
4,Stray,Injured,Dog,Intact Female,Pit Bull Mix,Blue/White,Euthanasia,Intact Female,3 years,06:40:00


In [27]:
# Time was in multiple units, lets convert to the lowest - days
import re
def timestr_to_days(s):
    s = str(s)
    match = re.search('([\w.-]+) ([\w.-]+)', s)
    if match:
        x = str(match.group(1))
        y = str(match.group(2))
        multiplier = 1
        if "year" in y:
            multiplier = 365
        elif "month" in y:
            multiplier = 365 / 12
        elif "week" in y:
            multiplier = 7
        elif "day" in y:
            multiplier = 1
        return int(x) * multiplier  

In [28]:
# Create new column that has the animal age in days and drop 'Age Upon Outcome'
data['Age (Days)'] = [timestr_to_days(datum) for datum in data['Age upon Outcome']]
data = data.drop(['Age upon Outcome'], axis=1)

In [49]:
# Drop NaNs for now, there are ~8000 instances. We can discuss filling them later.
data = data.dropna()

In [50]:
# We will use label encoder to transform categorical data to numberic
# Split the attributes X and y
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
X = data.drop(['Time in Shelter'], axis = 1)
y = data['Time in Shelter']


Unnamed: 0,Intake Type,Intake Condition,Animal Type,Sex upon Intake,Breed,Color,Outcome Type,Sex upon Outcome,Age (Days)
0,Stray,Normal,Dog,Spayed Female,Border Terrier/Border Collie,White/Tan,Return to Owner,Spayed Female,2555.0
1,Stray,Normal,Cat,Unknown,Domestic Shorthair Mix,Orange/White,Transfer,Unknown,7.0


In [55]:
cols = ['Intake Type','Intake Condition', 'Animal Type', 'Sex upon Intake','Breed','Color','Outcome Type','Sex upon Outcome']
for i in cols:
    X[i]=le.fit_transform(X[i])


In [56]:
X.head()

Unnamed: 0,Intake Type,Intake Condition,Animal Type,Sex upon Intake,Breed,Color,Outcome Type,Sex upon Outcome,Age (Days)
0,4,5,1,3,502,546,6,3,2555.0
1,4,5,0,4,1104,358,8,4,7.0
2,4,5,0,4,1104,358,8,4,7.0
3,4,5,0,4,1104,358,8,4,7.0
4,4,3,1,0,1837,122,3,0,1095.0
