This notesbook deals with cleaning the data. 

In [123]:
import pandas as pd
import numpy as np
import datetime
from bs4 import BeautifulSoup as bf
import requests
import pickle

%matplotlib inline

In [92]:
prelim_data = pd.read_csv('animal_train.csv')

# Make the column names lowercase so they're easier to access
prelim_data.columns = [name.lower() for name in prelim_data.columns]

In [93]:
#Get an idea of the data
prelim_data.describe()

Unnamed: 0,animalid,name,datetime,outcometype,outcomesubtype,animaltype,sexuponoutcome,ageuponoutcome,breed,color
count,26729,19038,26729,26729,13117,26729,26728,26711,26729,26729
unique,26729,6374,22918,5,16,2,5,44,1380,366
top,A705677,Max,2015-08-11 00:00:00,Adoption,Partner,Dog,Neutered Male,1 year,Domestic Shorthair Mix,Black/White
freq,1,136,19,10769,7816,15595,9779,3969,8810,2824


In [94]:
# Delete the only row of data that is missing sexuponoutcome
data = prelim_data[-prelim_data.sexuponoutcome.isnull()]

### Age

In [95]:
# Make the age into days
from functions.age_to_days import to_days
# Make the age into days
data_age_list = to_days(data.ageuponoutcome[-data.ageuponoutcome.isnull()])
# Imputing the missing ages with the mean age
data_age_mean = np.mean(data_age_list)
data['age_in_days'] = to_days(data.ageuponoutcome.fillna(data_age_mean))

### Date

In [96]:
# Turned date into a timestamp
data['datetime'] = [datetime.datetime.strptime((string_date), '%Y-%m-%d %H:%M:%S') for string_date in data.datetime]

In [97]:
# Seperate it by year, month, day, weekday, hour
data['year'] = map(lambda x: x.strftime('%Y'), data.datetime)
data['month'] = map(lambda x: x.strftime('%b'), data.datetime)
data['day'] = map(lambda x: x.strftime('%d'), data.datetime)
data['weekday'] = map(lambda x: x.strftime('%A'), data.datetime)
data['hour'] = map(lambda x: x.hour, data.datetime)

In [98]:
# Sterile into
sterile_clean = [status_sex if status_sex == 'Unknown' else status_sex.split(' ')[0] for status_sex in data.sexuponoutcome]
data['sterile'] = ['Sterile' if status == 'Neutered' or status == 'Spayed' else status for status in sterile_clean]

## How To Deal With Color

In [99]:
def return_classes(color, color_order):
    try:
        return color.split('/')[color_order] 
    except:
        return 'None'

In [100]:
first_color_dog = [return_classes(color, 0) for color in data[data.animaltype == 'Dog'].color]
second_color_dog = [return_classes(color, 1) for color in data[data.animaltype == 'Dog'].color]

In [101]:
reduced_first_color = [return_classes(color, 0) for color in first_color_dog]
reduced_second_color = [color.split()[0] for color in second_color_dog]

In [102]:
first_color_complete = [return_classes(color, 0) for color in data.color]
second_color_complete = [return_classes(color, 1) for color in data.color]
reduced_first_color_complete = map(lambda x: x.split()[0], first_color_complete)
reduced_second_color_complete = map(lambda x: x.split()[0], second_color_complete)

In [103]:
first_color_cat = [return_classes(color, 0) for color in data[data.animaltype == 'Cat'].color]
second_color_cat = [return_classes(color, 1) for color in data[data.animaltype == 'Cat'].color]

In [104]:
df_cat = pd.DataFrame({'first_color_cat': first_color_cat,
              'second_color_cat': second_color_cat,
              'complete': data[data.animaltype == 'Cat'].color})

In [105]:
df_complete = pd.DataFrame({'first_color_complete': first_color_complete,
              'second_color_complete': second_color_complete,
              'all_complete': data.color})

In [106]:
data['first_color'] = reduced_first_color_complete
data['second_color'] = reduced_second_color_complete

## Breeds

In [107]:
len(data.breed.value_counts())

1380

There are a lot of breeds, so I reduced the dogs breeds into dog kennel club classes and I took the top cat breeds and
used only those

I scraped the table from the American Kennel Club's wikipedia page and added breeds that weren't there. 

In [121]:
url = 'https://en.wikipedia.org/wiki/List_of_dog_breeds_recognized_by_the_American_Kennel_Club'

breed_url = bf(requests.get(url).text, 'lxml')

breed_dict = {}
for i, line in enumerate(breed_url.find_all('li')[26:]):
    try:
        breed_dict[line.a.text] = line.a.next_sibling.split(', ')[1]
    except:
        pass

#Cats    
breed_dict['Domestic Shorthair'] = 'Domestic ShortHair'
breed_dict['Domestic Medium Hair'] = 'Domestic Medium Hair'
breed_dict['Domestic Longhair'] = 'Domestic Longhair'
breed_dict['Siamese'] = 'Siamese'
breed_dict['Snowshoe'] = 'Snowshoe'

#Dogs  
breed_dict['Shepherd'] = 'Herding'
breed_dict['Miniature Poodle'] = 'Toy'
breed_dict['Parson Russell Terrier'] = 'Terrier'
breed_dict['Chihuahua Shorthair'] = 'Toy'
breed_dict['Jack Russell Terrier'] = 'Terrier'
breed_dict['Chihuahua Longhair'] = 'Toy'
breed_dict['Anatol Shepherd'] = 'Working'
breed_dict['Pit Bull'] = 'Pit Bull'
breed_dict['American Bulldog'] = 'Non-Sporting'
breed_dict['Hound'] = 'Hound'
breed_dict['Pointer'] = 'Sporting'
breed_dict['Terrier'] = 'Terrier'
breed_dict['Bulldog'] = 'Non-Sporting'
breed_dict['Poodle'] = 'Toy'
breed_dict['other'] = 'other'

In [125]:
with open('breed_dict.pickle', 'wb') as handle:
    pickle.dump(breed_dict, handle)

<b> Then I Made a dictionary so that we could iterate through the breeds and they would give us back
the new condensed breed <b>

We then seperate split the seperate breeds 

In [110]:
first_breed_complete = [return_classes(color, 0) for color in data.breed]
second_breed_complete = [return_classes(color, 1) for color in data.breed]

We Then transform the breeds

In [111]:
without_mix_first_breed_complete = [breed.replace('Mix', '').strip() for breed in first_breed_complete]
reduced_first_breeds = [in_dict(i, breed_dict) for i in without_mix_first_breed_complete]

In [112]:
data['first_breed'] = first_breed_complete
data['second_breed'] = second_breed_complete
data['reduced_first_breed'] = reduced_first_breeds

In [113]:
df_breed_complete = pd.DataFrame({'first_breed_complete': first_breed_complete,
              'second_breed_complete': second_breed_complete,
              'all_complete': data.breed,
              'reduce_first_breed': data.reduced_first_breed})

In [114]:
## Make to Dummy Variables

In [115]:
data['sex'] = [status_sex if status_sex == 'Unknown' else status_sex.split(' ')[1] for status_sex in data.sexuponoutcome]
data['if_female'] = [1 if sex == 'Female' else 0 for sex in data.sex]
data['if_dog'] = [1 if type_ == 'Dog' else 0 for type_ in data.animaltype]
data['has_name'] = [1 if type(name) == str else 0 for name in data.name]

In [136]:
month_dummies = pd.get_dummies(data['month'], prefix = 'month', drop_first = True)
fertility_dummies = pd.get_dummies(data.sterile, prefix = 'fertility', drop_first= True)
weekday_dummies = pd.get_dummies(data.weekday, prefix = 'weekday', drop_first = True)
first_breed_dummies = pd.get_dummies(data.first_breed, prefix = 'first_breed', drop_first = True)
second_breed_dummies = pd.get_dummies(data.second_breed, prefix = 'second_breed', drop_first = True)
first_color_dummies = pd.get_dummies(data.first_color, prefix = 'first_color', drop_first = True)
second_color_dummies = pd.get_dummies(data.second_color, prefix = 'second_color', drop_first = True)
reduced_first_breed_dummies = pd.get_dummies(data.reduced_first_breed, prefix = 'first_breed', drop_first = True)

In [140]:
predict_data = data[['outcometype',  'age_in_days', 'hour', 'has_name', 'if_female', 'if_dog']].join([month_dummies,
                                                                                                    fertility_dummies,
                                                                                                    weekday_dummies,
                                                                                                     reduced_first_breed_dummies,
                                                                                                     first_color_dummies,
                                                                                                     second_color_dummies])

In [141]:
predict_data.to_csv('predict_data.csv')

In [142]:
predict_data.columns#.first_breed_Snowshoe

Index([                     u'outcometype',
                            u'age_in_days',
                                   u'hour',
                               u'has_name',
                              u'if_female',
                                 u'if_dog',
                              u'month_Aug',
                              u'month_Dec',
                              u'month_Feb',
                              u'month_Jan',
                              u'month_Jul',
                              u'month_Jun',
                              u'month_Mar',
                              u'month_May',
                              u'month_Nov',
                              u'month_Oct',
                              u'month_Sep',
                      u'fertility_Sterile',
                      u'fertility_Unknown',
                         u'weekday_Monday',
                       u'weekday_Saturday',
                         u'weekday_Sunday',
                       u'weekday

In [None]:
predi