In [124]:
import pandas as pd
import pickle
import numpy as np
from datetime import datetime
import cPickle

In [125]:
test = pd.read_csv('animal_test.csv')
test.columns = [name.lower() for name in test.columns]

with open('breed_dict.pkl', 'rb') as handle:
     breed_dict = pickle.load(handle)
        
with open('rforest_model.pkl', 'rb') as fid:
    rf_clf = cPickle.load(fid)        

In [126]:
from shelter_df.functions import return_classes, to_days, in_dict

In [127]:
test.head()

Unnamed: 0,id,name,datetime,animaltype,sexuponoutcome,ageuponoutcome,breed,color
0,1,Summer,2015-10-12 12:15:00,Dog,Intact Female,10 months,Labrador Retriever Mix,Red/White
1,2,Cheyenne,2014-07-26 17:59:00,Dog,Spayed Female,2 years,German Shepherd/Siberian Husky,Black/Tan
2,3,Gus,2016-01-13 12:20:00,Cat,Neutered Male,1 year,Domestic Shorthair Mix,Brown Tabby
3,4,Pongo,2013-12-28 18:12:00,Dog,Intact Male,4 months,Collie Smooth Mix,Tricolor
4,5,Skooter,2015-09-24 17:59:00,Dog,Neutered Male,2 years,Miniature Poodle Mix,White


# Cleaning Data

In [129]:
age_list = to_days(test.ageuponoutcome[-test.ageuponoutcome.isnull()])
age_mean = np.mean(age_list)

test['age_in_days'] = to_days(test.ageuponoutcome.fillna(age_mean))
test['has_name'] = [1 if type(name) == str else 0 for name in test.name]
test['month'] = [datetime.strptime((string_date), '%Y-%m-%d %H:%M:%S').strftime('%b') for string_date in test.datetime]
test['year'] = [datetime.strptime((string_date), '%Y-%m-%d %H:%M:%S').strftime('%Y') for string_date in test.datetime]
test['day'] = [datetime.strptime((string_date), '%Y-%m-%d %H:%M:%S').strftime('%d') for string_date in test.datetime]
test['weekday'] = [datetime.strptime((string_date), '%Y-%m-%d %H:%M:%S').strftime('%A') for string_date in test.datetime]
test['sex'] = [status_sex if status_sex == 'Unknown' else status_sex.split(' ')[1] for status_sex in test.sexuponoutcome]
sterile_clean = [status_sex if status_sex == 'Unknown' else status_sex.split(' ')[0] for status_sex in test.sexuponoutcome]
test['sterile'] = ['Sterile' if status == 'Neutered' or status == 'Spayed' else status for status in sterile_clean]
test['if_female'] = [1 if sex == 'Female' else 0 for sex in test.sex]
test['if_dog'] = [1 if type_ == 'Dog' else 0 for type_ in test.animaltype]
test['hour'] = [datetime.strptime((string_date), '%Y-%m-%d %H:%M:%S').hour for string_date in test.datetime]

first_breed_complete_test = [return_classes(color, 0) for color in test.breed]
without_mix_first_breed_complete_test = [breed.replace('Mix', '').strip() for breed in first_breed_complete_test]
reduced_first_breeds_test = [in_dict(i, breed_dict) for i in without_mix_first_breed_complete_test]
test['reduced_first_breed'] = reduced_first_breeds_test

first_color_test = [return_classes(color, 0) for color in test.color]
second_color_test = [return_classes(color, 1) for color in test.color]
reduced_first_color_test= map(lambda x: x.split()[0], first_color_test)
reduced_second_color_test = map(lambda x: x.split()[0], second_color_test)
test['first_color'] = reduced_first_color_test
test['second_color'] = reduced_second_color_test

In [141]:
test_month_dummies = pd.get_dummies(test['month'], prefix = 'month', drop_first = True)
test_fertility_dummies = pd.get_dummies(test.sterile, prefix = 'fertility', drop_first = True)
test_weekday_dummies = pd.get_dummies(test.weekday, prefix = 'weekday', drop_first = True)
test_breed_dummies = pd.get_dummies(test.reduced_first_breed, prefix = 'first_breed', drop_first = True)
test_first_color_dummies = pd.get_dummies(test.first_color, prefix = 'first_color', drop_first = True)
test_second_color_dummies = pd.get_dummies(test.second_color, prefix = 'second_color', drop_first = True)


submission_data = test[['age_in_days', 'hour', 'has_name', 'if_female', 'if_dog']].join([test_month_dummies, 
                                                                                       test_fertility_dummies,
                                                                                      test_weekday_dummies,
                                                                                      test_breed_dummies,
                                                                                       test_first_color_dummies,
                                                                                       test_second_color_dummies])
submission_data['first_color_Ruddy'] = range(len(submission_data))
submission_data['second_color_Flame'] = range(len(submission_data))
submission_data['second_color_Pink'] = range(len(submission_data))

## Lump Data

In [152]:
submission_pred = rf_clf.predict_proba(submission_data)

In [169]:
submission = pd.DataFrame(submission_pred, 
                          columns=['Adoption', 'Died','Euthanasia',  'Return_to_owner', 'Transfer'],
                          index= range(1,len(submission_pred)+1))
submission.index.name = 'ID'

In [171]:
submission.to_csv('cv_kaggle_submission.csv')