In [1]:
import re
import datetime
import pandas as pd
import numpy as np

import warnings 
warnings.filterwarnings('ignore')

%config InlineBackend.figure_format='retina'

In [2]:
train = pd.read_csv('data/train.csv.gz')
test = pd.read_csv('data/test.csv.gz')

In [3]:
# Header Cleaining Script
def clean_header(df):
    
    '''This function removes wierd characters and spaces from columns and keeps everything lowercase'''
    
    df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_').str.replace('(', '').str.replace(')','')

In [4]:
clean_header(train)
clean_header(test)

In [5]:
test.head()

Unnamed: 0,id,name,datetime,animaltype,sexuponoutcome,ageuponoutcome,breed,color
0,1,Summer,2015-10-12 12:15:00,Dog,Intact Female,10 months,Labrador Retriever Mix,Red/White
1,2,Cheyenne,2014-07-26 17:59:00,Dog,Spayed Female,2 years,German Shepherd/Siberian Husky,Black/Tan
2,3,Gus,2016-01-13 12:20:00,Cat,Neutered Male,1 year,Domestic Shorthair Mix,Brown Tabby
3,4,Pongo,2013-12-28 18:12:00,Dog,Intact Male,4 months,Collie Smooth Mix,Tricolor
4,5,Skooter,2015-09-24 17:59:00,Dog,Neutered Male,2 years,Miniature Poodle Mix,White


In [6]:
train.isnull().sum()

animalid              0
name               7691
datetime              0
outcometype           0
outcomesubtype    13612
animaltype            0
sexuponoutcome        1
ageuponoutcome       18
breed                 0
color                 0
dtype: int64

In [7]:
train.outcometype.value_counts()

Adoption           10769
Transfer            9422
Return_to_owner     4786
Euthanasia          1555
Died                 197
Name: outcometype, dtype: int64

In [8]:
train['sexuponoutcome'] = train.sexuponoutcome.replace('Unknown', 'Uknown Unknown').fillna('Uknown Unknown')

test['sexuponoutcome'] = test.sexuponoutcome.replace('Unknown', 'Uknown Unknown').fillna('Uknown Unknown')

train.head()

train['speutered'] = train.sexuponoutcome.apply(lambda x: str(x).split(' ')[0]).map({'Intact': 0, 
                                                                                     'Neutered': 1, 
                                                                                     'Spayed': 1,
                                                                                    'Unknown' : 2}).fillna('Unknown')

train['sex'] = train.sexuponoutcome.apply(lambda x: str(x).split(' ')[1]).map({'Male': 0, 'Female': 1, 'Unknown': 2})

test['speutered'] = test.sexuponoutcome.apply(lambda x: str(x).split(' ')[0]).map({'Intact': 0, 
                                                                                     'Neutered': 1, 
                                                                                     'Spayed': 1,
                                                                                    'Unknown' : 2}).fillna('Unknown')

test['sex'] = test.sexuponoutcome.apply(lambda x: str(x).split(' ')[1]).map({'Male': 0, 'Female': 1, 'Unknown': 2})
train['outcometype'] = train['outcometype'].replace({'Adoption': '0', 'Return_to_owner': '1', 
                                                     'Transfer': '2', 'Died':'3', 'Euthanasia': '4'}) 
# train['color'] = train['color'].replace({'Adoption': '0', 'black': '1'}) 

In [9]:
train['black'] = train['color'].apply(lambda x: 1 if x == "Black" else 0)
test['black'] = test['color'].apply(lambda x: 1 if x == "Black" else 0)

In [10]:
train['animaltype'] = train['animaltype'].replace({'Dog': '1', 'Cat': '0'})
test['animaltype'] = test['animaltype'].replace({'Dog': '1', 'Cat': '0'})

unpopular = ['Pit Bull Mix', 'Chihuahua Shorthair Mix', 
              'German Shepherd Mix', 'Rottweiler Mix', 
             'Pit Bull', 'Labrador Retriever/Pit Bull', 
             'Pit Bull/Labrador Retriever', 'Rottweiler']
train['unpopular'] = train['breed'].apply(lambda x: 1 if x in unpopular else 0)
test['unpopular'] = test['breed'].apply(lambda x: 1 if x in unpopular else 0)

In [11]:
#train.isnull().sum()

In [12]:
#train['outcomesubtype'].value_counts()

In [13]:
train['agemths'] = train['ageuponoutcome'].apply(lambda x: int(str(x).split(' ')[0])*12 if "year" in str(x) else int(str(x).split(' ')[0]) if "month" in str(x) else 0)
test['agemths'] = test['ageuponoutcome'].apply(lambda x: int(str(x).split(' ')[0])*12 if "year" in str(x) else int(str(x).split(' ')[0]) if "month" in str(x) else 0)
train['agemths'] = train['agemths'].fillna('0 Years')
test['agemths'] = test['agemths'].fillna('0 Years')

In [14]:
train['date'] = pd.to_datetime(train.datetime)
train['month'] = train['date'].apply(lambda x: x.month)
test['date'] = pd.to_datetime(test.datetime)
test['month'] = test['date'].apply(lambda x: x.month)

In [15]:
train = train[['month', 'agemths', 'outcometype', 'agemths', 'sex', 'speutered', 'breed', 'black', 'unpopular']]
test = test[['month', 'agemths',  'agemths', 'sex', 'speutered', 'breed', 'black', 'unpopular']]

In [16]:
train.head()

Unnamed: 0,month,agemths,outcometype,agemths.1,sex,speutered,breed,black,unpopular
0,2,12,1,12,0,1,Shetland Sheepdog Mix,0,0
1,10,12,4,12,1,1,Domestic Shorthair Mix,0,0
2,1,24,0,24,0,1,Pit Bull Mix,0,1
3,7,0,2,0,0,0,Domestic Shorthair Mix,0,0
4,11,24,2,24,0,1,Lhasa Apso/Miniature Poodle,0,0


In [17]:
test.head()

Unnamed: 0,month,agemths,agemths.1,sex,speutered,breed,black,unpopular
0,10,10,10,1,0,Labrador Retriever Mix,0,0
1,7,24,24,1,1,German Shepherd/Siberian Husky,0,0
2,1,12,12,0,1,Domestic Shorthair Mix,0,0
3,12,4,4,0,0,Collie Smooth Mix,0,0
4,9,24,24,0,1,Miniature Poodle Mix,0,0


In [18]:
train.isnull().sum()

month          0
agemths        0
outcometype    0
agemths        0
sex            0
speutered      0
breed          0
black          0
unpopular      0
dtype: int64

In [19]:
train.to_csv('data/train_clean.csv', index=False)
test.to_csv('data/test_clean.csv', index=False)