In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn import preprocessing, cross_validation

In [2]:
data = pd.read_csv('train.csv')

In [3]:
data_types = data.dtypes  
cat_cols = list(data_types[data_types=='object'].index)
num_cols = list(data_types[data_types=='int64'].index) + list(data_types[data_types=='float64'].index)

id_col = 'AnimalID'
target_col = 'OutcomeType'

data_params = dict()


#Categorical columns:
data_params['cat_cols'] = list(data_types[data_types=='object'].index)
data_params['cat_cols'].remove('AnimalID')
data_params['cat_cols'].remove('OutcomeType')
data_params['cat_cols'].remove('OutcomeSubtype')

#Numeric columns:
data_params['num_cols'] = list(data_types[data_types=='int64'].index) + list(data_types[data_types=='float64'].index)

#ID
data_params['id_col'] = ['AnimalID']
#Target
data_params['tgt_col'] = ['OutcomeType']

print (data_params)

{'tgt_col': ['OutcomeType'], 'num_cols': [], 'id_col': ['AnimalID'], 'cat_cols': ['Name', 'DateTime', 'AnimalType', 'SexuponOutcome', 'AgeuponOutcome', 'Breed', 'Color']}


In [4]:
#Proprocessing: Label encoding Categorical columns this works well for tree-based models 
for col in data_params['cat_cols']:
    print ("Label encoding  %s" % (col))
    LBL = preprocessing.LabelEncoder()
    LBL.fit(data[col])
    data[col]=LBL.transform(data[col])

Label encoding  Name
Label encoding  DateTime


  flag = np.concatenate(([True], aux[1:] != aux[:-1]))
  return aux[:-1][aux[1:] == aux[:-1]]


Label encoding  AnimalType
Label encoding  SexuponOutcome
Label encoding  AgeuponOutcome
Label encoding  Breed
Label encoding  Color


In [5]:
#Label target    
LBL = preprocessing.LabelEncoder()
LBL.fit(data['OutcomeType'])
tgt_cls = dict(zip(data['OutcomeType'].unique()
               , LBL.transform(data['OutcomeType'].unique())))

In [6]:
tgt_cls

{'Adoption': 0,
 'Died': 1,
 'Euthanasia': 2,
 'Return_to_owner': 3,
 'Transfer': 4}

In [7]:
data['OutcomeType']=LBL.transform(data['OutcomeType'])

In [8]:
train, validation = cross_validation.train_test_split(data, test_size = 0.2, random_state= 0)

In [9]:
simple_model = RandomForestClassifier(n_estimators=100, criterion='entropy',
                                max_features=0.5, max_depth=10,
                                oob_score=True)
full_cols = data_params['cat_cols'] + data_params['num_cols']
train_X = train[full_cols].values
train_y = train[data_params['tgt_col']].values
simple_model = simple_model.fit(train_X,train_y)



In [10]:
validation_X = validation[full_cols].values
validation_y = validation[data_params['tgt_col']].values
cv = simple_model.score(validation_X, validation_y )
cv

0.63075196408529743

Let's engineer the name by its length

In [11]:
tmp_data = pd.read_csv('train.csv')

In [12]:
def name_length(name):
    if type(name) is str:
        return len(name)
    else:
        return 0
data['LengthofName'] = tmp_data['Name'].apply(name_length)

In [13]:
data_params['cat_cols'].remove('Name')

In [14]:

data_params['num_cols'].append('LengthofName')

In [15]:
data.head()

Unnamed: 0,AnimalID,Name,DateTime,OutcomeType,OutcomeSubtype,AnimalType,SexuponOutcome,AgeuponOutcome,Breed,Color,LengthofName
0,A671945,2352,3361,3,,1,3,6,1221,130,7
1,A656520,1848,350,2,Suffering,0,4,6,640,167,5
2,A686464,4441,12681,0,Foster,1,3,22,1066,86,6
3,A683430,0,7190,4,Partner,0,2,26,640,42,0
4,A667013,0,1233,4,Partner,1,3,22,914,274,0


In [16]:
train, validation = cross_validation.train_test_split(data, test_size = 0.2, random_state= 0)
name_model = RandomForestClassifier(n_estimators=100, criterion='entropy',
                                max_features=0.5, max_depth=10,
                                oob_score=True)
full_cols = data_params['cat_cols'] + data_params['num_cols']
train_X = train[full_cols].values
train_y = train[data_params['tgt_col']].values
name_model =name_model.fit(train_X,train_y)



In [17]:
validation_X = validation[full_cols].values
validation_y = validation[data_params['tgt_col']].values
cv = name_model.score(validation_X, validation_y )
cv

0.63374485596707819

In [18]:
data_params['cat_cols'].append('Name')
data_params['num_cols'].remove('LengthofName')

In [19]:
data_params['cat_cols']

['DateTime',
 'AnimalType',
 'SexuponOutcome',
 'AgeuponOutcome',
 'Breed',
 'Color',
 'Name']

Let's transform the age

In [20]:
def age_to_day(age):
    if type(age) is str:
        if 'day' in age:
            age_to_day = int(age.split(' ')[0])
        elif 'week' in age:
            age_to_day = int(age.split(' ')[0])*7
        elif 'month' in age:
            age_to_day = int(age.split(' ')[0])*30
        elif 'year' in age:
            age_to_day = int(age.split(' ')[0])*365
    else:
        age_to_day = 0
    return age_to_day

In [21]:
data['Ageindays'] = tmp_data['AgeuponOutcome'].apply(age_to_day)

In [22]:
data_params['cat_cols'].remove('AgeuponOutcome')
data_params['num_cols'].append('Ageindays')

In [23]:
train, validation = cross_validation.train_test_split(data, test_size = 0.2, random_state= 0)
age_model = RandomForestClassifier(n_estimators=100, criterion='entropy',
                                max_features=0.5, max_depth=10,
                                oob_score=True)
full_cols = data_params['cat_cols'] + data_params['num_cols']
train_X = train[full_cols].values
train_y = train[data_params['tgt_col']].values
age_model =age_model.fit(train_X,train_y)



In [24]:
validation_X = validation[full_cols].values
validation_y = validation[data_params['tgt_col']].values
cv = age_model.score(validation_X, validation_y )
cv

0.65057987280209506

Aha, enginerring the age helps performance!Let's add datetime!

In [25]:
tmp_data['DateTime'] =  pd.to_datetime(tmp_data['DateTime'], coerce=True)

  if __name__ == '__main__':


In [26]:
data["DateTime"+'_year']=tmp_data["DateTime"].dt.year
data["DateTime"+'_month']=tmp_data["DateTime"].dt.month
data["DateTime"+'_day']=tmp_data["DateTime"].dt.day
data["DateTime"+'_dayofweek']=tmp_data["DateTime"].dt.dayofweek
data["DateTime"+'_weekofyear']=tmp_data["DateTime"].dt.weekofyear
data["DateTime"+'_dayofyear'] = tmp_data["DateTime"].dt.dayofyear
data["DateTime"+'_quarter'] = tmp_data["DateTime"].dt.quarter
data["DateTime"+'_hour'] = tmp_data["DateTime"].dt.hour
data_params['num_cols'].append("DateTime"+'_year')
data_params['num_cols'].append("DateTime"+'_month')
data_params['num_cols'].append("DateTime"+'_day')
data_params['num_cols'].append("DateTime"+'_dayofweek')
data_params['num_cols'].append("DateTime"+'_weekofyear')
data_params['num_cols'].append("DateTime"+'_dayofyear')
data_params['num_cols'].append("DateTime"+'_quarter')
data_params['num_cols'].append("DateTime"+'_hour') 

In [27]:
train, validation = cross_validation.train_test_split(data, test_size = 0.2, random_state= 0)
date_model = RandomForestClassifier(n_estimators=100, criterion='entropy',
                                max_features=0.5, max_depth=10,
                                oob_score=True)
full_cols = data_params['cat_cols'] + data_params['num_cols']
train_X = train[full_cols].values
train_y = train[data_params['tgt_col']].values
date_model =date_model.fit(train_X,train_y)



In [28]:
validation_X = validation[full_cols].values
validation_y = validation[data_params['tgt_col']].values
cv = date_model.score(validation_X, validation_y )
cv

0.68649457538346426

Future direction:

1. The trees used here is very simple. Using complicated tree (over-fit??)
2. Boosting.
3. Enginerring more features (like breed/color)
