# Diplodatos Kaggle Competition

We present this peace of code to create the baseline for the competition, and as an example of how to deal with these kind of problems. The main goals are that you:

1. Learn
1. Try different models and see which one fits the best the given data
1. Get a higher score than the given one in the current baseline example
1. Try to get the highest score in the class :)

In [1]:
# Import the required packages
import os
import time

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
# load the given labels
breed = pd.read_csv('../data/breed_labels.csv')
color = pd.read_csv('../data/color_labels.csv')
state = pd.read_csv('../data/state_labels.csv')

Now we take a look at the labels, just to understand what these are

In [3]:
breed.head()

Unnamed: 0,BreedID,Type,BreedName
0,1,1,Affenpinscher
1,2,1,Afghan Hound
2,3,1,Airedale Terrier
3,4,1,Akbash
4,5,1,Akita


In [4]:
color.head()

Unnamed: 0,ColorID,ColorName
0,1,Black
1,2,Brown
2,3,Golden
3,4,Yellow
4,5,Cream


In [5]:
state

Unnamed: 0,StateID,StateName
0,41336,Johor
1,41325,Kedah
2,41367,Kelantan
3,41401,Kuala Lumpur
4,41415,Labuan
5,41324,Melaka
6,41332,Negeri Sembilan
7,41335,Pahang
8,41330,Perak
9,41380,Perlis


And now we are ready to deal with the *original* dataset...

In [3]:
original_df = pd.read_csv('../data/train.csv')

In [7]:
original_df.columns

Index(['Type', 'Age', 'Breed1', 'Breed2', 'Gender', 'Color1', 'Color2',
       'Color3', 'MaturitySize', 'FurLength', 'Vaccinated', 'Dewormed',
       'Sterilized', 'Health', 'Quantity', 'Fee', 'State', 'Description',
       'AdoptionSpeed', 'PID'],
      dtype='object')

In [8]:
original_df.describe()

Unnamed: 0,Type,Age,Breed1,Breed2,Gender,Color1,Color2,Color3,MaturitySize,FurLength,Vaccinated,Dewormed,Sterilized,Health,Quantity,Fee,State,AdoptionSpeed,PID
count,10582.0,10582.0,10582.0,10582.0,10582.0,10582.0,10582.0,10582.0,10582.0,10582.0,10582.0,10582.0,10582.0,10582.0,10582.0,10582.0,10582.0,10582.0,10582.0
mean,1.454734,10.520412,265.469854,74.388868,1.779059,2.230675,3.236912,1.856738,1.860518,1.460971,1.72973,1.566528,1.912115,1.036666,1.584011,20.80996,41345.994613,2.5189,7477.025799
std,0.49797,18.374027,60.12149,123.43401,0.684763,1.743985,2.748595,2.974465,0.547535,0.593843,0.670791,0.701482,0.564041,0.198228,1.488348,78.397243,32.409109,1.176018,4310.921553
min,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,41324.0,0.0,0.0
25%,1.0,2.0,265.0,0.0,1.0,1.0,0.0,0.0,2.0,1.0,1.0,1.0,2.0,1.0,1.0,0.0,41326.0,2.0,3768.25
50%,1.0,3.0,266.0,0.0,2.0,2.0,2.0,0.0,2.0,1.0,2.0,1.0,2.0,1.0,1.0,0.0,41326.0,2.0,7473.5
75%,2.0,12.0,307.0,188.0,2.0,3.0,6.0,5.0,2.0,2.0,2.0,2.0,2.0,1.0,1.0,0.0,41401.0,4.0,11200.75
max,2.0,255.0,307.0,307.0,3.0,7.0,7.0,7.0,4.0,3.0,3.0,3.0,3.0,3.0,20.0,3000.0,41415.0,4.0,14992.0


In [9]:
original_df.head()

Unnamed: 0,Type,Age,Breed1,Breed2,Gender,Color1,Color2,Color3,MaturitySize,FurLength,Vaccinated,Dewormed,Sterilized,Health,Quantity,Fee,State,Description,AdoptionSpeed,PID
0,2,3,299,0,1,1,7,0,1,1,2,2,2,1,1,100,41326,Nibble is a 3+ month old ball of cuteness. He ...,2,0
1,1,4,307,0,2,1,2,0,2,1,1,1,2,1,1,150,41401,"Good guard dog, very alert, active, obedience ...",2,3
2,1,1,307,0,1,1,0,0,2,1,2,2,2,1,1,0,41326,This handsome yet cute boy is up for adoption....,2,4
3,2,3,266,0,2,5,6,0,2,1,2,2,2,1,1,0,41326,This is a stray kitten that came to my house. ...,2,5
4,2,12,264,264,1,1,0,0,2,3,2,2,3,1,1,300,41326,anyone within the area of ipoh or taiping who ...,1,6


Create a function to transform the datasets. This is done by means of a function so that the transformations are the same for the training and testing datasets... We replace the encodings just to make it easy to "visualize" the data

In [4]:
def transform_data(train_data_fname, test_data_fname):
    def transform_columns(df):
        df = df.drop(["Description"], axis=1)
        #df.Type = df.Type.replace({1: 'Dog', 2: 'Cat'})
        #df.Gender = df.Gender.replace({1:'Male', 2:'Female', 3:'Mixed'})
        #df.MaturitySize = df.MaturitySize.replace({1:'S', 2:'M', 3:'L', 4:'XL', 0:'N/A'})
        #df.FurLength = df.FurLength.replace({1:'S', 2:'M', 3:'L', 0:'N/A'})
        #df.Vaccinated = df.Vaccinated.replace({1:'T', 2:'N', 3:'N/A'})
        #df.Dewormed = df.Dewormed.replace({1:'T', 2:'F', 3:'N/A'})
        #df.Sterilized = df.Sterilized.replace({1:'T', 2:'F', 3:'N/A'})
        #df.Health = df.Health.replace({1:'Healthy', 2: 'MinorInjury', 3:'SeriousInjury', 0: 'N/A'})
        #df.Color1 = df.Color1.replace(dict(list(zip(color.ColorID, color.ColorName)) + [(0, "N/A")]))
        #df.Color2 = df.Color2.replace(dict(list(zip(color.ColorID, color.ColorName)) + [(0, "N/A")]))
        #df.Color3 = df.Color3.replace(dict(list(zip(color.ColorID, color.ColorName)) + [(0, "N/A")]))
        #df.Breed1 = df.Breed1.replace(dict(list(zip(breed.BreedID, breed.BreedName)) + [(0, "N/A")]))
        #df.Breed2 = df.Breed2.replace(dict(list(zip(breed.BreedID, breed.BreedName)) + [(0, "N/A")]))
        return df
    
    df_train = pd.read_csv(train_data_fname)
    df_train = transform_columns(df_train)
    df_test = pd.read_csv(test_data_fname)
    df_test = transform_columns(df_test)
    
    df = pd.concat([df_train, df_test], sort=True)

    # set dummy variables for everything
    # except from Age, Quantity, Fee
    df = pd.get_dummies(df)
    # get train and test back
    n = len(df_train)
    df_train = df.iloc[:n]
    df_test = df.iloc[n:]
    
    y = df_train['AdoptionSpeed']
    X = df_train.drop('AdoptionSpeed', axis=1)
    yy = None
    XX = df_test.drop('AdoptionSpeed', axis=1)

    return X, y, XX, yy

Load the data...

In [5]:
X, y, XX, yy = transform_data("../data/train.csv", "../data/test.csv")

Create the model and evaluate it

In [6]:
# split training dataset into train and "validation" 
# (we won't be using validation set in this example, because of the cross-validation;
# but it couldn be useful for you depending on your approach)
from sklearn.model_selection import train_test_split
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.3, random_state=42)

from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV

results = pd.DataFrame(columns=('clf', 'best_acc'))

In [159]:
y_train.value_counts(normalize=True)

4.0    0.277035
2.0    0.272715
3.0    0.217902
1.0    0.206156
0.0    0.026191
Name: AdoptionSpeed, dtype: float64

In [161]:
#X_train[best_feature_names].info()
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7407 entries, 8685 to 7270
Data columns (total 18 columns):
Age             7407 non-null int64
Breed1          7407 non-null int64
Breed2          7407 non-null int64
Color1          7407 non-null int64
Color2          7407 non-null int64
Color3          7407 non-null int64
Dewormed        7407 non-null int64
Fee             7407 non-null int64
FurLength       7407 non-null int64
Gender          7407 non-null int64
Health          7407 non-null int64
MaturitySize    7407 non-null int64
PID             7407 non-null int64
Quantity        7407 non-null int64
State           7407 non-null int64
Sterilized      7407 non-null int64
Type            7407 non-null int64
Vaccinated      7407 non-null int64
dtypes: int64(18)
memory usage: 1.1 MB


In [170]:
#X_train[best_feature_names].describe()
X_train.describe()

Unnamed: 0,Age,Breed1,Breed2,Color1,Color2,Color3,Dewormed,Fee,FurLength,Gender,Health,MaturitySize,PID,Quantity,State,Sterilized,Type,Vaccinated
count,7407.0,7407.0,7407.0,7407.0,7407.0,7407.0,7407.0,7407.0,7407.0,7407.0,7407.0,7407.0,7407.0,7407.0,7407.0,7407.0,7407.0,7407.0
mean,10.753746,265.071824,74.017011,2.240043,3.225462,1.832186,1.563251,21.378561,1.462535,1.771432,1.036857,1.862832,7506.54057,1.582962,41346.047658,1.9086,1.45349,1.722425
std,18.749651,60.321886,123.021531,1.749627,2.75666,2.965014,0.702232,79.112298,0.59471,0.683318,0.196835,0.55053,4315.216877,1.490666,32.468174,0.567021,0.497866,0.673894
min,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,3.0,1.0,41324.0,1.0,1.0,1.0
25%,2.0,265.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,2.0,3786.5,1.0,41326.0,2.0,1.0,1.0
50%,3.0,266.0,0.0,2.0,2.0,0.0,1.0,0.0,1.0,2.0,1.0,2.0,7525.0,1.0,41326.0,2.0,1.0,2.0
75%,12.0,307.0,179.0,3.0,6.0,5.0,2.0,0.0,2.0,2.0,1.0,2.0,11216.0,1.0,41401.0,2.0,2.0,2.0
max,255.0,307.0,307.0,7.0,7.0,7.0,3.0,3000.0,3.0,3.0,3.0,4.0,14988.0,20.0,41415.0,3.0,2.0,3.0


In [171]:
for col in X_train.columns:
    print(X_train[col].value_counts())

2      1716
1      1117
3       958
4       570
12      487
24      327
6       266
5       259
36      216
8       178
7       136
48      124
60       99
0        94
9        92
10       83
18       81
84       58
11       45
72       45
15       43
17       34
96       28
14       27
30       26
16       23
13       22
20       20
120      19
19       16
       ... 
33        2
212       2
63        2
68        1
35        1
43        1
44        1
135       1
122       1
95        1
86        1
82        1
238       1
66        1
76        1
80        1
45        1
88        1
100       1
112       1
46        1
144       1
156       1
168       1
180       1
81        1
61        1
57        1
147       1
255       1
Name: Age, Length: 92, dtype: int64
307    2945
266    1806
265     607
299     170
264     136
292     128
285     101
141     101
205      94
179      83
109      75
218      73
254      55
20       46
243      45
189      44
103      44
152      41
283      38
213 

In [172]:
X_train.columns

Index(['Age', 'Breed1', 'Breed2', 'Color1', 'Color2', 'Color3', 'Dewormed',
       'Fee', 'FurLength', 'Gender', 'Health', 'MaturitySize', 'PID',
       'Quantity', 'State', 'Sterilized', 'Type', 'Vaccinated'],
      dtype='object')

In [173]:
# comento porque algunos modelos se quejan del tipo
#for col in ['Vaccinated','Type','Sterilized','State','Quantity','MaturitySize','Health','Gender','FurLength','Dewormed','Color1', 'Color2', 'Color3']:
#    X_train[col] = pd.Categorical(X_train[col])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [174]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7407 entries, 8685 to 7270
Data columns (total 18 columns):
Age             7407 non-null int64
Breed1          7407 non-null int64
Breed2          7407 non-null int64
Color1          7407 non-null category
Color2          7407 non-null category
Color3          7407 non-null category
Dewormed        7407 non-null category
Fee             7407 non-null int64
FurLength       7407 non-null category
Gender          7407 non-null category
Health          7407 non-null category
MaturitySize    7407 non-null category
PID             7407 non-null int64
Quantity        7407 non-null category
State           7407 non-null category
Sterilized      7407 non-null category
Type            7407 non-null category
Vaccinated      7407 non-null category
dtypes: category(13), int64(5)
memory usage: 444.6 KB


In [175]:
from sklearn.tree import DecisionTreeClassifier as DT
start_time = time.time()
tree_param = {'criterion':('gini', 'entropy'),
              'min_samples_leaf':(1, 2, 5),
              'min_samples_split':(2, 3, 5, 10, 50, 100),
              'min_impurity_split':(1,2, 3, 5, 10, 50, 100),
              #'max_features': [10,50,100,200,300, None],
              'max_depth':[1,10,100,1000],
              'presort':[True, False]}
tree = DT(random_state=42)
tree_clf = GridSearchCV(tree, tree_param, scoring='accuracy', cv=3, iid=False, n_jobs=-1)
tree_clf.fit(X_train.drop(["PID"], axis=1), y_train)
best_tree_clf = tree_clf.best_estimator_
print('Best Decision Tree accuracy: ', tree_clf.best_score_)
print(best_tree_clf)
results = results.append({'clf': best_tree_clf, 'best_acc': tree_clf.best_score_}, ignore_index=True)

print('The best classifier so far is: ')
print(results.loc[results['best_acc'].idxmax()]['clf'])
print(f'Seconds: {time.time() - start_time}')

Best Decision Tree accuracy:  0.3642514953231318
DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=10,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=2,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=True,
                       random_state=42, splitter='best')
The best classifier so far is: 
XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=1.0, max_delta_step=0, max_depth=1,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='multi:softprob', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)
Seconds: 80.52197313308716




In [176]:
best_features = best_tree_clf.feature_importances_

In [181]:
best_n_features = sorted(best_features,reverse=True)[0:len([x for x in best_features if x!=0]) - 1]
best_n_features_index = [list(best_features).index(x) for x in best_n_features]
best_n_feature_names = X_train.drop('PID',axis=1).columns[best_n_features_index]

In [182]:
best_n_feature_names

Index(['Breed1', 'Age', 'Breed2', 'Sterilized', 'Quantity', 'FurLength',
       'State', 'Gender', 'MaturitySize', 'Color3', 'Dewormed', 'Fee',
       'Color2'],
      dtype='object')

In [7]:
from sklearn.ensemble import RandomForestClassifier

In [8]:
start_time = time.time()
rfc_param = {
#    'n_estimators': [10, 100, 1000],
    'n_estimators': [10, 100, 1000,5, 50, 500],
    'criterion': ['gini','entropy'],
    'max_depth': [1, 10, 100, 1000,5, 50, 500],
    'min_samples_split': [2, 5, 10, 100],
#    'min_samples_leaf': [1, 2, 5, 10, 100]
}

rfc = RandomForestClassifier(random_state=0)
rfc_clf = GridSearchCV(rfc, rfc_param, scoring='accuracy', cv=5, iid=False, n_jobs=-1)
rfc_clf.fit(X.drop(["PID"], axis=1), y)
#rfc_clf.fit(X_train[best_n_feature_names], y_train)
best_rfc_clf = rfc_clf.best_estimator_
print('Best Random Forest accuracy: ', rfc_clf.best_score_)
print(best_rfc_clf)
results = results.append({'clf': best_rfc_clf, 'best_acc': rfc_clf.best_score_}, ignore_index=True)

print('The best classifier so far is: ')
print(results.loc[results['best_acc'].idxmax()]['clf'])
print(f'Seconds: {time.time() - start_time}')

Best Random Forest accuracy:  0.39529207607330896
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=100, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=100,
                       min_weight_fraction_leaf=0.0, n_estimators=500,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)
The best classifier so far is: 
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=100, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=100,
                       min_weight_fraction_leaf=0.0, n_estimators=500,
                       n_jobs=None, oob_score=False, random_s

**And finally**, we predict the unknown label for the testing set

In [13]:
X.shape, XX.shape

((10582, 18), (4411, 18))

In [14]:
results[results.best_acc == results.best_acc.max()].clf.item()

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=100, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=100,
                       min_weight_fraction_leaf=0.0, n_estimators=500,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

In [15]:
results

Unnamed: 0,clf,best_acc
0,"(DecisionTreeClassifier(class_weight=None, cri...",0.395292


In [21]:
yy = results.clf.iloc[0].predict(XX.drop(["PID"], axis=1))
#yy = results[results.best_acc == results.best_acc.max()].clf.item().predict(XX[best_30_feature_names])
yy = yy.astype(np.int)

In [24]:
#yy = results.clf.iloc[7].predict(XX.drop('PID',axis=1))
#yy = yy.astype(np.int)

The last thing we do is generating a file that should be *submitted* on kaggle

In [22]:
submission = pd.DataFrame(list(zip(XX.PID, yy)), columns=["PID", "AdoptionSpeed"])

In [23]:
submission.to_csv("../data/submission.csv", header=True, index=False)