# Diplodatos Kaggle Competition

We present this peace of code to create the baseline for the competition, and as an example of how to deal with these kind of problems. The main goals are that you:

1. Learn
1. Try different models and see which one fits the best the given data
1. Get a higher score than the given one in the current baseline example
1. Try to get the highest score in the class :)

In [18]:
# Import the required packages
import os

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier as DT
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import Perceptron
from sklearn.svm import LinearSVC
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import RadiusNeighborsClassifier, KNeighborsClassifier
from sklearn.neighbors import NearestCentroid

In [19]:
# load the given labels
breed = pd.read_csv('../data/breed_labels.csv')
color = pd.read_csv('../data/color_labels.csv')
state = pd.read_csv('../data/state_labels.csv')

Now we take a look at the labels, just to understand what these are

In [20]:
breed.head()

Unnamed: 0,BreedID,Type,BreedName
0,1,1,Affenpinscher
1,2,1,Afghan Hound
2,3,1,Airedale Terrier
3,4,1,Akbash
4,5,1,Akita


In [21]:
color.head()

Unnamed: 0,ColorID,ColorName
0,1,Black
1,2,Brown
2,3,Golden
3,4,Yellow
4,5,Cream


In [22]:
state

Unnamed: 0,StateID,StateName
0,41336,Johor
1,41325,Kedah
2,41367,Kelantan
3,41401,Kuala Lumpur
4,41415,Labuan
5,41324,Melaka
6,41332,Negeri Sembilan
7,41335,Pahang
8,41330,Perak
9,41380,Perlis


And now we are ready to deal with the *original* dataset...

In [23]:
original_df = pd.read_csv('../data/train.csv')

In [24]:
original_df.columns

Index(['Type', 'Age', 'Breed1', 'Breed2', 'Gender', 'Color1', 'Color2',
       'Color3', 'MaturitySize', 'FurLength', 'Vaccinated', 'Dewormed',
       'Sterilized', 'Health', 'Quantity', 'Fee', 'State', 'Description',
       'AdoptionSpeed', 'PID'],
      dtype='object')

In [25]:
original_df.describe()

Unnamed: 0,Type,Age,Breed1,Breed2,Gender,Color1,Color2,Color3,MaturitySize,FurLength,Vaccinated,Dewormed,Sterilized,Health,Quantity,Fee,State,AdoptionSpeed,PID
count,10582.0,10582.0,10582.0,10582.0,10582.0,10582.0,10582.0,10582.0,10582.0,10582.0,10582.0,10582.0,10582.0,10582.0,10582.0,10582.0,10582.0,10582.0,10582.0
mean,1.454734,10.520412,265.469854,74.388868,1.779059,2.230675,3.236912,1.856738,1.860518,1.460971,1.72973,1.566528,1.912115,1.036666,1.584011,20.80996,41345.994613,2.5189,7477.025799
std,0.49797,18.374027,60.12149,123.43401,0.684763,1.743985,2.748595,2.974465,0.547535,0.593843,0.670791,0.701482,0.564041,0.198228,1.488348,78.397243,32.409109,1.176018,4310.921553
min,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,41324.0,0.0,0.0
25%,1.0,2.0,265.0,0.0,1.0,1.0,0.0,0.0,2.0,1.0,1.0,1.0,2.0,1.0,1.0,0.0,41326.0,2.0,3768.25
50%,1.0,3.0,266.0,0.0,2.0,2.0,2.0,0.0,2.0,1.0,2.0,1.0,2.0,1.0,1.0,0.0,41326.0,2.0,7473.5
75%,2.0,12.0,307.0,188.0,2.0,3.0,6.0,5.0,2.0,2.0,2.0,2.0,2.0,1.0,1.0,0.0,41401.0,4.0,11200.75
max,2.0,255.0,307.0,307.0,3.0,7.0,7.0,7.0,4.0,3.0,3.0,3.0,3.0,3.0,20.0,3000.0,41415.0,4.0,14992.0


In [26]:
original_df.head()

Unnamed: 0,Type,Age,Breed1,Breed2,Gender,Color1,Color2,Color3,MaturitySize,FurLength,Vaccinated,Dewormed,Sterilized,Health,Quantity,Fee,State,Description,AdoptionSpeed,PID
0,2,3,299,0,1,1,7,0,1,1,2,2,2,1,1,100,41326,Nibble is a 3+ month old ball of cuteness. He ...,2,0
1,1,4,307,0,2,1,2,0,2,1,1,1,2,1,1,150,41401,"Good guard dog, very alert, active, obedience ...",2,3
2,1,1,307,0,1,1,0,0,2,1,2,2,2,1,1,0,41326,This handsome yet cute boy is up for adoption....,2,4
3,2,3,266,0,2,5,6,0,2,1,2,2,2,1,1,0,41326,This is a stray kitten that came to my house. ...,2,5
4,2,12,264,264,1,1,0,0,2,3,2,2,3,1,1,300,41326,anyone within the area of ipoh or taiping who ...,1,6


Create a function to transform the datasets. This is done by means of a function so that the transformations are the same for the training and testing datasets... We replace the encodings just to make it easy to "visualize" the data

In [27]:
def transform_data(train_data_fname, test_data_fname):
    def transform_columns(df):
        df = df.drop(["Description"], axis=1)
        df.Type = df.Type.replace({1: 'Dog', 2: 'Cat'})
        df.Gender = df.Gender.replace({1:'Male', 2:'Female', 3:'Mixed'})
        df.MaturitySize = df.MaturitySize.replace({1:'S', 2:'M', 3:'L', 4:'XL', 0:'N/A'})
        df.FurLength = df.FurLength.replace({1:'S', 2:'M', 3:'L', 0:'N/A'})
        df.Vaccinated = df.Vaccinated.replace({1:'T', 2:'N', 3:'N/A'})
        df.Dewormed = df.Dewormed.replace({1:'T', 2:'F', 3:'N/A'})
        df.Sterilized = df.Sterilized.replace({1:'T', 2:'F', 3:'N/A'})
        df.Health = df.Health.replace({1:'Healthy', 2: 'MinorInjury', 3:'SeriousInjury', 0: 'N/A'})
        df.Color1 = df.Color1.replace(dict(list(zip(color.ColorID, color.ColorName)) + [(0, "N/A")]))
        df.Color2 = df.Color2.replace(dict(list(zip(color.ColorID, color.ColorName)) + [(0, "N/A")]))
        df.Color3 = df.Color3.replace(dict(list(zip(color.ColorID, color.ColorName)) + [(0, "N/A")]))
        df.Breed1 = df.Breed1.replace(dict(list(zip(breed.BreedID, breed.BreedName)) + [(0, "N/A")]))
        df.Breed2 = df.Breed2.replace(dict(list(zip(breed.BreedID, breed.BreedName)) + [(0, "N/A")]))
        return df
    
    df_train = pd.read_csv(train_data_fname)
    df_train = transform_columns(df_train)
    df_test = pd.read_csv(test_data_fname)
    df_test = transform_columns(df_test)
    
    df = pd.concat([df_train, df_test], sort=True)

    # set dummy variables for everything
    # except from Age, Quantity, Fee
    df = pd.get_dummies(df)
    # get train and test back
    n = len(df_train)
    df_train = df.iloc[:n]
    df_test = df.iloc[n:]
    
    y = df_train['AdoptionSpeed']
    X = df_train.drop('AdoptionSpeed', axis=1)
    yy = None
    XX = df_test.drop('AdoptionSpeed', axis=1)

    return X, y, XX, yy

Load the data...

In [11]:
X, y, XX, yy = transform_data("../data/train.csv", "../data/test.csv")

In [29]:
X.head()

Unnamed: 0,Age,Fee,PID,Quantity,State,Breed1_Abyssinian,Breed1_Affenpinscher,Breed1_Airedale Terrier,Breed1_Akita,Breed1_American Bulldog,...,MaturitySize_S,MaturitySize_XL,Sterilized_F,Sterilized_N/A,Sterilized_T,Type_Cat,Type_Dog,Vaccinated_N,Vaccinated_N/A,Vaccinated_T
0,3,100,0,1,41326,0,0,0,0,0,...,1,0,1,0,0,1,0,1,0,0
1,4,150,3,1,41401,0,0,0,0,0,...,0,0,1,0,0,0,1,0,0,1
2,1,0,4,1,41326,0,0,0,0,0,...,0,0,1,0,0,0,1,1,0,0
3,3,0,5,1,41326,0,0,0,0,0,...,0,0,1,0,0,1,0,1,0,0
4,12,300,6,1,41326,0,0,0,0,0,...,0,0,0,1,0,1,0,1,0,0


Split training dataset into train and "validation" 

(we won't be using validation set in this example, because of the cross-validation; but it couldn be useful for you depending on your approach)

In [12]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.3, random_state=42)

# Creating a dataframe to store the classifiers
results = pd.DataFrame(columns=('clf', 'best_acc'))

We also scalate the data to improve results and accelerate the learning process

In [13]:
scaler = StandardScaler()
scaler.fit(X_train.drop(["PID"], axis=1))
scaled_X_train = scaler.transform(X_train.drop(["PID"], axis=1))

  return self.partial_fit(X, y)
  This is separate from the ipykernel package so we can avoid doing imports until


Create a **Decision Tree** model and evaluate it

In [14]:
tree_param = {'criterion':('gini', 'entropy'), 'min_samples_leaf':(1, 2, 5),
              'min_samples_split':(2, 3, 5, 10, 50, 100)}
tree = DT(random_state=42)
tree_clf = GridSearchCV(tree, tree_param, scoring='accuracy', cv=3, iid=False, n_jobs=-1)
tree_clf.fit(scaled_X_train, y_train)
best_tree_clf = tree_clf.best_estimator_
print('Best Decision Tree accuracy: ', tree_clf.best_score_)
print(best_tree_clf)
results = results.append({'clf': best_tree_clf, 'best_acc': tree_clf.best_score_}, ignore_index=True)

print('The best classifier so far is: ')
print(results.loc[results['best_acc'].idxmax()]['clf'])

Best Decision Tree accuracy:  0.3519622095560508
DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=100,
            min_weight_fraction_leaf=0.0, presort=False, random_state=42,
            splitter='best')
The best classifier so far is: 
DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=100,
            min_weight_fraction_leaf=0.0, presort=False, random_state=42,
            splitter='best')


Creating a **Logistic Regression** model and evaluate it

In [15]:
log_regr_param = {'C':(1e+4, 1e+3, 1e+2),
                  'solver':('newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'),
                  'max_iter':(5, 20, 100, 1000, 2000)}

log_regr = LogisticRegression(random_state=42)
log_reg_clf = GridSearchCV(log_regr, log_regr_param, scoring='accuracy', cv=3, iid=False, n_jobs=-1)
log_reg_clf.fit(scaled_X_train, y_train)
best_log_reg_clf = log_reg_clf.best_estimator_
print('Best Logistic Regression accuracy: ', log_reg_clf.best_score_)
print(best_log_reg_clf)
results = results.append({'clf': best_log_reg_clf, 'best_acc': log_reg_clf.best_score_}, ignore_index=True)

print('The best classifier so far is: ')
print(results.loc[results['best_acc'].idxmax()]['clf'])

KeyboardInterrupt: 

Creating a **Perceptron** model and evaluate it

In [None]:
perceptron_param = {'shuffle':(True, False),
                  'random_state':(1, 10, 20, 30, 40, 50, 100),
                  'max_iter':(5, 20, 100, 1000, 2000)}

perceptron = Perceptron()
perceptron_clf = GridSearchCV(perceptron, perceptron_param, scoring='accuracy', cv=3, iid=False, n_jobs=-1)
perceptron_clf.fit(scaled_X_train, y_train)
best_perceptron_clf = perceptron_clf.best_estimator_
print('Best Perceptron accuracy: ', perceptron_clf.best_score_)
print(best_perceptron_clf)
results = results.append({'clf': best_perceptron_clf, 'best_acc': perceptron_clf.best_score_}, ignore_index=True)

print('The best classifier so far is: ')
print(results.loc[results['best_acc'].idxmax()]['clf'])

Creating a **SVM** model and evaluate it

In [20]:
svm_param = {'loss':('hinge', 'squared_hinge'),
             'dual':(True, False),
             'C':(1e+3, 1e+2, 1e+1, 1),
             'tol':(1e-3, 1e-4, 1e-5),
             'random_state':(1, 10, 20, 30, 40, 50, 100),
             'max_iter':(5, 20, 100, 1000, 2000)}

svm = LinearSVC()
svm_clf = GridSearchCV(svm, svm_param, scoring='accuracy', cv=3, iid=False, n_jobs=-1, error_score=0.0)
svm_clf.fit(scaled_X_train, y_train)
best_svm_clf = svm_clf.best_estimator_
print('Best SVM accuracy: ', svm_clf.best_score_)
print(best_svm_clf)
results = results.append({'clf': best_svm_clf, 'best_acc': svm_clf.best_score_}, ignore_index=True)

print('The best classifier so far is: ')
print(results.loc[results['best_acc'].idxmax()]['clf'])

Best SVM accuracy:  0.360060687360994
LinearSVC(C=1, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=1, tol=0.001, verbose=0)
The best classifier so far is: 
LinearSVC(C=1, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=1, tol=0.001, verbose=0)




Creating a **NN** models and evaluate them

In [8]:
knn_param = {'n_neighbors':(3, 4, 5, 6),
             'weights':('uniform', 'distance'),
             'algorithm':('auto', 'ball_tree', 'kd_tree', 'brute'),
             'leaf_size':(15, 30, 60),
             'p':(1, 2)}

knn = KNeighborsClassifier()
knn_clf = GridSearchCV(knn, knn_param, scoring='accuracy', cv=3, iid=False, n_jobs=-1, error_score=0.0)
knn_clf.fit(scaled_X_train, y_train)
best_knn_clf = knn_clf.best_estimator_
print('Best KNN accuracy: ', knn_clf.best_score_)
print(best_knn_clf)
results = results.append({'clf': best_knn_clf, 'best_acc': knn_clf.best_score_}, ignore_index=True)

print('The best classifier so far is: ')
print(results.loc[results['best_acc'].idxmax()]['clf'])

Best KNN accuracy:  0.3187501055804416
KNeighborsClassifier(algorithm='auto', leaf_size=60, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=6, p=1,
           weights='distance')
The best classifier so far is: 
KNeighborsClassifier(algorithm='auto', leaf_size=60, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=6, p=1,
           weights='distance')


In [16]:
rnn_param = {'weights':('uniform', 'distance'),
             'radius':(1, 2, 3, 5, 10),
             'algorithm':('auto', 'ball_tree', 'kd_tree', 'brute'),
             'leaf_size':(15, 30, 60),
             'p':(1, 2)}
rnn = RadiusNeighborsClassifier(outlier_label=0)
rnn_clf = GridSearchCV(rnn, rnn_param, scoring='accuracy', cv=3, iid=False, n_jobs=-1, error_score=0.0)
rnn_clf.fit(scaled_X_train, y_train)
best_rnn_clf = rnn_clf.best_estimator_
print('Best NN accuracy: ', rnn_clf.best_score_)
print(best_rnn_clf)
results = results.append({'clf': best_rnn_clf, 'best_acc': rnn_clf.best_score_}, ignore_index=True)

print('The best classifier so far is: ')
print(results.loc[results['best_acc'].idxmax()]['clf'])



Best NN accuracy:  0.3244198165543033
RadiusNeighborsClassifier(algorithm='brute', leaf_size=15, metric='minkowski',
             metric_params=None, n_jobs=None, outlier_label=0, p=2,
             radius=10, weights='distance')
The best classifier so far is: 
DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=100,
            min_weight_fraction_leaf=0.0, presort=False, random_state=42,
            splitter='best')


Creating a **Nearest Centroid** models and evaluate them

In [33]:
nc_param = {'metric': ['euclidean', 'manhattan', 'cityblock', 'cosine', 'l1', 'l2'], 
            'shrink_threshold': [None, 1e+3, 1e+2, 1e+1, 1, 1e-3, 1e-2, 1e-1] }

nc = NearestCentroid()
nc_clf = GridSearchCV(nc, nc_param, scoring='accuracy', cv=3, iid=False, n_jobs=-1, error_score=0.0)
nc_clf.fit(scaled_X_train, y_train)
best_nc_clf = nc_clf.best_estimator_
print('Best SVM accuracy: ', nc_clf.best_score_)
print(best_nc_clf)
results = results.append({'clf': best_nc_clf, 'best_acc': nc_clf.best_score_}, ignore_index=True)

print('The best classifier so far is: ')
print(results.loc[results['best_acc'].idxmax()]['clf'])

Best SVM accuracy:  0.3421109855524957
NearestCentroid(metric='cityblock', shrink_threshold=1)
The best classifier so far is: 
LogisticRegression(C=10000.0, class_weight=None, dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=5,
          multi_class='warn', n_jobs=None, penalty='l2', random_state=42,
          solver='sag', tol=0.0001, verbose=0, warm_start=False)




**And finally**, we predict the unknown label for the testing set

In [21]:
X.shape, XX.shape

((10582, 360), (4411, 360))

In [22]:
yy = results.clf.iloc[0].predict(XX.drop(["PID"], axis=1))
yy = yy.astype(np.int)

###### The last thing we do is generating a file that should be *submitted* on kaggle

In [23]:
submission = pd.DataFrame(list(zip(XX.PID, yy)), columns=["PID", "AdoptionSpeed"])

In [24]:
submission.to_csv("../data/submission.csv", header=True, index=False)