In [5]:
%matplotlib inline
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt

In [41]:
rng = np.random.RandomState(0)

print ('reading in train data..')
train = pd.read_csv('train.csv')
train['type'] = 'train'

print ('reading in test data..')
test = pd.read_csv('test.csv')
test['type'] = 'test'
test['OutcomeSubtype'] = ''
test['OutcomeType'] = ''

def data_import(df1, df2):
    
    print ('Running feature extraction process..')
    df = (df1.append(df2)
         .rename(columns=str.lower))
    
    # functions to get new parameters from the column
    def get_sex(x):
        x = str(x)
        if x.find('Male') >= 0: return 'male'
        if x.find('Female') >= 0: return 'female'
        return 'unknown'
    
    def get_neutered(x):
        x = str(x)
        if x.find('Spayed') >= 0: return 'neutered'
        if x.find('Neutered') >= 0: return 'neutered'
        if x.find('Intact') >= 0: return 'intact'
        return 'unknown'

    df['sex'] = df.sexuponoutcome.apply(get_sex)
    df['neutered'] = df.sexuponoutcome.apply(get_neutered)
    
    def get_mix(x):
        x = str(x)
        if x.find('Mix') >= 0: return 'mix'
        return 'not'

    df['mix'] = df.breed.apply(get_mix)
    
    
    def calc_age_in_years(x):
        x = str(x)
        if x == 'nan': return np.nan
        age = int(x.split()[0])
        if x.find('year') > -1: return age 
        if x.find('month')> -1: return age / 12.
        if x.find('week')> -1: return age / 52.
        if x.find('day')> -1: return age / 365.
        else: return np.nan
    
    df['ageinyears'] = df.ageuponoutcome.apply(calc_age_in_years)
    
    # Creating some more date variables

    from datetime import datetime

    df['datetime'] = pd.to_datetime(df.datetime)
    df['year'] = df['datetime'].map(lambda x: x.year).astype(str)
    df['year'] = df['datetime'].map(lambda x: x.month).astype(str)
    df['wday'] = df['datetime'].map(lambda x: x.dayofweek).astype(str)
    
    def has_name(x):
        if x == 'Nameless': return 0
        else: return 1
    
    df['hasname'] = df['name'].map(has_name)
    
    print ('Dropping unused variables..')
    
    drop_cols = ['animalid', 'datetime', 'name', 'ageuponoutcome', 'sexuponoutcome', 'outcomesubtype']

    df.drop(drop_cols, axis=1, inplace=True)

    df['mix'] = df['breed'].str.contains('Mix').astype(int)

    df['color_simple'] = df.color.str.split('/| ').str.get(0)
    df.drop(['breed', 'color'], axis = 1 , inplace = True)
    
    # Using mean imputation of missing values. Can build on if necessary
    
    df['ageinyears'] = df.ageinyears.fillna(df.ageinyears.mean())
    
    # Just using training data for model building
    
    return(df)

df = data_import(train, test)

def prep_data(dataframe, type):
    
    df = dataframe.copy()
    df.drop('id', axis = 1, inplace = True)
    df = df.loc[df.type == type,:]
    df.drop('type', axis = 1, inplace = True)
    
    # Encoding labels
    print ('Encoding labels of the outcome variable..')
    
    y = df['outcometype'].values
    
    if type == 'test':
        df['color_simple_Ruddy'] = 0
    
    from sklearn import preprocessing
    le = preprocessing.LabelEncoder()
    
    le.fit(y)
    
    y = le.transform(y)
    X = df
    X.drop(['outcometype'], axis=1, inplace=True)
    
    print ('Using one hot encoding for predictor variables..')
    X = pd.get_dummies(X)
    
    X_cols = X.columns
    from sklearn.preprocessing import Imputer

    #Imputing missing values
    imp = Imputer(missing_values=np.nan, strategy='mean', axis=0)
    imp.fit(X)

    X = imp.transform(X)
    
    return(X, y, le, X_cols)

print ('Running data preparation for train dataset')
X_train, y_train, le_train, X_train_cols = prep_data(df, 'train')

print ('Running data preparation for test dataset')
X_test, y_test, le_test, X_test_cols = prep_data(df, 'test')

cols_equal = list(set(X_train_cols) - set(X_test_cols)) 

if not cols_equal:
    print ('Columns are the same!!')
else:
    print ('Columns are not the same..')

reading in train data..
reading in test data..
Running feature extraction process..
Dropping unused variables..
Running data preparation for train dataset
Encoding labels of the outcome variable..
Using one hot encoding for predictor variables..
Running data preparation for test dataset
Encoding labels of the outcome variable..
Using one hot encoding for predictor variables..
Columns are the same!!


In [7]:
from sklearn.svm import SVC

linear_svm = SVC(kernel="linear", C=0.025, probability = True)
rbf_svm = SVC(gamma=2, C=1, probability = True)

print ('Fitting linear SVM..')
linear_svm.fit(X_train, y_train)

print ('Fitting SVM with RBF kernel..')
rbf_svm.fit(X_train, y_train)

Fitting linear SVM..
Fitting SVM with RBF kernel..


SVC(C=1, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma=2, kernel='rbf',
  max_iter=-1, probability=True, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [91]:
def predict_output(clf, file):

    print ('Predicting outcomes..')
    y_pred = rbf_svm.predict_proba(X_test)

    df = pd.DataFrame(y_pred)
    df.columns = le_train.classes_
    ID = pd.Series(range(1, y_pred.shape[0] + 1))
    submission = pd.concat([ID, df], axis=1)
    submission.rename(columns={0:'ID'}, inplace=True)
    submission['ID'] = submission['ID'].astype(int)
    
    if (y_pred.shape[0] == 11456):
        print ('Correct number of rows')
        print ('Saving to CSV..')
        
        import time
        
        file_path = './Submissions/'
        current_time  = time.strftime("%d_%m_%Y_%H_%M")
        file_name = file_path + file + '_' + current_time + '.csv'
        
        submission.to_csv(file_name, index=False)
        print ('Done!')
        
    else:
        print ('Incorrect number of rows, please check')

In [92]:
predict_output(rbf_svm, 'rbf_svm')

Predicting outcomes..
Correct number of rows
Saving to CSV..
Done!


In [94]:
# From the above OOB investigation we definitely want ot look at RBF SVMs. This code is taken from:
# http://scikit-learn.org/stable/auto_examples/svm/plot_rbf_parameters.html

import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import Normalize

from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import load_iris
from sklearn.cross_validation import StratifiedShuffleSplit
from sklearn.grid_search import GridSearchCV

# Utility function to move the midpoint of a colormap to be around
# the values of interest.

class MidpointNormalize(Normalize):

    def __init__(self, vmin=None, vmax=None, midpoint=None, clip=False):
        self.midpoint = midpoint
        Normalize.__init__(self, vmin, vmax, clip)

    def __call__(self, value, clip=None):
        x, y = [self.vmin, self.midpoint, self.vmax], [0, 0.5, 1]
        return np.ma.masked_array(np.interp(value, x, y))

def svm_rbf(X, y):
    
    print('Grid searching for the best RBF hyperparameters')
    ##############################################################################
    # Train classifiers
    #
    # For an initial search, a logarithmic grid with basis
    # 10 is often helpful. Using a basis of 2, a finer
    # tuning can be achieved but at a much higher cost.

    C_range = np.logspace(-2, 10, 13)
    gamma_range = np.logspace(-9, 3, 13)
    param_grid = dict(gamma=gamma_range, C=C_range)
    cv = StratifiedShuffleSplit(y, n_iter=5, test_size=0.2, random_state=42)
    grid = GridSearchCV(SVC(), param_grid=param_grid, cv=cv)
    grid.fit(X, y)

    print("The best parameters are %s with a score of %0.2f"
          % (grid.best_params_, grid.best_score_))

    C_2d_range = [1e-2, 1, 1e2]
    gamma_2d_range = [1e-1, 1, 1e1]
    classifiers = []
    for C in C_2d_range:
        for gamma in gamma_2d_range:
            clf = SVC(C=C, gamma=gamma)
            clf.fit(X, y)
            classifiers.append((C, gamma, clf))

    scores = [x[1] for x in grid.grid_scores_]
    scores = np.array(scores).reshape(len(C_range), len(gamma_range))

    plt.figure(figsize=(8, 6))
    plt.subplots_adjust(left=.2, right=0.95, bottom=0.15, top=0.95)
    plt.imshow(scores, interpolation='nearest', cmap=plt.cm.hot,
               norm=MidpointNormalize(vmin=0.2, midpoint=0.92))
    plt.xlabel('gamma')
    plt.ylabel('C')
    plt.colorbar()
    plt.xticks(np.arange(len(gamma_range)), gamma_range, rotation=45)
    plt.yticks(np.arange(len(C_range)), C_range)
    plt.title('Validation accuracy')
    plt.show()

In [None]:
results = svm_rbf(X_train, y_train)

Grid searching for the best RBF hyperparameters
