In [2]:
%matplotlib inline
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt

In [3]:
rng = np.random.RandomState(0)

train = pd.read_csv('train.csv')
train['type'] = 'train'

test = pd.read_csv('test.csv')
test['type'] = 'test'
test['OutcomeSubtype'] = ''
test['OutcomeType'] = ''

def data_import(df1, df2):

    df = (df1.append(df2)
         .rename(columns=str.lower))
    
    # functions to get new parameters from the column
    def get_sex(x):
        x = str(x)
        if x.find('Male') >= 0: return 'male'
        if x.find('Female') >= 0: return 'female'
        return 'unknown'
    
    def get_neutered(x):
        x = str(x)
        if x.find('Spayed') >= 0: return 'neutered'
        if x.find('Neutered') >= 0: return 'neutered'
        if x.find('Intact') >= 0: return 'intact'
        return 'unknown'

    df['sex'] = df.sexuponoutcome.apply(get_sex)
    df['neutered'] = df.sexuponoutcome.apply(get_neutered)
    
    def get_mix(x):
        x = str(x)
        if x.find('Mix') >= 0: return 'mix'
        return 'not'

    df['mix'] = df.breed.apply(get_mix)
    
    
    def calc_age_in_years(x):
        x = str(x)
        if x == 'nan': return np.nan
        age = int(x.split()[0])
        if x.find('year') > -1: return age 
        if x.find('month')> -1: return age / 12.
        if x.find('week')> -1: return age / 52.
        if x.find('day')> -1: return age / 365.
        else: return np.nan
    
    df['ageinyears'] = df.ageuponoutcome.apply(calc_age_in_years)
    
    # Creating some more date variables

    from datetime import datetime

    df['datetime'] = pd.to_datetime(df.datetime)
    df['year'] = df['datetime'].map(lambda x: x.year).astype(str)
    df['year'] = df['datetime'].map(lambda x: x.month).astype(str)
    df['wday'] = df['datetime'].map(lambda x: x.dayofweek).astype(str)
    
    def has_name(x):
        if x == 'Nameless': return 0
        else: return 1
    
    df['hasname'] = df['name'].map(has_name)
    
    drop_cols = ['animalid', 'datetime', 'name', 'ageuponoutcome', 'sexuponoutcome', 'outcomesubtype']

    df.drop(drop_cols, axis=1, inplace=True)

    df['mix'] = df['breed'].str.contains('Mix').astype(int)

    df['color_simple'] = df.color.str.split('/| ').str.get(0)
    df.drop(['breed', 'color'], axis = 1 , inplace = True)
    
    # Using mean imputation of missing values. Can build on if necessary
    
    df['ageinyears'] = df.ageinyears.fillna(df.ageinyears.mean())
    
    # Just using training data for model building
    
    return(df)

df = data_import(train, test)

def prep_data(dataframe, type):
    
    df = dataframe.copy()
    df.drop('id', axis = 1, inplace = True)
    df = df.loc[df.type == type,:]
    df.drop('type', axis = 1, inplace = True)
    
    # Encoding labels
    
    y = df['outcometype'].values
    
    from sklearn import preprocessing
    le = preprocessing.LabelEncoder()
    
    le.fit(y)
    
    y = le.transform(y)
    X = df
    X.drop(['outcometype'], axis=1, inplace=True)
    
    X = pd.get_dummies(X)
    
    from sklearn.preprocessing import Imputer

    # Imputing missing values
    imp = Imputer(missing_values=np.nan, strategy='mean', axis=0)
    imp.fit(X)

    X = imp.transform(X)
    
    return(X, y, le)

X, y, le = prep_data(df, 'train')

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [5]:
from scipy import linalg
from sklearn.decomposition import PCA, FactorAnalysis
from sklearn.cross_validation import cross_val_score
from sklearn.grid_search import GridSearchCV

n_features = X_train.shape[1]
n_components = np.arange(0, n_features, 1)  # options for n_components

def compute_scores(X):
    
    # Fit the models
    pca = PCA()
    fa = FactorAnalysis()

    pca_scores, fa_scores = [], []
    for n in n_components:
        pca.n_components = n
        fa.n_components = n
        pca_scores.append(np.mean(cross_val_score(pca, X)))
        fa_scores.append(np.mean(cross_val_score(fa, X)))

    return pca_scores, fa_scores

for X in [(X_train)]:
    pca_scores, fa_scores = compute_scores(X)
    n_components_pca = n_components[np.argmax(pca_scores)]
    n_components_fa = n_components[np.argmax(fa_scores)]

    pca = PCA(n_components='mle')
    pca.fit(X)
    n_components_pca_mle = pca.n_components_

    print("best n_components by PCA CV = %d" % n_components_pca)
    print("best n_components by FactorAnalysis CV = %d" % n_components_fa)
    print("best n_components by PCA MLE = %d" % n_components_pca_mle)
    
pca = PCA(n_components= n_components_pca)
X_PC_48 = pca.fit(X_train).transform(X_train)

pca = PCA(n_components= n_components_pca_mle)
X_PC_mle = pca.fit(X_train).transform(X_train)

fa = FactorAnalysis(n_components= n_components_fa)
X_FA_11 = fa.fit(X_train).transform(X_train)

Automatically created module for IPython interactive environment
best n_components by PCA CV = 48
best n_components by FactorAnalysis CV = 11
best n_components by PCA MLE = 58


In [8]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

names = ["Nearest Neighbors", "Linear SVM", "RBF SVM", "Decision Tree",
         "Random Forest", "AdaBoost", "Naive Bayes", "Linear Discriminant Analysis",
         "Quadratic Discriminant Analysis", "AdaBoosted decision trees"]

knn = KNeighborsClassifier(5)
linear_svm = SVC(kernel="linear", C=0.025)
rbf_svm = SVC(gamma=2, C=1)
d_tree = DecisionTreeClassifier(max_depth=5)
rf = RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1)
adaboost = AdaBoostClassifier()
naive_bayes = GaussianNB()
lda = LinearDiscriminantAnalysis()
qda = QuadraticDiscriminantAnalysis()
adaboost_rf = AdaBoostClassifier(DecisionTreeClassifier(max_depth=5),
                         algorithm="SAMME",
                         n_estimators=200)

classifiers = [
    knn,
    linear_svm,
    rbf_svm,
    d_tree,
    rf,
    adaboost,
    naive_bayes,
    lda,
    qda,
    adaboost_rf
]

def est_score(X, y):

    # iterate over classifiers
    for name, clf in zip(names, classifiers):
        clf.fit(X, y)
        score = clf.score(X, y)
        print(name, score)

In [9]:
# Looking at the original transformation
est_score(X_train, y_train)

Nearest Neighbors 0.702308354222
Linear SVM 0.62654794418
RBF SVM 0.877735792585
Decision Tree 0.63698604512
Random Forest 0.607579782259
AdaBoost 0.635639193385
Naive Bayes 0.0423135919787




Linear Discriminant Analysis 0.628119271204
Quadratic Discriminant Analysis 0.0178457854764
AdaBoosted decision trees 0.620561936473


In [10]:
# Looking at the first 4 principal components
est_score(X_PC_48, y_train)

Nearest Neighbors 0.702196116578
Linear SVM 0.626697594373
RBF SVM 0.877698380037
Decision Tree 0.636200381608
Random Forest 0.540087545363
AdaBoost 0.611208799431
Naive Bayes 0.174155411725
Linear Discriminant Analysis 0.628119271204




Quadratic Discriminant Analysis 0.0182199109581
AdaBoosted decision trees 0.604100415279


In [17]:
# Looking at the MLE determined PC transformation
est_score(X_PC_mle, y_train)

Nearest Neighbors 0.681132851959
Linear SVM 0.626622769277
RBF SVM 0.823337947548
Decision Tree 0.622582214075
Random Forest 0.580493097385
AdaBoost 0.614052153092
Naive Bayes 0.415017396835
Linear Discriminant Analysis 0.627707733174




Quadratic Discriminant Analysis 0.0184817987953
AdaBoosted decision trees 0.593587489244


In [11]:
# Looking at the determined FA transformation
# For some weird reason a large amount of these are the same.. FML
est_score(X_FA_11, y_train)

Nearest Neighbors 0.680870964121
Linear SVM 0.607841670096
RBF SVM 0.607841670096
Decision Tree 0.607841670096
Random Forest 0.607841670096
AdaBoost 0.607841670096
Naive Bayes 0.52830259269
Linear Discriminant Analysis 0.607841670096
Quadratic Discriminant Analysis 0.402895731228




AdaBoosted decision trees 0.607841670096


In [12]:
# From the above OOB investigation we definitely want ot look at RBF SVMs. This code is taken from:
# http://scikit-learn.org/stable/auto_examples/svm/plot_rbf_parameters.html

print(__doc__)

import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import Normalize

from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import load_iris
from sklearn.cross_validation import StratifiedShuffleSplit
from sklearn.grid_search import GridSearchCV

# Utility function to move the midpoint of a colormap to be around
# the values of interest.

class MidpointNormalize(Normalize):

    def __init__(self, vmin=None, vmax=None, midpoint=None, clip=False):
        self.midpoint = midpoint
        Normalize.__init__(self, vmin, vmax, clip)

    def __call__(self, value, clip=None):
        x, y = [self.vmin, self.midpoint, self.vmax], [0, 0.5, 1]
        return np.ma.masked_array(np.interp(value, x, y))

def svm_rbf(X, y):

    ##############################################################################
    # Train classifiers
    #
    # For an initial search, a logarithmic grid with basis
    # 10 is often helpful. Using a basis of 2, a finer
    # tuning can be achieved but at a much higher cost.

    C_range = np.logspace(-2, 10, 13)
    gamma_range = np.logspace(-9, 3, 13)
    param_grid = dict(gamma=gamma_range, C=C_range)
    cv = StratifiedShuffleSplit(y, n_iter=5, test_size=0.2, random_state=42)
    grid = GridSearchCV(SVC(), param_grid=param_grid, cv=cv)
    grid.fit(X, y)

    print("The best parameters are %s with a score of %0.2f"
          % (grid.best_params_, grid.best_score_))

    C_2d_range = [1e-2, 1, 1e2]
    gamma_2d_range = [1e-1, 1, 1e1]
    classifiers = []
    for C in C_2d_range:
        for gamma in gamma_2d_range:
            clf = SVC(C=C, gamma=gamma)
            clf.fit(X, y)
            classifiers.append((C, gamma, clf))

    scores = [x[1] for x in grid.grid_scores_]
    scores = np.array(scores).reshape(len(C_range), len(gamma_range))

    plt.figure(figsize=(8, 6))
    plt.subplots_adjust(left=.2, right=0.95, bottom=0.15, top=0.95)
    plt.imshow(scores, interpolation='nearest', cmap=plt.cm.hot,
               norm=MidpointNormalize(vmin=0.2, midpoint=0.92))
    plt.xlabel('gamma')
    plt.ylabel('C')
    plt.colorbar()
    plt.xticks(np.arange(len(gamma_range)), gamma_range, rotation=45)
    plt.yticks(np.arange(len(C_range)), C_range)
    plt.title('Validation accuracy')
    plt.show()

Automatically created module for IPython interactive environment


In [None]:
svm_rbf(X_PC_48, y_train)