In [1]:
import os
import h5py
from keras.layers import Dense, Activation, Dropout
from keras.models import Sequential
from keras.optimizers import SGD
import numpy as np
from sklearn.metrics import log_loss

from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.neighbors import NearestNeighbors
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import AdaBoostClassifier, ExtraTreesClassifier 
from sklearn.ensemble import RandomForestClassifier

import xgboost as xgb

# ========================================================
# Custom file with methods for NN, cross-validation, etc
# ========================================================
from nn_methods import run_model, get_index


Using Theano backend.


Here we have used several different classifiers (excluding NN) in order to compare logloss for different approaches. Metrics value logloss is calculated with 5-fold stratified cross-validation procedure. 

Recall about the task: multi-class classification probelm, 39 classes, 900K rows in a train set, 900K in test set. Main features are spatial data (X,Y coordinates and different transformations) and date features (year, month, day,..). 

Data preparation and exploration is considered in sfcrime_data_preparation.ipynb

### Load data

In [2]:
f = h5py.File("SFData.hdf5", "r")
Xtrain_d = f["X_train"][:]
ytrain = f["y_train"][:]
Xtest_d = f["X_test"][:]
dtest = f["X_test_ID"][:]
le_cat_class = f["le_cat_classes"][:]

In [34]:
# write results for submission
def write_result(y_t, filename):
    result = pd.DataFrame(y_t, index=dtest.Id)
    result.columns = le_cat.classes_
    result.to_csv(filename + '.csv', float_format='%.5f')

### Indicies for stratified k-fold cross-validation

In [3]:
k = 5
indtrain1, indtrain2 = get_index(Xtrain_d, k)

print map(lambda x: len(x), indtrain1)
print map(lambda x: len(x), indtrain2)

[702439, 702439, 702439, 702439, 702440]
[175610, 175610, 175610, 175610, 175609]


In [8]:
Xtrain_d.shape

(878049, 57)

### General method for classifiers with using k-fold cross-validation

In [4]:
def classifier_result(model, Xtrain_d, ytrain, indtrain1, indtrain2):
    losses = []
    allypred = np.zeros((len(Xtrain_d), 39)) # we will predict on each fold to make an ensemble
    for ind in range(k):
        print "===== FOLD %s/4 ======" % ind
        ind_test = indtrain1[ind]  # ind1
        ind_train = indtrain2[ind]  # ind2
        
        x_tr = Xtrain_d[ind_test,:]
        y_tr = ytrain[ind_test]
        
        x_te = Xtrain_d[ind_test,:]
        y_te = ytrain[ind_test]
        
        print "Fitting..."
        model.fit(x_tr, y_tr)
        
        print "Evaluating..."
        prob_te = model.predict_proba(x_te)
        ll = log_loss(y_te, prob_te)
        losses.append(ll)
        print ll
        allypred[ind_test, :] = prob_te
    return allypred, np.mean(losses)

## Logistic regression

In [33]:
# Don't recalculate! 
logreg = LogisticRegression(multi_class='multinomial', solver='lbfgs')
allypred, avg_loss = classifier_result(logreg, Xtrain_d, ytrain, indtrain1, indtrain2)

# avg_logloss = 2.67

Fitting...
Evaluating...
2.67145687663
Fitting...
Evaluating...
2.67035541355
Fitting...
Evaluating...
2.67112873925
Fitting...
Evaluating...
2.66960007376
Fitting...
Evaluating...
2.66967472531


In [None]:
avg_loss

In [36]:
print("Save to file...")
g = h5py.File("data_%s.hdf5" % "logreg", "w")
g.create_dataset("allypred", data=allypred)
g.create_dataset("avg_loss", data=avg_loss)
g.close()

Save to file...


In [None]:
# Final Log Regression.Training on all train set, calculate probabilities for test set 
logreg_final = LogisticRegression(multi_class='multinomial', solver='lbfgs')
logreg_final.fit(Xtrain_d, ytrain)
prob_test = logreg_final.predict_proba(Xtest_d)
write_result(prob_test, 'submission_logreg')

## Random Forest

In [None]:
rand_forest = RandomForestClassifier(n_estimators=1000, max_depth=15)
allypred, avg_loss = classifier_result(rand_forest, Xtrain_d, ytrain, indtrain1, indtrain2)

# avg_logloss = 2.38

Fitting...


In [None]:
print("Save to file...")
g = h5py.File("data_%s.hdf5" % "rand_forest", "w")
g.create_dataset("allypred", data=allypred)
g.create_dataset("avg_loss", data=avg_loss)
g.close()

In [None]:
# Final Random Forest. Training on all train set, calculate probabilities for test set 
rf_final = RandomForestClassifier(n_estimators=1000, max_depth=15)
rf_final.fit(Xtrain_d, ytrain)
prob_test = rf_final.predict_proba(Xtest_d)
write_result(prob_test, 'submission_rf')

## XGBoost

In [None]:
def classifier_xg_boost_result(Xtrain_d, ytrain, indtrain1, indtrain2):
    param = {}
    param['booster'] = 'gbtree'
    param['objective'] = 'multi:softprob'
    param['num_class'] = 39
    param['eval_metric'] = 'logloss'
    # param['scale_pos_weight'] = 1.0
    param['bst:eta'] = 1
    param['bst:max_depth'] = 6
    # param['bst:colsample_bytree'] = 0.4
    # param['gamma'] = 0.5
    # param['min_child_weight'] = 5.
    param['max_delta_step'] = 1
    # param['silent'] = 1
    # param['nthread'] = 30
    num_round = 15
    plst = list(param.items())
    watchlist = []

    losses = []
    allypred = np.zeros((len(Xtrain_d), 39)) # we will predict on each fold to make an ensemble
    for ind in range(k):
        print "===== FOLD %s/4 ======" % ind
        ind_test = indtrain1[ind]  # ind1
        ind_train = indtrain2[ind]  # ind2
        
        x_tr = Xtrain_d[ind_test,:]
        y_tr = ytrain[ind_test]
        
        x_te = Xtrain_d[ind_test,:]
        y_te = ytrain[ind_test]
        
        dtrain_x = xgb.DMatrix(x_tr, label=y_tr)
        dtest_x = xgb.DMatrix(x_te, label=y_te)

        print "Fitting..."
        bst = xgb.train(plst, dtrain_x, num_round, watchlist)
        bst.save_model("xgboost_%s.model" % str(ind))
        
        print "Evaluating..."
        curpred = bst.predict(dtest_x)
        ll = log_loss(y_te, curpred)
        
        print ll
        losses.append(ll)
        allypred[ind_test, :] = curpred
    
    return allypred, np.mean(losses)



In [None]:
# average logloss for XgBoost
allypred, avg_loss = classifier_xg_boost_result(Xtrain_d, ytrain, indtrain1, indtrain2)

# avg. logloss for 70% of train set = 2.42

In [None]:
print("Save to file...")
g = h5py.File("data_%s.hdf5" % "xgboost", "w")
g.create_dataset("allypred", data=allypred)
g.create_dataset("avg_loss", data=avg_loss)
g.close()

In [None]:
# Final model for XgBoost, train on all data

dtrain_x = xgb.DMatrix(Xtrain_d, label=ytrain)
print "Fitting..."
bst = xgb.train(plst, dtrain_x, num_round, watchlist)
bst.save_model("final_xgboost.model")
curpred = bst.predict(Xtest_d)
write_result(curpred, 'submission_xgboost')


## KNN 

In [None]:
knn_classifier4 = KNeighborsClassifier(n_neighbors=4)
knn_classifier8 = KNeighborsClassifier(n_neighbors=8)
knn_classifier16 = KNeighborsClassifier(n_neighbors=16)
knn_classifier32 = KNeighborsClassifier(n_neighbors=32)
knn_classifier64 = KNeighborsClassifier(n_neighbors=64)


allypred4, avg_loss4 = classifier_result(knn_classifier4, Xtrain_d, ytrain, indtrain1, indtrain2)
allypred8, avg_loss8 = classifier_result(knn_classifier8, Xtrain_d, ytrain, indtrain1, indtrain2)
allypred16, avg_loss16 = classifier_result(knn_classifier16, Xtrain_d, ytrain, indtrain1, indtrain2)
allypred32, avg_loss32 = classifier_result(knn_classifier32, Xtrain_d, ytrain, indtrain1, indtrain2)
allypred64, avg_loss64 = classifier_result(knn_classifier64, Xtrain_d, ytrain, indtrain1, indtrain2)


Fitting...
Evaluating...


In [None]:
print avg_loss4, avg_loss8, avg_loss16, avg_loss32, avg_loss64