In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

In [2]:
def load_train_data(path):
    df = pd.read_csv(path)
    X = df.values.copy()
    np.random.shuffle(X)
    X, labels = X[:, 1:-1].astype(np.float32), X[:, -1]
    encoder = LabelEncoder()
    y = encoder.fit_transform(labels).astype(np.int32)
    scaler = StandardScaler()
    X = scaler.fit_transform(X)
    return X, y, encoder, scaler

In [3]:
def load_test_data(path, scaler):
    df = pd.read_csv(path)
    X = df.values.copy()
    X, ids = X[:, 1:].astype(np.float32), X[:, 0].astype(str)
    X = scaler.transform(X)
    return X, ids

In [4]:
def make_submission(clf, X_test, ids, encoder, name='my_submission.csv'):
    y_prob = clf.predict_proba(X_test)
    with open(name, 'w') as f:
        f.write('id,')
        f.write(','.join(encoder.classes_))
        f.write('\n')
        for id, probs in zip(ids, y_prob):
            probas = ','.join([id] + map(str, probs.tolist()))
            f.write(probas)
            f.write('\n')
    print("Wrote submission to file {}.".format(name))

In [5]:
def correctRounding(prob):
    return max(min(prob, 1. - 10e-15), 10e-15)

In [6]:
from matplotlib import pyplot
def build_hist(arr, nbins):
    return pyplot.hist(arr, nbins)
%matplotlib inline

In [12]:
from math import log
def loss_func(predictions, actual):
    loss = []
    for item, real_value in zip(predictions, actual):
        loss += [-log(correctRounding(item[real_value]))]
    return -np.mean(loss)

##Load Data

In [8]:
X, y, encoder, scaler = load_train_data('train.csv')

In [9]:
X_evaluate, ids = load_test_data('test.csv', scaler)

In [10]:
from sklearn import datasets
from sklearn import cross_validation

In [11]:
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, random_state = 42, test_size = 0.1)

#Load Classifiers

In [15]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

_n_trees = 500
_n_estimators = 500

randomForest = RandomForestClassifier(n_jobs=3, n_estimators=_n_estimators,  )
vector_machine = SVC(probability=True)
knn = KNeighborsClassifier(n_neighbors = 15)

In [79]:
model = randomForest.fit(X_train, y_train)
result = model.predict_proba(X_test)

In [102]:
sum(result[0])

1.0

In [80]:
loss_func(result, y_test)

0.56595258280111793

#Serious boosting

In [16]:
def multiLoglossScoring(estimator, X, y):
    predicton = estimator.predict_proba(X)
    return -loss_func(predicton, y)

In [None]:
from sklearn.grid_search import GridSearchCV
_n_estimators = 1000

parameterGrid = {'learning_rate': [0.1, 0.01, 0.001],
                 'max_depth': [4,6],
                 'min_samples_leaf' : [3,5]
                }
est = GradientBoostingClassifier(n_estimators=_n_estimators)

gs_cv = GridSearchCV(est, parameterGrid, scoring = multiLoglossScoring, n_jobs = 3).fit(X_train, y_train)

print('Best hyperparameters: %r' % gs_cv.best_params_)

In [106]:
boosting_result = boostring_tryout.predict_proba(X_test)
loss_func(boosting_result, y_test)

0.67189894386361326

In [1]:
import theano