In [3]:
import pandas as pd 
import matplotlib.pyplot as plt
import numpy as np 
import xgboost as xgb
import seaborn as sns
from sklearn import preprocessing, cross_validation
from ml_metrics import quadratic_weighted_kappa
from scipy.optimize import fmin_powell, fmin

%matplotlib inline

In [None]:
X = pd.read_csv('basic_features_train.csv')
Y = pd.read_csv('basic_features_labels.csv', header=None, index_col=0)
X_test = pd.read_csv('basic_features_test.csv')
Y = Y[1].as_matrix()

In [2]:
X = pd.read_csv('onehot_features_train.csv')
Y = pd.read_csv('onehot_features_labels.csv', header=None, index_col=0)
X_test = pd.read_csv('onehot_features_test.csv')
Y = Y[1].as_matrix()

In [5]:
X = pd.read_csv('onehot_nansum_features_train.csv')
Y = pd.read_csv('onehot_nansum_features_labels.csv', header=None, index_col=0)
X_test = pd.read_csv('onehot_nansum_features_test.csv')
Y = Y[1].as_matrix()

In [24]:
X = pd.read_csv('onehot_eng_features_train.csv')
Y = pd.read_csv('onehot_eng_features_labels.csv', header=None, index_col=0)
X_test = pd.read_csv('onehot_eng_features_test.csv')
Y = Y[1].as_matrix()

In [25]:
print(X.shape, X_test.shape, Y.shape)

(59381, 977) (19765, 977) (59381,)


In [6]:
def eval_wrapper(yhat, y):  
    y = np.array(y)
    y = y.astype(int)
    yhat = np.array(yhat)
    yhat = np.clip(np.round(yhat), np.min(y), np.max(y)).astype(int)   
    return quadratic_weighted_kappa(yhat, y)

In [7]:
def apply_offset(data, bin_offset, sv, scorer=eval_wrapper):
    # data has the format of pred=0, offset_pred=1, labels=2 in the first dim
    data[1, data[0].astype(int)==sv] = data[0, data[0].astype(int)==sv] + bin_offset
    score = scorer(data[1], data[2])
    return score

In [48]:
param = {   'objective': 'reg:linear',
            'eval_metric': 'rmse',
            'booster': 'gbtree',
            'nthread': 4,
            'max_depth': 7,
            'colsample_bytree':0.3,
            'subsample': 0.5,
            'eta': 0.01,
            'min_child_weight':50
        }
np.random.seed(8888)

skf = cross_validation.StratifiedKFold(Y, n_folds=5)

num_round = 2700
scores = []

num_classes = Y.max()

for i, (train_index, test_index) in enumerate(skf):
    xg_train = xgb.DMatrix( X.iloc[train_index].as_matrix(), label=Y[train_index])
    xg_test = xgb.DMatrix( X.iloc[test_index].as_matrix())
    # Train model
    bst = xgb.train(param, xg_train, num_round)
    # Predict model on train
    y = bst.predict(xg_train, ntree_limit=bst.best_iteration)
    y = np.clip(y, -0.99, 8.99)
    
    # Find the thresholds that give the best score
    offsets = np.ones(num_classes) * -0.5
    offset_train_preds = np.vstack((y, y, Y[train_index]))
    for j in range(num_classes):
        train_offset = lambda x: -apply_offset(offset_train_preds, x, j)
        offsets[j] = fmin_powell(train_offset, offsets[j], disp=True)
    
    # Predict model on test
    y = bst.predict(xg_test, ntree_limit=bst.best_iteration)
    # Apply offsets found from optimizing on the train
    data = np.vstack((y, y ))
    for j in range(num_classes):
        data[1, data[0].astype(int)==j] = data[0, data[0].astype(int)==j] + offsets[j]
        
    final_test_preds = np.round(np.clip(data[1], 1, 8)).astype(int)
    
    score = eval_wrapper(Y[test_index], final_test_preds)
    print('Fold {} scored {}'.format(i+1, score))
    scores.append(score)
print('Avg {}, Std {}'.format(np.mean(scores), np.std(scores)*2))

Optimization terminated successfully.
         Current function value: -0.684509
         Iterations: 1
         Function evaluations: 14
Optimization terminated successfully.
         Current function value: -0.688317
         Iterations: 1
         Function evaluations: 14
Optimization terminated successfully.
         Current function value: -0.704946
         Iterations: 2
         Function evaluations: 40
Optimization terminated successfully.
         Current function value: -0.717137
         Iterations: 2
         Function evaluations: 48
Optimization terminated successfully.
         Current function value: -0.720807
         Iterations: 2
         Function evaluations: 35
Optimization terminated successfully.
         Current function value: -0.721101
         Iterations: 2
         Function evaluations: 48
Optimization terminated successfully.
         Current function value: -0.724702
         Iterations: 2
         Function evaluations: 49
Optimization terminated successful

In [49]:
xg_train = xgb.DMatrix( X.as_matrix(), label=Y )
xg_test = xgb.DMatrix( X_test.as_matrix() )
# Train model
bst = xgb.train(param, xg_train, num_round)
# Predict model on train
y = bst.predict(xg_train, ntree_limit=bst.best_iteration)

# Find the thresholds that give the best score
offsets = np.ones(num_classes) * -0.5
offset_train_preds = np.vstack((y, y, Y))
for j in range(num_classes):
    train_offset = lambda x: -apply_offset(offset_train_preds, x, j)
    offsets[j] = fmin_powell(train_offset, offsets[j], disp=True)

# Predict model on test
y = bst.predict(xg_test, ntree_limit=bst.best_iteration)
# Apply offsets found from optimizing on the train
data = np.vstack((y, y, Y[:len(y)]))
for j in range(num_classes):
    data[1, data[0].astype(int)==j] = data[0, data[0].astype(int)==j] + offsets[j]

final_test_preds = np.round(np.clip(data[1], 1, 8)).astype(int)

Optimization terminated successfully.
         Current function value: -0.677192
         Iterations: 1
         Function evaluations: 14
Optimization terminated successfully.
         Current function value: -0.681116
         Iterations: 1
         Function evaluations: 14
Optimization terminated successfully.
         Current function value: -0.698206
         Iterations: 2
         Function evaluations: 40
Optimization terminated successfully.
         Current function value: -0.710057
         Iterations: 2
         Function evaluations: 46
Optimization terminated successfully.
         Current function value: -0.714240
         Iterations: 2
         Function evaluations: 45
Optimization terminated successfully.
         Current function value: -0.714314
         Iterations: 2
         Function evaluations: 49
Optimization terminated successfully.
         Current function value: -0.718419
         Iterations: 2
         Function evaluations: 57
Optimization terminated successful

In [50]:
sample = pd.read_csv('sample_submission.csv')
print(len(sample), len(final_test_preds))
sample.Response = final_test_preds
sample.to_csv('submissions/sub07.csv', index=False)

19765 19765
