In [9]:
import pandas as pd 
import matplotlib.pyplot as plt
import numpy as np 
import xgboost as xgb
import seaborn as sns
from sklearn import preprocessing, cross_validation, neighbors, cluster
from scipy.optimize import fmin_powell, fmin

from ml_metrics import quadratic_weighted_kappa

%matplotlib inline

In [46]:
X = pd.read_csv('basic_features_train.csv')
Y = pd.read_csv('basic_features_labels.csv', header=None, index_col=0)
X_test = pd.read_csv('basic_features_test.csv')
Y = Y[1].as_matrix()

In [None]:
X = pd.read_csv('onehot_features_train.csv')
Y = pd.read_csv('onehot_features_labels.csv', header=None, index_col=0)
X_test = pd.read_csv('onehot_features_test.csv')
Y = Y[1].as_matrix()

In [None]:
X = pd.read_csv('onehot_nansum_features_train.csv')
Y = pd.read_csv('onehot_nansum_features_labels.csv', header=None, index_col=0)
X_test = pd.read_csv('onehot_nansum_features_test.csv')
Y = Y[1].as_matrix()

In [2]:
X = pd.read_csv('onehot_eng_features_train.csv')
Y = pd.read_csv('onehot_eng_features_labels.csv', header=None, index_col=0)
X_test = pd.read_csv('onehot_eng_features_test.csv')
Y = Y[1].as_matrix()

In [3]:
print(X.shape, X_test.shape, Y.shape)

(59381, 977) (19765, 977) (59381,)


In [4]:
def eval_wrapper(yhat, y):  
    y = np.array(y)
    y = y.astype(int)
    yhat = np.array(yhat)
    yhat = np.clip(np.round(yhat), np.min(y), np.max(y)).astype(int)   
    return quadratic_weighted_kappa(yhat, y)

In [5]:
def apply_offset(data, bin_offset, sv, scorer=eval_wrapper):
    # data has the format of pred=0, offset_pred=1, labels=2 in the first dim
    data[1, data[0].astype(int)==sv] = data[0, data[0].astype(int)==sv] + bin_offset
    score = scorer(data[1], data[2])
    return score

In [37]:
np.random.seed(8888)

km1 = cluster.MiniBatchKMeans(n_clusters=4, verbose=False)
km2 = cluster.MiniBatchKMeans(n_clusters=8, verbose=False)
km3 = cluster.MiniBatchKMeans(n_clusters=8, verbose=False)

kms = []

for i in range(10):
    kms.append( cluster.MiniBatchKMeans(n_clusters=2**(i+1)) )

skf = cross_validation.StratifiedKFold(Y, n_folds=5)


for i, (train_index, test_index) in enumerate(skf):
    preds = []
    for j, km in enumerate(kms):
        print('Fitting {}'.format(j))
        km.fit(X.iloc[train_index], Y[train_index])
        print('Predicting {}'.format(j))
        preds.append(km.predict(X.iloc[test_index]))
    
    #for p in kms
    #pred1 = km1.predict(X.iloc[test_index])
    #pred2 = km2.predict(X.iloc[test_index])
    #print(pred1, pred2)
    print(preds)
    meta_feat = pd.DataFrame(preds).T
    bst = xgb.XGBClassifier(n_estimators=100)
    
    bst.fit(meta_feat.as_matrix(), Y[test_index])
    pred = bst.predict(meta_feat.as_matrix())
    print('SCORE:', eval_wrapper(Y[test_index], pred))
    
meta_feat.head()

Fitting 0
Predicting 0
Fitting 1
Predicting 1
Fitting 2
Predicting 2
Fitting 3
Predicting 3
Fitting 4
Predicting 4
Fitting 5
Predicting 5
Fitting 6
Predicting 6
Fitting 7
Predicting 7
Fitting 8
Predicting 8
Fitting 9

  init_size=init_size)
  init_size=init_size)



Predicting 9
[array([1, 0, 0, ..., 0, 0, 0], dtype=int32), array([1, 2, 2, ..., 2, 2, 0], dtype=int32), array([1, 0, 3, ..., 0, 0, 3], dtype=int32), array([ 9, 15,  5, ...,  0, 15,  5], dtype=int32), array([ 3,  8, 14, ..., 16, 24, 14], dtype=int32), array([ 6, 63,  9, ..., 47,  9, 22], dtype=int32), array([106,  36,  52, ...,  83,  60, 115], dtype=int32), array([102, 210, 131, ..., 219, 130, 239], dtype=int32), array([ 88,  73, 372, ...,  85, 319, 320], dtype=int32), array([486,  12, 208, ..., 381, 695, 659], dtype=int32)]
SCORE: 0.08234160070932961
Fitting 0
Predicting 0
Fitting 1
Predicting 1
Fitting 2
Predicting 2
Fitting 3
Predicting 3
Fitting 4


KeyboardInterrupt: 

In [38]:
meta_feat.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,1,1,1,9,3,6,106,102,88,486
1,0,2,0,15,8,63,36,210,73,12
2,0,2,3,5,14,9,52,131,372,208
3,0,2,5,0,30,32,19,26,387,790
4,0,2,5,0,30,32,99,246,181,439


In [50]:
np.random.seed(8888)

n_neighbors = 5

num_classes = Y.max()

# Train model
knn = neighbors.KNeighborsClassifier(n_neighbors=n_neighbors, n_jobs=4, leaf_size=100)
knn.fit(X.iloc[5000:].as_matrix(), Y[5000:])
print('finish fit')
y = knn.predict(X.iloc[:5000])
score = eval_wrapper(Y[:5000], y)
print(score)

finish fit
0.22447704222501919


In [8]:
np.random.seed(8888)

n_neighbors = 5


skf = cross_validation.StratifiedKFold(Y, n_folds=5)

num_classes = Y.max()

for i, (train_index, test_index) in enumerate(skf):
    print('Start fold {}'.format(i))
    # Train model
    knn = neighbors.KNeighborsClassifier(n_neighbors=n_neighbors, n_jobs=4)
    knn.fit(X.iloc[train_index].as_matrix(), Y[train_index])
    print('Complete training')
    # Predict model on train
    y = knn.predict(X.iloc[0:5000])
    print('Complete prediction on train')
    y = np.clip(y, -0.99, 8.99)
    
    # Find the thresholds that give the best score
    offsets = np.ones(num_classes) * -0.5
    offset_train_preds = np.vstack((y, y, Y[train_index]))
    print('Finding optimal offsets')
    for j in range(num_classes):
        train_offset = lambda x: -apply_offset(offset_train_preds, x, j)
        offsets[j] = fmin_powell(train_offset, offsets[j], disp=True)
    
    # Predict model on test
    y = knn.predict(X[0:15])
    print('Complete prediction on test')
    # Apply offsets found from optimizing on the train
    data = np.vstack((y, y ))
    for j in range(num_classes):
        data[1, data[0].astype(int)==j] = data[0, data[0].astype(int)==j] + offsets[j]
        
    final_test_preds = np.round(np.clip(data[1], 1, 8)).astype(int)
    
    score = eval_wrapper(Y[test_index], final_test_preds)
    print('Fold {} scored {}'.format(i+1, score))
    scores.append(score)
print('Avg {}, Std {}'.format(np.mean(scores), np.std(scores)*2))

Start fold 0
Complete training
Complete prediction on train


ValueError: all the input array dimensions except for the concatenation axis must match exactly

In [None]:
xg_train = xgb.DMatrix( X.as_matrix(), label=Y )
xg_test = xgb.DMatrix( X_test.as_matrix() )
# Train model
bst = xgb.train(param, xg_train, num_round)
# Predict model on train
y = bst.predict(xg_train, ntree_limit=bst.best_iteration)

# Find the thresholds that give the best score
offsets = np.ones(num_classes) * -0.5
offset_train_preds = np.vstack((y, y, Y))
for j in range(num_classes):
    train_offset = lambda x: -apply_offset(offset_train_preds, x, j)
    offsets[j] = fmin_powell(train_offset, offsets[j], disp=True)

# Predict model on test
y = bst.predict(xg_test, ntree_limit=bst.best_iteration)
# Apply offsets found from optimizing on the train
data = np.vstack((y, y, Y[:len(y)]))
for j in range(num_classes):
    data[1, data[0].astype(int)==j] = data[0, data[0].astype(int)==j] + offsets[j]

final_test_preds = np.round(np.clip(data[1], 1, 8)).astype(int)

In [None]:
sample = pd.read_csv('sample_submission.csv')
print(len(sample), len(final_test_preds))
sample.Response = final_test_preds
sample.to_csv('submissions/sub07.csv', index=False)