In [17]:
from __future__ import division
import numpy as np
import pandas as pd
#import load_data
from sklearn.cross_validation import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression

def logloss(attempt, actual, epsilon=1.0e-15):
    """Logloss, i.e. the score of the bioresponse competition.
    """
    attempt = np.clip(attempt, epsilon, 1.0-epsilon)
    return - np.mean(actual * np.log(attempt) + (1.0 - actual) * np.log(1.0 - attempt))

In [2]:
np.random.seed(0) # seed to shuffle the train set

n_folds = 10
verbose = True
shuffle = False

In [3]:
data_trn = pd.read_csv('new_train.csv', index_col=False)
data_tst = pd.read_csv('new_test.csv', index_col=False)
data_trn = data_trn.drop('Unnamed: 0', axis=1)
data_tst = data_tst.drop('Unnamed: 0', axis=1)
target = ['ACTION']
feature_columns_to_use = [column for column in data_trn.columns if column not in target]

In [4]:
feature_columns_to_use

['RESOURCE_enum',
 'MGR_ID_enum',
 'ROLE_ROLLUP_1_enum',
 'ROLE_ROLLUP_2_enum',
 'ROLE_DEPTNAME_enum',
 'ROLE_TITLE_enum',
 'ROLE_FAMILY_DESC_enum',
 'ROLE_FAMILY_enum',
 'ROLE_CODE_enum']

In [5]:
data_trn

Unnamed: 0,RESOURCE_enum,MGR_ID_enum,ROLE_ROLLUP_1_enum,ROLE_ROLLUP_2_enum,ROLE_DEPTNAME_enum,ROLE_TITLE_enum,ROLE_FAMILY_DESC_enum,ROLE_FAMILY_enum,ROLE_CODE_enum,ACTION
0,3050,4440,21,65,319,4,7,65,4,1
1,644,162,21,69,310,34,62,67,38,1
2,2706,1679,50,58,14,0,2590,3,0,1
3,2615,931,21,69,184,22,2357,65,23,1
4,3616,1010,15,13,160,70,380,4,77,1
5,4172,1685,19,18,23,36,68,3,40,0
6,1329,2051,21,69,320,60,2813,8,66,1
7,756,615,21,22,119,224,2605,21,234,1
8,1842,67,21,74,229,240,2823,1,251,1
9,5481,3797,33,38,13,0,2836,3,0,1


In [6]:
X = data_trn.as_matrix(feature_columns_to_use)
y = data_trn.as_matrix(['ACTION']).ravel()

In [7]:
X_submission = data_tst.as_matrix(feature_columns_to_use)
y_id = data_tst.as_matrix(['id']).ravel()

#X, y, X_submission = load_data.load()

if shuffle:
    idx = np.random.permutation(y.size)
    X = X[idx]
    y = y[idx]

skf = list(StratifiedKFold(y, n_folds))

In [8]:
skf = list(StratifiedKFold(y, n_folds))

clfs = [RandomForestClassifier(n_estimators=100, n_jobs=-1, criterion='gini'),
        RandomForestClassifier(n_estimators=100, n_jobs=-1, criterion='entropy'),
        ExtraTreesClassifier(n_estimators=100, n_jobs=-1, criterion='gini'),
        ExtraTreesClassifier(n_estimators=100, n_jobs=-1, criterion='entropy'),
        GradientBoostingClassifier(learning_rate=0.05, subsample=0.5, max_depth=6, n_estimators=50)]

In [9]:
skf

[(array([ 3076,  3102,  3108, ..., 32766, 32767, 32768]),
  array([   0,    1,    2, ..., 3289, 3290, 3291])),
 (array([    0,     1,     2, ..., 32766, 32767, 32768]),
  array([3076, 3102, 3108, ..., 6560, 6561, 6562])),
 (array([    0,     1,     2, ..., 32766, 32767, 32768]),
  array([ 6474,  6487,  6504, ...,  9964,  9975, 10007])),
 (array([    0,     1,     2, ..., 32766, 32767, 32768]),
  array([ 9824,  9825,  9826, ..., 13107, 13108, 13116])),
 (array([    0,     1,     2, ..., 32766, 32767, 32768]),
  array([13109, 13110, 13111, ..., 16392, 16393, 16394])),
 (array([    0,     1,     2, ..., 32766, 32767, 32768]),
  array([16251, 16283, 16287, ..., 19978, 20001, 20007])),
 (array([    0,     1,     2, ..., 32766, 32767, 32768]),
  array([19650, 19651, 19652, ..., 23246, 23283, 23287])),
 (array([    0,     1,     2, ..., 32766, 32767, 32768]),
  array([22910, 22911, 22912, ..., 26333, 26342, 26345])),
 (array([    0,     1,     2, ..., 32766, 32767, 32768]),
  array([26201, 26

In [10]:
print "Creating train and test sets for blending."

dataset_blend_train = np.zeros((X.shape[0], len(clfs)))
dataset_blend_test = np.zeros((X_submission.shape[0], len(clfs)))

Creating train and test sets for blending.


In [11]:
for j, clf in enumerate(clfs):
    print j, clf
    dataset_blend_test_j = np.zeros((X_submission.shape[0], len(skf)))
    for i, (train, test) in enumerate(skf):
        print "Fold", i
        X_train = X[train]
        y_train = y[train]
        X_test = X[test]
        y_test = y[test]
        clf.fit(X_train, y_train)
        y_submission = clf.predict_proba(X_test)[:,1]
        dataset_blend_train[test, j] = y_submission
        dataset_blend_test_j[:, i] = clf.predict_proba(X_submission)[:,1]
    dataset_blend_test[:,j] = dataset_blend_test_j.mean(1)
print "Blending."

0 RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)
Fold 0
Fold 1
Fold 2
Fold 3
Fold 4
Fold 5
Fold 6
Fold 7
Fold 8
Fold 9
1 RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)
Fold 0
Fold 1
Fold 2
Fold 3
Fold 4
Fold 5
Fold 6
Fold 7
Fold 8
Fold 9
2 ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
           max_depth=None, max_features='auto', max_leaf_n

In [12]:
pd.DataFrame(X_train)

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,3050,4440,21,65,319,4,7,65,4
1,644,162,21,69,310,34,62,67,38
2,2706,1679,50,58,14,0,2590,3,0
3,2615,931,21,69,184,22,2357,65,23
4,3616,1010,15,13,160,70,380,4,77
5,4172,1685,19,18,23,36,68,3,40
6,1329,2051,21,69,320,60,2813,8,66
7,756,615,21,22,119,224,2605,21,234
8,1842,67,21,74,229,240,2823,1,251
9,5481,3797,33,38,13,0,2836,3,0


In [13]:
clf = LogisticRegression()
clf.fit(dataset_blend_train, y)
y_submission = clf.predict_proba(dataset_blend_test)[:,1]

print "Linear stretch of predictions to [0,1]"
y_submission = (y_submission - y_submission.min()) / (y_submission.max() - y_submission.min())
y_id=range(1,58922)
print "Saving Results."
#np.savetxt(fname='test.csv', X=y_submission, fmt='%0.9f')

Linear stretch of predictions to [0,1]
Saving Results.


In [14]:
# np.savetxt(fname='test.csv', X=zip(y_id,y_submission), fmt='%0.9f')

In [15]:
def fn_WriteSubmission(_file_name, _y_predict, _y_id):
    # Write Submission
    submission = pd.DataFrame({"ACTION": _y_predict}, index=_y_id)
    submission.index.name = 'Id'

    submission.to_csv(_file_name)
#Call
fn_WriteSubmission("blending.csv",y_submission,y_id)

In [16]:
y_id=range(1,58922)