In [1]:
from __future__ import division
import numpy as np
import pandas as pd
#import load_data
from sklearn.cross_validation import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression

def logloss(attempt, actual, epsilon=1.0e-15):
    """Logloss, i.e. the score of the bioresponse competition.
    """
    attempt = np.clip(attempt, epsilon, 1.0-epsilon)
    return - np.mean(actual * np.log(attempt) + (1.0 - actual) * np.log(1.0 - attempt))

In [2]:
np.random.seed(0) # seed to shuffle the train set

n_folds = 10
verbose = True
shuffle = False

In [3]:
data_trn = pd.read_csv('new_train.csv', index_col=False)
data_tst = pd.read_csv('new_test.csv', index_col=False)
data_trn = data_trn.drop('Unnamed: 0', axis=1)
data_tst = data_tst.drop('Unnamed: 0', axis=1)
target = ['ACTION']
feature_columns_to_use = [column for column in data_trn.columns if column not in target]

In [4]:
feature_columns_to_use

['RESOURCE_enum',
 'MGR_ID_enum',
 'ROLE_ROLLUP_1_enum',
 'ROLE_ROLLUP_2_enum',
 'ROLE_DEPTNAME_enum',
 'ROLE_TITLE_enum',
 'ROLE_FAMILY_DESC_enum',
 'ROLE_FAMILY_enum',
 'ROLE_CODE_enum']

In [5]:
X = data_trn.as_matrix(feature_columns_to_use)
y = data_trn.as_matrix(['ACTION']).ravel()

In [6]:
X_submission = data_tst.as_matrix(feature_columns_to_use)
y_id = data_tst.as_matrix(['id']).ravel()

In [7]:
#X, y, X_submission = load_data.load()

if shuffle:
    idx = np.random.permutation(y.size)
    X = X[idx]
    y = y[idx]
    print 'shuffle'

In [8]:
skf = list(StratifiedKFold(y, n_folds))

clfs = [RandomForestClassifier(n_estimators=100, n_jobs=-1, criterion='gini'),
        RandomForestClassifier(n_estimators=100, n_jobs=-1, criterion='entropy'),
        ExtraTreesClassifier(n_estimators=100, n_jobs=-1, criterion='gini'),
        ExtraTreesClassifier(n_estimators=100, n_jobs=-1, criterion='entropy'),
        GradientBoostingClassifier(learning_rate=0.05, subsample=0.5, max_depth=6, n_estimators=50)]

In [9]:
pd.DataFrame(skf) #10 folds incdices prepared 0 for train and 1 for test

Unnamed: 0,0,1
0,"[3076, 3102, 3108, 3120, 3161, 3182, 3206, 320...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,..."
1,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[3076, 3102, 3108, 3120, 3161, 3182, 3206, 320..."
2,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[6474, 6487, 6504, 6520, 6536, 6550, 6557, 656..."
3,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[9824, 9825, 9826, 9827, 9828, 9829, 9830, 983..."
4,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[13109, 13110, 13111, 13112, 13113, 13114, 131..."
5,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[16251, 16283, 16287, 16292, 16355, 16383, 163..."
6,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[19650, 19651, 19652, 19653, 19654, 19655, 196..."
7,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[22910, 22911, 22912, 22913, 22914, 22916, 229..."
8,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[26201, 26202, 26203, 26205, 26206, 26207, 262..."
9,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[29463, 29482, 29484, 29496, 29497, 29498, 294..."


In [10]:
print "Creating train and test sets for blending."

dataset_blend_train = np.zeros((X.shape[0], len(clfs)))
dataset_blend_test = np.zeros((X_submission.shape[0], len(clfs)))

Creating train and test sets for blending.


In [11]:
for j, clf in enumerate(clfs):   #five different classifiers used in the problem
    print j, clf

0 RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)
1 RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)
2 ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
           max_depth=None, max_features='auto', max_leaf_nodes=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
       

In [12]:
for i, (train, test) in enumerate(skf): #just to see  folds in train and test using indices
        print "Fold", i
        print train, "train"
        print test, "test"

Fold 0
[ 3076  3102  3108 ..., 32766 32767 32768] train
[   0    1    2 ..., 3289 3290 3291] test
Fold 1
[    0     1     2 ..., 32766 32767 32768] train
[3076 3102 3108 ..., 6560 6561 6562] test
Fold 2
[    0     1     2 ..., 32766 32767 32768] train
[ 6474  6487  6504 ...,  9964  9975 10007] test
Fold 3
[    0     1     2 ..., 32766 32767 32768] train
[ 9824  9825  9826 ..., 13107 13108 13116] test
Fold 4
[    0     1     2 ..., 32766 32767 32768] train
[13109 13110 13111 ..., 16392 16393 16394] test
Fold 5
[    0     1     2 ..., 32766 32767 32768] train
[16251 16283 16287 ..., 19978 20001 20007] test
Fold 6
[    0     1     2 ..., 32766 32767 32768] train
[19650 19651 19652 ..., 23246 23283 23287] test
Fold 7
[    0     1     2 ..., 32766 32767 32768] train
[22910 22911 22912 ..., 26333 26342 26345] test
Fold 8
[    0     1     2 ..., 32766 32767 32768] train
[26201 26202 26203 ..., 29493 29494 29495] test
Fold 9
[    0     1     2 ..., 29493 29494 29495] train
[29463 29482 29484 .

In [13]:
for j, clf in enumerate(clfs):
    print j, clf
    dataset_blend_test_j = np.zeros((X_submission.shape[0], len(skf)))
    for i, (train, test) in enumerate(skf):
        print "Fold", i
        X_train = X[train] #split train data for one fold in independent variables
        y_train = y[train] #split train data for one fold in dependent variables
        X_test = X[test] #split the test data for one fold in independent variables
        y_test = y[test] #split the test data for one fold in dependent variables
        clf.fit(X_train, y_train) #one fold train data for training
        y_submission = clf.predict_proba(X_test)[:,1] #predict output for one fold test (y_submission)
        dataset_blend_train[test, j] = y_submission #store y_submission in dataset_blend_train as it will be used further
        #all the above steps till now are for original training data (39000 rows)
        #X_submission is test data (59000 rows)        
        dataset_blend_test_j[:, i] = clf.predict_proba(X_submission)[:,1] #for one particular algorithm it stores output for original test data for one fold (each column) i.e 'i'
    dataset_blend_test[:,j] = dataset_blend_test_j.mean(1) #for one particular algorithm, it takes mean of all folds of the original test data prediction
print "Blending."

0 RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)
Fold 0
Fold 1
Fold 2
Fold 3
Fold 4
Fold 5
Fold 6
Fold 7
Fold 8
Fold 9
1 RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)
Fold 0
Fold 1
Fold 2
Fold 3
Fold 4
Fold 5
Fold 6
Fold 7
Fold 8
Fold 9
2 ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
           max_depth=None, max_features='auto', max_leaf_n

In [14]:
pd.DataFrame(dataset_blend_train)

Unnamed: 0,0,1,2,3,4
0,1.00,1.00,1.00,1.00,0.975568
1,0.98,0.98,0.96,0.97,0.963451
2,0.92,0.91,0.99,0.97,0.929209
3,0.97,1.00,0.98,0.98,0.962979
4,0.99,1.00,1.00,1.00,0.966847
5,0.97,0.97,0.97,1.00,0.953429
6,0.99,1.00,1.00,1.00,0.952111
7,1.00,0.99,1.00,1.00,0.960088
8,0.98,1.00,0.99,0.98,0.947600
9,0.96,1.00,1.00,1.00,0.952761


In [15]:
pd.DataFrame(y)

Unnamed: 0,0
0,1
1,1
2,1
3,1
4,1
5,0
6,1
7,1
8,1
9,1


In [16]:
pd.DataFrame(dataset_blend_test)

Unnamed: 0,0,1,2,3,4
0,0.976,0.969,0.981,0.976,0.895547
1,0.994,0.996,0.995,0.996,0.962269
2,0.999,0.999,1.000,0.998,0.962711
3,0.999,0.997,0.998,0.999,0.956033
4,1.000,0.999,1.000,1.000,0.960941
5,1.000,1.000,0.999,1.000,0.971858
6,0.987,0.989,0.990,0.992,0.955504
7,0.998,0.997,0.999,0.999,0.977640
8,0.919,0.894,0.895,0.880,0.908795
9,0.996,0.999,1.000,1.000,0.941998


In [17]:
clf = LogisticRegression()
clf.fit(dataset_blend_train, y) #predicted output of all the five algorithms will be used here as independent variable for training
y_submission = clf.predict_proba(dataset_blend_test)[:,1]

In [18]:
pd.DataFrame(y_submission)

array([ 0.96623001,  0.97571218,  0.9763573 , ...,  0.96968655,
        0.95039842,  0.97705343])

In [19]:
print "Linear stretch of predictions to [0,1]"
y_submission = (y_submission - y_submission.min()) / (y_submission.max() - y_submission.min())
y_id=range(1,58922)
print "Saving Results."
#np.savetxt(fname='test.csv', X=y_submission, fmt='%0.9f')

Linear stretch of predictions to [0,1]
Saving Results.


In [22]:
pd.DataFrame(y_submission)

Unnamed: 0,0
0,0.986732
1,0.996564
2,0.997233
3,0.996529
4,0.997125
5,0.998006
6,0.994906
7,0.998050
8,0.975008
9,0.995291


In [20]:
# np.savetxt(fname='test.csv', X=zip(y_id,y_submission), fmt='%0.9f')

In [21]:
def fn_WriteSubmission(_file_name, _y_predict, _y_id):
    # Write Submission
    submission = pd.DataFrame({"ACTION": _y_predict}, index=_y_id)
    submission.index.name = 'Id'

    submission.to_csv(_file_name)
#Call
fn_WriteSubmission("blending.csv",y_submission,y_id)