In [2]:
import pandas as pd, numpy as np, matplotlib.pyplot as plt
%matplotlib inline



In [3]:
def gini(actual, pred, cmpcol = 0, sortcol = 1):
    assert( len(actual) == len(pred) )
    all = np.asarray(np.c_[ actual, pred, np.arange(len(actual)) ], dtype=np.float)
    all = all[ np.lexsort((all[:,2], -1*all[:,1])) ]
    totalLosses = all[:,0].sum()
    giniSum = all[:,0].cumsum().sum() / totalLosses
 
    giniSum -= (len(actual) + 1) / 2.
    return giniSum / len(actual)
 
def gini_normalized(a, p):
    return gini(a, p) / gini(a, a)
 
def test_gini():
    def fequ(a,b):
        return abs( a -b) < 1e-6
    def T(a, p, g, n):
        assert( fequ(gini(a,p), g) )
        assert( fequ(gini_normalized(a,p), n) )
    T([1, 2, 3], [10, 20, 30], 0.111111, 1)
    T([1, 2, 3], [30, 20, 10], -0.111111, -1)
    T([1, 2, 3], [0, 0, 0], -0.111111, -1)
    T([3, 2, 1], [0, 0, 0], 0.111111, 1)
    T([1, 2, 4, 3], [0, 0, 0, 0], -0.1, -0.8)
    T([2, 1, 4, 3], [0, 0, 2, 1], 0.125, 1)
    T([0, 20, 40, 0, 10], [40, 40, 10, 5, 5], 0, 0)
    T([40, 0, 20, 0, 10], [1000000, 40, 40, 5, 5], 0.171428,
       0.6)
    T([40, 20, 10, 0, 0], [40, 20, 10, 0, 0], 0.285714, 1)
    T([1, 1, 0, 1], [0.86, 0.26, 0.52, 0.32], -0.041666,
       -0.333333)

Read data

In [4]:
#X_train= np.load(open('../wp004/train_matrix.bin','rb'))
#y_train= np.load(open('../wp004/train_labels.bin','rb'))

In [5]:
#X_test= np.load(open('../wp004/test_matrix.bin','rb'))
#y_test= np.load(open('../wp004/test_labels.bin','rb'))

In [9]:
# Read in wp007 feature set and split into train and test
import sklearn.cross_validation
df= pd.read_csv('../data/train.csv',usecols= ['id','target'])
train,test= sklearn.cross_validation.train_test_split(range(df['target'].count()),test_size= 0.33,random_state=0)
y_train= df.target.values[train]
y_test= df.target.values[test]
df2= pd.read_csv('../wp007/full_248_train.csv')
X_train= df2.as_matrix()[train,:]
X_test= df2.as_matrix()[test,:]

In [10]:
del df2

In [11]:
# import pandas as pd
# df= pd.DataFrame(X_train,columns= ['x'+str(i) for i in range(np.shape(X_train)[1])])
# df.loc[:,'y']= y_train
# df.to_csv('train.csv',index=False,columns=['y']+['x'+str(i) for i in range(np.shape(X_train)[1])])
# 
# df= pd.DataFrame(X_test,columns= ['x'+str(i) for i in range(np.shape(X_train)[1])])
# df.loc[:,'y']= np.zeros(np.shape(X_test)[0])
# df.to_csv('test.csv',index=False,columns=['y']+['x'+str(i) for i in range(np.shape(X_train)[1])])

Run vanilla random forest

In [12]:
# import sklearn.linear_model
from datetime import datetime,timedelta

In [13]:
# tc= datetime.now()
# lr= sklearn.linear_model.LogisticRegression(penalty='l2',
#                                             C=1.,
#                                             class_weight='balanced'
#                                             )
# lr.fit(X_train,y_train)
# print 'Training took: ',(datetime.now() - tc).total_seconds(),' seconds'

In [14]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold

sample_weights = np.load(open('sample_weights.bin','rb'))

class Ensemble(object):
    def __init__(self, n_splits, stacker, base_models):
        self.n_splits = n_splits
        self.stacker = stacker
        self.base_models = base_models

    def fit_predict(self, X, y, T):
        X = np.array(X)
        y = np.array(y)
        T = np.array(T)

        folds = list(StratifiedKFold(n_splits=self.n_splits, shuffle=True, random_state=2016).split(X, y))

        S_train = np.zeros((X.shape[0], len(self.base_models)))
        S_test = np.zeros((T.shape[0], len(self.base_models)))
        for i, clf in enumerate(self.base_models):

            S_test_i = np.zeros((T.shape[0], self.n_splits))

            for j, (train_idx, test_idx) in enumerate(folds):
                X_train = X[train_idx]
                y_train = y[train_idx]
                X_holdout = X[test_idx]
#                y_holdout = y[test_idx]

                print ("Fit %s fold %d" % (str(clf).split('(')[0], j+1))
#                 if i == len(self.base_models)-1:
#                     print 'fitting grey area model'
#                     clf.fit(X_train, y_train, sample_weight= sample_weights[train_idx])
#                 else:
                clf.fit(X_train, y_train)
#                cross_score = cross_val_score(clf, X_train, y_train, cv=3, scoring='roc_auc')
#                print("    cross_score: %.5f" % (cross_score.mean()))
                y_pred = clf.predict_proba(X_holdout)[:,1]                

                S_train[test_idx, i] = y_pred
                S_test_i[:, j] = clf.predict_proba(T)[:,1]
            S_test[:, i] = S_test_i.mean(axis=1)

        results = cross_val_score(self.stacker, S_train, y, cv=3, scoring='roc_auc')
        print("Stacker score: %.5f" % (2.*results.mean()-1.))

        self.stacker.fit(S_train, y)
        res = self.stacker.predict_proba(S_test)[:,1]
        return res

In [32]:

tc= datetime.now()
import lightgbm.sklearn
lgb_params = {}
lgb_params['learning_rate'] = 0.01
lgb_params['n_estimators'] = 1300
#lgb_params['max_depth'] = 10
lgb_params['max_bin'] = 10
lgb_params['subsample'] = 0.8
lgb_params['subsample_freq'] = 10
lgb_params['colsample_bytree'] = 0.8   
lgb_params['min_child_samples'] = 500
lgb_params['num_leaves']= 25
# lgb_params['random_state']= 0
gbm1 = lightgbm.sklearn.LGBMClassifier(**lgb_params)

lgb_params_2 = {
    'learning_rate': 0.005,
    'n_estimators': 3700,
    'subsample': 0.7,
    'subsample_freq': 2,
    'colsample_bytree': 0.3,  
    'num_leaves': 16
}

lgb_params_3 = {
    'learning_rate': 0.02,
    'n_estimators': 800,
    'max_depth': 4
}

lgb_params_4 = {
    'learning_rate':0.05,
    'n_estimators':600,
    'num_leaves':35,
    'min_child_samples':500
}

# lgb_params_5 = {
#     'learning_rate':0.01,
#     'n_estimators':1300,
#     'num_leaves':25,
#     'subsample': 0.8,
#     'subsample_freq': 10,
#     'colsample_bytree': 0.8 ,  
#     'min_child_samples': 500,
#     'num_leaves': 25
# }

gbm2 = lightgbm.sklearn.LGBMClassifier(**lgb_params_2)
gbm3 = lightgbm.sklearn.LGBMClassifier(**lgb_params_3)
gbm4 = lightgbm.sklearn.LGBMClassifier(**lgb_params_4)
# gbm5 = lightgbm.sklearn.LGBMClassifier(**lgb_params_5)

gbm1.fit(X_train,y_train)
y_pred= gbm1.predict_proba(X_test)[:,1]

import sklearn.neural_network
mlp_params= {
    'hidden_layer_sizes':(100,100), 
    'activation':'relu', 
    'solver':'adam', 
    'alpha':0.00000001, 
    'batch_size':'auto', 
    'learning_rate_init':0.001, 
    'shuffle':True, 
    'tol':0.0001, 
    'early_stopping': True,
    'validation_fraction':0.1, 
    'beta_1':0.9, 
    'beta_2':0.999, 
    'epsilon':1e-08
}
mlp= sklearn.neural_network.MLPClassifier(**mlp_params)
#mlp.fit(X_train,y_train)
#y_pred= mlp.predict_proba(X_test)[:,1]


import sklearn.linear_model
lr_base_params= {
    'C':100.0,
     'penalty':'l2',
    'class_weight':'balanced'
}
# lr_base= sklearn.linear_model.LogisticRegression(**lr_base_params)
#lr_base.fit(X_train,y_train)
#y_pred= lr_base.predict_proba(X_test)[:,1]

lr_stacker= sklearn.linear_model.LogisticRegression()

# lgbm_stacker= lightgbm.sklearn.LGBMClassifier(**{'learning_rate':0.01,
#                                                  'n_estimators':500,
#                                                  'num_leaves':6,
#                                                  'min_child_samples':500})

#import sklearn.ensemble
#rf_stacker= sklearn.ensemble.RandomForestClassifier(n_estimators=300,
#                                                   max_depth=7,
#                                                   min_samples_leaf= 300)

# knnparams={
#     'n_neighbors':5, 
#     'weights':'uniform', 
#     'algorithm':'kd_tree'
# }
# import sklearn.neighbors
# knn= sklearn.neighbors.KNeighborsClassifier(**knnparams)
# knn.fit(X_train,y_train)
# y_pred= knn.predict_proba(X_test)

# tc= datetime.now()
# stack = Ensemble(n_splits=3,
# stacker = lr_stacker,
#         base_models = (gbm1, gbm2, gbm3, gbm4))#, mlp, lr_base))        
# y_pred = stack.fit_predict(X_train, y_train, X_test) 

print 'Training took: ',(datetime.now() - tc).total_seconds(),' seconds'

Training took:  153.139017  seconds


In [33]:
prob= y_pred #gbm.predict_proba(X_test)

In [34]:
import sklearn.metrics
print 'AUC= ',sklearn.metrics.roc_auc_score(y_test,y_pred)

AUC=  0.639781503074


In [31]:
2.*sklearn.metrics.roc_auc_score(y_test,y_pred)-1.

0.28036380223742596

In [111]:
gini_normalized(y_test,y_pred)

0.27265345351512127

In [24]:
import pickle as pkl
pkl.dump(stack,open('simple_stacker.pkl','wb'))