In [15]:
import pandas as pd
import numpy as np
from datetime import date
import os
import matplotlib.pyplot as plt
import seaborn as sns
import logging

# %env JOBLIB_TEMP_FOLDER=/tmp
%matplotlib inline

from xgboost.sklearn import XGBClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.ensemble import RandomForestClassifier
from heamy.dataset import Dataset
from heamy.estimator import Regressor, Classifier
from heamy.pipeline import ModelsPipeline

from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import RFE
from sklearn.preprocessing import MinMaxScaler
from sklearn import metrics

import warnings
warnings.filterwarnings('ignore')

# pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
logger = logging.getLogger()
logger.setLevel(logging.INFO)
logger_output = logging.FileHandler('o2o-v2-stack.log', mode='a')
logger_output.setLevel(logging.DEBUG)
formatter = logging.Formatter("[%(asctime)s]: %(message)s")
logger_output.setFormatter(formatter)
logger.addHandler(logger_output)

In [9]:
def get_model_input(model='gbdt', set_param=False, train=False, pred=False):
    dataset1 = pd.read_csv('../input/dataset1.csv')
    dataset2 = pd.read_csv('../input/dataset2.csv')
    dataset3 = pd.read_csv('../input/dataset3.csv')
    Submission = pd.read_csv('../input/Submission.csv')
    print('...read dataset complete...')

    if model != 'xgb':
        # 如果模型不是xgb填充-999
        dataset1.fillna(-999, inplace=True)
        dataset2.fillna(-999, inplace=True)
        dataset3.fillna(-999, inplace=True)
    dataset = pd.concat([dataset1, dataset2], axis=0)
    if set_param:
        Y_train = dataset1[['label']]
        X_train = dataset1.drop(columns='label')
        return X_train, Y_train
    if train:
        split_point = len(dataset) * 4 // 5
        train_data = dataset.iloc[: split_point, :]
        test_data = dataset.iloc[split_point:, :]
        Y_train = train_data[['label']]
        X_train = train_data.drop(columns='label')
        Y_test = test_data[['label']]
        X_test = test_data.drop(columns='label')
        print(dataset.shape, train_data.shape, test_data.shape)
        return X_train, Y_train, X_test, Y_test
    if pred:
        Y_train = dataset[['label']]
        X_train = dataset.drop(columns='label')
        X_pred = dataset3
        return X_train, Y_train, X_pred, Submission

In [10]:
# stack
# load data
X_train, Y_train, X_pred, Submission = get_model_input(model='stack', pred=True)

...read dataset complete...


In [11]:
# creat dataset
dataset = Dataset(X_train, Y_train, X_pred)

In [12]:
# initialize model
gbdt_params = {'verbose': 1, 'learning_rate': 0.1, 'n_estimators': 80, 'subsample': 0.8, 'max_features': 'sqrt', 'min_samples_leaf': 60, 'min_samples_split': 210, 'max_depth': 12}
model_gbdt = Classifier(dataset=dataset, estimator=GradientBoostingClassifier, parameters=gbdt_params, name='gbdt')
rf_params = {'verbose': 1, 'n_estimators': 80, 'max_features': 'sqrt', 'min_samples_leaf': 20, 'min_samples_split': 60, 'max_depth': 20}
model_rf = Classifier(dataset=dataset, estimator=RandomForestClassifier, parameters=rf_params, name='rf')

In [13]:
# Stack models
# Returns new dataset with out-of-fold predictions
pipeline = ModelsPipeline(model_gbdt, model_rf)
stack_ds = pipeline.stack(k=5, seed=621)

      Iter       Train Loss      OOB Improve   Remaining Time 
         1           0.5908           0.0539            5.64m
         2           0.5570           0.0312            5.99m
         3           0.5303           0.0241            6.05m
         4           0.5107           0.0188            5.99m
         5           0.4946           0.0147            5.73m
         6           0.4824           0.0124            5.72m
         7           0.4694           0.0102            5.66m
         8           0.4589           0.0090            5.62m
         9           0.4509           0.0076            5.55m
        10           0.4427           0.0063            5.45m
        20           0.4016           0.0017            4.41m
        30           0.3824           0.0007            3.46m
        40           0.3715           0.0003            2.62m
        50           0.3648           0.0002            1.83m
        60           0.3588           0.0002            1.15m
       

[Parallel(n_jobs=1)]: Done  80 out of  80 | elapsed:  1.5min finished
[Parallel(n_jobs=1)]: Done  80 out of  80 | elapsed:    1.7s finished
[Parallel(n_jobs=1)]: Done  80 out of  80 | elapsed:  1.5min finished
[Parallel(n_jobs=1)]: Done  80 out of  80 | elapsed:    1.7s finished
[Parallel(n_jobs=1)]: Done  80 out of  80 | elapsed:  1.5min finished
[Parallel(n_jobs=1)]: Done  80 out of  80 | elapsed:    1.6s finished
[Parallel(n_jobs=1)]: Done  80 out of  80 | elapsed:  1.6min finished
[Parallel(n_jobs=1)]: Done  80 out of  80 | elapsed:    1.8s finished
[Parallel(n_jobs=1)]: Done  80 out of  80 | elapsed:  1.6min finished
[Parallel(n_jobs=1)]: Done  80 out of  80 | elapsed:    1.7s finished
[Parallel(n_jobs=1)]: Done  80 out of  80 | elapsed:  2.2min finished
[Parallel(n_jobs=1)]: Done  80 out of  80 | elapsed:    2.2s finished


In [16]:
# Train LinearRegression on stacked data (second stage)
stacker = Regressor(dataset=stack_ds, estimator=LinearRegression)
results = stacker.predict()
Submission['Proba'] = results
Submission.to_csv('stack_preds.csv', index=False, header=False)

In [23]:
# Validate results using 10 fold cross-validation
val_results = stacker.validate(k=5, scorer=metrics.roc_auc_score)

Metric: roc_auc_score
Folds accuracy: [0.9003401112589613, 0.8987312523495665, 0.9035328764639095, 0.9009359857845852, 0.9020440833287333]
Mean accuracy: 0.9011168618371512
Standard Deviation: 0.0016139427874324295
Variance: 2.6048113211051603e-06
