In [1]:
import pandas as pd
import numpy as np
from datetime import date
import os
import matplotlib.pyplot as plt
import seaborn as sns
import logging

# %env JOBLIB_TEMP_FOLDER=/tmp
%matplotlib inline

from xgboost.sklearn import XGBClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.ensemble import RandomForestClassifier
from heamy.dataset import Dataset
from heamy.estimator import Regressor, Classifier
from heamy.pipeline import ModelsPipeline

from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import RFE
from sklearn.preprocessing import MinMaxScaler
from sklearn import metrics

import warnings
warnings.filterwarnings('ignore')

# pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
logger = logging.getLogger()
logger.setLevel(logging.INFO)
logger_output = logging.FileHandler('o2o-v2-blend.log', mode='a')
logger_output.setLevel(logging.DEBUG)
formatter = logging.Formatter("[%(asctime)s]: %(message)s")
logger_output.setFormatter(formatter)
logger.addHandler(logger_output)

In [2]:
def get_model_input(model='gbdt', set_param=False, train=False, pred=False):
    dataset1 = pd.read_csv('../input/dataset1.csv')
    dataset2 = pd.read_csv('../input/dataset2.csv')
    dataset3 = pd.read_csv('../input/dataset3.csv')
    Submission = pd.read_csv('../input/Submission.csv')
    print('...read dataset complete...')

    if model != 'xgb':
        # 如果模型不是xgb填充-999
        dataset1.fillna(-999, inplace=True)
        dataset2.fillna(-999, inplace=True)
        dataset3.fillna(-999, inplace=True)
    dataset = pd.concat([dataset1, dataset2], axis=0)
    if set_param:
        Y_train = dataset1[['label']]
        X_train = dataset1.drop(columns='label')
        return X_train, Y_train
    if train:
        split_point = len(dataset) * 4 // 5
        train_data = dataset.iloc[: split_point, :]
        test_data = dataset.iloc[split_point:, :]
        Y_train = train_data[['label']]
        X_train = train_data.drop(columns='label')
        Y_test = test_data[['label']]
        X_test = test_data.drop(columns='label')
        print(dataset.shape, train_data.shape, test_data.shape)
        return X_train, Y_train, X_test, Y_test
    if pred:
        Y_train = dataset[['label']]
        X_train = dataset.drop(columns='label')
        X_pred = dataset3
        return X_train, Y_train, X_pred, Submission

In [3]:
# blend
# load data
X_train, Y_train, X_pred, Submission = get_model_input(model='blend', pred=True)

...read dataset complete...


In [4]:
# creat dataset
dataset = Dataset(X_train, Y_train, X_pred)

In [5]:
# initialize model
gbdt_params = {'verbose': 1, 'learning_rate': 0.1, 'n_estimators': 80, 'subsample': 0.8, 'max_features': 'sqrt', 'min_samples_leaf': 60, 'min_samples_split': 210, 'max_depth': 12}
model_gbdt = Classifier(dataset=dataset, estimator=GradientBoostingClassifier, parameters=gbdt_params, name='gbdt')
rf_params = {'verbose': 1, 'n_estimators': 80, 'max_features': 'sqrt', 'min_samples_leaf': 20, 'min_samples_split': 60, 'max_depth': 20}
model_rf = Classifier(dataset=dataset, estimator=RandomForestClassifier, parameters=rf_params, name='rf')

In [6]:
# Blend models
# Returns new dataset
pipeline = ModelsPipeline(model_gbdt, model_rf)
blend_ds = pipeline.blend(proportion=0.2, seed=621)

      Iter       Train Loss      OOB Improve   Remaining Time 
         1           0.5921           0.0513            7.75m
         2           0.5572           0.0322            7.22m
         3           0.5320           0.0241            6.91m
         4           0.5119           0.0183            6.66m
         5           0.4967           0.0152            6.55m
         6           0.4839           0.0124            6.51m
         7           0.4706           0.0111            6.23m
         8           0.4612           0.0087            6.17m
         9           0.4526           0.0080            6.09m
        10           0.4455           0.0064            5.97m
        20           0.4018           0.0017            4.84m
        30           0.3815           0.0007            3.78m
        40           0.3707           0.0003            2.83m
        50           0.3635           0.0003            1.98m
        60           0.3584           0.0001            1.23m
       

[Parallel(n_jobs=1)]: Done  80 out of  80 | elapsed:  1.8min finished
[Parallel(n_jobs=1)]: Done  80 out of  80 | elapsed:    4.8s finished


In [7]:
# Train LinearRegression on blended data (second stage)
blender = Regressor(dataset=blend_ds, estimator=LinearRegression)
results = blender.predict()
Submission['Proba'] = results
Submission.to_csv('blend_preds.csv', index=False, header=False)

In [10]:
# Validate results using 5 fold cross-validation
val_results = blender.validate(k=5, scorer=metrics.roc_auc_score)

Metric: roc_auc_score
Folds accuracy: [0.8969898121801695, 0.9022143528915822, 0.9024800566156469, 0.9029154449240097, 0.9078609308271796]
Mean accuracy: 0.9024921194877177
Standard Deviation: 0.003445461511184118
Variance: 1.1871205025051145e-05
