In [12]:
import pandas as pd
import numpy as np
from datetime import date
import os
import matplotlib.pyplot as plt
import seaborn as sns
import logging

# %env JOBLIB_TEMP_FOLDER=/tmp
%matplotlib inline

from xgboost.sklearn import XGBClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.ensemble import RandomForestClassifier
from heamy.dataset import Dataset
from heamy.estimator import Regressor, Classifier
from heamy.pipeline import ModelsPipeline

from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import RFE
from sklearn.preprocessing import MinMaxScaler
from sklearn import metrics

import warnings
warnings.filterwarnings('ignore')

# pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
logger = logging.getLogger()
logger.setLevel(logging.INFO)
logger_output = logging.FileHandler('o2o-v1-stacking.log', mode='a')
logger_output.setLevel(logging.DEBUG)
formatter = logging.Formatter("[%(asctime)s]: %(message)s")
logger_output.setFormatter(formatter)
logger.addHandler(logger_output)

In [2]:
def get_model_input(model='gbdt', set_param=False, train=False, pred=False):
    dataset1 = pd.read_csv('../input/dataset1.csv')
    dataset2 = pd.read_csv('../input/dataset2.csv')
    dataset3 = pd.read_csv('../input/dataset3.csv')
    Submission = pd.read_csv('../input/Submission.csv')
    print('...read dataset complete...')

    if model != 'xgb':
        # 如果模型不是xgb填充-999
        dataset1.fillna(-999, inplace=True)
        dataset2.fillna(-999, inplace=True)
        dataset3.fillna(-999, inplace=True)
    dataset = pd.concat([dataset1, dataset2], axis=0)
    if set_param:
        Y_train = dataset1[['label']]
        X_train = dataset1.drop(columns='label')
        return X_train, Y_train
    if train:
        split_point = len(dataset) * 4 // 5
        train_data = dataset.iloc[: split_point, :]
        test_data = dataset.iloc[split_point:, :]
        Y_train = train_data[['label']]
        X_train = train_data.drop(columns='label')
        Y_test = test_data[['label']]
        X_test = test_data.drop(columns='label')
        print(dataset.shape, train_data.shape, test_data.shape)
        return X_train, Y_train, X_test, Y_test
    if pred:
        Y_train = dataset[['label']]
        X_train = dataset.drop(columns='label')
        X_pred = dataset3
        return X_train, Y_train, X_pred, Submission

In [3]:
# blend
# load data
X_train, Y_train, X_pred, Submission = get_model_input(model='blend', pred=True)

...read dataset complete...


In [4]:
# creat dataset
dataset = Dataset(X_train, Y_train, X_pred)

In [7]:
# initialize GradientBoostingClassifier
gbdt_params = {'verbose': 1, 'learning_rate': 0.1, 'n_estimators': 80, 'subsample': 0.8, 'max_features': 9, 'min_samples_leaf': 40, 'min_samples_split': 200, 'max_depth': 12}
model_gbdt = Classifier(dataset=dataset, estimator=GradientBoostingClassifier, parameters=gbdt_params, name='gbdt')

In [8]:
# Blend two models
# Returns new dataset
pipeline = ModelsPipeline(model_gbdt)
blend_ds = pipeline.blend(proportion=0.2, seed=621)

      Iter       Train Loss      OOB Improve   Remaining Time 
         1           0.5892           0.0538            5.70m
         2           0.5556           0.0319            5.10m
         3           0.5325           0.0224            4.79m
         4           0.5147           0.0172            5.55m
         5           0.4998           0.0146            5.99m
         6           0.4856           0.0116            6.22m
         7           0.4730           0.0103            6.53m
         8           0.4633           0.0084            6.69m
         9           0.4539           0.0076            6.71m
        10           0.4482           0.0067            6.85m
        20           0.4032           0.0018            5.55m
        30           0.3848           0.0006            4.28m
        40           0.3734           0.0004            3.20m
        50           0.3661           0.0003            2.24m
        60           0.3598           0.0000            1.44m
       

In [13]:
# Train LinearRegression on blended data (second stage)
blender = Regressor(dataset=blend_ds, estimator=LinearRegression)
results = blender.predict()

In [14]:
print(results.shape)
results

(116204, 1)


array([[0.02012357],
       [0.05021644],
       [0.05229346],
       ...,
       [0.05966063],
       [0.08550992],
       [0.06294739]])

In [16]:
# Validate results using 5 fold cross-validation
results = blender.validate(k=5)

In [18]:
results

([array([[0],
         [0],
         [0],
         ...,
         [0],
         [0],
         [0]], dtype=int64), array([[0],
         [1],
         [0],
         ...,
         [0],
         [0],
         [0]], dtype=int64), array([[0],
         [0],
         [0],
         ...,
         [0],
         [0],
         [0]], dtype=int64), array([[0],
         [1],
         [0],
         ...,
         [1],
         [0],
         [0]], dtype=int64), array([[0],
         [0],
         [0],
         ...,
         [0],
         [0],
         [0]], dtype=int64)], [array([[0.00258482],
         [0.10541922],
         [0.00144992],
         ...,
         [0.01353952],
         [0.04069764],
         [0.01465232]]), array([[0.02472527],
         [0.86899181],
         [0.07765581],
         ...,
         [0.00469142],
         [0.10899011],
         [0.01078428]]), array([[0.01637234],
         [0.00331525],
         [0.00194804],
         ...,
         [0.00069003],
         [0.0650459 ],
         [

In [10]:
blend_ds

Dataset(72e3790fc507abf359d2b625e2fea9dd)