In [1]:
import pandas as pd
import numpy as np
from datetime import date
import os
import matplotlib.pyplot as plt
import seaborn as sns
import logging

# %env JOBLIB_TEMP_FOLDER=/tmp
%matplotlib inline

from xgboost.sklearn import XGBClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.ensemble import RandomForestClassifier
from heamy.dataset import Dataset
from heamy.estimator import Regressor, Classifier
from heamy.pipeline import ModelsPipeline

from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import RFE
from sklearn.preprocessing import MinMaxScaler
from sklearn import metrics

import warnings
warnings.filterwarnings('ignore')

# pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
logger = logging.getLogger()
logger.setLevel(logging.INFO)
logger_output = logging.FileHandler('o2o-v2-weighted.log', mode='a')
logger_output.setLevel(logging.DEBUG)
formatter = logging.Formatter("[%(asctime)s]: %(message)s")
logger_output.setFormatter(formatter)
logger.addHandler(logger_output)

In [2]:
def get_model_input(model='gbdt', set_param=False, train=False, pred=False):
    dataset1 = pd.read_csv('../input/dataset1.csv')
    dataset2 = pd.read_csv('../input/dataset2.csv')
    dataset3 = pd.read_csv('../input/dataset3.csv')
    Submission = pd.read_csv('../input/Submission.csv')
    print('...read dataset complete...')

    if model != 'xgb':
        # 如果模型不是xgb填充-999
        dataset1.fillna(-999, inplace=True)
        dataset2.fillna(-999, inplace=True)
        dataset3.fillna(-999, inplace=True)
    dataset = pd.concat([dataset1, dataset2], axis=0)
    if set_param:
        Y_train = dataset1[['label']]
        X_train = dataset1.drop(columns='label')
        return X_train, Y_train
    if train:
        split_point = len(dataset) * 4 // 5
        train_data = dataset.iloc[: split_point, :]
        test_data = dataset.iloc[split_point:, :]
        Y_train = train_data[['label']]
        X_train = train_data.drop(columns='label')
        Y_test = test_data[['label']]
        X_test = test_data.drop(columns='label')
        print(dataset.shape, train_data.shape, test_data.shape)
        return X_train, Y_train, X_test, Y_test
    if pred:
        Y_train = dataset[['label']]
        X_train = dataset.drop(columns='label')
        X_pred = dataset3
        return X_train, Y_train, X_pred, Submission

In [3]:
# weighted
# load data
X_train, Y_train, X_pred, Submission = get_model_input(model='weighted', pred=True)

...read dataset complete...


In [4]:
# creat dataset
dataset = Dataset(X_train, Y_train, X_pred)

In [5]:
# initialize model
gbdt_params = {'verbose': 1, 'learning_rate': 0.01, 'n_estimators': 1800, 'subsample': 0.8, 'max_features': 'sqrt', 'min_samples_leaf': 60, 'min_samples_split': 210, 'max_depth': 12}
model_gbdt = Classifier(dataset=dataset, estimator=GradientBoostingClassifier, parameters=gbdt_params, name='gbdt')
rf_params = {'verbose': 1, 'n_estimators': 1800, 'max_features': 'sqrt', 'min_samples_leaf': 20, 'min_samples_split': 60, 'max_depth': 20}
model_rf = Classifier(dataset=dataset, estimator=RandomForestClassifier, parameters=rf_params, name='rf')

In [6]:
pipeline = ModelsPipeline(model_gbdt, model_rf)

In [7]:
weights = pipeline.find_weights(metrics.roc_auc_score)

      Iter       Train Loss      OOB Improve   Remaining Time 
         1           0.5929           0.0509           13.22m
         2           0.5618           0.0315            9.96m
         3           0.5340           0.0236            9.24m
         4           0.5150           0.0189            8.52m
         5           0.5010           0.0147            7.97m
         6           0.4858           0.0125            7.58m
         7           0.4755           0.0102            7.36m
         8           0.4668           0.0094            7.11m
         9           0.4545           0.0080            6.88m
        10           0.4486           0.0070            6.75m
        20           0.4026           0.0019            5.21m
        30           0.3822           0.0010            3.94m
        40           0.3727           0.0003            2.87m
        50           0.3644           0.0002            1.99m
        60           0.3583           0.0000            1.26m
       

[Parallel(n_jobs=1)]: Done  80 out of  80 | elapsed:  1.9min finished
[Parallel(n_jobs=1)]: Done  80 out of  80 | elapsed:    1.7s finished


Best Score (roc_auc_score): 0.8992923439289697
Best Weights: [0.50003465 0.49996535]


In [22]:
results = pipeline.weight(weights)

In [26]:
results.models

[gbdt, rf]

In [24]:
weights

array([0.50003465, 0.49996535])

In [27]:
# read preds.csv
gbdt = pd.read_csv('./gbdt_preds_19.csv', header=None)
gbdt.columns = ['User_id', 'Coupon_id', 'Date_received', 'Proba']
rf = pd.read_csv('./rf_preds_4.csv', header=None)
rf.columns = ['User_id', 'Coupon_id', 'Date_received', 'Proba']

# weighted models preds
weighted = pd.read_csv('../input/Submission.csv')
weighted['Proba'] = weights[0] * gbdt.Proba + weights[1] * rf.Proba
weighted.to_csv('weighted_preds.csv', index=False, header=False)