In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import xgboost
import sklearn
# import seaborn as sns
%matplotlib inline
data_path = '/Users/dirlt/.kaggle/competitions/bike-sharing-demand/'
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV
import sklearn.metrics
from sklearn.base import BaseEstimator, RegressorMixin
import skopt
from collections import defaultdict

links:
- https://www.kaggle.com/miteshyadav/comprehensive-eda-with-xgboost-top-10-percentile/notebook
- https://www.kaggle.com/viveksrinivasan/eda-ensemble-model-top-10-percentile

# Load Data

In [None]:
df = pd.read_csv('mytrain.csv')
test_df = pd.read_csv('mytest.csv')
X, y = df.drop(['casual', 'registered', 'count'], axis = 1), np.log1p(df[['casual', 'registered', 'count']])

In [None]:
def input_fn(x, casual = True):
    drop_fields = ['dt_day', 'dt_hour', 'season', 'weather', 'dt_year', 'dt_month', 'dt_weekday', 'atemp']
    if 'datetime' in x.columns:
        drop_fields.append('datetime')
    return x.drop(drop_fields, axis = 1)

def make_cv(X,n = 2):
    for i in range(n):
        days = [x for x in [18-i, 19-i]]
        train_idx = X[X['dt_day'].apply(lambda x: x not in days)].index
        test_idx = X[X['dt_day'].apply(lambda x: x in days)].index
        yield train_idx, test_idx

def rmse(x, y):
    return mean_squared_error(x, y) ** 0.5

def print_features(names, values, thres = 0.01):
    fts = list(zip(names, values))
    fts.sort(key = lambda x: -x[1])
    ns = []
    for idx, (name, value) in enumerate(fts):
        if value < thres: break
        print('- {} {:.2f}'.format(name, value))
        ns.append(name)
    print(format(','.join(ns)))

class MyEstimator(BaseEstimator, RegressorMixin):
    def __init__(self, **kwargs):
        if 'ma' in kwargs:
            self.ma = kwargs['ma']
            del kwargs['ma']
        if 'mb' in kwargs:
            self.mb = kwargs['mb']
            del kwargs['mb']
        kwargs['init'] = True
        self.set_params(**kwargs)
        
    def fit(self, X, y):
        input_a = input_fn(X, casual=True)
        input_b = input_fn(X, casual=False)
        self.ma.fit(input_a, y['casual'])
        self.mb.fit(input_b, y['registered'])
        self.ca = input_a.columns
        self.cb = input_b.columns
        
    def predict(self, X, n = None):
        if n:
            ma_est = self.ma.estimators_
            mb_est = self.mb.estimators_
            self.ma.estimators_ = ma_est[:n]
            self.mb.estimators_ = mb_est[:n]
        ya = self.ma.predict(input_fn(X, casual=True))
        yb = self.mb.predict(input_fn(X, casual=False))
        y = np.log1p(np.expm1(ya) + np.expm1(yb))
        if n:
            self.ma.estimators_ = ma_est
            self.mb.estimators_ = mb_est
        return y
    
    def score(self, X, y , n = None):
        y2 = self.predict(X, n = n)
        return -rmse(y['count'], y2)
        
    def set_params(self, **params):
        pa = {}
        pb = {}
        for k in params:
            if k.startswith('a_'):
                pa[k[2:]] = params[k]
            elif k.startswith('b_'):
                pb[k[2:]] = params[k]
            elif k == 'n_estimators':
                # 让两个回归器共享n
                pa[k] = params[k]
                pb[k] = params[k]
            else:
                pass
        if 'init' not in params:
            #print(pa, pb)
            pass
        self.ma.set_params(**pa)
        self.mb.set_params(**pb)
        return self
        
    def get_params(self, deep = True):
        pa = self.ma.get_params(deep)
        pb = self.ma.get_params(deep)
        p = {}
        for k in pa:
            p['a_' + k] = pa[k]
        for k in pb:
            p['b_' + k] = pb[k]
        p['ma'] = self.ma
        p['mb'] = self.mb
        return p
    
    def print_features(self, thres = 0.005):
        ca = self.ca
        cb = self.cb
        print('=====casual=====')
        print_features(ca, self.ma.feature_importances_, thres)
        print('=====registered=====')
        print_features(cb, self.mb.feature_importances_, thres)

In [None]:
input_fn(X).columns

In [None]:
def skopt_min_fn(model, X, y, params, n_calls = 10, n_cv = 2):
    pitems = list(params.items())
    pkeys = [x[0] for x in pitems]
    pvalues = [x[1] for x in pitems]
    history = {}
    def _f(_params):
        my_params = dict(zip(pkeys, _params))
        print('use params = {}'.format(my_params))
        k = tuple(_params)
        if k in history:
            return history[k]
        model.set_params(**my_params)
        cv = make_cv(X, n_cv)
        scores = []
        for train_idx, test_idx in cv:
            model.fit(X.loc[train_idx], y.loc[train_idx])
            s = model.score(X.loc[test_idx], y.loc[test_idx])
            scores.append(s)
        s = -np.mean(scores)
        print('score = {}'.format(s))
        history[k] = s
        return s
    
    res = skopt.forest_minimize(func = _f, dimensions = pvalues, n_calls = n_calls)
    res.best_params = dict(zip(pkeys, res.x))
    res.best_score = res.fun
    return res

In [None]:
def select_n(model, X, y, ns, n_cv = 4):
    scores = defaultdict(list)
    scores2 = dict()
    cv = make_cv(X, n_cv)
    print('set n_estimators = {}'.format(max(ns)))
    model.set_params(**{'n_estimators': max(ns)})
    for idx, (train_idx, test_idx) in enumerate(cv):
        print('doing fit cv = {} ...'.format(idx))
        model.fit(X.loc[train_idx], y.loc[train_idx])
        for n in ns:
            s = model.score(X.loc[test_idx], y.loc[test_idx], n = n)
            scores[n].append(s)
    for n in ns:
        scores2[n] = np.mean(scores[n])
    tmp = list(scores2.items())
    tmp.sort(key = lambda x: -x[1])
    return scores2, tmp

# RF model

In [None]:
%%time
print('cv for rf model')
rf0 = RandomForestRegressor(n_estimators=200, random_state = 42, verbose=0, n_jobs=4)
rf1 = RandomForestRegressor(n_estimators=200, random_state = 42, verbose=0, n_jobs=4)
rf = MyEstimator(ma = rf0, mb = rf1)

In [None]:
# %%time
# params = {'a_min_samples_split': [3, 10], 'b_min_samples_split': [3, 10]}
# rf_opt = skopt_min_fn(rf, X, y, params, n_calls = 20, n_cv = 4)
# print(rf_opt.best_params, rf_opt.best_score)

In [None]:
# params = {'a_min_samples_split': [8,9,10,11,12], 'b_min_samples_split': [4,5,6,7,8]}
# rf_cv = GridSearchCV(rf, params, cv = make_cv(X,2), n_jobs = 1, verbose = 1)
# rf_cv.fit(X, y)
# print(rf_cv.best_score_, rf_cv.best_params_)

In [None]:
# %%time
# rf.set_params(**{'a_min_samples_split': 10, 'b_min_samples_split': 7})
# rf_scores = select_n(rf, X, y, ns = range(100, 2000, 100), n_cv = 4)
# print(rf_scores[1][:5])

In [None]:
%%time
rf_best_params = {'a_min_samples_split': 10, 'b_min_samples_split': 7, 'n_estimators': 1700}
rf.set_params(**rf_best_params)
rf.fit(X, y)
output_y = rf.predict(test_df)
output = np.round(np.expm1(output_y)).astype(int)
output[output < 0] = 0
df_output = pd.DataFrame({'datetime': test_df['datetime'], 'count': output}, columns=('datetime', 'count'))
df_output['count'] = df_output['count'].astype(int)
df_output.to_csv('submission-rf.csv', index = False)

In [None]:
!kaggle competitions submit -c bike-sharing-demand -f submission-rf.csv -m 'ms=(10,7), n = 1700'

# GBM

In [None]:
%%time
print('cv for gbm model')
gbm0 = GradientBoostingRegressor(n_estimators=200, random_state = 42, verbose=0)
gbm1 = GradientBoostingRegressor(n_estimators=200, random_state = 42, verbose=0)
gbm = MyEstimator(ma = gbm0, mb = gbm1)

In [None]:
# %%time
# params = {'a_max_depth': [3,4,5,6,7,8], 'b_max_depth': [3,4,5,6,7,8]}
# gbm_opt = skopt_min_fn(gbm, X, y, params, n_calls = 20, n_cv = 4)
# print(gbm_opt.best_params, gbm_opt.best_score)

In [None]:
# params = {'a_max_depth': [3,4,5,6,7,8], 'b_max_depth': [3,4,5,6,7,8]}
# gbm_cv = GridSearchCV(gbm, params, cv = make_cv(X,2), n_jobs = 4, verbose = 1)
# gbm_cv.fit(X, y)
# print(gbm_cv.best_score_, gbm_cv.best_params_)

In [None]:
# %%time
# gbm.set_params(**{'a_max_depth': 4, 'b_max_depth':6})
# gbm_scores = select_n(gbm, X, y, ns = range(100, 2000, 20), n_cv = 4)
# print(gbm_scores[1][:5])

In [None]:
%%time
gbm_best_params = {'a_max_depth': 4, 'b_max_depth':6, 'n_estimators': 360}
gbm.set_params(**gbm_best_params)
gbm.fit(X, y)
output_y = gbm.predict(test_df)
output = np.round(np.expm1(output_y)).astype(int)
output[output < 0] = 0
df_output = pd.DataFrame({'datetime': test_df['datetime'], 'count': output}, columns=('datetime', 'count'))
df_output['count'] = df_output['count'].astype(int)
df_output.to_csv('submission-gbm.csv', index = False)

In [None]:
!kaggle competitions submit -c bike-sharing-demand -f submission-gbm.csv -m 'ms=(4,6), n = 360'

# XGB

In [None]:
%%time
print('cv for xgb model')
xgb0 = XGBRegressor(n_estimators=200, random_state = 42, verbose=0, n_jobs=4)
xgb1 = XGBRegressor(n_estimators=200, random_state = 42, verbose=0, n_jobs=4)
xgb = MyEstimator(ma = xgb0, mb = xgb1)

In [None]:
# params = {'a_max_depth': [3,4,5,6,7,8], 'b_max_depth': [3,4,5,6,7,8], 'n_estimators': [100,1000]}
# xgb_opt = skopt_min_fn(xgb, X, y, params, n_calls = 50, n_cv = 4)
# print(xgb_opt.best_params, xgb_opt.best_score)

In [None]:
# params = {'a_max_depth': [3,4,5,6,7,8], 'b_max_depth': [5,6,7,8]}
# xgb_cv = GridSearchCV(xgb, params, cv = make_cv(X,2), n_jobs = 1, verbose = 1)
# xgb_cv.fit(X, y)
# print(xgb_cv.best_score_, xgb_cv.best_params_)

In [None]:
%%time
xgb_best_params = {'a_max_depth': 5, 'b_max_depth': 4,'n_estimators': 925}
xgb.set_params(**xgb_best_params)
xgb.fit(X, y)
output_y = xgb.predict(test_df)
output = np.round(np.expm1(output_y)).astype(int)
output[output < 0] = 0
df_output = pd.DataFrame({'datetime': test_df['datetime'], 'count': output}, columns=('datetime', 'count'))
df_output['count'] = df_output['count'].astype(int)
df_output.to_csv('submission-xgb.csv', index = False)

# average

In [None]:
df_rf = pd.read_csv('submission-rf.csv')
df_gbm = pd.read_csv('submission-gbm.csv')
df_xgb = pd.read_csv('submission-xgb.csv')
df_avg = pd.DataFrame(df_rf)
df_avg['count'] = np.round((df_rf['count'] + df_gbm['count'] + df_xgb['count'] + 1) * 0.33).astype(int)
# df_avg['count'] = np.round((df_rf['count'] + df_gbm['count'] + 1) * 0.5).astype(int)
df_avg.to_csv('submission.csv', index = False)

In [None]:
!kaggle competitions submit -c bike-sharing-demand -f submission.csv -m 'avg'

[submission link](https://www.kaggle.com/c/bike-sharing-demand/submissions?sortBy=date&group=all&page=1)